feat: graph and measurement code for CPU lossless algorithms
This commit is contained in:
parent
9cd37f156a
commit
3bf3667849
7 changed files with 205 additions and 0 deletions
114
cpu_compression_graphs.py
Normal file
114
cpu_compression_graphs.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import os
|
||||
|
||||
|
||||
def plot_compression_metrics(csv_path, output_dir="results/compress/plots"):
|
||||
"""Generate visualizations for compression metrics.
|
||||
|
||||
Args:
|
||||
csv_path (str): Path to the CSV file with compression metrics
|
||||
output_dir (str): Directory to save the output plots
|
||||
"""
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Read the data
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
# Add a human-readable size column
|
||||
def format_size(size_bytes):
|
||||
for unit in ['B', 'KB', 'MB']:
|
||||
if size_bytes < 1024.0:
|
||||
return f"{size_bytes:.1f} {unit}"
|
||||
size_bytes /= 1024.0
|
||||
return f"{size_bytes:.1f} GB"
|
||||
|
||||
df['file_size'] = df['original_size'].apply(format_size)
|
||||
|
||||
# Set up the plotting style
|
||||
sns.set(style="whitegrid")
|
||||
plt.rcParams['figure.figsize'] = (12, 6)
|
||||
|
||||
# 1. Compression Ratio Comparison
|
||||
plt.figure()
|
||||
ax = sns.barplot(
|
||||
x='filename',
|
||||
y='compression_ratio',
|
||||
hue='compressor',
|
||||
data=df,
|
||||
palette='viridis'
|
||||
)
|
||||
plt.title('Compression Ratio by File and Compressor')
|
||||
plt.xlabel('File')
|
||||
plt.ylabel('Compression Ratio (Higher is better)')
|
||||
plt.xticks(rotation=45, ha='right')
|
||||
plt.tight_layout()
|
||||
plt.savefig(f"{output_dir}/compression_ratio_comparison.png", dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# 2. Compression Time Comparison
|
||||
plt.figure()
|
||||
ax = sns.barplot(
|
||||
x='filename',
|
||||
y='compression_time',
|
||||
hue='compressor',
|
||||
data=df,
|
||||
palette='viridis'
|
||||
)
|
||||
plt.title('Compression Time by File and Compressor')
|
||||
plt.xlabel('File')
|
||||
plt.ylabel('Compression Time (seconds) (log)')
|
||||
plt.yscale('log')
|
||||
plt.xticks(rotation=45, ha='right')
|
||||
plt.tight_layout()
|
||||
plt.savefig(f"{output_dir}/compression_time_comparison.png", dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# 3. Decompression Time Comparison
|
||||
plt.figure()
|
||||
ax = sns.barplot(
|
||||
x='filename',
|
||||
y='decompression_time',
|
||||
hue='compressor',
|
||||
data=df,
|
||||
palette='viridis'
|
||||
)
|
||||
plt.title('Decompression Time by File and Compressor')
|
||||
plt.xlabel('File')
|
||||
plt.ylabel('Decompression Time (seconds) (log)')
|
||||
plt.yscale('log')
|
||||
plt.xticks(rotation=45, ha='right')
|
||||
plt.tight_layout()
|
||||
plt.savefig(f"{output_dir}/decompression_time_comparison.png", dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
# 4. Scatter plot: Compression Ratio vs Compression Time
|
||||
plt.figure(figsize=(10, 6))
|
||||
for compressor in df['compressor'].unique():
|
||||
subset = df[df['compressor'] == compressor]
|
||||
plt.scatter(
|
||||
subset['compression_ratio'],
|
||||
subset['compression_time'],
|
||||
label=compressor.upper(),
|
||||
s=100,
|
||||
alpha=0.7
|
||||
)
|
||||
|
||||
plt.title('Compression Ratio vs Compression Time')
|
||||
plt.xlabel('Compression Ratio (Higher is better)')
|
||||
plt.ylabel('Compression Time (seconds)')
|
||||
plt.legend(title='Compressor')
|
||||
plt.grid(True, alpha=0.3)
|
||||
plt.tight_layout()
|
||||
plt.savefig(f"{output_dir}/ratio_vs_time.png", dpi=300, bbox_inches='tight')
|
||||
plt.close()
|
||||
|
||||
print(f"Plots saved to {output_dir}/")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# You can modify this path to point to your CSV file
|
||||
csv_path = "results/compress/compression_metrics_20251216_182603.csv"
|
||||
plot_compression_metrics(csv_path)
|
||||
BIN
graphs/plots/compression_ratio_comparison.png
Normal file
BIN
graphs/plots/compression_ratio_comparison.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 289 KiB |
BIN
graphs/plots/compression_time_comparison.png
Normal file
BIN
graphs/plots/compression_time_comparison.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 278 KiB |
BIN
graphs/plots/decompression_time_comparison.png
Normal file
BIN
graphs/plots/decompression_time_comparison.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 278 KiB |
BIN
graphs/plots/ratio_vs_time.png
Normal file
BIN
graphs/plots/ratio_vs_time.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 143 KiB |
66
measure_gzip_lz4.sh
Executable file
66
measure_gzip_lz4.sh
Executable file
|
|
@ -0,0 +1,66 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Check if a directory is provided
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 <dataset_directory>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
dataset_dir="$1"
|
||||
output_file="compression_metrics_$(date +%Y%m%d_%H%M%S).csv"
|
||||
|
||||
# Create output CSV file with header
|
||||
echo "filename,original_size,compressor,compressed_size,compression_ratio,compression_time,decompression_time" > "$output_file"
|
||||
|
||||
# Process each file in the dataset
|
||||
for file in "$dataset_dir"/*; do
|
||||
if [ -f "$file" ]; then
|
||||
filename=$(basename "$file")
|
||||
original_size=$(stat -c%s "$file")
|
||||
|
||||
# Skip if not a regular file
|
||||
[ ! -f "$file" ] && continue
|
||||
|
||||
echo "Processing: $filename"
|
||||
|
||||
# Test gzip
|
||||
echo " Testing gzip..."
|
||||
start_time=$(date +%s.%N)
|
||||
gzip -k -c "$file" > "${file}.gz"
|
||||
compression_time=$(echo "$(date +%s.%N) - $start_time" | bc)
|
||||
compressed_size=$(stat -c%s "${file}.gz")
|
||||
compression_ratio=$(echo "scale=2; $original_size / $compressed_size" | bc -l)
|
||||
|
||||
# Decompress and measure time
|
||||
start_time=$(date +%s.%N)
|
||||
gzip -d -k -f "${file}.gz"
|
||||
decompression_time=$(echo "$(date +%s.%N) - $start_time" | bc)
|
||||
|
||||
# Write results
|
||||
echo "$filename,$original_size,gzip,$compressed_size,$compression_ratio,$compression_time,$decompression_time" >> "$output_file"
|
||||
|
||||
# Clean up
|
||||
rm -f "${file}.gz"
|
||||
|
||||
# Test lz4
|
||||
echo " Testing lz4..."
|
||||
start_time=$(date +%s.%N)
|
||||
lz4 -f -q "$file" "${file}.lz4"
|
||||
compression_time=$(echo "$(date +%s.%N) - $start_time" | bc)
|
||||
compressed_size=$(stat -c%s "${file}.lz4")
|
||||
compression_ratio=$(echo "scale=2; $original_size / $compressed_size" | bc -l)
|
||||
|
||||
# Decompress and measure time
|
||||
start_time=$(date +%s.%N)
|
||||
lz4 -f -d -q "${file}.lz4" - | cat > /dev/null
|
||||
decompression_time=$(echo "$(date +%s.%N) - $start_time" | bc)
|
||||
|
||||
# Write results
|
||||
echo "$filename,$original_size,lz4,$compressed_size,$compression_ratio,$compression_time,$decompression_time" >> "$output_file"
|
||||
|
||||
# Clean up
|
||||
rm -f "${file}.lz4"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "All tests completed. Results saved to $output_file"
|
||||
25
results/compress/compression_metrics_20251216_182603.csv
Normal file
25
results/compress/compression_metrics_20251216_182603.csv
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
filename,original_size,compressor,compressed_size,compression_ratio,compression_time,decompression_time
|
||||
genome.fna,4699745,gzip,1424004,3.30,.681197994,.015465955
|
||||
genome.fna,4699745,lz4,2655438,1.76,.012701161,.009190410
|
||||
genome_large.fna,23498433,gzip,7118154,3.30,3.384480370,.067414798
|
||||
genome_large.fna,23498433,lz4,13275544,1.77,.020719873,.025022334
|
||||
genome_small.fna,1367,gzip,589,2.32,.001937446,.001983156
|
||||
genome_small.fna,1367,lz4,1041,1.31,.001883076,.002144425
|
||||
genome_xlarge.fna,46996793,gzip,14235842,3.30,6.775190783,.131633333
|
||||
genome_xlarge.fna,46996793,lz4,26551229,1.77,.031734579,.043495412
|
||||
genome_xsmall.fna,1043,gzip,475,2.19,.002007016,.002012775
|
||||
genome_xsmall.fna,1043,lz4,814,1.28,.001954316,.002085566
|
||||
genome_xxsmall.fna,800,gzip,393,2.03,.002071485,.001958195
|
||||
genome_xxsmall.fna,800,lz4,641,1.24,.001893416,.001943666
|
||||
text_large.txt,12977332,gzip,4770044,2.72,.613155078,.043915520
|
||||
text_large.txt,12977332,lz4,7879136,1.64,.017927300,.015065196
|
||||
text_small.txt,1022,gzip,590,1.73,.002070305,.001903226
|
||||
text_small.txt,1022,lz4,857,1.19,.001967146,.002040285
|
||||
text.txt,6488666,gzip,2385264,2.72,.308393934,.023656716
|
||||
text.txt,6488666,lz4,3939378,1.64,.014891266,.009709618
|
||||
text_xlarge.txt,25954664,gzip,9539638,2.72,1.229028819,.085925486
|
||||
text_xlarge.txt,25954664,lz4,15758785,1.64,.023613977,.023486747
|
||||
text_xsmall.txt,825,gzip,473,1.74,.002110205,.001980535
|
||||
text_xsmall.txt,825,lz4,678,1.21,.001757717,.002191075
|
||||
text_xxsmall.txt,492,gzip,325,1.51,.001867306,.002114055
|
||||
text_xxsmall.txt,492,lz4,438,1.12,.001869646,.002134206
|
||||
|
Reference in a new issue