feat: graph and measurement code for CPU lossless algorithms

This commit is contained in:
RobinMeersman 2025-12-16 18:54:29 +01:00
parent 9cd37f156a
commit 3bf3667849
7 changed files with 205 additions and 0 deletions

114
cpu_compression_graphs.py Normal file
View file

@ -0,0 +1,114 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
def plot_compression_metrics(csv_path, output_dir="results/compress/plots"):
"""Generate visualizations for compression metrics.
Args:
csv_path (str): Path to the CSV file with compression metrics
output_dir (str): Directory to save the output plots
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Read the data
df = pd.read_csv(csv_path)
# Add a human-readable size column
def format_size(size_bytes):
for unit in ['B', 'KB', 'MB']:
if size_bytes < 1024.0:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024.0
return f"{size_bytes:.1f} GB"
df['file_size'] = df['original_size'].apply(format_size)
# Set up the plotting style
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
# 1. Compression Ratio Comparison
plt.figure()
ax = sns.barplot(
x='filename',
y='compression_ratio',
hue='compressor',
data=df,
palette='viridis'
)
plt.title('Compression Ratio by File and Compressor')
plt.xlabel('File')
plt.ylabel('Compression Ratio (Higher is better)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(f"{output_dir}/compression_ratio_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 2. Compression Time Comparison
plt.figure()
ax = sns.barplot(
x='filename',
y='compression_time',
hue='compressor',
data=df,
palette='viridis'
)
plt.title('Compression Time by File and Compressor')
plt.xlabel('File')
plt.ylabel('Compression Time (seconds) (log)')
plt.yscale('log')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(f"{output_dir}/compression_time_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 3. Decompression Time Comparison
plt.figure()
ax = sns.barplot(
x='filename',
y='decompression_time',
hue='compressor',
data=df,
palette='viridis'
)
plt.title('Decompression Time by File and Compressor')
plt.xlabel('File')
plt.ylabel('Decompression Time (seconds) (log)')
plt.yscale('log')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(f"{output_dir}/decompression_time_comparison.png", dpi=300, bbox_inches='tight')
plt.close()
# 4. Scatter plot: Compression Ratio vs Compression Time
plt.figure(figsize=(10, 6))
for compressor in df['compressor'].unique():
subset = df[df['compressor'] == compressor]
plt.scatter(
subset['compression_ratio'],
subset['compression_time'],
label=compressor.upper(),
s=100,
alpha=0.7
)
plt.title('Compression Ratio vs Compression Time')
plt.xlabel('Compression Ratio (Higher is better)')
plt.ylabel('Compression Time (seconds)')
plt.legend(title='Compressor')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f"{output_dir}/ratio_vs_time.png", dpi=300, bbox_inches='tight')
plt.close()
print(f"Plots saved to {output_dir}/")
if __name__ == "__main__":
# You can modify this path to point to your CSV file
csv_path = "results/compress/compression_metrics_20251216_182603.csv"
plot_compression_metrics(csv_path)

Binary file not shown.

After

Width:  |  Height:  |  Size: 289 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 278 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 278 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 143 KiB

66
measure_gzip_lz4.sh Executable file
View file

@ -0,0 +1,66 @@
#!/bin/bash
# Check if a directory is provided
if [ $# -ne 1 ]; then
echo "Usage: $0 <dataset_directory>"
exit 1
fi
dataset_dir="$1"
output_file="compression_metrics_$(date +%Y%m%d_%H%M%S).csv"
# Create output CSV file with header
echo "filename,original_size,compressor,compressed_size,compression_ratio,compression_time,decompression_time" > "$output_file"
# Process each file in the dataset
for file in "$dataset_dir"/*; do
if [ -f "$file" ]; then
filename=$(basename "$file")
original_size=$(stat -c%s "$file")
# Skip if not a regular file
[ ! -f "$file" ] && continue
echo "Processing: $filename"
# Test gzip
echo " Testing gzip..."
start_time=$(date +%s.%N)
gzip -k -c "$file" > "${file}.gz"
compression_time=$(echo "$(date +%s.%N) - $start_time" | bc)
compressed_size=$(stat -c%s "${file}.gz")
compression_ratio=$(echo "scale=2; $original_size / $compressed_size" | bc -l)
# Decompress and measure time
start_time=$(date +%s.%N)
gzip -d -k -f "${file}.gz"
decompression_time=$(echo "$(date +%s.%N) - $start_time" | bc)
# Write results
echo "$filename,$original_size,gzip,$compressed_size,$compression_ratio,$compression_time,$decompression_time" >> "$output_file"
# Clean up
rm -f "${file}.gz"
# Test lz4
echo " Testing lz4..."
start_time=$(date +%s.%N)
lz4 -f -q "$file" "${file}.lz4"
compression_time=$(echo "$(date +%s.%N) - $start_time" | bc)
compressed_size=$(stat -c%s "${file}.lz4")
compression_ratio=$(echo "scale=2; $original_size / $compressed_size" | bc -l)
# Decompress and measure time
start_time=$(date +%s.%N)
lz4 -f -d -q "${file}.lz4" - | cat > /dev/null
decompression_time=$(echo "$(date +%s.%N) - $start_time" | bc)
# Write results
echo "$filename,$original_size,lz4,$compressed_size,$compression_ratio,$compression_time,$decompression_time" >> "$output_file"
# Clean up
rm -f "${file}.lz4"
fi
done
echo "All tests completed. Results saved to $output_file"

View file

@ -0,0 +1,25 @@
filename,original_size,compressor,compressed_size,compression_ratio,compression_time,decompression_time
genome.fna,4699745,gzip,1424004,3.30,.681197994,.015465955
genome.fna,4699745,lz4,2655438,1.76,.012701161,.009190410
genome_large.fna,23498433,gzip,7118154,3.30,3.384480370,.067414798
genome_large.fna,23498433,lz4,13275544,1.77,.020719873,.025022334
genome_small.fna,1367,gzip,589,2.32,.001937446,.001983156
genome_small.fna,1367,lz4,1041,1.31,.001883076,.002144425
genome_xlarge.fna,46996793,gzip,14235842,3.30,6.775190783,.131633333
genome_xlarge.fna,46996793,lz4,26551229,1.77,.031734579,.043495412
genome_xsmall.fna,1043,gzip,475,2.19,.002007016,.002012775
genome_xsmall.fna,1043,lz4,814,1.28,.001954316,.002085566
genome_xxsmall.fna,800,gzip,393,2.03,.002071485,.001958195
genome_xxsmall.fna,800,lz4,641,1.24,.001893416,.001943666
text_large.txt,12977332,gzip,4770044,2.72,.613155078,.043915520
text_large.txt,12977332,lz4,7879136,1.64,.017927300,.015065196
text_small.txt,1022,gzip,590,1.73,.002070305,.001903226
text_small.txt,1022,lz4,857,1.19,.001967146,.002040285
text.txt,6488666,gzip,2385264,2.72,.308393934,.023656716
text.txt,6488666,lz4,3939378,1.64,.014891266,.009709618
text_xlarge.txt,25954664,gzip,9539638,2.72,1.229028819,.085925486
text_xlarge.txt,25954664,lz4,15758785,1.64,.023613977,.023486747
text_xsmall.txt,825,gzip,473,1.74,.002110205,.001980535
text_xsmall.txt,825,lz4,678,1.21,.001757717,.002191075
text_xxsmall.txt,492,gzip,325,1.51,.001867306,.002114055
text_xxsmall.txt,492,lz4,438,1.12,.001869646,.002134206
1 filename original_size compressor compressed_size compression_ratio compression_time decompression_time
2 genome.fna 4699745 gzip 1424004 3.30 .681197994 .015465955
3 genome.fna 4699745 lz4 2655438 1.76 .012701161 .009190410
4 genome_large.fna 23498433 gzip 7118154 3.30 3.384480370 .067414798
5 genome_large.fna 23498433 lz4 13275544 1.77 .020719873 .025022334
6 genome_small.fna 1367 gzip 589 2.32 .001937446 .001983156
7 genome_small.fna 1367 lz4 1041 1.31 .001883076 .002144425
8 genome_xlarge.fna 46996793 gzip 14235842 3.30 6.775190783 .131633333
9 genome_xlarge.fna 46996793 lz4 26551229 1.77 .031734579 .043495412
10 genome_xsmall.fna 1043 gzip 475 2.19 .002007016 .002012775
11 genome_xsmall.fna 1043 lz4 814 1.28 .001954316 .002085566
12 genome_xxsmall.fna 800 gzip 393 2.03 .002071485 .001958195
13 genome_xxsmall.fna 800 lz4 641 1.24 .001893416 .001943666
14 text_large.txt 12977332 gzip 4770044 2.72 .613155078 .043915520
15 text_large.txt 12977332 lz4 7879136 1.64 .017927300 .015065196
16 text_small.txt 1022 gzip 590 1.73 .002070305 .001903226
17 text_small.txt 1022 lz4 857 1.19 .001967146 .002040285
18 text.txt 6488666 gzip 2385264 2.72 .308393934 .023656716
19 text.txt 6488666 lz4 3939378 1.64 .014891266 .009709618
20 text_xlarge.txt 25954664 gzip 9539638 2.72 1.229028819 .085925486
21 text_xlarge.txt 25954664 lz4 15758785 1.64 .023613977 .023486747
22 text_xsmall.txt 825 gzip 473 1.74 .002110205 .001980535
23 text_xsmall.txt 825 lz4 678 1.21 .001757717 .002191075
24 text_xxsmall.txt 492 gzip 325 1.51 .001867306 .002114055
25 text_xxsmall.txt 492 lz4 438 1.12 .001869646 .002134206