diff --git a/cpu_compression_graphs.py b/cpu_compression_graphs.py new file mode 100644 index 0000000..6bee7b8 --- /dev/null +++ b/cpu_compression_graphs.py @@ -0,0 +1,114 @@ +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import os + + +def plot_compression_metrics(csv_path, output_dir="results/compress/plots"): + """Generate visualizations for compression metrics. + + Args: + csv_path (str): Path to the CSV file with compression metrics + output_dir (str): Directory to save the output plots + """ + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + # Read the data + df = pd.read_csv(csv_path) + + # Add a human-readable size column + def format_size(size_bytes): + for unit in ['B', 'KB', 'MB']: + if size_bytes < 1024.0: + return f"{size_bytes:.1f} {unit}" + size_bytes /= 1024.0 + return f"{size_bytes:.1f} GB" + + df['file_size'] = df['original_size'].apply(format_size) + + # Set up the plotting style + sns.set(style="whitegrid") + plt.rcParams['figure.figsize'] = (12, 6) + + # 1. Compression Ratio Comparison + plt.figure() + ax = sns.barplot( + x='filename', + y='compression_ratio', + hue='compressor', + data=df, + palette='viridis' + ) + plt.title('Compression Ratio by File and Compressor') + plt.xlabel('File') + plt.ylabel('Compression Ratio (Higher is better)') + plt.xticks(rotation=45, ha='right') + plt.tight_layout() + plt.savefig(f"{output_dir}/compression_ratio_comparison.png", dpi=300, bbox_inches='tight') + plt.close() + + # 2. Compression Time Comparison + plt.figure() + ax = sns.barplot( + x='filename', + y='compression_time', + hue='compressor', + data=df, + palette='viridis' + ) + plt.title('Compression Time by File and Compressor') + plt.xlabel('File') + plt.ylabel('Compression Time (seconds) (log)') + plt.yscale('log') + plt.xticks(rotation=45, ha='right') + plt.tight_layout() + plt.savefig(f"{output_dir}/compression_time_comparison.png", dpi=300, bbox_inches='tight') + plt.close() + + # 3. Decompression Time Comparison + plt.figure() + ax = sns.barplot( + x='filename', + y='decompression_time', + hue='compressor', + data=df, + palette='viridis' + ) + plt.title('Decompression Time by File and Compressor') + plt.xlabel('File') + plt.ylabel('Decompression Time (seconds) (log)') + plt.yscale('log') + plt.xticks(rotation=45, ha='right') + plt.tight_layout() + plt.savefig(f"{output_dir}/decompression_time_comparison.png", dpi=300, bbox_inches='tight') + plt.close() + + # 4. Scatter plot: Compression Ratio vs Compression Time + plt.figure(figsize=(10, 6)) + for compressor in df['compressor'].unique(): + subset = df[df['compressor'] == compressor] + plt.scatter( + subset['compression_ratio'], + subset['compression_time'], + label=compressor.upper(), + s=100, + alpha=0.7 + ) + + plt.title('Compression Ratio vs Compression Time') + plt.xlabel('Compression Ratio (Higher is better)') + plt.ylabel('Compression Time (seconds)') + plt.legend(title='Compressor') + plt.grid(True, alpha=0.3) + plt.tight_layout() + plt.savefig(f"{output_dir}/ratio_vs_time.png", dpi=300, bbox_inches='tight') + plt.close() + + print(f"Plots saved to {output_dir}/") + + +if __name__ == "__main__": + # You can modify this path to point to your CSV file + csv_path = "results/compress/compression_metrics_20251216_182603.csv" + plot_compression_metrics(csv_path) \ No newline at end of file diff --git a/graphs/plots/compression_ratio_comparison.png b/graphs/plots/compression_ratio_comparison.png new file mode 100644 index 0000000..bdb559f Binary files /dev/null and b/graphs/plots/compression_ratio_comparison.png differ diff --git a/graphs/plots/compression_time_comparison.png b/graphs/plots/compression_time_comparison.png new file mode 100644 index 0000000..f9ff048 Binary files /dev/null and b/graphs/plots/compression_time_comparison.png differ diff --git a/graphs/plots/decompression_time_comparison.png b/graphs/plots/decompression_time_comparison.png new file mode 100644 index 0000000..b74c314 Binary files /dev/null and b/graphs/plots/decompression_time_comparison.png differ diff --git a/graphs/plots/ratio_vs_time.png b/graphs/plots/ratio_vs_time.png new file mode 100644 index 0000000..916e6b3 Binary files /dev/null and b/graphs/plots/ratio_vs_time.png differ diff --git a/measure_gzip_lz4.sh b/measure_gzip_lz4.sh new file mode 100755 index 0000000..0b5ef45 --- /dev/null +++ b/measure_gzip_lz4.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Check if a directory is provided +if [ $# -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +dataset_dir="$1" +output_file="compression_metrics_$(date +%Y%m%d_%H%M%S).csv" + +# Create output CSV file with header +echo "filename,original_size,compressor,compressed_size,compression_ratio,compression_time,decompression_time" > "$output_file" + +# Process each file in the dataset +for file in "$dataset_dir"/*; do + if [ -f "$file" ]; then + filename=$(basename "$file") + original_size=$(stat -c%s "$file") + + # Skip if not a regular file + [ ! -f "$file" ] && continue + + echo "Processing: $filename" + + # Test gzip + echo " Testing gzip..." + start_time=$(date +%s.%N) + gzip -k -c "$file" > "${file}.gz" + compression_time=$(echo "$(date +%s.%N) - $start_time" | bc) + compressed_size=$(stat -c%s "${file}.gz") + compression_ratio=$(echo "scale=2; $original_size / $compressed_size" | bc -l) + + # Decompress and measure time + start_time=$(date +%s.%N) + gzip -d -k -f "${file}.gz" + decompression_time=$(echo "$(date +%s.%N) - $start_time" | bc) + + # Write results + echo "$filename,$original_size,gzip,$compressed_size,$compression_ratio,$compression_time,$decompression_time" >> "$output_file" + + # Clean up + rm -f "${file}.gz" + + # Test lz4 + echo " Testing lz4..." + start_time=$(date +%s.%N) + lz4 -f -q "$file" "${file}.lz4" + compression_time=$(echo "$(date +%s.%N) - $start_time" | bc) + compressed_size=$(stat -c%s "${file}.lz4") + compression_ratio=$(echo "scale=2; $original_size / $compressed_size" | bc -l) + + # Decompress and measure time + start_time=$(date +%s.%N) + lz4 -f -d -q "${file}.lz4" - | cat > /dev/null + decompression_time=$(echo "$(date +%s.%N) - $start_time" | bc) + + # Write results + echo "$filename,$original_size,lz4,$compressed_size,$compression_ratio,$compression_time,$decompression_time" >> "$output_file" + + # Clean up + rm -f "${file}.lz4" + fi +done + +echo "All tests completed. Results saved to $output_file" \ No newline at end of file diff --git a/results/compress/compression_metrics_20251216_182603.csv b/results/compress/compression_metrics_20251216_182603.csv new file mode 100644 index 0000000..b823124 --- /dev/null +++ b/results/compress/compression_metrics_20251216_182603.csv @@ -0,0 +1,25 @@ +filename,original_size,compressor,compressed_size,compression_ratio,compression_time,decompression_time +genome.fna,4699745,gzip,1424004,3.30,.681197994,.015465955 +genome.fna,4699745,lz4,2655438,1.76,.012701161,.009190410 +genome_large.fna,23498433,gzip,7118154,3.30,3.384480370,.067414798 +genome_large.fna,23498433,lz4,13275544,1.77,.020719873,.025022334 +genome_small.fna,1367,gzip,589,2.32,.001937446,.001983156 +genome_small.fna,1367,lz4,1041,1.31,.001883076,.002144425 +genome_xlarge.fna,46996793,gzip,14235842,3.30,6.775190783,.131633333 +genome_xlarge.fna,46996793,lz4,26551229,1.77,.031734579,.043495412 +genome_xsmall.fna,1043,gzip,475,2.19,.002007016,.002012775 +genome_xsmall.fna,1043,lz4,814,1.28,.001954316,.002085566 +genome_xxsmall.fna,800,gzip,393,2.03,.002071485,.001958195 +genome_xxsmall.fna,800,lz4,641,1.24,.001893416,.001943666 +text_large.txt,12977332,gzip,4770044,2.72,.613155078,.043915520 +text_large.txt,12977332,lz4,7879136,1.64,.017927300,.015065196 +text_small.txt,1022,gzip,590,1.73,.002070305,.001903226 +text_small.txt,1022,lz4,857,1.19,.001967146,.002040285 +text.txt,6488666,gzip,2385264,2.72,.308393934,.023656716 +text.txt,6488666,lz4,3939378,1.64,.014891266,.009709618 +text_xlarge.txt,25954664,gzip,9539638,2.72,1.229028819,.085925486 +text_xlarge.txt,25954664,lz4,15758785,1.64,.023613977,.023486747 +text_xsmall.txt,825,gzip,473,1.74,.002110205,.001980535 +text_xsmall.txt,825,lz4,678,1.21,.001757717,.002191075 +text_xxsmall.txt,492,gzip,325,1.51,.001867306,.002114055 +text_xxsmall.txt,492,lz4,438,1.12,.001869646,.002134206