import glob import os import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns from matplotlib.figure import Figure ALGORITHM_COL = 'compressor' LABEL_COL = 'label' CONTEXT_COL = 'context_size' INPUT_SIZE_COL = 'input_size' OUTPUT_SIZE_COL = 'compressed_size' COMPRESS_TIME_COL = 'compression_time' DECOMPRESS_TIME_COL = 'decompression_time' RATE_COL = 'compression_ratio' DISTORTION_COL = 'mse_loss' def original_v_compressed_filesize(df: pd.DataFrame, unique_labels: list[str], palette_dict, markers_dict ) -> Figure: """The "rate" graph""" plt.figure() break_point = 0.1 _, ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', OUTPUT_SIZE_COL, 'Compressed size (log, MB)', break_point, 'Compressor', 'upper left', LABEL_COL, unique_labels, palette_dict, markers_dict) # Add Baseline (y=x) df_small, df_large = df[df[INPUT_SIZE_COL] < break_point], df[df[INPUT_SIZE_COL] > break_point] baseline_label = 'Compression ratio 1.0' baseline_alpha = 0.5 min_xy, max_xy = df_small[INPUT_SIZE_COL].min(), df_small[INPUT_SIZE_COL].max() ax_small.plot([min_xy, max_xy], [min_xy, max_xy], color='gray', linestyle='--', label=baseline_label, alpha=baseline_alpha) min_xy, max_xy = df_large[INPUT_SIZE_COL].min(), df_large[INPUT_SIZE_COL].max() ax_large.plot([min_xy, max_xy], [min_xy, max_xy], color='gray', linestyle='--', label=baseline_label, alpha=baseline_alpha) plt.yscale('log') return plt.gcf() def compression_ratios(df: pd.DataFrame, unique_labels, palette_dict) -> Figure: """The "rate" graph""" plt.figure() fig, ax = plt.subplots() sns.boxplot( data=df, x=RATE_COL, y=LABEL_COL, hue=LABEL_COL, hue_order=unique_labels, palette=palette_dict, ax=ax, fill=False ) ax.set_xlabel('Compression ratio') ax.set_ylabel('Compressor') plt.yticks(rotation=45, ha="right") ax.grid(True, alpha=0.3) return plt.gcf() def compression_ratio_v_compression_time(df: pd.DataFrame, unique_labels, palette_dict, markers_dict) -> Figure: plt.figure() sns.scatterplot( data=df, x=RATE_COL, y=COMPRESS_TIME_COL, hue=LABEL_COL, hue_order=unique_labels, palette=palette_dict, style=LABEL_COL, style_order=unique_labels, markers=markers_dict, ) plt.legend(title='Compressor') plt.xlabel('Compression ratio') plt.ylabel('Compression time (s)') plt.grid(True, alpha=0.3) return plt.gcf() def filesize_v_compression_time(df: pd.DataFrame, unique_labels: list[str], palette_dict, markers_dict ) -> Figure: """The "execution time" graph""" plt.figure() f, _, _ = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', COMPRESS_TIME_COL, 'Runtime (log, s)', 0.1, 'Compressor', 'center left', LABEL_COL, unique_labels, palette_dict, markers_dict) f.text(0.5, 1, 'Compression runtime for different filesizes using each compressor', va='center', ha='center') plt.yscale('log') return plt.gcf() def filesize_v_decompression_time(df: pd.DataFrame, unique_labels: list[str], palette_dict, markers_dict ) -> Figure: """The "execution time" graph""" plt.figure() f, _, _ = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', DECOMPRESS_TIME_COL, 'Runtime (log, s)', 0.1, 'Compressor', 'center left', LABEL_COL, unique_labels, palette_dict, markers_dict) f.text(0.5, 1, 'Decompression runtime for different filesizes using each compressor', va='center', ha='center') plt.yscale('log') return plt.gcf() def filesize_v_mse(df: pd.DataFrame) -> Figure: """The "distortion" graph""" plt.figure() df = df[df[DISTORTION_COL] != 0] df = df[df[ALGORITHM_COL] == 'Autoencoder'] df.sort_values(by=INPUT_SIZE_COL, inplace=True) def filename_and_size(row): filename = row['input_filename'] size = row[INPUT_SIZE_COL] return f"{filename} ({size:.4f} MB)" def size(row): full_name = row['input_filename'] # Strip prefix size_name = full_name.lstrip('text').lstrip('genome').rstrip('txt').rstrip('fna') size_name = f"*{size_name}*" return size_name df['input_filename_size'] = df.apply(filename_and_size, axis=1) df['input_size'] = df.apply(size, axis=1) g = sns.catplot( data=df, kind="bar", x=DISTORTION_COL, y='input_size', col='training_dataset', hue=CONTEXT_COL, palette='Set2', height=5, aspect=0.6 ) g.set_axis_labels("MSE Loss", "Filename") g.set_titles("Autoencoder trained on {col_name}") # plt.title('MSE for autoencoder') # plt.yticks(rotation=45, ha="right") # plt.legend(title='Context size') g.tight_layout() return plt.gcf() def mse_losses(df: pd.DataFrame, unique_labels, palette_dict) -> Figure: """The "distortion" graph""" plt.figure() fig, ax = plt.subplots() sns.boxplot( data=df, x=DISTORTION_COL, y=LABEL_COL, hue=LABEL_COL, hue_order=unique_labels, palette=palette_dict, ax=ax, fill=False ) ax.set_xlabel('MSE') ax.set_ylabel('Compressor') plt.yticks(rotation=45, ha="right") ax.grid(True, alpha=0.3) return plt.gcf() def split_graph( df, x, x_axis_label, y, y_axis_label, break_point, legend_title, legend_loc, hue, unique_labels, palette_dict, markers_dict ) -> tuple: df = df.sort_values(by=x) f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(8, 4)) df_left = df[df[x] < break_point] sns.scatterplot( data=df_left, x=x, y=y, ax=ax_left, hue=hue, hue_order=unique_labels, palette=palette_dict, style=hue, style_order=unique_labels, markers=markers_dict, # s=150 ) ax_left.set_xlabel('') df_right = df[df[x] > break_point] sns.scatterplot( data=df_right, x=x, y=y, ax=ax_right, hue=hue, hue_order=unique_labels, palette=palette_dict, style=hue, style_order=unique_labels, markers=markers_dict, # s=150 ) ax_right.set_xlabel('') ax_right.set_ylabel('') # Combine both plots into one ax_left.spines['right'].set_visible(False) ax_right.spines['left'].set_visible(False) ax_right.yaxis.tick_right() ax_right.tick_params(labelright=False) ax_right.yaxis.set_ticks_position('none') # Add diagonal slash lines to indicate the break (with help from Gemini) d = .015 # proportion of vertical to horizontal extent of the slanted line kwargs = dict(transform=ax_left.transAxes, color='k', clip_on=False) ax_left.plot((1 - d, 1 + d), (-d, +d), **kwargs) # Top-right diagonal ax_left.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs) # Bottom-right diagonal kwargs.update(transform=ax_right.transAxes) # Switch to the other axes ax_right.plot((-d, +d), (1 - d, 1 + d), **kwargs) # Top-left diagonal ax_right.plot((-d, +d), (-d, +d), **kwargs) # Bottom-left diagonal # Fix legends handles_left, labels_left = ax_left.get_legend_handles_labels() handles_right, labels_right = ax_right.get_legend_handles_labels() unique_legend = dict(zip(labels_left + labels_right, handles_left + handles_right)) ax_left.get_legend().remove() ax_right.get_legend().remove() ax_left.legend(unique_legend.values(), unique_legend.keys(), title=legend_title, loc=legend_loc) f.text(0.5, 0, x_axis_label, ha='center', va='center') ax_left.set_ylabel(y_axis_label) ax_left.grid(True, alpha=0.3) ax_right.grid(True, alpha=0.3) plt.tight_layout() return f, ax_left, ax_right def generate( df: pd.DataFrame, unique_labels, palette_dict, markers_dict, tgt_dir: str ) -> None: """Generate all the plots""" # Make plots original_v_compressed_filesize(df, unique_labels, palette_dict, markers_dict).savefig( os.path.join(tgt_dir, 'original_v_compressed_filesize.png'), bbox_inches='tight', ) filesize_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig( os.path.join(tgt_dir, 'filesize_v_compression_time.png'), bbox_inches='tight', ) filesize_v_decompression_time(df, unique_labels, palette_dict, markers_dict).savefig( os.path.join(tgt_dir, 'filesize_v_decompression_time.png'), bbox_inches='tight', ) compression_ratios(df, unique_labels, palette_dict).savefig( os.path.join(tgt_dir, 'compression_ratios.png'), bbox_inches='tight' ) compression_ratio_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig( os.path.join(tgt_dir, 'compression_ratio_v_compression_time.png'), bbox_inches='tight' ) filesize_v_mse(df).savefig( os.path.join(tgt_dir, 'filesize_mse.png'), bbox_inches='tight' ) mse_losses(df, unique_labels, palette_dict).savefig( os.path.join(tgt_dir, 'mse_losses.png'), bbox_inches='tight' ) def setup(tgt_dir, dpi = 300): # Create the targ directory if it does not exist os.makedirs(tgt_dir, exist_ok=True) # Prepare matplotlib for use with LaTeX (makes it look less out of place, less Pythonesque) params = {'text.usetex': True, 'font.size': 11, 'font.family': 'serif', 'figure.dpi': dpi, } plt.rcParams.update(params) def preprocessing(df: pd.DataFrame) -> tuple: # Convert byts to MB df[INPUT_SIZE_COL] /= 1e6 df[OUTPUT_SIZE_COL] /= 1e6 # Convert ns to s df[COMPRESS_TIME_COL] /= 1e9 # Add labels to differentiate between algorithms with context lengths def create_label(row): compressor = row[ALGORITHM_COL] return compressor if pd.isna(row[CONTEXT_COL]) else f"{compressor} ($L = {int(row[CONTEXT_COL])}$)" df[LABEL_COL] = df.apply(create_label, axis=1) # Add the compression ratio df[RATE_COL] = df[INPUT_SIZE_COL] / df[OUTPUT_SIZE_COL] # Identify all categories upfront unique_labels = sorted(df[LABEL_COL].unique()) n_labels = len(unique_labels) # Create fixed palette and marker mapping palette_dict = dict(zip(unique_labels, sns.color_palette("Set2", n_labels))) markers_dict = dict(zip(unique_labels, ['o', '^', 'v', 's', 'D', 'H', 'X'])) return df, unique_labels, palette_dict, markers_dict def main(): """Load the data and generate the plots.""" df = pd.read_csv("measurements.csv") tgt_dir = "figures" setup(tgt_dir, 300) generate(*preprocessing(df), tgt_dir=tgt_dir) def old_results(): # read in the csv df = pd.read_csv("compression_results.csv") # Make compatible with new code df[INPUT_SIZE_COL] = df['original_file_size'] df[OUTPUT_SIZE_COL] = df['compressed_file_size'] df['compressor'] = df['model_type'] df[CONTEXT_COL] = df['context_length'] # df, unique_labels, palette_dict, markers_dict = preprocessing(df) for dataset_type in df["dataset_type"].unique(): for model_type in df["model_type"].unique(): dataset_df = df[df["dataset_type"] == dataset_type] model_df = dataset_df[dataset_df["model_type"] == model_type].copy() # execution time plt.figure(figsize=(4, 3)) model_df["original_file_size_mb"] = model_df["original_file_size"] / 1e6 model_df["compression_time_s"] = model_df["compression_time"] / 1e9 model_df["decompression_time_s"] = model_df["decompression_time"] / 1e9 # compression sns.lineplot( data=model_df, x="original_file_size_mb", y="compression_time_s", hue="context_length", palette="Set2", markers=True, legend="brief", linestyle="-" ) # decompression sns.lineplot( data=model_df, x="original_file_size_mb", y="decompression_time_s", hue="context_length", palette="Set2", markers=True, legend=False, linestyle="--" ) # plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}") plt.xlabel("File size (MB)") plt.ylabel("Time (log, s)") plt.yscale("log") plt.legend( [f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])]) plt.tight_layout() plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png") # compression ratio plt.figure(figsize=(4, 3)) c256 = model_df[model_df["context_length"] == 256] c128 = model_df[model_df["context_length"] == 128] plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256") plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128") # plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}") plt.xlabel("Original file size (MB)") plt.ylabel("Compressed file size (MB)") plt.ylim(0, df[df["model_type"] == model_type]["compressed_file_size"].max() / 1e6) plt.legend(title="Context size") plt.tight_layout() plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png") if model_type == "cnn": plt.figure() for length, linestyle in [(128, '-'), (256, '--')]: # extrapolate execution time to larger files x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6 y = model_df[model_df["context_length"] == length]["compression_time"] y_decom = model_df[model_df["context_length"] == length]["decompression_time"] b1, loga1 = np.polyfit(x, np.log(y), 1) b2, loga2 = np.polyfit(x, np.log(y_decom), 1) x_comp = np.linspace(0, 40, 1000) x_decomp = np.linspace(0, 40, 1000) a1 = np.exp(loga1) a2 = np.exp(loga2) plt.plot( x_comp, a1 * np.exp(x_comp), label=f"{length} compression", linestyle=linestyle ) plt.plot( x_decomp, a2 * np.exp(x_decomp), label=f"{length} decompression", linestyle=linestyle ) plt.grid(True, alpha=0.3) plt.legend() plt.title(f"(Log-linear) Extrapolated execution time for CNN") # plt.xscale('log') plt.xlabel("File size (MB)") plt.yscale('log') plt.ylabel("Time (log, s)") plt.tight_layout() plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png") for model_type in df["model_type"].unique(): model_df = df[df["model_type"] == model_type] plt.figure(figsize=(10, 4)) bar_height = 0.25 files = model_df["input_file_name"].unique() y = np.arange(len(files)) c256 = model_df[model_df["context_length"] == 256] c128 = model_df[model_df["context_length"] == 128] plt.barh( y - bar_height / 2, c256["mse_loss"], height=bar_height, label="256", ) plt.barh( y + bar_height / 2, c128["mse_loss"], height=bar_height, label="128", ) plt.yticks(y, files, rotation=45, ha="right") plt.title(f"MSE loss for different context lengths") plt.xlabel("MSE loss") plt.ylabel("Filename") plt.legend() plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(f"./graphs/{model_type}_loss.png") def training_loss(df, loss) -> Figure: plt.figure(figsize=(4, 3)) plt.plot(df['train_loss'], label="Training loss") plt.plot(df['validation_loss'], label="Validation losses") plt.xlabel("Epoch") plt.ylabel(loss) if loss == 'MSE Loss': ylim = 0.01 else: ylim = 6 plt.ylim(0, ylim) plt.legend() plt.tight_layout() return plt.gcf() def make_training_graphs(models_dir, loss): for csv in glob.glob(models_dir + '/*.csv'): df = pd.read_csv(csv) training_loss(df, loss).savefig( csv.replace('.csv', '.png'), bbox_inches='tight', ) if __name__ == "__main__": main() # old_results() make_training_graphs('../models/autoencoder', 'MSE Loss') make_training_graphs('../models/cnn', 'Cross Entropy Loss')