diff --git a/results/make_graphs.py b/results/make_graphs.py index 934d22b..3f328e5 100644 --- a/results/make_graphs.py +++ b/results/make_graphs.py @@ -3,7 +3,6 @@ import os import matplotlib.pyplot as plt import numpy as np import pandas as pd -import scipy import seaborn as sns from matplotlib.figure import Figure @@ -18,21 +17,18 @@ RATE_COL = 'compression_ratio' DISTORTION_COL = 'mse_loss' -def original_v_compressed_filesize( - df: pd.DataFrame, - unique_labels: list[str], - palette_dict, - markers_dict -) -> Figure: +def original_v_compressed_filesize(df: pd.DataFrame, + unique_labels: list[str], palette_dict, markers_dict + ) -> Figure: """The "rate" graph""" plt.figure() break_point = 0.1 - ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', - OUTPUT_SIZE_COL, 'Compressed size (log, MB)', - break_point, 'Compressor', 'upper left', LABEL_COL, - unique_labels, palette_dict, markers_dict) + _, ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', + OUTPUT_SIZE_COL, 'Compressed size (log, MB)', + break_point, 'Compressor', 'upper left', LABEL_COL, + unique_labels, palette_dict, markers_dict) # Add Baseline (y=x) df_small, df_large = df[df[INPUT_SIZE_COL] < break_point], df[df[INPUT_SIZE_COL] > break_point] @@ -50,51 +46,136 @@ def original_v_compressed_filesize( return plt.gcf() -def filesize_v_compression_time( - df: pd.DataFrame, - unique_labels: list[str], - palette_dict, - markers_dict -) -> Figure: +def compression_ratios(df: pd.DataFrame, unique_labels, palette_dict) -> Figure: + """The "rate" graph""" + plt.figure() + + fig, ax = plt.subplots() + sns.boxplot( + data=df, + x=RATE_COL, + y=LABEL_COL, + hue=LABEL_COL, + hue_order=unique_labels, + palette=palette_dict, + ax=ax, + fill=False + ) + + ax.set_xlabel('Compression ratio') + ax.set_ylabel('Compressor') + + plt.yticks(rotation=45, ha="right") + + ax.grid(True) + + return plt.gcf() + + +def filesize_v_compression_time(df: pd.DataFrame, + unique_labels: list[str], palette_dict, markers_dict + ) -> Figure: """The "execution time" graph""" plt.figure() - split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', - COMPRESS_TIME_COL, 'Compression time (log, s)', - 0.1, 'Compressor', 'center left', LABEL_COL, - unique_labels, palette_dict, markers_dict) + f, _, _ = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', + COMPRESS_TIME_COL, 'Runtime (log, s)', + 0.1, 'Compressor', 'center left', LABEL_COL, + unique_labels, palette_dict, markers_dict) + f.text(0.5, 1, 'Compression runtime for different filesizes using each compressor', va='center', ha='center') plt.yscale('log') return plt.gcf() -def filesize_v_decompression_time( - df: pd.DataFrame, - unique_labels: list[str], - palette_dict, - markers_dict -) -> Figure: +def filesize_v_decompression_time(df: pd.DataFrame, + unique_labels: list[str], palette_dict, markers_dict + ) -> Figure: """The "execution time" graph""" plt.figure() - split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', - DECOMPRESS_TIME_COL, 'Decompression time (log, s)', - 0.1, 'Compressor', 'center left', LABEL_COL, - unique_labels, palette_dict, markers_dict) + f, _, _ = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', + DECOMPRESS_TIME_COL, 'Runtime (log, s)', + 0.1, 'Compressor', 'center left', LABEL_COL, + unique_labels, palette_dict, markers_dict) + f.text(0.5, 1, 'Decompression runtime for different filesizes using each compressor', va='center', ha='center') plt.yscale('log') return plt.gcf() +def filesize_v_mse(df: pd.DataFrame) -> Figure: + """The "distortion" graph""" + plt.figure() + + df = df[df[DISTORTION_COL] != 0] + df = df[df[ALGORITHM_COL] == 'Autoencoder'] + + df.sort_values(by=INPUT_SIZE_COL, inplace=True) + + def filename_and_size(row): + filename = row['input_filename'] + size = row[INPUT_SIZE_COL] + return f"{filename} ({size:.4f} MB)" + + df['input_filename_size'] = df.apply(filename_and_size, axis=1) + + fig, ax = plt.subplots() + sns.barplot( + data=df, + y='input_filename', + x=DISTORTION_COL, + hue=CONTEXT_COL, + ax=ax, + palette='Set2' + ) + + plt.title('MSE for autoencoder') + plt.xlabel('MSE') + plt.ylabel('Filename') + plt.yticks(rotation=45, ha="right") + plt.legend(title='Context size') + + plt.grid(True) + + return plt.gcf() + + +def mse_losses(df: pd.DataFrame, unique_labels, palette_dict) -> Figure: + """The "distortion" graph""" + plt.figure() + + fig, ax = plt.subplots() + sns.boxplot( + data=df, + x=DISTORTION_COL, + y=LABEL_COL, + hue=LABEL_COL, + hue_order=unique_labels, + palette=palette_dict, + ax=ax, + fill=False + ) + + ax.set_xlabel('MSE') + ax.set_ylabel('Compressor') + + plt.yticks(rotation=45, ha="right") + + ax.grid(True) + + return plt.gcf() + + def split_graph( df, x, x_axis_label, y, y_axis_label, break_point, legend_title, legend_loc, hue, unique_labels, palette_dict, markers_dict ) -> tuple: df = df.sort_values(by=x) - f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(10, 5)) + f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(8, 4)) df_left = df[df[x] < break_point] sns.scatterplot( @@ -107,7 +188,8 @@ def split_graph( palette=palette_dict, style=hue, style_order=unique_labels, - markers=markers_dict + markers=markers_dict, + # s=150 ) ax_left.set_xlabel('') @@ -122,7 +204,8 @@ def split_graph( palette=palette_dict, style=hue, style_order=unique_labels, - markers=markers_dict + markers=markers_dict, + # s=150 ) ax_right.set_xlabel('') ax_right.set_ylabel('') @@ -159,48 +242,7 @@ def split_graph( ax_right.grid(True) plt.tight_layout() - return ax_left, ax_right - - -def compression_v_mse_scatter(df: pd.DataFrame) -> Figure: - """The "distortion" graph""" - plt.figure() - - sns.scatterplot( - data=df, - x=RATE_COL, - y=DISTORTION_COL - ) - - plt.xscale('log') - plt.xlabel('Compression ratio (log)') - - # TODO This does not work properly - - plt.yscale('log') - plt.ylabel('MSE (log)') - - return plt.gcf() - - -def compression_ratios(df: pd.DataFrame) -> Figure: - """The "distortion" graph""" - plt.figure() - - fig, ax = plt.subplots() - sns.boxplot( - data=df, - x=RATE_COL, - y=LABEL_COL, - ax=ax - ) - - ax.set_xlabel('Compression ratio') - ax.set_ylabel('') - - ax.grid(True) - - return plt.gcf() + return f, ax_left, ax_right def generate( @@ -213,22 +255,29 @@ def generate( original_v_compressed_filesize(df, unique_labels, palette_dict, markers_dict).savefig( os.path.join(tgt_dir, 'original_v_compressed_filesize.png'), bbox_inches='tight', - dpi=dpi ) filesize_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig( os.path.join(tgt_dir, 'filesize_v_compression_time.png'), bbox_inches='tight', - dpi=dpi ) filesize_v_decompression_time(df, unique_labels, palette_dict, markers_dict).savefig( os.path.join(tgt_dir, 'filesize_v_decompression_time.png'), bbox_inches='tight', - dpi=dpi ) - # compression_v_mse_scatter(df).savefig(os.path.join(tgt_dir, 'compression_v_mse.png'), bbox_inches='tight') - compression_ratios(df).savefig(os.path.join(tgt_dir, 'compression_ratios.png'), bbox_inches='tight') + compression_ratios(df, unique_labels, palette_dict).savefig( + os.path.join(tgt_dir, 'compression_ratios.png'), + bbox_inches='tight' + ) + filesize_v_mse(df).savefig( + os.path.join(tgt_dir, 'filesize_mse.png'), + bbox_inches='tight' + ) + mse_losses(df, unique_labels, palette_dict).savefig( + os.path.join(tgt_dir, 'mse_losses.png'), + bbox_inches='tight' + ) def setup(tgt_dir): @@ -239,6 +288,7 @@ def setup(tgt_dir): params = {'text.usetex': True, 'font.size': 11, 'font.family': 'serif', + 'figure.dpi': 300, } plt.rcParams.update(params) @@ -266,8 +316,8 @@ def preprocessing(df: pd.DataFrame) -> tuple: n_labels = len(unique_labels) # Create fixed palette and marker mapping - palette_dict = dict(zip(unique_labels, sns.color_palette("tab10", n_labels))) - markers_dict = dict(zip(unique_labels, ['x', '+', '1', '2', '3', '4'])) + palette_dict = dict(zip(unique_labels, sns.color_palette("Set2", n_labels))) + markers_dict = dict(zip(unique_labels, ['o', '^', 'v', 's', 'D', 'H', 'X'])) return df, unique_labels, palette_dict, markers_dict @@ -281,20 +331,26 @@ def main(): generate(*preprocessing(df), tgt_dir=tgt_dir, dpi=150) -if __name__ == "__main__": - main() - exit() - +def old_results(): # read in the csv df = pd.read_csv("compression_results.csv") + # Make compatible with new code + df[INPUT_SIZE_COL] = df['original_file_size'] + df[OUTPUT_SIZE_COL] = df['compressed_file_size'] + df['compressor'] = df['model_type'] + df[CONTEXT_COL] = df['context_length'] + # + + df, unique_labels, palette_dict, markers_dict = preprocessing(df) + for dataset_type in df["dataset_type"].unique(): for model_type in df["model_type"].unique(): dataset_df = df[df["dataset_type"] == dataset_type] model_df = dataset_df[dataset_df["model_type"] == model_type].copy() # execution time - plt.figure() + plt.figure(figsize=(4, 3)) model_df["original_file_size_mb"] = model_df["original_file_size"] / 1e6 model_df["compression_time_s"] = model_df["compression_time"] / 1e9 model_df["decompression_time_s"] = model_df["decompression_time"] / 1e9 @@ -304,7 +360,7 @@ if __name__ == "__main__": x="original_file_size_mb", y="compression_time_s", hue="context_length", - palette="Set1", + palette="Set2", markers=True, legend="brief", linestyle="-" @@ -315,14 +371,14 @@ if __name__ == "__main__": x="original_file_size_mb", y="decompression_time_s", hue="context_length", - palette="Set1", + palette="Set2", markers=True, legend=False, linestyle="--" ) - plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}") - plt.xlabel("file size [MB]") - plt.ylabel("Time [s]") + # plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}") + plt.xlabel("File size (MB)") + plt.ylabel("Time (log, s)") plt.yscale("log") plt.legend( [f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])]) @@ -330,56 +386,57 @@ if __name__ == "__main__": plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png") # compression ratio - plt.figure() + plt.figure(figsize=(4, 3)) c256 = model_df[model_df["context_length"] == 256] c128 = model_df[model_df["context_length"] == 128] plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256") plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128") - plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}") - plt.xlabel("Original file size [MB]") - plt.ylabel("Compressed file size [MB]") - plt.legend() + # plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}") + plt.xlabel("Original file size (MB)") + plt.ylabel("Compressed file size (MB)") + plt.ylim(0, model_df["compressed_file_size"].max() / 1e6) + plt.legend(title="Context size") + plt.tight_layout() plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png") - # if model_type == "cnn": - # import numpy as np - # - # plt.figure() - # for length, linestyle in [(128, '-'), (256, '--')]: - # # extrapolate execution time to larger files - # x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6 - # y = model_df[model_df["context_length"] == length]["compression_time"] - # y_decom = model_df[model_df["context_length"] == length]["decompression_time"] - # - # b1, loga1 = np.polyfit(x, np.log(y), 1) - # b2, loga2 = np.polyfit(x, np.log(y_decom), 1) - # - # x_comp = np.linspace(0, 40, 1000) - # x_decomp = np.linspace(0, 40, 1000) - # a1 = np.exp(loga1) - # a2 = np.exp(loga2) - # - # - # plt.plot( - # x_comp, a1 * np.exp(x_comp), - # label=f"{length} compression", - # linestyle=linestyle - # ) - # plt.plot( - # x_decomp, a2 * np.exp(x_decomp), - # label=f"{length} decompression", - # linestyle=linestyle - # ) - # - # - # - # plt.legend() - # plt.title(f"Extrapolated execution time for CNN compression and decompression") - # plt.xlabel("File size [MB]") - # plt.ylabel("Time [s]") - # plt.tight_layout() - # plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png") + if model_type == "cnn": + + plt.figure() + for length, linestyle in [(128, '-'), (256, '--')]: + # extrapolate execution time to larger files + x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6 + y = model_df[model_df["context_length"] == length]["compression_time"] + y_decom = model_df[model_df["context_length"] == length]["decompression_time"] + + b1, loga1 = np.polyfit(x, np.log(y), 1) + b2, loga2 = np.polyfit(x, np.log(y_decom), 1) + + x_comp = np.linspace(0, 40, 1000) + x_decomp = np.linspace(0, 40, 1000) + a1 = np.exp(loga1) + a2 = np.exp(loga2) + + plt.plot( + x_comp, a1 * np.exp(x_comp), + label=f"{length} compression", + linestyle=linestyle + ) + plt.plot( + x_decomp, a2 * np.exp(x_decomp), + label=f"{length} decompression", + linestyle=linestyle + ) + + plt.grid(True) + plt.legend() + plt.title(f"(Log-linear) Extrapolated execution time for CNN") + # plt.xscale('log') + plt.xlabel("File size (MB)") + plt.yscale('log') + plt.ylabel("Time (log, s)") + plt.tight_layout() + plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png") for model_type in df["model_type"].unique(): model_df = df[df["model_type"] == model_type] @@ -395,19 +452,25 @@ if __name__ == "__main__": y - bar_height / 2, c256["mse_loss"], height=bar_height, - label="256" + label="256", ) plt.barh( y + bar_height / 2, c128["mse_loss"], height=bar_height, - label="128" + label="128", ) plt.yticks(y, files, rotation=45, ha="right") plt.title(f"MSE loss for different context lengths") plt.xlabel("MSE loss") plt.ylabel("Filename") plt.legend() + plt.grid(True) plt.tight_layout() plt.savefig(f"./graphs/{model_type}_loss.png") + + +if __name__ == "__main__": + main() + old_results()