feat: Graphs checkpoint

This commit is contained in:
Tibo De Peuter 2025-12-19 00:06:14 +01:00
parent 15062d8884
commit b62f06018d
Signed by: tdpeuter
GPG key ID: 38297DE43F75FFE2

View file

@ -3,7 +3,6 @@ import os
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import scipy
import seaborn as sns import seaborn as sns
from matplotlib.figure import Figure from matplotlib.figure import Figure
@ -18,21 +17,18 @@ RATE_COL = 'compression_ratio'
DISTORTION_COL = 'mse_loss' DISTORTION_COL = 'mse_loss'
def original_v_compressed_filesize( def original_v_compressed_filesize(df: pd.DataFrame,
df: pd.DataFrame, unique_labels: list[str], palette_dict, markers_dict
unique_labels: list[str], ) -> Figure:
palette_dict,
markers_dict
) -> Figure:
"""The "rate" graph""" """The "rate" graph"""
plt.figure() plt.figure()
break_point = 0.1 break_point = 0.1
ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', _, ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
OUTPUT_SIZE_COL, 'Compressed size (log, MB)', OUTPUT_SIZE_COL, 'Compressed size (log, MB)',
break_point, 'Compressor', 'upper left', LABEL_COL, break_point, 'Compressor', 'upper left', LABEL_COL,
unique_labels, palette_dict, markers_dict) unique_labels, palette_dict, markers_dict)
# Add Baseline (y=x) # Add Baseline (y=x)
df_small, df_large = df[df[INPUT_SIZE_COL] < break_point], df[df[INPUT_SIZE_COL] > break_point] df_small, df_large = df[df[INPUT_SIZE_COL] < break_point], df[df[INPUT_SIZE_COL] > break_point]
@ -50,51 +46,136 @@ def original_v_compressed_filesize(
return plt.gcf() return plt.gcf()
def filesize_v_compression_time( def compression_ratios(df: pd.DataFrame, unique_labels, palette_dict) -> Figure:
df: pd.DataFrame, """The "rate" graph"""
unique_labels: list[str], plt.figure()
palette_dict,
markers_dict fig, ax = plt.subplots()
) -> Figure: sns.boxplot(
data=df,
x=RATE_COL,
y=LABEL_COL,
hue=LABEL_COL,
hue_order=unique_labels,
palette=palette_dict,
ax=ax,
fill=False
)
ax.set_xlabel('Compression ratio')
ax.set_ylabel('Compressor')
plt.yticks(rotation=45, ha="right")
ax.grid(True)
return plt.gcf()
def filesize_v_compression_time(df: pd.DataFrame,
unique_labels: list[str], palette_dict, markers_dict
) -> Figure:
"""The "execution time" graph""" """The "execution time" graph"""
plt.figure() plt.figure()
split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', f, _, _ = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
COMPRESS_TIME_COL, 'Compression time (log, s)', COMPRESS_TIME_COL, 'Runtime (log, s)',
0.1, 'Compressor', 'center left', LABEL_COL, 0.1, 'Compressor', 'center left', LABEL_COL,
unique_labels, palette_dict, markers_dict) unique_labels, palette_dict, markers_dict)
f.text(0.5, 1, 'Compression runtime for different filesizes using each compressor', va='center', ha='center')
plt.yscale('log') plt.yscale('log')
return plt.gcf() return plt.gcf()
def filesize_v_decompression_time( def filesize_v_decompression_time(df: pd.DataFrame,
df: pd.DataFrame, unique_labels: list[str], palette_dict, markers_dict
unique_labels: list[str], ) -> Figure:
palette_dict,
markers_dict
) -> Figure:
"""The "execution time" graph""" """The "execution time" graph"""
plt.figure() plt.figure()
split_graph(df, INPUT_SIZE_COL, 'Input size (MB)', f, _, _ = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
DECOMPRESS_TIME_COL, 'Decompression time (log, s)', DECOMPRESS_TIME_COL, 'Runtime (log, s)',
0.1, 'Compressor', 'center left', LABEL_COL, 0.1, 'Compressor', 'center left', LABEL_COL,
unique_labels, palette_dict, markers_dict) unique_labels, palette_dict, markers_dict)
f.text(0.5, 1, 'Decompression runtime for different filesizes using each compressor', va='center', ha='center')
plt.yscale('log') plt.yscale('log')
return plt.gcf() return plt.gcf()
def filesize_v_mse(df: pd.DataFrame) -> Figure:
"""The "distortion" graph"""
plt.figure()
df = df[df[DISTORTION_COL] != 0]
df = df[df[ALGORITHM_COL] == 'Autoencoder']
df.sort_values(by=INPUT_SIZE_COL, inplace=True)
def filename_and_size(row):
filename = row['input_filename']
size = row[INPUT_SIZE_COL]
return f"{filename} ({size:.4f} MB)"
df['input_filename_size'] = df.apply(filename_and_size, axis=1)
fig, ax = plt.subplots()
sns.barplot(
data=df,
y='input_filename',
x=DISTORTION_COL,
hue=CONTEXT_COL,
ax=ax,
palette='Set2'
)
plt.title('MSE for autoencoder')
plt.xlabel('MSE')
plt.ylabel('Filename')
plt.yticks(rotation=45, ha="right")
plt.legend(title='Context size')
plt.grid(True)
return plt.gcf()
def mse_losses(df: pd.DataFrame, unique_labels, palette_dict) -> Figure:
"""The "distortion" graph"""
plt.figure()
fig, ax = plt.subplots()
sns.boxplot(
data=df,
x=DISTORTION_COL,
y=LABEL_COL,
hue=LABEL_COL,
hue_order=unique_labels,
palette=palette_dict,
ax=ax,
fill=False
)
ax.set_xlabel('MSE')
ax.set_ylabel('Compressor')
plt.yticks(rotation=45, ha="right")
ax.grid(True)
return plt.gcf()
def split_graph( def split_graph(
df, x, x_axis_label, y, y_axis_label, df, x, x_axis_label, y, y_axis_label,
break_point, legend_title, legend_loc, hue, unique_labels, palette_dict, markers_dict break_point, legend_title, legend_loc, hue, unique_labels, palette_dict, markers_dict
) -> tuple: ) -> tuple:
df = df.sort_values(by=x) df = df.sort_values(by=x)
f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(10, 5)) f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(8, 4))
df_left = df[df[x] < break_point] df_left = df[df[x] < break_point]
sns.scatterplot( sns.scatterplot(
@ -107,7 +188,8 @@ def split_graph(
palette=palette_dict, palette=palette_dict,
style=hue, style=hue,
style_order=unique_labels, style_order=unique_labels,
markers=markers_dict markers=markers_dict,
# s=150
) )
ax_left.set_xlabel('') ax_left.set_xlabel('')
@ -122,7 +204,8 @@ def split_graph(
palette=palette_dict, palette=palette_dict,
style=hue, style=hue,
style_order=unique_labels, style_order=unique_labels,
markers=markers_dict markers=markers_dict,
# s=150
) )
ax_right.set_xlabel('') ax_right.set_xlabel('')
ax_right.set_ylabel('') ax_right.set_ylabel('')
@ -159,48 +242,7 @@ def split_graph(
ax_right.grid(True) ax_right.grid(True)
plt.tight_layout() plt.tight_layout()
return ax_left, ax_right return f, ax_left, ax_right
def compression_v_mse_scatter(df: pd.DataFrame) -> Figure:
"""The "distortion" graph"""
plt.figure()
sns.scatterplot(
data=df,
x=RATE_COL,
y=DISTORTION_COL
)
plt.xscale('log')
plt.xlabel('Compression ratio (log)')
# TODO This does not work properly
plt.yscale('log')
plt.ylabel('MSE (log)')
return plt.gcf()
def compression_ratios(df: pd.DataFrame) -> Figure:
"""The "distortion" graph"""
plt.figure()
fig, ax = plt.subplots()
sns.boxplot(
data=df,
x=RATE_COL,
y=LABEL_COL,
ax=ax
)
ax.set_xlabel('Compression ratio')
ax.set_ylabel('')
ax.grid(True)
return plt.gcf()
def generate( def generate(
@ -213,22 +255,29 @@ def generate(
original_v_compressed_filesize(df, unique_labels, palette_dict, markers_dict).savefig( original_v_compressed_filesize(df, unique_labels, palette_dict, markers_dict).savefig(
os.path.join(tgt_dir, 'original_v_compressed_filesize.png'), os.path.join(tgt_dir, 'original_v_compressed_filesize.png'),
bbox_inches='tight', bbox_inches='tight',
dpi=dpi
) )
filesize_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig( filesize_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig(
os.path.join(tgt_dir, 'filesize_v_compression_time.png'), os.path.join(tgt_dir, 'filesize_v_compression_time.png'),
bbox_inches='tight', bbox_inches='tight',
dpi=dpi
) )
filesize_v_decompression_time(df, unique_labels, palette_dict, markers_dict).savefig( filesize_v_decompression_time(df, unique_labels, palette_dict, markers_dict).savefig(
os.path.join(tgt_dir, 'filesize_v_decompression_time.png'), os.path.join(tgt_dir, 'filesize_v_decompression_time.png'),
bbox_inches='tight', bbox_inches='tight',
dpi=dpi
) )
# compression_v_mse_scatter(df).savefig(os.path.join(tgt_dir, 'compression_v_mse.png'), bbox_inches='tight') compression_ratios(df, unique_labels, palette_dict).savefig(
compression_ratios(df).savefig(os.path.join(tgt_dir, 'compression_ratios.png'), bbox_inches='tight') os.path.join(tgt_dir, 'compression_ratios.png'),
bbox_inches='tight'
)
filesize_v_mse(df).savefig(
os.path.join(tgt_dir, 'filesize_mse.png'),
bbox_inches='tight'
)
mse_losses(df, unique_labels, palette_dict).savefig(
os.path.join(tgt_dir, 'mse_losses.png'),
bbox_inches='tight'
)
def setup(tgt_dir): def setup(tgt_dir):
@ -239,6 +288,7 @@ def setup(tgt_dir):
params = {'text.usetex': True, params = {'text.usetex': True,
'font.size': 11, 'font.size': 11,
'font.family': 'serif', 'font.family': 'serif',
'figure.dpi': 300,
} }
plt.rcParams.update(params) plt.rcParams.update(params)
@ -266,8 +316,8 @@ def preprocessing(df: pd.DataFrame) -> tuple:
n_labels = len(unique_labels) n_labels = len(unique_labels)
# Create fixed palette and marker mapping # Create fixed palette and marker mapping
palette_dict = dict(zip(unique_labels, sns.color_palette("tab10", n_labels))) palette_dict = dict(zip(unique_labels, sns.color_palette("Set2", n_labels)))
markers_dict = dict(zip(unique_labels, ['x', '+', '1', '2', '3', '4'])) markers_dict = dict(zip(unique_labels, ['o', '^', 'v', 's', 'D', 'H', 'X']))
return df, unique_labels, palette_dict, markers_dict return df, unique_labels, palette_dict, markers_dict
@ -281,20 +331,26 @@ def main():
generate(*preprocessing(df), tgt_dir=tgt_dir, dpi=150) generate(*preprocessing(df), tgt_dir=tgt_dir, dpi=150)
if __name__ == "__main__": def old_results():
main()
exit()
# read in the csv # read in the csv
df = pd.read_csv("compression_results.csv") df = pd.read_csv("compression_results.csv")
# Make compatible with new code
df[INPUT_SIZE_COL] = df['original_file_size']
df[OUTPUT_SIZE_COL] = df['compressed_file_size']
df['compressor'] = df['model_type']
df[CONTEXT_COL] = df['context_length']
#
df, unique_labels, palette_dict, markers_dict = preprocessing(df)
for dataset_type in df["dataset_type"].unique(): for dataset_type in df["dataset_type"].unique():
for model_type in df["model_type"].unique(): for model_type in df["model_type"].unique():
dataset_df = df[df["dataset_type"] == dataset_type] dataset_df = df[df["dataset_type"] == dataset_type]
model_df = dataset_df[dataset_df["model_type"] == model_type].copy() model_df = dataset_df[dataset_df["model_type"] == model_type].copy()
# execution time # execution time
plt.figure() plt.figure(figsize=(4, 3))
model_df["original_file_size_mb"] = model_df["original_file_size"] / 1e6 model_df["original_file_size_mb"] = model_df["original_file_size"] / 1e6
model_df["compression_time_s"] = model_df["compression_time"] / 1e9 model_df["compression_time_s"] = model_df["compression_time"] / 1e9
model_df["decompression_time_s"] = model_df["decompression_time"] / 1e9 model_df["decompression_time_s"] = model_df["decompression_time"] / 1e9
@ -304,7 +360,7 @@ if __name__ == "__main__":
x="original_file_size_mb", x="original_file_size_mb",
y="compression_time_s", y="compression_time_s",
hue="context_length", hue="context_length",
palette="Set1", palette="Set2",
markers=True, markers=True,
legend="brief", legend="brief",
linestyle="-" linestyle="-"
@ -315,14 +371,14 @@ if __name__ == "__main__":
x="original_file_size_mb", x="original_file_size_mb",
y="decompression_time_s", y="decompression_time_s",
hue="context_length", hue="context_length",
palette="Set1", palette="Set2",
markers=True, markers=True,
legend=False, legend=False,
linestyle="--" linestyle="--"
) )
plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}") # plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}")
plt.xlabel("file size [MB]") plt.xlabel("File size (MB)")
plt.ylabel("Time [s]") plt.ylabel("Time (log, s)")
plt.yscale("log") plt.yscale("log")
plt.legend( plt.legend(
[f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])]) [f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])])
@ -330,56 +386,57 @@ if __name__ == "__main__":
plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png") plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png")
# compression ratio # compression ratio
plt.figure() plt.figure(figsize=(4, 3))
c256 = model_df[model_df["context_length"] == 256] c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128] c128 = model_df[model_df["context_length"] == 128]
plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256") plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256")
plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128") plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128")
plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}") # plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}")
plt.xlabel("Original file size [MB]") plt.xlabel("Original file size (MB)")
plt.ylabel("Compressed file size [MB]") plt.ylabel("Compressed file size (MB)")
plt.legend() plt.ylim(0, model_df["compressed_file_size"].max() / 1e6)
plt.legend(title="Context size")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png") plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png")
# if model_type == "cnn": if model_type == "cnn":
# import numpy as np
# plt.figure()
# plt.figure() for length, linestyle in [(128, '-'), (256, '--')]:
# for length, linestyle in [(128, '-'), (256, '--')]: # extrapolate execution time to larger files
# # extrapolate execution time to larger files x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6
# x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6 y = model_df[model_df["context_length"] == length]["compression_time"]
# y = model_df[model_df["context_length"] == length]["compression_time"] y_decom = model_df[model_df["context_length"] == length]["decompression_time"]
# y_decom = model_df[model_df["context_length"] == length]["decompression_time"]
# b1, loga1 = np.polyfit(x, np.log(y), 1)
# b1, loga1 = np.polyfit(x, np.log(y), 1) b2, loga2 = np.polyfit(x, np.log(y_decom), 1)
# b2, loga2 = np.polyfit(x, np.log(y_decom), 1)
# x_comp = np.linspace(0, 40, 1000)
# x_comp = np.linspace(0, 40, 1000) x_decomp = np.linspace(0, 40, 1000)
# x_decomp = np.linspace(0, 40, 1000) a1 = np.exp(loga1)
# a1 = np.exp(loga1) a2 = np.exp(loga2)
# a2 = np.exp(loga2)
# plt.plot(
# x_comp, a1 * np.exp(x_comp),
# plt.plot( label=f"{length} compression",
# x_comp, a1 * np.exp(x_comp), linestyle=linestyle
# label=f"{length} compression", )
# linestyle=linestyle plt.plot(
# ) x_decomp, a2 * np.exp(x_decomp),
# plt.plot( label=f"{length} decompression",
# x_decomp, a2 * np.exp(x_decomp), linestyle=linestyle
# label=f"{length} decompression", )
# linestyle=linestyle
# ) plt.grid(True)
# plt.legend()
# plt.title(f"(Log-linear) Extrapolated execution time for CNN")
# # plt.xscale('log')
# plt.legend() plt.xlabel("File size (MB)")
# plt.title(f"Extrapolated execution time for CNN compression and decompression") plt.yscale('log')
# plt.xlabel("File size [MB]") plt.ylabel("Time (log, s)")
# plt.ylabel("Time [s]") plt.tight_layout()
# plt.tight_layout() plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png")
# plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png")
for model_type in df["model_type"].unique(): for model_type in df["model_type"].unique():
model_df = df[df["model_type"] == model_type] model_df = df[df["model_type"] == model_type]
@ -395,19 +452,25 @@ if __name__ == "__main__":
y - bar_height / 2, y - bar_height / 2,
c256["mse_loss"], c256["mse_loss"],
height=bar_height, height=bar_height,
label="256" label="256",
) )
plt.barh( plt.barh(
y + bar_height / 2, y + bar_height / 2,
c128["mse_loss"], c128["mse_loss"],
height=bar_height, height=bar_height,
label="128" label="128",
) )
plt.yticks(y, files, rotation=45, ha="right") plt.yticks(y, files, rotation=45, ha="right")
plt.title(f"MSE loss for different context lengths") plt.title(f"MSE loss for different context lengths")
plt.xlabel("MSE loss") plt.xlabel("MSE loss")
plt.ylabel("Filename") plt.ylabel("Filename")
plt.legend() plt.legend()
plt.grid(True)
plt.tight_layout() plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_loss.png") plt.savefig(f"./graphs/{model_type}_loss.png")
if __name__ == "__main__":
main()
old_results()