413 lines
13 KiB
Python
413 lines
13 KiB
Python
import os
|
|
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import pandas as pd
|
|
import scipy
|
|
import seaborn as sns
|
|
from matplotlib.figure import Figure
|
|
|
|
ALGORITHM_COL = 'compressor'
|
|
LABEL_COL = 'label'
|
|
CONTEXT_COL = 'context_size'
|
|
INPUT_SIZE_COL = 'input_size'
|
|
OUTPUT_SIZE_COL = 'compressed_size'
|
|
COMPRESS_TIME_COL = 'compression_time'
|
|
DECOMPRESS_TIME_COL = 'decompression_time'
|
|
RATE_COL = 'compression_ratio'
|
|
DISTORTION_COL = 'mse_loss'
|
|
|
|
|
|
def original_v_compressed_filesize(
|
|
df: pd.DataFrame,
|
|
unique_labels: list[str],
|
|
palette_dict,
|
|
markers_dict
|
|
) -> Figure:
|
|
"""The "rate" graph"""
|
|
plt.figure()
|
|
|
|
break_point = 0.1
|
|
|
|
ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
|
|
OUTPUT_SIZE_COL, 'Compressed size (log, MB)',
|
|
break_point, 'Compressor', 'upper left', LABEL_COL,
|
|
unique_labels, palette_dict, markers_dict)
|
|
|
|
# Add Baseline (y=x)
|
|
df_small, df_large = df[df[INPUT_SIZE_COL] < break_point], df[df[INPUT_SIZE_COL] > break_point]
|
|
baseline_label = 'Compression ratio 1.0'
|
|
baseline_alpha = 0.5
|
|
min_xy, max_xy = df_small[INPUT_SIZE_COL].min(), df_small[INPUT_SIZE_COL].max()
|
|
ax_small.plot([min_xy, max_xy], [min_xy, max_xy],
|
|
color='gray', linestyle='--', label=baseline_label, alpha=baseline_alpha)
|
|
min_xy, max_xy = df_large[INPUT_SIZE_COL].min(), df_large[INPUT_SIZE_COL].max()
|
|
ax_large.plot([min_xy, max_xy], [min_xy, max_xy],
|
|
color='gray', linestyle='--', label=baseline_label, alpha=baseline_alpha)
|
|
|
|
plt.yscale('log')
|
|
|
|
return plt.gcf()
|
|
|
|
|
|
def filesize_v_compression_time(
|
|
df: pd.DataFrame,
|
|
unique_labels: list[str],
|
|
palette_dict,
|
|
markers_dict
|
|
) -> Figure:
|
|
"""The "execution time" graph"""
|
|
plt.figure()
|
|
|
|
split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
|
|
COMPRESS_TIME_COL, 'Compression time (log, s)',
|
|
0.1, 'Compressor', 'center left', LABEL_COL,
|
|
unique_labels, palette_dict, markers_dict)
|
|
|
|
plt.yscale('log')
|
|
|
|
return plt.gcf()
|
|
|
|
|
|
def filesize_v_decompression_time(
|
|
df: pd.DataFrame,
|
|
unique_labels: list[str],
|
|
palette_dict,
|
|
markers_dict
|
|
) -> Figure:
|
|
"""The "execution time" graph"""
|
|
plt.figure()
|
|
|
|
split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
|
|
DECOMPRESS_TIME_COL, 'Decompression time (log, s)',
|
|
0.1, 'Compressor', 'center left', LABEL_COL,
|
|
unique_labels, palette_dict, markers_dict)
|
|
|
|
plt.yscale('log')
|
|
|
|
return plt.gcf()
|
|
|
|
|
|
def split_graph(
|
|
df, x, x_axis_label, y, y_axis_label,
|
|
break_point, legend_title, legend_loc, hue, unique_labels, palette_dict, markers_dict
|
|
) -> tuple:
|
|
df = df.sort_values(by=x)
|
|
|
|
f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(10, 5))
|
|
|
|
df_left = df[df[x] < break_point]
|
|
sns.scatterplot(
|
|
data=df_left,
|
|
x=x,
|
|
y=y,
|
|
ax=ax_left,
|
|
hue=hue,
|
|
hue_order=unique_labels,
|
|
palette=palette_dict,
|
|
style=hue,
|
|
style_order=unique_labels,
|
|
markers=markers_dict
|
|
)
|
|
ax_left.set_xlabel('')
|
|
|
|
df_right = df[df[x] > break_point]
|
|
sns.scatterplot(
|
|
data=df_right,
|
|
x=x,
|
|
y=y,
|
|
ax=ax_right,
|
|
hue=hue,
|
|
hue_order=unique_labels,
|
|
palette=palette_dict,
|
|
style=hue,
|
|
style_order=unique_labels,
|
|
markers=markers_dict
|
|
)
|
|
ax_right.set_xlabel('')
|
|
ax_right.set_ylabel('')
|
|
|
|
# Combine both plots into one
|
|
ax_left.spines['right'].set_visible(False)
|
|
ax_right.spines['left'].set_visible(False)
|
|
ax_right.yaxis.tick_right()
|
|
ax_right.tick_params(labelright=False)
|
|
ax_right.yaxis.set_ticks_position('none')
|
|
|
|
# Add diagonal slash lines to indicate the break (with help from Gemini)
|
|
d = .015 # proportion of vertical to horizontal extent of the slanted line
|
|
kwargs = dict(transform=ax_left.transAxes, color='k', clip_on=False)
|
|
ax_left.plot((1 - d, 1 + d), (-d, +d), **kwargs) # Top-right diagonal
|
|
ax_left.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs) # Bottom-right diagonal
|
|
|
|
kwargs.update(transform=ax_right.transAxes) # Switch to the other axes
|
|
ax_right.plot((-d, +d), (1 - d, 1 + d), **kwargs) # Top-left diagonal
|
|
ax_right.plot((-d, +d), (-d, +d), **kwargs) # Bottom-left diagonal
|
|
|
|
# Fix legends
|
|
handles_left, labels_left = ax_left.get_legend_handles_labels()
|
|
handles_right, labels_right = ax_right.get_legend_handles_labels()
|
|
unique_legend = dict(zip(labels_left + labels_right, handles_left + handles_right))
|
|
ax_left.get_legend().remove()
|
|
ax_right.get_legend().remove()
|
|
ax_left.legend(unique_legend.values(), unique_legend.keys(), title=legend_title, loc=legend_loc)
|
|
|
|
f.text(0.5, 0, x_axis_label, ha='center', va='center')
|
|
ax_left.set_ylabel(y_axis_label)
|
|
|
|
ax_left.grid(True)
|
|
ax_right.grid(True)
|
|
|
|
plt.tight_layout()
|
|
return ax_left, ax_right
|
|
|
|
|
|
def compression_v_mse_scatter(df: pd.DataFrame) -> Figure:
|
|
"""The "distortion" graph"""
|
|
plt.figure()
|
|
|
|
sns.scatterplot(
|
|
data=df,
|
|
x=RATE_COL,
|
|
y=DISTORTION_COL
|
|
)
|
|
|
|
plt.xscale('log')
|
|
plt.xlabel('Compression ratio (log)')
|
|
|
|
# TODO This does not work properly
|
|
|
|
plt.yscale('log')
|
|
plt.ylabel('MSE (log)')
|
|
|
|
return plt.gcf()
|
|
|
|
|
|
def compression_ratios(df: pd.DataFrame) -> Figure:
|
|
"""The "distortion" graph"""
|
|
plt.figure()
|
|
|
|
fig, ax = plt.subplots()
|
|
sns.boxplot(
|
|
data=df,
|
|
x=RATE_COL,
|
|
y=LABEL_COL,
|
|
ax=ax
|
|
)
|
|
|
|
ax.set_xlabel('Compression ratio')
|
|
ax.set_ylabel('')
|
|
|
|
ax.grid(True)
|
|
|
|
return plt.gcf()
|
|
|
|
|
|
def generate(
|
|
df: pd.DataFrame, unique_labels, palette_dict, markers_dict,
|
|
tgt_dir: str, dpi: int = 300
|
|
) -> None:
|
|
"""Generate all the plots"""
|
|
# Make plots
|
|
|
|
original_v_compressed_filesize(df, unique_labels, palette_dict, markers_dict).savefig(
|
|
os.path.join(tgt_dir, 'original_v_compressed_filesize.png'),
|
|
bbox_inches='tight',
|
|
dpi=dpi
|
|
)
|
|
|
|
filesize_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig(
|
|
os.path.join(tgt_dir, 'filesize_v_compression_time.png'),
|
|
bbox_inches='tight',
|
|
dpi=dpi
|
|
)
|
|
filesize_v_decompression_time(df, unique_labels, palette_dict, markers_dict).savefig(
|
|
os.path.join(tgt_dir, 'filesize_v_decompression_time.png'),
|
|
bbox_inches='tight',
|
|
dpi=dpi
|
|
)
|
|
|
|
# compression_v_mse_scatter(df).savefig(os.path.join(tgt_dir, 'compression_v_mse.png'), bbox_inches='tight')
|
|
compression_ratios(df).savefig(os.path.join(tgt_dir, 'compression_ratios.png'), bbox_inches='tight')
|
|
|
|
|
|
def setup(tgt_dir):
|
|
# Create the targ directory if it does not exist
|
|
os.makedirs(tgt_dir, exist_ok=True)
|
|
|
|
# Prepare matplotlib for use with LaTeX (makes it look less out of place, less Pythonesque)
|
|
params = {'text.usetex': True,
|
|
'font.size': 11,
|
|
'font.family': 'serif',
|
|
}
|
|
plt.rcParams.update(params)
|
|
|
|
|
|
def preprocessing(df: pd.DataFrame) -> tuple:
|
|
# Convert byts to MB
|
|
df[INPUT_SIZE_COL] /= 1e6
|
|
df[OUTPUT_SIZE_COL] /= 1e6
|
|
|
|
# Convert ns to s
|
|
df[COMPRESS_TIME_COL] /= 1e9
|
|
|
|
# Add labels to differentiate between algorithms with context lengths
|
|
def create_label(row):
|
|
compressor = row[ALGORITHM_COL]
|
|
return compressor if pd.isna(row[CONTEXT_COL]) else f"{compressor} ($L = {int(row[CONTEXT_COL])}$)"
|
|
|
|
df[LABEL_COL] = df.apply(create_label, axis=1)
|
|
|
|
# Add the compression ratio
|
|
df[RATE_COL] = df[INPUT_SIZE_COL] / df[OUTPUT_SIZE_COL]
|
|
|
|
# Identify all categories upfront
|
|
unique_labels = sorted(df[LABEL_COL].unique())
|
|
n_labels = len(unique_labels)
|
|
|
|
# Create fixed palette and marker mapping
|
|
palette_dict = dict(zip(unique_labels, sns.color_palette("tab10", n_labels)))
|
|
markers_dict = dict(zip(unique_labels, ['x', '+', '1', '2', '3', '4']))
|
|
|
|
return df, unique_labels, palette_dict, markers_dict
|
|
|
|
|
|
def main():
|
|
"""Load the data and generate the plots."""
|
|
df = pd.read_csv("measurements.csv")
|
|
|
|
tgt_dir = "figures"
|
|
setup(tgt_dir)
|
|
generate(*preprocessing(df), tgt_dir=tgt_dir, dpi=150)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
exit()
|
|
|
|
# read in the csv
|
|
df = pd.read_csv("compression_results.csv")
|
|
|
|
for dataset_type in df["dataset_type"].unique():
|
|
for model_type in df["model_type"].unique():
|
|
dataset_df = df[df["dataset_type"] == dataset_type]
|
|
model_df = dataset_df[dataset_df["model_type"] == model_type].copy()
|
|
|
|
# execution time
|
|
plt.figure()
|
|
model_df["original_file_size_mb"] = model_df["original_file_size"] / 1e6
|
|
model_df["compression_time_s"] = model_df["compression_time"] / 1e9
|
|
model_df["decompression_time_s"] = model_df["decompression_time"] / 1e9
|
|
# compression
|
|
sns.lineplot(
|
|
data=model_df,
|
|
x="original_file_size_mb",
|
|
y="compression_time_s",
|
|
hue="context_length",
|
|
palette="Set1",
|
|
markers=True,
|
|
legend="brief",
|
|
linestyle="-"
|
|
)
|
|
# decompression
|
|
sns.lineplot(
|
|
data=model_df,
|
|
x="original_file_size_mb",
|
|
y="decompression_time_s",
|
|
hue="context_length",
|
|
palette="Set1",
|
|
markers=True,
|
|
legend=False,
|
|
linestyle="--"
|
|
)
|
|
plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}")
|
|
plt.xlabel("file size [MB]")
|
|
plt.ylabel("Time [s]")
|
|
plt.yscale("log")
|
|
plt.legend(
|
|
[f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])])
|
|
plt.tight_layout()
|
|
plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png")
|
|
|
|
# compression ratio
|
|
plt.figure()
|
|
c256 = model_df[model_df["context_length"] == 256]
|
|
c128 = model_df[model_df["context_length"] == 128]
|
|
|
|
plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256")
|
|
plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128")
|
|
plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}")
|
|
plt.xlabel("Original file size [MB]")
|
|
plt.ylabel("Compressed file size [MB]")
|
|
plt.legend()
|
|
plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png")
|
|
|
|
# if model_type == "cnn":
|
|
# import numpy as np
|
|
#
|
|
# plt.figure()
|
|
# for length, linestyle in [(128, '-'), (256, '--')]:
|
|
# # extrapolate execution time to larger files
|
|
# x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6
|
|
# y = model_df[model_df["context_length"] == length]["compression_time"]
|
|
# y_decom = model_df[model_df["context_length"] == length]["decompression_time"]
|
|
#
|
|
# b1, loga1 = np.polyfit(x, np.log(y), 1)
|
|
# b2, loga2 = np.polyfit(x, np.log(y_decom), 1)
|
|
#
|
|
# x_comp = np.linspace(0, 40, 1000)
|
|
# x_decomp = np.linspace(0, 40, 1000)
|
|
# a1 = np.exp(loga1)
|
|
# a2 = np.exp(loga2)
|
|
#
|
|
#
|
|
# plt.plot(
|
|
# x_comp, a1 * np.exp(x_comp),
|
|
# label=f"{length} compression",
|
|
# linestyle=linestyle
|
|
# )
|
|
# plt.plot(
|
|
# x_decomp, a2 * np.exp(x_decomp),
|
|
# label=f"{length} decompression",
|
|
# linestyle=linestyle
|
|
# )
|
|
#
|
|
#
|
|
#
|
|
# plt.legend()
|
|
# plt.title(f"Extrapolated execution time for CNN compression and decompression")
|
|
# plt.xlabel("File size [MB]")
|
|
# plt.ylabel("Time [s]")
|
|
# plt.tight_layout()
|
|
# plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png")
|
|
|
|
for model_type in df["model_type"].unique():
|
|
model_df = df[df["model_type"] == model_type]
|
|
|
|
plt.figure(figsize=(10, 4))
|
|
bar_height = 0.25
|
|
files = model_df["input_file_name"].unique()
|
|
y = np.arange(len(files))
|
|
c256 = model_df[model_df["context_length"] == 256]
|
|
c128 = model_df[model_df["context_length"] == 128]
|
|
|
|
plt.barh(
|
|
y - bar_height / 2,
|
|
c256["mse_loss"],
|
|
height=bar_height,
|
|
label="256"
|
|
)
|
|
|
|
plt.barh(
|
|
y + bar_height / 2,
|
|
c128["mse_loss"],
|
|
height=bar_height,
|
|
label="128"
|
|
)
|
|
plt.yticks(y, files, rotation=45, ha="right")
|
|
plt.title(f"MSE loss for different context lengths")
|
|
plt.xlabel("MSE loss")
|
|
plt.ylabel("Filename")
|
|
plt.legend()
|
|
plt.tight_layout()
|
|
plt.savefig(f"./graphs/{model_type}_loss.png")
|