feat: Graphs checkpoint
This commit is contained in:
parent
2f869a8a7a
commit
15062d8884
4 changed files with 453 additions and 4 deletions
|
|
@ -1,9 +1,290 @@
|
|||
import pandas as pd
|
||||
import os
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import scipy
|
||||
import seaborn as sns
|
||||
from matplotlib.figure import Figure
|
||||
|
||||
ALGORITHM_COL = 'compressor'
|
||||
LABEL_COL = 'label'
|
||||
CONTEXT_COL = 'context_size'
|
||||
INPUT_SIZE_COL = 'input_size'
|
||||
OUTPUT_SIZE_COL = 'compressed_size'
|
||||
COMPRESS_TIME_COL = 'compression_time'
|
||||
DECOMPRESS_TIME_COL = 'decompression_time'
|
||||
RATE_COL = 'compression_ratio'
|
||||
DISTORTION_COL = 'mse_loss'
|
||||
|
||||
|
||||
def original_v_compressed_filesize(
|
||||
df: pd.DataFrame,
|
||||
unique_labels: list[str],
|
||||
palette_dict,
|
||||
markers_dict
|
||||
) -> Figure:
|
||||
"""The "rate" graph"""
|
||||
plt.figure()
|
||||
|
||||
break_point = 0.1
|
||||
|
||||
ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
|
||||
OUTPUT_SIZE_COL, 'Compressed size (log, MB)',
|
||||
break_point, 'Compressor', 'upper left', LABEL_COL,
|
||||
unique_labels, palette_dict, markers_dict)
|
||||
|
||||
# Add Baseline (y=x)
|
||||
df_small, df_large = df[df[INPUT_SIZE_COL] < break_point], df[df[INPUT_SIZE_COL] > break_point]
|
||||
baseline_label = 'Compression ratio 1.0'
|
||||
baseline_alpha = 0.5
|
||||
min_xy, max_xy = df_small[INPUT_SIZE_COL].min(), df_small[INPUT_SIZE_COL].max()
|
||||
ax_small.plot([min_xy, max_xy], [min_xy, max_xy],
|
||||
color='gray', linestyle='--', label=baseline_label, alpha=baseline_alpha)
|
||||
min_xy, max_xy = df_large[INPUT_SIZE_COL].min(), df_large[INPUT_SIZE_COL].max()
|
||||
ax_large.plot([min_xy, max_xy], [min_xy, max_xy],
|
||||
color='gray', linestyle='--', label=baseline_label, alpha=baseline_alpha)
|
||||
|
||||
plt.yscale('log')
|
||||
|
||||
return plt.gcf()
|
||||
|
||||
|
||||
def filesize_v_compression_time(
|
||||
df: pd.DataFrame,
|
||||
unique_labels: list[str],
|
||||
palette_dict,
|
||||
markers_dict
|
||||
) -> Figure:
|
||||
"""The "execution time" graph"""
|
||||
plt.figure()
|
||||
|
||||
split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
|
||||
COMPRESS_TIME_COL, 'Compression time (log, s)',
|
||||
0.1, 'Compressor', 'center left', LABEL_COL,
|
||||
unique_labels, palette_dict, markers_dict)
|
||||
|
||||
plt.yscale('log')
|
||||
|
||||
return plt.gcf()
|
||||
|
||||
|
||||
def filesize_v_decompression_time(
|
||||
df: pd.DataFrame,
|
||||
unique_labels: list[str],
|
||||
palette_dict,
|
||||
markers_dict
|
||||
) -> Figure:
|
||||
"""The "execution time" graph"""
|
||||
plt.figure()
|
||||
|
||||
split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
|
||||
DECOMPRESS_TIME_COL, 'Decompression time (log, s)',
|
||||
0.1, 'Compressor', 'center left', LABEL_COL,
|
||||
unique_labels, palette_dict, markers_dict)
|
||||
|
||||
plt.yscale('log')
|
||||
|
||||
return plt.gcf()
|
||||
|
||||
|
||||
def split_graph(
|
||||
df, x, x_axis_label, y, y_axis_label,
|
||||
break_point, legend_title, legend_loc, hue, unique_labels, palette_dict, markers_dict
|
||||
) -> tuple:
|
||||
df = df.sort_values(by=x)
|
||||
|
||||
f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(10, 5))
|
||||
|
||||
df_left = df[df[x] < break_point]
|
||||
sns.scatterplot(
|
||||
data=df_left,
|
||||
x=x,
|
||||
y=y,
|
||||
ax=ax_left,
|
||||
hue=hue,
|
||||
hue_order=unique_labels,
|
||||
palette=palette_dict,
|
||||
style=hue,
|
||||
style_order=unique_labels,
|
||||
markers=markers_dict
|
||||
)
|
||||
ax_left.set_xlabel('')
|
||||
|
||||
df_right = df[df[x] > break_point]
|
||||
sns.scatterplot(
|
||||
data=df_right,
|
||||
x=x,
|
||||
y=y,
|
||||
ax=ax_right,
|
||||
hue=hue,
|
||||
hue_order=unique_labels,
|
||||
palette=palette_dict,
|
||||
style=hue,
|
||||
style_order=unique_labels,
|
||||
markers=markers_dict
|
||||
)
|
||||
ax_right.set_xlabel('')
|
||||
ax_right.set_ylabel('')
|
||||
|
||||
# Combine both plots into one
|
||||
ax_left.spines['right'].set_visible(False)
|
||||
ax_right.spines['left'].set_visible(False)
|
||||
ax_right.yaxis.tick_right()
|
||||
ax_right.tick_params(labelright=False)
|
||||
ax_right.yaxis.set_ticks_position('none')
|
||||
|
||||
# Add diagonal slash lines to indicate the break (with help from Gemini)
|
||||
d = .015 # proportion of vertical to horizontal extent of the slanted line
|
||||
kwargs = dict(transform=ax_left.transAxes, color='k', clip_on=False)
|
||||
ax_left.plot((1 - d, 1 + d), (-d, +d), **kwargs) # Top-right diagonal
|
||||
ax_left.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs) # Bottom-right diagonal
|
||||
|
||||
kwargs.update(transform=ax_right.transAxes) # Switch to the other axes
|
||||
ax_right.plot((-d, +d), (1 - d, 1 + d), **kwargs) # Top-left diagonal
|
||||
ax_right.plot((-d, +d), (-d, +d), **kwargs) # Bottom-left diagonal
|
||||
|
||||
# Fix legends
|
||||
handles_left, labels_left = ax_left.get_legend_handles_labels()
|
||||
handles_right, labels_right = ax_right.get_legend_handles_labels()
|
||||
unique_legend = dict(zip(labels_left + labels_right, handles_left + handles_right))
|
||||
ax_left.get_legend().remove()
|
||||
ax_right.get_legend().remove()
|
||||
ax_left.legend(unique_legend.values(), unique_legend.keys(), title=legend_title, loc=legend_loc)
|
||||
|
||||
f.text(0.5, 0, x_axis_label, ha='center', va='center')
|
||||
ax_left.set_ylabel(y_axis_label)
|
||||
|
||||
ax_left.grid(True)
|
||||
ax_right.grid(True)
|
||||
|
||||
plt.tight_layout()
|
||||
return ax_left, ax_right
|
||||
|
||||
|
||||
def compression_v_mse_scatter(df: pd.DataFrame) -> Figure:
|
||||
"""The "distortion" graph"""
|
||||
plt.figure()
|
||||
|
||||
sns.scatterplot(
|
||||
data=df,
|
||||
x=RATE_COL,
|
||||
y=DISTORTION_COL
|
||||
)
|
||||
|
||||
plt.xscale('log')
|
||||
plt.xlabel('Compression ratio (log)')
|
||||
|
||||
# TODO This does not work properly
|
||||
|
||||
plt.yscale('log')
|
||||
plt.ylabel('MSE (log)')
|
||||
|
||||
return plt.gcf()
|
||||
|
||||
|
||||
def compression_ratios(df: pd.DataFrame) -> Figure:
|
||||
"""The "distortion" graph"""
|
||||
plt.figure()
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
sns.boxplot(
|
||||
data=df,
|
||||
x=RATE_COL,
|
||||
y=LABEL_COL,
|
||||
ax=ax
|
||||
)
|
||||
|
||||
ax.set_xlabel('Compression ratio')
|
||||
ax.set_ylabel('')
|
||||
|
||||
ax.grid(True)
|
||||
|
||||
return plt.gcf()
|
||||
|
||||
|
||||
def generate(
|
||||
df: pd.DataFrame, unique_labels, palette_dict, markers_dict,
|
||||
tgt_dir: str, dpi: int = 300
|
||||
) -> None:
|
||||
"""Generate all the plots"""
|
||||
# Make plots
|
||||
|
||||
original_v_compressed_filesize(df, unique_labels, palette_dict, markers_dict).savefig(
|
||||
os.path.join(tgt_dir, 'original_v_compressed_filesize.png'),
|
||||
bbox_inches='tight',
|
||||
dpi=dpi
|
||||
)
|
||||
|
||||
filesize_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig(
|
||||
os.path.join(tgt_dir, 'filesize_v_compression_time.png'),
|
||||
bbox_inches='tight',
|
||||
dpi=dpi
|
||||
)
|
||||
filesize_v_decompression_time(df, unique_labels, palette_dict, markers_dict).savefig(
|
||||
os.path.join(tgt_dir, 'filesize_v_decompression_time.png'),
|
||||
bbox_inches='tight',
|
||||
dpi=dpi
|
||||
)
|
||||
|
||||
# compression_v_mse_scatter(df).savefig(os.path.join(tgt_dir, 'compression_v_mse.png'), bbox_inches='tight')
|
||||
compression_ratios(df).savefig(os.path.join(tgt_dir, 'compression_ratios.png'), bbox_inches='tight')
|
||||
|
||||
|
||||
def setup(tgt_dir):
|
||||
# Create the targ directory if it does not exist
|
||||
os.makedirs(tgt_dir, exist_ok=True)
|
||||
|
||||
# Prepare matplotlib for use with LaTeX (makes it look less out of place, less Pythonesque)
|
||||
params = {'text.usetex': True,
|
||||
'font.size': 11,
|
||||
'font.family': 'serif',
|
||||
}
|
||||
plt.rcParams.update(params)
|
||||
|
||||
|
||||
def preprocessing(df: pd.DataFrame) -> tuple:
|
||||
# Convert byts to MB
|
||||
df[INPUT_SIZE_COL] /= 1e6
|
||||
df[OUTPUT_SIZE_COL] /= 1e6
|
||||
|
||||
# Convert ns to s
|
||||
df[COMPRESS_TIME_COL] /= 1e9
|
||||
|
||||
# Add labels to differentiate between algorithms with context lengths
|
||||
def create_label(row):
|
||||
compressor = row[ALGORITHM_COL]
|
||||
return compressor if pd.isna(row[CONTEXT_COL]) else f"{compressor} ($L = {int(row[CONTEXT_COL])}$)"
|
||||
|
||||
df[LABEL_COL] = df.apply(create_label, axis=1)
|
||||
|
||||
# Add the compression ratio
|
||||
df[RATE_COL] = df[INPUT_SIZE_COL] / df[OUTPUT_SIZE_COL]
|
||||
|
||||
# Identify all categories upfront
|
||||
unique_labels = sorted(df[LABEL_COL].unique())
|
||||
n_labels = len(unique_labels)
|
||||
|
||||
# Create fixed palette and marker mapping
|
||||
palette_dict = dict(zip(unique_labels, sns.color_palette("tab10", n_labels)))
|
||||
markers_dict = dict(zip(unique_labels, ['x', '+', '1', '2', '3', '4']))
|
||||
|
||||
return df, unique_labels, palette_dict, markers_dict
|
||||
|
||||
|
||||
def main():
|
||||
"""Load the data and generate the plots."""
|
||||
df = pd.read_csv("measurements.csv")
|
||||
|
||||
tgt_dir = "figures"
|
||||
setup(tgt_dir)
|
||||
generate(*preprocessing(df), tgt_dir=tgt_dir, dpi=150)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
exit()
|
||||
|
||||
# read in the csv
|
||||
df = pd.read_csv("compression_results.csv")
|
||||
|
||||
|
|
@ -43,7 +324,8 @@ if __name__ == "__main__":
|
|||
plt.xlabel("file size [MB]")
|
||||
plt.ylabel("Time [s]")
|
||||
plt.yscale("log")
|
||||
plt.legend([f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])])
|
||||
plt.legend(
|
||||
[f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])])
|
||||
plt.tight_layout()
|
||||
plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png")
|
||||
|
||||
|
|
@ -60,7 +342,6 @@ if __name__ == "__main__":
|
|||
plt.legend()
|
||||
plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png")
|
||||
|
||||
|
||||
# if model_type == "cnn":
|
||||
# import numpy as np
|
||||
#
|
||||
|
|
|
|||
49
results/measurements.csv
Normal file
49
results/measurements.csv
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
compressor,training_dataset,context_size,input_filename,input_size,compressed_size,compression_time,decompressed_size,decompression_time,mse_loss
|
||||
gzip,,,genome.fna,4699745,1424004,.681197994,4699745,.015465955,0.0
|
||||
gzip,,,genome_large.fna,23498433,7118154,3.384480370,23498433,.067414798,0.0
|
||||
gzip,,,genome_small.fna,1367,589,.001937446,1367,.001983156,0.0
|
||||
gzip,,,genome_xlarge.fna,46996793,14235842,6.775190783,46996793,.131633333,0.0
|
||||
gzip,,,genome_xsmall.fna,1043,475,.002007016,1043,.002012775,0.0
|
||||
gzip,,,genome_xxsmall.fna,800,393,.002071485,800,.001958195,0.0
|
||||
gzip,,,text_large.txt,12977332,4770044,.613155078,12977332,.043915520,0.0
|
||||
gzip,,,text_small.txt,1022,590,.002070305,1022,.001903226,0.0
|
||||
gzip,,,text.txt,6488666,2385264,.308393934,6488666,.023656716,0.0
|
||||
gzip,,,text_xlarge.txt,25954664,9539638,1.229028819,25954664,.085925486,0.0
|
||||
gzip,,,text_xsmall.txt,825,473,.002110205,825,.001980535,0.0
|
||||
gzip,,,text_xxsmall.txt,492,325,.001867306,492,.002114055,0.0
|
||||
LZ4,,,genome.fna,4699745,2655438,.012701161,4699745,.009190410,0.0
|
||||
LZ4,,,genome_large.fna,23498433,13275544,.020719873,23498433,.025022334,0.0
|
||||
LZ4,,,genome_small.fna,1367,1041,.001883076,1367,.002144425,0.0
|
||||
LZ4,,,genome_xlarge.fna,46996793,26551229,.031734579,46996793,.043495412,0.0
|
||||
LZ4,,,genome_xsmall.fna,1043,814,.001954316,1043,.002085566,0.0
|
||||
LZ4,,,genome_xxsmall.fna,800,641,.001893416,800,.001943666,0.0
|
||||
LZ4,,,text_large.txt,12977332,7879136,.017927300,12977332,.015065196,0.0
|
||||
LZ4,,,text_small.txt,1022,857,.001967146,1022,.002040285,0.0
|
||||
LZ4,,,text.txt,6488666,3939378,.014891266,6488666,.009709618,0.0
|
||||
LZ4,,,text_xlarge.txt,25954664,15758785,.023613977,25954664,.023486747,0.0
|
||||
LZ4,,,text_xsmall.txt,825,678,.001757717,825,.002191075,0.0
|
||||
LZ4,,,text_xxsmall.txt,492,438,.001869646,492,.002134206,0.0
|
||||
Autoencoder,genome,256,genome.fna,4699745,4259288,636915773,,27887947,83.62875366210938
|
||||
Autoencoder,genome,256,genome_large.fna,23498433,21295512,1932602305,,7778175,83.59369659423828
|
||||
Autoencoder,genome,256,genome_xlarge.fna,46996793,42591024,3850901316,,10996509,83.58621215820312
|
||||
Autoencoder,genome,128,genome.fna,4699745,9399552,390656081,,5804539,83.01229095458984
|
||||
Autoencoder,genome,128,genome_large.fna,23498433,46996992,1932561312,,10575739,83.01190185546875
|
||||
Autoencoder,genome,128,genome_xlarge.fna,46996793,93993728,3873777067,,18670984,83.00253295898438
|
||||
Autoencoder,enwik9,256,text.txt,6488666,6184668,551986635,,10536259,786.6799926757812
|
||||
Autoencoder,enwik9,256,text_large.txt,12977332,12369092,1065897991,,5763879,786.6173706054688
|
||||
Autoencoder,enwik9,256,text_xlarge.txt,25954664,24738184,2139223055,,8369164,786.6337890625
|
||||
Autoencoder,enwik9,128,text.txt,6488666,12774636,545577194,,20624030,206.2792510986328
|
||||
Autoencoder,enwik9,128,text_large.txt,12977332,25549272,1073396133,,60871642,206.24131774902344
|
||||
Autoencoder,enwik9,128,text_xlarge.txt,25954664,51098292,2145601924,,59481825,206.33023071289062
|
||||
CNN,genome,256,genome_small.fna,1367,1743,1029290599,,890595665,0.0
|
||||
CNN,genome,256,genome_xsmall.fna,1043,1343,686878467,,683701323,0.0
|
||||
CNN,genome,256,genome_xxsmall.fna,800,1038,531354486,,527072394,0.0
|
||||
CNN,genome,128,genome_small.fna,1367,1682,829554150,,851934528,0.0
|
||||
CNN,genome,128,genome_xsmall.fna,1043,1300,654742547,,637221301,0.0
|
||||
CNN,genome,128,genome_xxsmall.fna,800,1006,483840337,,488870786,0.0
|
||||
CNN,enwik9,256,text_small.txt,1022,1561,693378115,,671294958,0.0
|
||||
CNN,enwik9,256,text_xsmall.txt,825,1268,550333502,,550062973,0.0
|
||||
CNN,enwik9,256,text_xxsmall.txt,492,790,333745012,,332073466,0.0
|
||||
CNN,enwik9,128,text_small.txt,1022,1129,629310179,,621317553,0.0
|
||||
CNN,enwik9,128,text_xsmall.txt,825,882,504538600,,504907940,0.0
|
||||
CNN,enwik9,128,text_xxsmall.txt,492,571,305443187,,308964670,0.0
|
||||
|
Reference in a new issue