diff --git a/graphs/autoencoder_enwik9_accuracy.png b/graphs/autoencoder_enwik9_accuracy.png deleted file mode 100644 index fc349c9..0000000 Binary files a/graphs/autoencoder_enwik9_accuracy.png and /dev/null differ diff --git a/graphs/autoencoder_enwik9_compression_ratio.png b/graphs/autoencoder_enwik9_compression_ratio.png deleted file mode 100644 index 79db1d4..0000000 Binary files a/graphs/autoencoder_enwik9_compression_ratio.png and /dev/null differ diff --git a/graphs/autoencoder_enwik9_compression_time.png b/graphs/autoencoder_enwik9_compression_time.png deleted file mode 100644 index c6fa288..0000000 Binary files a/graphs/autoencoder_enwik9_compression_time.png and /dev/null differ diff --git a/graphs/autoencoder_enwik9_decompression_time.png b/graphs/autoencoder_enwik9_decompression_time.png deleted file mode 100644 index ec802d8..0000000 Binary files a/graphs/autoencoder_enwik9_decompression_time.png and /dev/null differ diff --git a/graphs/autoencoder_genome_accuracy.png b/graphs/autoencoder_genome_accuracy.png deleted file mode 100644 index 4a7b885..0000000 Binary files a/graphs/autoencoder_genome_accuracy.png and /dev/null differ diff --git a/graphs/autoencoder_genome_compression_ratio.png b/graphs/autoencoder_genome_compression_ratio.png deleted file mode 100644 index 7181f20..0000000 Binary files a/graphs/autoencoder_genome_compression_ratio.png and /dev/null differ diff --git a/graphs/autoencoder_genome_compression_time.png b/graphs/autoencoder_genome_compression_time.png deleted file mode 100644 index 12a4986..0000000 Binary files a/graphs/autoencoder_genome_compression_time.png and /dev/null differ diff --git a/graphs/autoencoder_genome_decompression_time.png b/graphs/autoencoder_genome_decompression_time.png deleted file mode 100644 index 7b4bd4f..0000000 Binary files a/graphs/autoencoder_genome_decompression_time.png and /dev/null differ diff --git a/graphs/cnn_enwik9_accuracy.png b/graphs/cnn_enwik9_accuracy.png deleted file mode 100644 index a978127..0000000 Binary files a/graphs/cnn_enwik9_accuracy.png and /dev/null differ diff --git a/graphs/cnn_enwik9_compression_ratio.png b/graphs/cnn_enwik9_compression_ratio.png deleted file mode 100644 index 04a2ce0..0000000 Binary files a/graphs/cnn_enwik9_compression_ratio.png and /dev/null differ diff --git a/graphs/cnn_enwik9_compression_time.png b/graphs/cnn_enwik9_compression_time.png deleted file mode 100644 index c35a8b0..0000000 Binary files a/graphs/cnn_enwik9_compression_time.png and /dev/null differ diff --git a/graphs/cnn_enwik9_decompression_time.png b/graphs/cnn_enwik9_decompression_time.png deleted file mode 100644 index 81bff80..0000000 Binary files a/graphs/cnn_enwik9_decompression_time.png and /dev/null differ diff --git a/graphs/cnn_genome_accuracy.png b/graphs/cnn_genome_accuracy.png deleted file mode 100644 index fce82c1..0000000 Binary files a/graphs/cnn_genome_accuracy.png and /dev/null differ diff --git a/graphs/cnn_genome_compression_ratio.png b/graphs/cnn_genome_compression_ratio.png deleted file mode 100644 index d0a2843..0000000 Binary files a/graphs/cnn_genome_compression_ratio.png and /dev/null differ diff --git a/graphs/cnn_genome_compression_time.png b/graphs/cnn_genome_compression_time.png deleted file mode 100644 index 9496767..0000000 Binary files a/graphs/cnn_genome_compression_time.png and /dev/null differ diff --git a/graphs/cnn_genome_decompression_time.png b/graphs/cnn_genome_decompression_time.png deleted file mode 100644 index 39d0dde..0000000 Binary files a/graphs/cnn_genome_decompression_time.png and /dev/null differ diff --git a/make_graphs.py b/make_graphs.py index 9be51ef..6738745 100644 --- a/make_graphs.py +++ b/make_graphs.py @@ -1,5 +1,6 @@ import pandas as pd import matplotlib.pyplot as plt +import seaborn as sns import numpy as np if __name__ == "__main__": @@ -9,63 +10,40 @@ if __name__ == "__main__": for dataset_type in df["dataset_type"].unique(): for model_type in df["model_type"].unique(): dataset_df = df[df["dataset_type"] == dataset_type] - model_df = dataset_df[dataset_df["model_type"] == model_type] + model_df = dataset_df[dataset_df["model_type"] == model_type].copy() # execution time plt.figure() - grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9 - labels = grouped.index.astype(str) # "128", "256" - x = np.arange(len(labels)) # [0, 1] - - plt.bar(x, grouped.values, width=0.6) - plt.title(f"{model_type.capitalize()} mean compression time") - plt.xticks(x, labels) - plt.xlabel("Context length") - plt.ylabel("Mean compression time [s]") - plt.tight_layout() - plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_time.png") - - plt.figure() - grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9 - labels = grouped.index.astype(str) # "128", "256" - x = np.arange(len(labels)) # [0, 1] - - plt.bar(x, grouped.values, width=0.6) - plt.title(f"{model_type.capitalize()} mean decompression time") - plt.xticks(x, labels) - plt.xlabel("Context length") - plt.ylabel("Mean decompression time [s]") - plt.tight_layout() - plt.savefig(f"./graphs/{model_type}_{dataset_type}_decompression_time.png") - - # loss - plt.figure(figsize=(10, 4)) - bar_height = 0.25 - files = model_df["input_file_name"].unique() - y = np.arange(len(files)) - c256 = model_df[model_df["context_length"] == 256] - c128 = model_df[model_df["context_length"] == 128] - - plt.barh( - y - bar_height / 2, - c256["mse_loss"], - height=bar_height, - label="256" + model_df["original_file_size_mb"] = model_df["original_file_size"] / 1e6 + # compression + sns.lineplot( + data=model_df, + x="original_file_size_mb", + y="compression_time", + hue="context_length", + palette="Set1", + markers=True, + legend="brief", + linestyle="-" ) - - plt.barh( - y + bar_height / 2, - c128["mse_loss"], - height=bar_height, - label="128" + # decompression + sns.lineplot( + data=model_df, + x="original_file_size_mb", + y="decompression_time", + hue="context_length", + palette="Set1", + markers=True, + legend=False, + linestyle="--" ) - plt.yticks(y, files, rotation=45, ha="right") - plt.title(f"{model_type.capitalize()} MSE loss for different context lengths") - plt.xlabel("MSE loss") - plt.ylabel("Filename") - plt.legend() + plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}") + plt.xlabel("file size [MB]") + plt.ylabel("Time [s]") + plt.yscale("log") + plt.legend([f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])]) plt.tight_layout() - plt.savefig(f"./graphs/{model_type}_{dataset_type}_accuracy.png") + plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png") # compression ratio plt.figure() @@ -74,8 +52,79 @@ if __name__ == "__main__": plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256") plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128") - plt.title(f"{model_type.capitalize()} compressed file evolution") + plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}") plt.xlabel("Original file size [MB]") plt.ylabel("Compressed file size [MB]") plt.legend() - plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png") \ No newline at end of file + plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png") + + + # if model_type == "cnn": + # import numpy as np + # + # plt.figure() + # for length, linestyle in [(128, '-'), (256, '--')]: + # # extrapolate execution time to larger files + # x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6 + # y = model_df[model_df["context_length"] == length]["compression_time"] + # y_decom = model_df[model_df["context_length"] == length]["decompression_time"] + # + # b1, loga1 = np.polyfit(x, np.log(y), 1) + # b2, loga2 = np.polyfit(x, np.log(y_decom), 1) + # + # x_comp = np.linspace(0, 40, 1000) + # x_decomp = np.linspace(0, 40, 1000) + # a1 = np.exp(loga1) + # a2 = np.exp(loga2) + # + # + # plt.plot( + # x_comp, a1 * np.exp(x_comp), + # label=f"{length} compression", + # linestyle=linestyle + # ) + # plt.plot( + # x_decomp, a2 * np.exp(x_decomp), + # label=f"{length} decompression", + # linestyle=linestyle + # ) + # + # + # + # plt.legend() + # plt.title(f"Extrapolated execution time for CNN compression and decompression") + # plt.xlabel("File size [MB]") + # plt.ylabel("Time [s]") + # plt.tight_layout() + # plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png") + + for model_type in df["model_type"].unique(): + model_df = df[df["model_type"] == model_type] + + plt.figure(figsize=(10, 4)) + bar_height = 0.25 + files = model_df["input_file_name"].unique() + y = np.arange(len(files)) + c256 = model_df[model_df["context_length"] == 256] + c128 = model_df[model_df["context_length"] == 128] + + plt.barh( + y - bar_height / 2, + c256["mse_loss"], + height=bar_height, + label="256" + ) + + plt.barh( + y + bar_height / 2, + c128["mse_loss"], + height=bar_height, + label="128" + ) + plt.yticks(y, files, rotation=45, ha="right") + plt.title(f"MSE loss for different context lengths") + plt.xlabel("MSE loss") + plt.ylabel("Filename") + plt.legend() + plt.tight_layout() + plt.savefig(f"./graphs/{model_type}_loss.png") \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 0f40d30..97b31c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,8 @@ dependencies = [ "fsspec==2024.9.0", "lorem>=0.1.1", "arithmeticencodingpython", + "pandas-stubs~=2.3.3", + "seaborn>=0.13.2", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index d927324..95a3d23 100644 --- a/uv.lock +++ b/uv.lock @@ -1613,6 +1613,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/44/5191d2e4026f86a2a109053e194d3ba7a31a2d10a9c2348368c63ed4e85a/pandas-2.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87", size = 13202175, upload-time = "2025-09-29T23:31:59.173Z" }, ] +[[package]] +name = "pandas-stubs" +version = "2.3.3.251201" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, + { name = "types-pytz" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ee/a6/491b2af2cb3ee232765a73fb273a44cc1ac33b154f7745b2df2ee1dc4d01/pandas_stubs-2.3.3.251201.tar.gz", hash = "sha256:7a980f4f08cff2a6d7e4c6d6d26f4c5fcdb82a6f6531489b2f75c81567fe4536", size = 107787, upload-time = "2025-12-01T18:29:22.403Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e2/68/78a3c253f146254b8e2c19f4a4768f272e12ef11001d9b45ec7b165db054/pandas_stubs-2.3.3.251201-py3-none-any.whl", hash = "sha256:eb5c9b6138bd8492fd74a47b09c9497341a278fcfbc8633ea4b35b230ebf4be5", size = 164638, upload-time = "2025-12-01T18:29:21.006Z" }, +] + [[package]] name = "pillow" version = "12.0.0" @@ -1718,6 +1731,8 @@ dependencies = [ { name = "datasets" }, { name = "fsspec" }, { name = "lorem" }, + { name = "pandas-stubs" }, + { name = "seaborn" }, ] [package.optional-dependencies] @@ -1746,7 +1761,9 @@ requires-dist = [ { name = "matplotlib", marker = "extra == 'dev'", specifier = ">=3.10.7" }, { name = "memray", marker = "extra == 'dev'", specifier = ">=1.19.1" }, { name = "optuna", marker = "extra == 'dev'", specifier = "==4.5.0" }, + { name = "pandas-stubs", specifier = "~=2.3.3" }, { name = "regex", marker = "extra == 'dataset'", specifier = ">=2025.11.3" }, + { name = "seaborn", specifier = ">=0.13.2" }, { name = "torch", marker = "extra == 'dev'", specifier = "==2.9.0" }, { name = "torchdata", marker = "extra == 'dev'", specifier = "==0.7.1" }, { name = "torchvision", marker = "extra == 'dev'", specifier = "==0.24.0" }, @@ -2116,6 +2133,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/25/7a/b0178788f8dc6cafce37a212c99565fa1fe7872c70c6c9c1e1a372d9d88f/rich-14.2.0-py3-none-any.whl", hash = "sha256:76bc51fe2e57d2b1be1f96c524b890b816e334ab4c1e45888799bfaab0021edd", size = 243393, upload-time = "2025-10-09T14:16:51.245Z" }, ] +[[package]] +name = "seaborn" +version = "0.13.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "matplotlib" }, + { name = "numpy" }, + { name = "pandas" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/86/59/a451d7420a77ab0b98f7affa3a1d78a313d2f7281a57afb1a34bae8ab412/seaborn-0.13.2.tar.gz", hash = "sha256:93e60a40988f4d65e9f4885df477e2fdaff6b73a9ded434c1ab356dd57eefff7", size = 1457696, upload-time = "2024-01-25T13:21:52.551Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, +] + [[package]] name = "setuptools" version = "80.9.0" @@ -2361,6 +2392,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/5e/dd/5cbf31f402f1cc0ab087c94d4669cfa55bd1e818688b910631e131d74e75/typer_slim-0.20.0-py3-none-any.whl", hash = "sha256:f42a9b7571a12b97dddf364745d29f12221865acef7a2680065f9bb29c7dc89d", size = 47087, upload-time = "2025-10-20T17:03:44.546Z" }, ] +[[package]] +name = "types-pytz" +version = "2025.2.0.20251108" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/40/ff/c047ddc68c803b46470a357454ef76f4acd8c1088f5cc4891cdd909bfcf6/types_pytz-2025.2.0.20251108.tar.gz", hash = "sha256:fca87917836ae843f07129567b74c1929f1870610681b4c92cb86a3df5817bdb", size = 10961, upload-time = "2025-11-08T02:55:57.001Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/c1/56ef16bf5dcd255155cc736d276efa6ae0a5c26fd685e28f0412a4013c01/types_pytz-2025.2.0.20251108-py3-none-any.whl", hash = "sha256:0f1c9792cab4eb0e46c52f8845c8f77cf1e313cb3d68bf826aa867fe4717d91c", size = 10116, upload-time = "2025-11-08T02:55:56.194Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0"