diff --git a/graphs/autoencoder_enwik9_accuracy.png b/graphs/autoencoder_enwik9_accuracy.png new file mode 100644 index 0000000..a833d52 Binary files /dev/null and b/graphs/autoencoder_enwik9_accuracy.png differ diff --git a/graphs/autoencoder_enwik9_compression_ratio.png b/graphs/autoencoder_enwik9_compression_ratio.png new file mode 100644 index 0000000..79db1d4 Binary files /dev/null and b/graphs/autoencoder_enwik9_compression_ratio.png differ diff --git a/graphs/autoencoder_enwik9_compression_time.png b/graphs/autoencoder_enwik9_compression_time.png new file mode 100644 index 0000000..a953c36 Binary files /dev/null and b/graphs/autoencoder_enwik9_compression_time.png differ diff --git a/graphs/autoencoder_enwik9_decompression_time.png b/graphs/autoencoder_enwik9_decompression_time.png new file mode 100644 index 0000000..5609be1 Binary files /dev/null and b/graphs/autoencoder_enwik9_decompression_time.png differ diff --git a/graphs/autoencoder_genome_accuracy.png b/graphs/autoencoder_genome_accuracy.png new file mode 100644 index 0000000..71c6e5d Binary files /dev/null and b/graphs/autoencoder_genome_accuracy.png differ diff --git a/graphs/autoencoder_genome_compression_ratio.png b/graphs/autoencoder_genome_compression_ratio.png new file mode 100644 index 0000000..7181f20 Binary files /dev/null and b/graphs/autoencoder_genome_compression_ratio.png differ diff --git a/graphs/autoencoder_genome_compression_time.png b/graphs/autoencoder_genome_compression_time.png new file mode 100644 index 0000000..7d8b17c Binary files /dev/null and b/graphs/autoencoder_genome_compression_time.png differ diff --git a/graphs/autoencoder_genome_decompression_time.png b/graphs/autoencoder_genome_decompression_time.png new file mode 100644 index 0000000..1dbd141 Binary files /dev/null and b/graphs/autoencoder_genome_decompression_time.png differ diff --git a/graphs/cnn_enwik9_accuracy.png b/graphs/cnn_enwik9_accuracy.png new file mode 100644 index 0000000..3cb4c32 Binary files /dev/null and b/graphs/cnn_enwik9_accuracy.png differ diff --git a/graphs/cnn_enwik9_compression_ratio.png b/graphs/cnn_enwik9_compression_ratio.png new file mode 100644 index 0000000..04a2ce0 Binary files /dev/null and b/graphs/cnn_enwik9_compression_ratio.png differ diff --git a/graphs/cnn_enwik9_compression_time.png b/graphs/cnn_enwik9_compression_time.png new file mode 100644 index 0000000..f12bf6d Binary files /dev/null and b/graphs/cnn_enwik9_compression_time.png differ diff --git a/graphs/cnn_enwik9_decompression_time.png b/graphs/cnn_enwik9_decompression_time.png new file mode 100644 index 0000000..4049215 Binary files /dev/null and b/graphs/cnn_enwik9_decompression_time.png differ diff --git a/graphs/cnn_genome_accuracy.png b/graphs/cnn_genome_accuracy.png new file mode 100644 index 0000000..483b0fa Binary files /dev/null and b/graphs/cnn_genome_accuracy.png differ diff --git a/graphs/cnn_genome_compression_ratio.png b/graphs/cnn_genome_compression_ratio.png new file mode 100644 index 0000000..d0a2843 Binary files /dev/null and b/graphs/cnn_genome_compression_ratio.png differ diff --git a/graphs/cnn_genome_compression_time.png b/graphs/cnn_genome_compression_time.png new file mode 100644 index 0000000..bc44282 Binary files /dev/null and b/graphs/cnn_genome_compression_time.png differ diff --git a/graphs/cnn_genome_decompression_time.png b/graphs/cnn_genome_decompression_time.png new file mode 100644 index 0000000..ef8e8a9 Binary files /dev/null and b/graphs/cnn_genome_decompression_time.png differ diff --git a/make_graphs.py b/make_graphs.py index 721dd28..2bb86dc 100644 --- a/make_graphs.py +++ b/make_graphs.py @@ -6,73 +6,76 @@ if __name__ == "__main__": # read in the csv df = pd.read_csv("./results/compress/compression_results.csv") - for model_type in df["model_type"].unique(): - model_df = df[df["model_type"] == model_type] + for dataset_type in df["dataset_type"].unique(): + for model_type in df["model_type"].unique(): + dataset_df = df[df["dataset_type"] == dataset_type] + model_df = dataset_df[dataset_df["model_type"] == model_type] - # execution time - plt.figure() - grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9 - labels = grouped.index.astype(str) # "128", "256" - x = np.arange(len(labels)) # [0, 1] + # execution time + plt.figure() + grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9 + labels = grouped.index.astype(str) # "128", "256" + x = np.arange(len(labels)) # [0, 1] - plt.bar(x, grouped.values, width=0.6) - plt.title(f"{model_type} mean compression time") - plt.xticks(x, labels) - plt.xlabel("Context length") - plt.ylabel("Mean compression time [s]") - plt.tight_layout() - plt.savefig(f"./graphs/{model_type}_{}_compression_time.png") + plt.bar(x, grouped.values, width=0.6) + plt.title(f"{model_type.capitalize()} mean compression time") + plt.xticks(x, labels) + plt.xlabel("Context length") + plt.ylabel("Mean compression time [s]") + plt.tight_layout() + plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_time.png") - plt.figure() - grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9 - labels = grouped.index.astype(str) # "128", "256" - x = np.arange(len(labels)) # [0, 1] + plt.figure() + grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9 + labels = grouped.index.astype(str) # "128", "256" + x = np.arange(len(labels)) # [0, 1] - plt.bar(x, grouped.values, width=0.6) - plt.title(f"{model_type} mean decompression time") - plt.xticks(x, labels) - plt.xlabel("Context length") - plt.ylabel("Mean decompression time [s]") - plt.tight_layout() - plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png") + plt.bar(x, grouped.values, width=0.6) + plt.title(f"{model_type.capitalize()} mean decompression time") + plt.xticks(x, labels) + plt.xlabel("Context length") + plt.ylabel("Mean decompression time [s]") + plt.tight_layout() + plt.savefig(f"./graphs/{model_type}_{dataset_type}_decompression_time.png") - # accuracy - plt.figure() - bar_height = 0.25 - files = model_df["input_file_name"].unique() - y = np.arange(len(files)) - c256 = model_df[model_df["context_length"] == 256] - c128 = model_df[model_df["context_length"] == 128] + # accuracy + plt.figure(figsize=(10, 4)) + bar_height = 0.25 + files = model_df["input_file_name"].unique() + y = np.arange(len(files)) + c256 = model_df[model_df["context_length"] == 256] + c128 = model_df[model_df["context_length"] == 128] - plt.barh( - y - bar_height / 2, - c256["match_percentage"] * 100, - height=bar_height, - label="256" - ) + plt.barh( + y - bar_height / 2, + c256["match_percentage"] * 100, + height=bar_height, + label="256" + ) - plt.barh( - y + bar_height / 2, - c128["match_percentage"] * 100, - height=bar_height, - label="128" - ) - plt.yticks(y, files) - plt.title(f"{model_type} time for different context lengths") - plt.xlabel("accuracy") - plt.ylabel("Filename") - plt.legend() - plt.savefig(f"./graphs/{model_type}_{}_accuracy.png") + plt.barh( + y + bar_height / 2, + c128["match_percentage"] * 100, + height=bar_height, + label="128" + ) + plt.yticks(y, files, rotation=45, ha="right") + plt.title(f"{model_type.capitalize()} accuracy for different context lengths") + plt.xlabel("Accuracy") + plt.ylabel("Filename") + plt.legend() + plt.tight_layout() + plt.savefig(f"./graphs/{model_type}_{dataset_type}_accuracy.png") - # compression ratio - plt.figure() - c256 = model_df[model_df["context_length"] == 256] - c128 = model_df[model_df["context_length"] == 128] + # compression ratio + plt.figure() + c256 = model_df[model_df["context_length"] == 256] + c128 = model_df[model_df["context_length"] == 128] - plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256") - plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128") - plt.title(f"{model_type} compressed file evolution") - plt.xlabel("Original file size [MB]") - plt.ylabel("Compressed file size [MB]") - plt.legend() - plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png") \ No newline at end of file + plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256") + plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128") + plt.title(f"{model_type.capitalize()} compressed file evolution") + plt.xlabel("Original file size [MB]") + plt.ylabel("Compressed file size [MB]") + plt.legend() + plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png") \ No newline at end of file diff --git a/measure.py b/measure.py index 430d660..d7bc8d0 100644 --- a/measure.py +++ b/measure.py @@ -1,5 +1,4 @@ import os -from argparse import ArgumentParser from contextlib import contextmanager import torch @@ -54,13 +53,15 @@ if __name__ == "__main__": ] files_enwik9 = [ - # "text.txt", - # "txt_large.txt", - # "txt_xlarge.txt" + "text.txt", + "text_large.txt", + "text_xlarge.txt" ] files_enwik9_cnn = [ - + "text_small.txt", + "text_xsmall.txt", + "text_xxsmall.txt" ] models = [ @@ -69,7 +70,7 @@ if __name__ == "__main__": ("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn), ("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn), ("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9), - ("auto-enwik9-full-128", 128, "autoencoder", files_enwik9), + ("auto-enwik9-full-128.pt", 128, "autoencoder", files_enwik9), ("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn), ("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn), ] @@ -78,10 +79,11 @@ if __name__ == "__main__": with open("./results/compress/compression_results.csv", "w") as f: # write header f.write( - "model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n" + "model_type,model_name,context_length,dataset_type,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n" ) for model, context_length, model_name, files in models: + dataset_type = "genome" if "genome" in model else "enwik9" for file in files: in_file = f"./data/compression_sets/{file}" model_path = f"./models/{model_name}/{model}" @@ -119,5 +121,5 @@ if __name__ == "__main__": os.remove("./output/tmp.pt") f.write( - f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n" + f"{model_name},{model},{context_length},{dataset_type},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n" ) diff --git a/models/autoencoder/auto-enwik9-128.pt b/models/autoencoder/auto-enwik9-128.pt new file mode 100644 index 0000000..2d4d299 Binary files /dev/null and b/models/autoencoder/auto-enwik9-128.pt differ diff --git a/models/autoencoder/auto-enwik9-256.pt b/models/autoencoder/auto-enwik9-256.pt new file mode 100644 index 0000000..2f915a6 Binary files /dev/null and b/models/autoencoder/auto-enwik9-256.pt differ diff --git a/models/autoencoder/auto-enwik9-full-128.pt b/models/autoencoder/auto-enwik9-full-128.pt new file mode 100644 index 0000000..0a72fb2 Binary files /dev/null and b/models/autoencoder/auto-enwik9-full-128.pt differ diff --git a/models/autoencoder/auto-enwik9-full-256.pt b/models/autoencoder/auto-enwik9-full-256.pt new file mode 100644 index 0000000..6ff7d11 Binary files /dev/null and b/models/autoencoder/auto-enwik9-full-256.pt differ diff --git a/models/autoencoder/auto-genome-128.pt b/models/autoencoder/auto-genome-128.pt new file mode 100644 index 0000000..dfc9584 Binary files /dev/null and b/models/autoencoder/auto-genome-128.pt differ diff --git a/models/autoencoder/auto-genome-256.pt b/models/autoencoder/auto-genome-256.pt new file mode 100644 index 0000000..76be580 Binary files /dev/null and b/models/autoencoder/auto-genome-256.pt differ diff --git a/models/autoencoder/auto-genome-full-128.pt b/models/autoencoder/auto-genome-full-128.pt new file mode 100644 index 0000000..fc2f363 Binary files /dev/null and b/models/autoencoder/auto-genome-full-128.pt differ diff --git a/models/autoencoder/auto-genome-full-256.pt b/models/autoencoder/auto-genome-full-256.pt new file mode 100644 index 0000000..5a51a36 Binary files /dev/null and b/models/autoencoder/auto-genome-full-256.pt differ diff --git a/models/cnn/cnn-enwik9-128.pt b/models/cnn/cnn-enwik9-128.pt new file mode 100644 index 0000000..f3db085 Binary files /dev/null and b/models/cnn/cnn-enwik9-128.pt differ diff --git a/models/cnn/cnn-enwik9-256.pt b/models/cnn/cnn-enwik9-256.pt new file mode 100644 index 0000000..c1f22a7 Binary files /dev/null and b/models/cnn/cnn-enwik9-256.pt differ diff --git a/models/cnn/cnn-enwik9-full-128.pt b/models/cnn/cnn-enwik9-full-128.pt new file mode 100644 index 0000000..9d546c7 Binary files /dev/null and b/models/cnn/cnn-enwik9-full-128.pt differ diff --git a/models/cnn/cnn-enwik9-full-256.pt b/models/cnn/cnn-enwik9-full-256.pt new file mode 100644 index 0000000..e06e9d5 Binary files /dev/null and b/models/cnn/cnn-enwik9-full-256.pt differ diff --git a/models/cnn/cnn-genome-256.pt b/models/cnn/cnn-genome-256.pt new file mode 100644 index 0000000..e8bae00 Binary files /dev/null and b/models/cnn/cnn-genome-256.pt differ diff --git a/models/cnn/cnn-genome-full-128.pt b/models/cnn/cnn-genome-full-128.pt new file mode 100644 index 0000000..69da5c4 Binary files /dev/null and b/models/cnn/cnn-genome-full-128.pt differ diff --git a/models/cnn/cnn-genome-full-256.pt b/models/cnn/cnn-genome-full-256.pt new file mode 100644 index 0000000..c74437f Binary files /dev/null and b/models/cnn/cnn-genome-full-256.pt differ diff --git a/src/process.py b/src/process.py index 9c0ee53..2d4241f 100644 --- a/src/process.py +++ b/src/process.py @@ -1,18 +1,16 @@ import contextlib import math +import struct from collections import deque -from decimal import Decimal -import numpy as np import torch import torch.nn as nn from tqdm import tqdm -import struct from src.models import AutoEncoder from src.utils import reference_ae -NUMBITS = 64 +NUMBITS = 16 def probs_to_freqs(probs, total_freq=8192):