feat: graphs + models + updated finished graph code + data in csv

This commit is contained in:
RobinMeersman 2025-12-16 10:06:47 +01:00
parent f3b07c1df3
commit 1abc2f5113
34 changed files with 76 additions and 73 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

View file

@ -6,73 +6,76 @@ if __name__ == "__main__":
# read in the csv # read in the csv
df = pd.read_csv("./results/compress/compression_results.csv") df = pd.read_csv("./results/compress/compression_results.csv")
for model_type in df["model_type"].unique(): for dataset_type in df["dataset_type"].unique():
model_df = df[df["model_type"] == model_type] for model_type in df["model_type"].unique():
dataset_df = df[df["dataset_type"] == dataset_type]
model_df = dataset_df[dataset_df["model_type"] == model_type]
# execution time # execution time
plt.figure() plt.figure()
grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9 grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256" labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1] x = np.arange(len(labels)) # [0, 1]
plt.bar(x, grouped.values, width=0.6) plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type} mean compression time") plt.title(f"{model_type.capitalize()} mean compression time")
plt.xticks(x, labels) plt.xticks(x, labels)
plt.xlabel("Context length") plt.xlabel("Context length")
plt.ylabel("Mean compression time [s]") plt.ylabel("Mean compression time [s]")
plt.tight_layout() plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{}_compression_time.png") plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_time.png")
plt.figure() plt.figure()
grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9 grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256" labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1] x = np.arange(len(labels)) # [0, 1]
plt.bar(x, grouped.values, width=0.6) plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type} mean decompression time") plt.title(f"{model_type.capitalize()} mean decompression time")
plt.xticks(x, labels) plt.xticks(x, labels)
plt.xlabel("Context length") plt.xlabel("Context length")
plt.ylabel("Mean decompression time [s]") plt.ylabel("Mean decompression time [s]")
plt.tight_layout() plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png") plt.savefig(f"./graphs/{model_type}_{dataset_type}_decompression_time.png")
# accuracy # accuracy
plt.figure() plt.figure(figsize=(10, 4))
bar_height = 0.25 bar_height = 0.25
files = model_df["input_file_name"].unique() files = model_df["input_file_name"].unique()
y = np.arange(len(files)) y = np.arange(len(files))
c256 = model_df[model_df["context_length"] == 256] c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128] c128 = model_df[model_df["context_length"] == 128]
plt.barh( plt.barh(
y - bar_height / 2, y - bar_height / 2,
c256["match_percentage"] * 100, c256["match_percentage"] * 100,
height=bar_height, height=bar_height,
label="256" label="256"
) )
plt.barh( plt.barh(
y + bar_height / 2, y + bar_height / 2,
c128["match_percentage"] * 100, c128["match_percentage"] * 100,
height=bar_height, height=bar_height,
label="128" label="128"
) )
plt.yticks(y, files) plt.yticks(y, files, rotation=45, ha="right")
plt.title(f"{model_type} time for different context lengths") plt.title(f"{model_type.capitalize()} accuracy for different context lengths")
plt.xlabel("accuracy") plt.xlabel("Accuracy")
plt.ylabel("Filename") plt.ylabel("Filename")
plt.legend() plt.legend()
plt.savefig(f"./graphs/{model_type}_{}_accuracy.png") plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{dataset_type}_accuracy.png")
# compression ratio # compression ratio
plt.figure() plt.figure()
c256 = model_df[model_df["context_length"] == 256] c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128] c128 = model_df[model_df["context_length"] == 128]
plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256") plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256")
plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128") plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128")
plt.title(f"{model_type} compressed file evolution") plt.title(f"{model_type.capitalize()} compressed file evolution")
plt.xlabel("Original file size [MB]") plt.xlabel("Original file size [MB]")
plt.ylabel("Compressed file size [MB]") plt.ylabel("Compressed file size [MB]")
plt.legend() plt.legend()
plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png") plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png")

View file

@ -1,5 +1,4 @@
import os import os
from argparse import ArgumentParser
from contextlib import contextmanager from contextlib import contextmanager
import torch import torch
@ -54,13 +53,15 @@ if __name__ == "__main__":
] ]
files_enwik9 = [ files_enwik9 = [
# "text.txt", "text.txt",
# "txt_large.txt", "text_large.txt",
# "txt_xlarge.txt" "text_xlarge.txt"
] ]
files_enwik9_cnn = [ files_enwik9_cnn = [
"text_small.txt",
"text_xsmall.txt",
"text_xxsmall.txt"
] ]
models = [ models = [
@ -69,7 +70,7 @@ if __name__ == "__main__":
("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn), ("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn),
("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn), ("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn),
("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9), ("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9),
("auto-enwik9-full-128", 128, "autoencoder", files_enwik9), ("auto-enwik9-full-128.pt", 128, "autoencoder", files_enwik9),
("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn), ("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn),
("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn), ("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn),
] ]
@ -78,10 +79,11 @@ if __name__ == "__main__":
with open("./results/compress/compression_results.csv", "w") as f: with open("./results/compress/compression_results.csv", "w") as f:
# write header # write header
f.write( f.write(
"model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n" "model_type,model_name,context_length,dataset_type,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
) )
for model, context_length, model_name, files in models: for model, context_length, model_name, files in models:
dataset_type = "genome" if "genome" in model else "enwik9"
for file in files: for file in files:
in_file = f"./data/compression_sets/{file}" in_file = f"./data/compression_sets/{file}"
model_path = f"./models/{model_name}/{model}" model_path = f"./models/{model_name}/{model}"
@ -119,5 +121,5 @@ if __name__ == "__main__":
os.remove("./output/tmp.pt") os.remove("./output/tmp.pt")
f.write( f.write(
f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n" f"{model_name},{model},{context_length},{dataset_type},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
) )

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,18 +1,16 @@
import contextlib import contextlib
import math import math
import struct
from collections import deque from collections import deque
from decimal import Decimal
import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from tqdm import tqdm from tqdm import tqdm
import struct
from src.models import AutoEncoder from src.models import AutoEncoder
from src.utils import reference_ae from src.utils import reference_ae
NUMBITS = 64 NUMBITS = 16
def probs_to_freqs(probs, total_freq=8192): def probs_to_freqs(probs, total_freq=8192):