feat: graphs + models + updated finished graph code + data in csv

This commit is contained in:
RobinMeersman 2025-12-16 10:06:47 +01:00
parent f3b07c1df3
commit 1abc2f5113
34 changed files with 76 additions and 73 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

View file

@ -6,73 +6,76 @@ if __name__ == "__main__":
# read in the csv
df = pd.read_csv("./results/compress/compression_results.csv")
for model_type in df["model_type"].unique():
model_df = df[df["model_type"] == model_type]
for dataset_type in df["dataset_type"].unique():
for model_type in df["model_type"].unique():
dataset_df = df[df["dataset_type"] == dataset_type]
model_df = dataset_df[dataset_df["model_type"] == model_type]
# execution time
plt.figure()
grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1]
# execution time
plt.figure()
grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1]
plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type} mean compression time")
plt.xticks(x, labels)
plt.xlabel("Context length")
plt.ylabel("Mean compression time [s]")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{}_compression_time.png")
plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type.capitalize()} mean compression time")
plt.xticks(x, labels)
plt.xlabel("Context length")
plt.ylabel("Mean compression time [s]")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_time.png")
plt.figure()
grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1]
plt.figure()
grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1]
plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type} mean decompression time")
plt.xticks(x, labels)
plt.xlabel("Context length")
plt.ylabel("Mean decompression time [s]")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png")
plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type.capitalize()} mean decompression time")
plt.xticks(x, labels)
plt.xlabel("Context length")
plt.ylabel("Mean decompression time [s]")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{dataset_type}_decompression_time.png")
# accuracy
plt.figure()
bar_height = 0.25
files = model_df["input_file_name"].unique()
y = np.arange(len(files))
c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128]
# accuracy
plt.figure(figsize=(10, 4))
bar_height = 0.25
files = model_df["input_file_name"].unique()
y = np.arange(len(files))
c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128]
plt.barh(
y - bar_height / 2,
c256["match_percentage"] * 100,
height=bar_height,
label="256"
)
plt.barh(
y - bar_height / 2,
c256["match_percentage"] * 100,
height=bar_height,
label="256"
)
plt.barh(
y + bar_height / 2,
c128["match_percentage"] * 100,
height=bar_height,
label="128"
)
plt.yticks(y, files)
plt.title(f"{model_type} time for different context lengths")
plt.xlabel("accuracy")
plt.ylabel("Filename")
plt.legend()
plt.savefig(f"./graphs/{model_type}_{}_accuracy.png")
plt.barh(
y + bar_height / 2,
c128["match_percentage"] * 100,
height=bar_height,
label="128"
)
plt.yticks(y, files, rotation=45, ha="right")
plt.title(f"{model_type.capitalize()} accuracy for different context lengths")
plt.xlabel("Accuracy")
plt.ylabel("Filename")
plt.legend()
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{dataset_type}_accuracy.png")
# compression ratio
plt.figure()
c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128]
# compression ratio
plt.figure()
c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128]
plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256")
plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128")
plt.title(f"{model_type} compressed file evolution")
plt.xlabel("Original file size [MB]")
plt.ylabel("Compressed file size [MB]")
plt.legend()
plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png")
plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256")
plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128")
plt.title(f"{model_type.capitalize()} compressed file evolution")
plt.xlabel("Original file size [MB]")
plt.ylabel("Compressed file size [MB]")
plt.legend()
plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png")

View file

@ -1,5 +1,4 @@
import os
from argparse import ArgumentParser
from contextlib import contextmanager
import torch
@ -54,13 +53,15 @@ if __name__ == "__main__":
]
files_enwik9 = [
# "text.txt",
# "txt_large.txt",
# "txt_xlarge.txt"
"text.txt",
"text_large.txt",
"text_xlarge.txt"
]
files_enwik9_cnn = [
"text_small.txt",
"text_xsmall.txt",
"text_xxsmall.txt"
]
models = [
@ -69,7 +70,7 @@ if __name__ == "__main__":
("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn),
("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn),
("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9),
("auto-enwik9-full-128", 128, "autoencoder", files_enwik9),
("auto-enwik9-full-128.pt", 128, "autoencoder", files_enwik9),
("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn),
("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn),
]
@ -78,10 +79,11 @@ if __name__ == "__main__":
with open("./results/compress/compression_results.csv", "w") as f:
# write header
f.write(
"model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
"model_type,model_name,context_length,dataset_type,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
)
for model, context_length, model_name, files in models:
dataset_type = "genome" if "genome" in model else "enwik9"
for file in files:
in_file = f"./data/compression_sets/{file}"
model_path = f"./models/{model_name}/{model}"
@ -119,5 +121,5 @@ if __name__ == "__main__":
os.remove("./output/tmp.pt")
f.write(
f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
f"{model_name},{model},{context_length},{dataset_type},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,18 +1,16 @@
import contextlib
import math
import struct
from collections import deque
from decimal import Decimal
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
import struct
from src.models import AutoEncoder
from src.utils import reference_ae
NUMBITS = 64
NUMBITS = 16
def probs_to_freqs(probs, total_freq=8192):