feat: measuring code + graph generator code

This commit is contained in:
RobinMeersman 2025-12-15 22:53:32 +01:00
parent dd0b3d3945
commit f3b07c1df3
6 changed files with 325 additions and 140 deletions

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 KiB

78
make_graphs.py Normal file
View file

@ -0,0 +1,78 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
if __name__ == "__main__":
# read in the csv
df = pd.read_csv("./results/compress/compression_results.csv")
for model_type in df["model_type"].unique():
model_df = df[df["model_type"] == model_type]
# execution time
plt.figure()
grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1]
plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type} mean compression time")
plt.xticks(x, labels)
plt.xlabel("Context length")
plt.ylabel("Mean compression time [s]")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{}_compression_time.png")
plt.figure()
grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1]
plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type} mean decompression time")
plt.xticks(x, labels)
plt.xlabel("Context length")
plt.ylabel("Mean decompression time [s]")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png")
# accuracy
plt.figure()
bar_height = 0.25
files = model_df["input_file_name"].unique()
y = np.arange(len(files))
c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128]
plt.barh(
y - bar_height / 2,
c256["match_percentage"] * 100,
height=bar_height,
label="256"
)
plt.barh(
y + bar_height / 2,
c128["match_percentage"] * 100,
height=bar_height,
label="128"
)
plt.yticks(y, files)
plt.title(f"{model_type} time for different context lengths")
plt.xlabel("accuracy")
plt.ylabel("Filename")
plt.legend()
plt.savefig(f"./graphs/{model_type}_{}_accuracy.png")
# compression ratio
plt.figure()
c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128]
plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256")
plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128")
plt.title(f"{model_type} compressed file evolution")
plt.xlabel("Original file size [MB]")
plt.ylabel("Compressed file size [MB]")
plt.legend()
plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png")

123
measure.py Normal file
View file

@ -0,0 +1,123 @@
import os
from argparse import ArgumentParser
from contextlib import contextmanager
import torch
import src.process as p
import time
@contextmanager
def timer():
start = time.time_ns()
elapsed = None
def get_elapsed():
nonlocal elapsed
if elapsed is None:
elapsed = time.time_ns() - start
return elapsed
yield get_elapsed
get_elapsed()
def compare_files(original, decompressed: str | torch.Tensor):
with open(original, "rb") as file:
original = file.read()
original = torch.tensor(list(original), dtype=torch.uint8).cpu()
if type(decompressed) == "str":
with open(decompressed, "rb") as file:
decompressed = file.read()
decompressed = torch.tensor(list(decompressed), dtype=torch.uint8).cpu()
# count bytes matching
count = torch.sum(original == decompressed[:original.shape[0]])
accuracy = count / original.shape[0]
return accuracy
if __name__ == "__main__":
files_genome = [
"genome.fna",
"genome_large.fna",
"genome_xlarge.fna"
]
files_genome_cnn = [
"genome_small.fna",
"genome_xsmall.fna",
"genome_xxsmall.fna"
]
files_enwik9 = [
# "text.txt",
# "txt_large.txt",
# "txt_xlarge.txt"
]
files_enwik9_cnn = [
]
models = [
("auto-genome-full-256.pt", 256, "autoencoder", files_genome),
("auto-genome-full-128.pt", 128, "autoencoder", files_genome),
("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn),
("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn),
("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9),
("auto-enwik9-full-128", 128, "autoencoder", files_enwik9),
("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn),
("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn),
]
device = "cuda" if torch.cuda.is_available() else "cpu"
with open("./results/compress/compression_results.csv", "w") as f:
# write header
f.write(
"model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
)
for model, context_length, model_name, files in models:
for file in files:
in_file = f"./data/compression_sets/{file}"
model_path = f"./models/{model_name}/{model}"
print(f"Running for model {model} and file {file}...")
with timer() as t:
compressed = p.compress(
device=device,
input_file=in_file,
model_name=model_name,
model_path=model_path,
context_length=context_length,
output_file="./output/tmp.pt"
)
compression_time = t()
with timer() as t:
decompressed = p.decompress(
device,
model_name=model_name,
model_path=model_path,
context_length=context_length,
input_file="./output/tmp.pt"
)
decompression_time = t()
accuracy = compare_files(in_file, decompressed.flatten().cpu())
og_file_len = os.path.getsize(in_file)
if compressed is None:
compressed_size = os.path.getsize("./output/tmp.pt")
else:
compressed_size = 4 * compressed.shape[0] * compressed.shape[1]
os.remove("./output/tmp.pt")
f.write(
f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
)

View file

@ -58,8 +58,6 @@ class AutoEncoder(Model):
"""
x: torch.Tensor of floats
"""
if len(x.shape) == 2:
x = x.unsqueeze(1)
return self.decoder(x)
def forward(self, x: torch.LongTensor) -> torch.Tensor:

View file

@ -7,10 +7,13 @@ import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
import struct
from src.models import AutoEncoder
from src.utils import reference_ae
NUMBITS = 64
def probs_to_freqs(probs, total_freq=8192):
freqs = (probs * total_freq).round().long()
@ -20,7 +23,7 @@ def probs_to_freqs(probs, total_freq=8192):
# Re-normalize so the sum matches total_freq
diff = total_freq - freqs.sum()
freqs[0] += diff # fix the sum by adjusting the first bin
freqs[freqs.argmax()] += diff # fix the sum by adjusting the first bin
return freqs
@ -32,32 +35,39 @@ def ae_compress(
model: nn.Module,
byte_data: bytes,
tensor: torch.Tensor
):
# Init AE
print("Initializing AE")
with contextlib.closing(reference_ae.BitOutputStream(open(output_file, "wb"))) as bitout:
enc = reference_ae.ArithmeticEncoder(len(byte_data), bitout)
with open(output_file, "wb") as raw_out:
# Write original length header (8 bytes)
raw_out.write(struct.pack(">Q", len(byte_data)))
with contextlib.closing(reference_ae.BitOutputStream(raw_out)) as bitout:
enc = reference_ae.ArithmeticEncoder(NUMBITS, bitout)
context = deque([0] * context_length, maxlen=context_length)
# Compress
for byte in tqdm(tensor.tolist(), desc="Compressing"):
context_tensor = torch.tensor([list(context)], dtype=torch.long, device=device)
context_tensor = torch.tensor(
[list(context)],
dtype=torch.long,
device=device
)
with torch.inference_mode():
logits = model(context_tensor)
probabilities = torch.softmax(logits[0], dim=-1)
print(f"probabilities: {probabilities}")
probabilities = probabilities.detach()
probability_table = reference_ae.SimpleFrequencyTable(probs_to_freqs(probabilities))
# write byte to output file
freqs = probs_to_freqs(probabilities).tolist()
probability_table = reference_ae.SimpleFrequencyTable(freqs)
enc.write(probability_table, byte)
context.append(byte)
def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
enc.finish()
def chunk_data(x: bytes, context_length=128) -> torch.Tensor:
tensor_data = torch.tensor(list(x), dtype=torch.long)
shape = tensor_data.size(0)
row_count = math.ceil(shape / context_length)
@ -65,13 +75,14 @@ def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
tensor_data = nn.functional.pad(tensor_data, (0, pad_count), value=0)
return tensor_data.view(row_count, context_length).float() / 255.0
def auto_encoder_compress(
data: bytes,
model: AutoEncoder,
output_file: str,
output_file: str | None = None,
context_length: int = 128,
device: str = "cuda"
):
) -> torch.Tensor:
# convert data to chunks of context length tensors
# send the data to device
tensor = chunk_data(data, context_length).to(device)
@ -83,10 +94,11 @@ def auto_encoder_compress(
print(f"output shape of compress: {4 * output.shape[0] * output.shape[1]} bytes")
# write output to file
if output_file is not None:
print(f"saving to file {output_file}...")
torch.save(output.detach(), output_file)
return output
def compress(
@ -99,7 +111,7 @@ def compress(
):
# Get input to compress
print("Reading input")
if input_file:
if input_file is not None:
with open(input_file, "rb") as file:
byte_data = file.read()
else:
@ -111,14 +123,14 @@ def compress(
tensor = torch.tensor(list(byte_data), dtype=torch.long)
# Get model
print("Loading model")
print(f"Loading model: {model_name}")
model = torch.load(model_path, weights_only=False)
model.to(device)
model.eval()
match model_name:
case "cnn":
ae_compress(
return ae_compress(
output_file,
context_length,
device,
@ -127,7 +139,7 @@ def compress(
tensor
)
case "autoencoder":
auto_encoder_compress(
return auto_encoder_compress(
byte_data,
model,
output_file,
@ -138,16 +150,75 @@ def compress(
raise ValueError(f"Unknown model type: {model_name}")
def ae_decompress(
model: nn.Module,
input_file: str,
context_length=128,
device="cuda",
output_file: str | None = None
):
pass
print("Initializing AE decoder")
with open(input_file, "rb") as raw_in:
# Read original length header
original_length_bytes = raw_in.read(8)
if len(original_length_bytes) != 8:
raise ValueError("Invalid compressed file (missing length header)")
original_length = struct.unpack(">Q", original_length_bytes)[0]
print(f"Original length: {original_length} bytes")
with contextlib.closing(reference_ae.BitInputStream(raw_in)) as bitin:
dec = reference_ae.ArithmeticDecoder(NUMBITS, bitin)
context = deque([0] * context_length, maxlen=context_length)
output_data = []
# Decode exactly original_length bytes
for _ in range(original_length):
context_tensor = torch.tensor(
[list(context)],
dtype=torch.long,
device=device
)
with torch.inference_mode():
logits = model(context_tensor)
probabilities = torch.softmax(logits[0], dim=-1)
freqs = probs_to_freqs(probabilities).tolist()
probability_table = reference_ae.SimpleFrequencyTable(freqs)
byte = dec.read(probability_table)
output_data.append(byte)
context.append(byte)
byte_data = torch.tensor(output_data, dtype=torch.long).byte()
if output_file is not None:
with open(output_file, "wb") as file:
file.write(byte_data.cpu().numpy().tobytes())
return byte_data
def auto_encoder_decompress(
data: torch.Tensor,
model: AutoEncoder,
output_file: str | None = None,
context_length=128,
device="cuda"
) -> torch.Tensor:
decompressed = model.decode(data).squeeze(1)
):
pass
# convert result back to bytes
byte_data = (decompressed * 255.0).round().byte().detach()
if output_file is not None:
with open(output_file, "wb") as file:
file.write(byte_data.cpu().numpy().tobytes())
return byte_data
def decompress(
@ -159,11 +230,13 @@ def decompress(
context_length: int = 128
):
print("Reading in the data")
with open(input_file, "r") as f:
length = int(f.readline())
bytes_data = f.read()
if model_name != "autoencoder":
with open(input_file, "rb") as f:
data = f.read()
else:
data = torch.load(input_file, map_location=device)
if len(bytes_data) == 0:
if len(data) == 0:
print("Input file is empty, nothing has to be done...")
return
@ -174,8 +247,19 @@ def decompress(
match model_name:
case "cnn":
ae_decompress()
return ae_decompress(
model=model,
input_file=input_file,
context_length=context_length,
output_file=output_file
)
case "autoencoder":
auto_encoder_decompress()
return auto_encoder_decompress(
data,
model,
output_file,
context_length,
device
)
case _:
raise ValueError(f"Unknown model type: {model_name}")