feat: measuring code + graph generator code

This commit is contained in:
RobinMeersman 2025-12-15 22:53:32 +01:00
parent dd0b3d3945
commit f3b07c1df3
6 changed files with 325 additions and 140 deletions

File diff suppressed because one or more lines are too long

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.3 KiB

78
make_graphs.py Normal file
View file

@ -0,0 +1,78 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
if __name__ == "__main__":
# read in the csv
df = pd.read_csv("./results/compress/compression_results.csv")
for model_type in df["model_type"].unique():
model_df = df[df["model_type"] == model_type]
# execution time
plt.figure()
grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1]
plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type} mean compression time")
plt.xticks(x, labels)
plt.xlabel("Context length")
plt.ylabel("Mean compression time [s]")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{}_compression_time.png")
plt.figure()
grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
labels = grouped.index.astype(str) # "128", "256"
x = np.arange(len(labels)) # [0, 1]
plt.bar(x, grouped.values, width=0.6)
plt.title(f"{model_type} mean decompression time")
plt.xticks(x, labels)
plt.xlabel("Context length")
plt.ylabel("Mean decompression time [s]")
plt.tight_layout()
plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png")
# accuracy
plt.figure()
bar_height = 0.25
files = model_df["input_file_name"].unique()
y = np.arange(len(files))
c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128]
plt.barh(
y - bar_height / 2,
c256["match_percentage"] * 100,
height=bar_height,
label="256"
)
plt.barh(
y + bar_height / 2,
c128["match_percentage"] * 100,
height=bar_height,
label="128"
)
plt.yticks(y, files)
plt.title(f"{model_type} time for different context lengths")
plt.xlabel("accuracy")
plt.ylabel("Filename")
plt.legend()
plt.savefig(f"./graphs/{model_type}_{}_accuracy.png")
# compression ratio
plt.figure()
c256 = model_df[model_df["context_length"] == 256]
c128 = model_df[model_df["context_length"] == 128]
plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256")
plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128")
plt.title(f"{model_type} compressed file evolution")
plt.xlabel("Original file size [MB]")
plt.ylabel("Compressed file size [MB]")
plt.legend()
plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png")

123
measure.py Normal file
View file

@ -0,0 +1,123 @@
import os
from argparse import ArgumentParser
from contextlib import contextmanager
import torch
import src.process as p
import time
@contextmanager
def timer():
start = time.time_ns()
elapsed = None
def get_elapsed():
nonlocal elapsed
if elapsed is None:
elapsed = time.time_ns() - start
return elapsed
yield get_elapsed
get_elapsed()
def compare_files(original, decompressed: str | torch.Tensor):
with open(original, "rb") as file:
original = file.read()
original = torch.tensor(list(original), dtype=torch.uint8).cpu()
if type(decompressed) == "str":
with open(decompressed, "rb") as file:
decompressed = file.read()
decompressed = torch.tensor(list(decompressed), dtype=torch.uint8).cpu()
# count bytes matching
count = torch.sum(original == decompressed[:original.shape[0]])
accuracy = count / original.shape[0]
return accuracy
if __name__ == "__main__":
files_genome = [
"genome.fna",
"genome_large.fna",
"genome_xlarge.fna"
]
files_genome_cnn = [
"genome_small.fna",
"genome_xsmall.fna",
"genome_xxsmall.fna"
]
files_enwik9 = [
# "text.txt",
# "txt_large.txt",
# "txt_xlarge.txt"
]
files_enwik9_cnn = [
]
models = [
("auto-genome-full-256.pt", 256, "autoencoder", files_genome),
("auto-genome-full-128.pt", 128, "autoencoder", files_genome),
("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn),
("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn),
("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9),
("auto-enwik9-full-128", 128, "autoencoder", files_enwik9),
("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn),
("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn),
]
device = "cuda" if torch.cuda.is_available() else "cpu"
with open("./results/compress/compression_results.csv", "w") as f:
# write header
f.write(
"model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
)
for model, context_length, model_name, files in models:
for file in files:
in_file = f"./data/compression_sets/{file}"
model_path = f"./models/{model_name}/{model}"
print(f"Running for model {model} and file {file}...")
with timer() as t:
compressed = p.compress(
device=device,
input_file=in_file,
model_name=model_name,
model_path=model_path,
context_length=context_length,
output_file="./output/tmp.pt"
)
compression_time = t()
with timer() as t:
decompressed = p.decompress(
device,
model_name=model_name,
model_path=model_path,
context_length=context_length,
input_file="./output/tmp.pt"
)
decompression_time = t()
accuracy = compare_files(in_file, decompressed.flatten().cpu())
og_file_len = os.path.getsize(in_file)
if compressed is None:
compressed_size = os.path.getsize("./output/tmp.pt")
else:
compressed_size = 4 * compressed.shape[0] * compressed.shape[1]
os.remove("./output/tmp.pt")
f.write(
f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
)

View file

@ -58,8 +58,6 @@ class AutoEncoder(Model):
""" """
x: torch.Tensor of floats x: torch.Tensor of floats
""" """
if len(x.shape) == 2:
x = x.unsqueeze(1)
return self.decoder(x) return self.decoder(x)
def forward(self, x: torch.LongTensor) -> torch.Tensor: def forward(self, x: torch.LongTensor) -> torch.Tensor:

View file

@ -7,10 +7,13 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from tqdm import tqdm from tqdm import tqdm
import struct
from src.models import AutoEncoder from src.models import AutoEncoder
from src.utils import reference_ae from src.utils import reference_ae
NUMBITS = 64
def probs_to_freqs(probs, total_freq=8192): def probs_to_freqs(probs, total_freq=8192):
freqs = (probs * total_freq).round().long() freqs = (probs * total_freq).round().long()
@ -20,7 +23,7 @@ def probs_to_freqs(probs, total_freq=8192):
# Re-normalize so the sum matches total_freq # Re-normalize so the sum matches total_freq
diff = total_freq - freqs.sum() diff = total_freq - freqs.sum()
freqs[0] += diff # fix the sum by adjusting the first bin freqs[freqs.argmax()] += diff # fix the sum by adjusting the first bin
return freqs return freqs
@ -32,32 +35,39 @@ def ae_compress(
model: nn.Module, model: nn.Module,
byte_data: bytes, byte_data: bytes,
tensor: torch.Tensor tensor: torch.Tensor
): ):
# Init AE
print("Initializing AE") print("Initializing AE")
with contextlib.closing(reference_ae.BitOutputStream(open(output_file, "wb"))) as bitout:
enc = reference_ae.ArithmeticEncoder(len(byte_data), bitout)
context = deque([0] * context_length, maxlen=context_length) with open(output_file, "wb") as raw_out:
# Write original length header (8 bytes)
raw_out.write(struct.pack(">Q", len(byte_data)))
# Compress with contextlib.closing(reference_ae.BitOutputStream(raw_out)) as bitout:
for byte in tqdm(tensor.tolist(), desc="Compressing"): enc = reference_ae.ArithmeticEncoder(NUMBITS, bitout)
context_tensor = torch.tensor([list(context)], dtype=torch.long, device=device)
with torch.inference_mode(): context = deque([0] * context_length, maxlen=context_length)
logits = model(context_tensor)
probabilities = torch.softmax(logits[0], dim=-1)
print(f"probabilities: {probabilities}")
probabilities = probabilities.detach()
probability_table = reference_ae.SimpleFrequencyTable(probs_to_freqs(probabilities))
# write byte to output file for byte in tqdm(tensor.tolist(), desc="Compressing"):
enc.write(probability_table, byte) context_tensor = torch.tensor(
[list(context)],
dtype=torch.long,
device=device
)
context.append(byte) with torch.inference_mode():
logits = model(context_tensor)
probabilities = torch.softmax(logits[0], dim=-1)
def chunk_data(x: bytes, context_length = 128) -> torch.Tensor: freqs = probs_to_freqs(probabilities).tolist()
probability_table = reference_ae.SimpleFrequencyTable(freqs)
enc.write(probability_table, byte)
context.append(byte)
enc.finish()
def chunk_data(x: bytes, context_length=128) -> torch.Tensor:
tensor_data = torch.tensor(list(x), dtype=torch.long) tensor_data = torch.tensor(list(x), dtype=torch.long)
shape = tensor_data.size(0) shape = tensor_data.size(0)
row_count = math.ceil(shape / context_length) row_count = math.ceil(shape / context_length)
@ -65,13 +75,14 @@ def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
tensor_data = nn.functional.pad(tensor_data, (0, pad_count), value=0) tensor_data = nn.functional.pad(tensor_data, (0, pad_count), value=0)
return tensor_data.view(row_count, context_length).float() / 255.0 return tensor_data.view(row_count, context_length).float() / 255.0
def auto_encoder_compress( def auto_encoder_compress(
data: bytes, data: bytes,
model: AutoEncoder, model: AutoEncoder,
output_file: str, output_file: str | None = None,
context_length: int = 128, context_length: int = 128,
device: str = "cuda" device: str = "cuda"
): ) -> torch.Tensor:
# convert data to chunks of context length tensors # convert data to chunks of context length tensors
# send the data to device # send the data to device
tensor = chunk_data(data, context_length).to(device) tensor = chunk_data(data, context_length).to(device)
@ -83,10 +94,11 @@ def auto_encoder_compress(
print(f"output shape of compress: {4 * output.shape[0] * output.shape[1]} bytes") print(f"output shape of compress: {4 * output.shape[0] * output.shape[1]} bytes")
# write output to file # write output to file
print(f"saving to file {output_file}...") if output_file is not None:
torch.save(output.detach(), output_file) print(f"saving to file {output_file}...")
torch.save(output.detach(), output_file)
return output
def compress( def compress(
@ -99,7 +111,7 @@ def compress(
): ):
# Get input to compress # Get input to compress
print("Reading input") print("Reading input")
if input_file: if input_file is not None:
with open(input_file, "rb") as file: with open(input_file, "rb") as file:
byte_data = file.read() byte_data = file.read()
else: else:
@ -111,14 +123,14 @@ def compress(
tensor = torch.tensor(list(byte_data), dtype=torch.long) tensor = torch.tensor(list(byte_data), dtype=torch.long)
# Get model # Get model
print("Loading model") print(f"Loading model: {model_name}")
model = torch.load(model_path, weights_only=False) model = torch.load(model_path, weights_only=False)
model.to(device) model.to(device)
model.eval() model.eval()
match model_name: match model_name:
case "cnn": case "cnn":
ae_compress( return ae_compress(
output_file, output_file,
context_length, context_length,
device, device,
@ -127,7 +139,7 @@ def compress(
tensor tensor
) )
case "autoencoder": case "autoencoder":
auto_encoder_compress( return auto_encoder_compress(
byte_data, byte_data,
model, model,
output_file, output_file,
@ -138,16 +150,75 @@ def compress(
raise ValueError(f"Unknown model type: {model_name}") raise ValueError(f"Unknown model type: {model_name}")
def ae_decompress( def ae_decompress(
model: nn.Module,
input_file: str,
context_length=128,
device="cuda",
output_file: str | None = None
): ):
pass print("Initializing AE decoder")
with open(input_file, "rb") as raw_in:
# Read original length header
original_length_bytes = raw_in.read(8)
if len(original_length_bytes) != 8:
raise ValueError("Invalid compressed file (missing length header)")
original_length = struct.unpack(">Q", original_length_bytes)[0]
print(f"Original length: {original_length} bytes")
with contextlib.closing(reference_ae.BitInputStream(raw_in)) as bitin:
dec = reference_ae.ArithmeticDecoder(NUMBITS, bitin)
context = deque([0] * context_length, maxlen=context_length)
output_data = []
# Decode exactly original_length bytes
for _ in range(original_length):
context_tensor = torch.tensor(
[list(context)],
dtype=torch.long,
device=device
)
with torch.inference_mode():
logits = model(context_tensor)
probabilities = torch.softmax(logits[0], dim=-1)
freqs = probs_to_freqs(probabilities).tolist()
probability_table = reference_ae.SimpleFrequencyTable(freqs)
byte = dec.read(probability_table)
output_data.append(byte)
context.append(byte)
byte_data = torch.tensor(output_data, dtype=torch.long).byte()
if output_file is not None:
with open(output_file, "wb") as file:
file.write(byte_data.cpu().numpy().tobytes())
return byte_data
def auto_encoder_decompress( def auto_encoder_decompress(
data: torch.Tensor,
model: AutoEncoder,
output_file: str | None = None,
context_length=128,
device="cuda"
) -> torch.Tensor:
decompressed = model.decode(data).squeeze(1)
): # convert result back to bytes
pass byte_data = (decompressed * 255.0).round().byte().detach()
if output_file is not None:
with open(output_file, "wb") as file:
file.write(byte_data.cpu().numpy().tobytes())
return byte_data
def decompress( def decompress(
@ -156,14 +227,16 @@ def decompress(
model_name: str, model_name: str,
input_file: str, input_file: str,
output_file: str | None = None, output_file: str | None = None,
context_length: int = 128 context_length: int = 128
): ):
print("Reading in the data") print("Reading in the data")
with open(input_file, "r") as f: if model_name != "autoencoder":
length = int(f.readline()) with open(input_file, "rb") as f:
bytes_data = f.read() data = f.read()
else:
data = torch.load(input_file, map_location=device)
if len(bytes_data) == 0: if len(data) == 0:
print("Input file is empty, nothing has to be done...") print("Input file is empty, nothing has to be done...")
return return
@ -174,8 +247,19 @@ def decompress(
match model_name: match model_name:
case "cnn": case "cnn":
ae_decompress() return ae_decompress(
model=model,
input_file=input_file,
context_length=context_length,
output_file=output_file
)
case "autoencoder": case "autoencoder":
auto_encoder_decompress() return auto_encoder_decompress(
data,
model,
output_file,
context_length,
device
)
case _: case _:
raise ValueError(f"Unknown model type: {model_name}") raise ValueError(f"Unknown model type: {model_name}")