feat: measuring code + graph generator code

2025-12-15 22:53:32 +01:00 · 2025-12-15 22:53:32 +01:00 · f3b07c1df3
commit f3b07c1df3
parent dd0b3d3945
6 changed files with 325 additions and 140 deletions
--- a/graphs.ipynb
+++ b/graphs.ipynb
--- a/graphs/autoencoder_loss.png
+++ b/graphs/autoencoder_loss.png
--- a/make_graphs.py
+++ b/make_graphs.py
@ -0,0 +1,78 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 if __name__ == "__main__":
    # read in the csv
    df = pd.read_csv("./results/compress/compression_results.csv")
    for model_type in df["model_type"].unique():
        model_df = df[df["model_type"] == model_type]
        # execution time
        plt.figure()
        grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
        labels = grouped.index.astype(str)  # "128", "256"
        x = np.arange(len(labels))  # [0, 1]
        plt.bar(x, grouped.values, width=0.6)
        plt.title(f"{model_type} mean compression time")
        plt.xticks(x, labels)
        plt.xlabel("Context length")
        plt.ylabel("Mean compression time [s]")
        plt.tight_layout()
        plt.savefig(f"./graphs/{model_type}_{}_compression_time.png")
        plt.figure()
        grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
        labels = grouped.index.astype(str)  # "128", "256"
        x = np.arange(len(labels))  # [0, 1]
        plt.bar(x, grouped.values, width=0.6)
        plt.title(f"{model_type} mean decompression time")
        plt.xticks(x, labels)
        plt.xlabel("Context length")
        plt.ylabel("Mean decompression time [s]")
        plt.tight_layout()
        plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png")
        # accuracy
        plt.figure()
        bar_height = 0.25
        files = model_df["input_file_name"].unique()
        y = np.arange(len(files))
        c256 = model_df[model_df["context_length"] == 256]
        c128 = model_df[model_df["context_length"] == 128]
        plt.barh(
            y - bar_height / 2,
            c256["match_percentage"] * 100,
            height=bar_height,
            label="256"
        )
        plt.barh(
            y + bar_height / 2,
            c128["match_percentage"] * 100,
            height=bar_height,
            label="128"
        )
        plt.yticks(y, files)
        plt.title(f"{model_type} time for different context lengths")
        plt.xlabel("accuracy")
        plt.ylabel("Filename")
        plt.legend()
        plt.savefig(f"./graphs/{model_type}_{}_accuracy.png")
        # compression ratio
        plt.figure()
        c256 = model_df[model_df["context_length"] == 256]
        c128 = model_df[model_df["context_length"] == 128]
        plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256")
        plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128")
        plt.title(f"{model_type} compressed file evolution")
        plt.xlabel("Original file size [MB]")
        plt.ylabel("Compressed file size [MB]")
        plt.legend()
        plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png")
--- a/measure.py
+++ b/measure.py
@ -0,0 +1,123 @@
 import os
 from argparse import ArgumentParser
 from contextlib import contextmanager
 import torch
 import src.process as p
 import time
@contextmanager
 def timer():
    start = time.time_ns()
    elapsed = None
    def get_elapsed():
        nonlocal elapsed
        if elapsed is None:
            elapsed = time.time_ns() - start
        return elapsed
    yield get_elapsed
    get_elapsed()
 def compare_files(original, decompressed: str | torch.Tensor):
    with open(original, "rb") as file:
        original = file.read()
    original = torch.tensor(list(original), dtype=torch.uint8).cpu()
    if type(decompressed) == "str":
        with open(decompressed, "rb") as file:
            decompressed = file.read()
        decompressed = torch.tensor(list(decompressed), dtype=torch.uint8).cpu()
    # count bytes matching
    count = torch.sum(original == decompressed[:original.shape[0]])
    accuracy = count / original.shape[0]
    return accuracy
 if __name__ == "__main__":
    files_genome = [
        "genome.fna",
        "genome_large.fna",
        "genome_xlarge.fna"
    ]
    files_genome_cnn = [
        "genome_small.fna",
        "genome_xsmall.fna",
        "genome_xxsmall.fna"
    ]
    files_enwik9 = [
        # "text.txt",
        # "txt_large.txt",
        # "txt_xlarge.txt"
    ]
    files_enwik9_cnn = [
    ]
    models = [
        ("auto-genome-full-256.pt", 256, "autoencoder", files_genome),
        ("auto-genome-full-128.pt", 128, "autoencoder", files_genome),
        ("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn),
        ("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn),
        ("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9),
        ("auto-enwik9-full-128", 128, "autoencoder", files_enwik9),
        ("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn),
        ("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn),
    ]
    device = "cuda" if torch.cuda.is_available() else "cpu"
    with open("./results/compress/compression_results.csv", "w") as f:
        # write header
        f.write(
            "model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
        )
        for model, context_length, model_name, files in models:
            for file in files:
                in_file = f"./data/compression_sets/{file}"
                model_path = f"./models/{model_name}/{model}"
                print(f"Running for model {model} and file {file}...")
                with timer() as t:
                    compressed = p.compress(
                        device=device,
                        input_file=in_file,
                        model_name=model_name,
                        model_path=model_path,
                        context_length=context_length,
                        output_file="./output/tmp.pt"
                    )
                compression_time = t()
                with timer() as t:
                    decompressed = p.decompress(
                        device,
                        model_name=model_name,
                        model_path=model_path,
                        context_length=context_length,
                        input_file="./output/tmp.pt"
                    )
                decompression_time = t()
                accuracy = compare_files(in_file, decompressed.flatten().cpu())
                og_file_len = os.path.getsize(in_file)
                if compressed is None:
                    compressed_size = os.path.getsize("./output/tmp.pt")
                else:
                    compressed_size = 4 * compressed.shape[0] * compressed.shape[1]
                os.remove("./output/tmp.pt")
                f.write(
                    f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
                )
--- a/src/models/autoencoder/autoencoder.py
+++ b/src/models/autoencoder/autoencoder.py
@ -58,8 +58,6 @@ class AutoEncoder(Model):
        """
        x: torch.Tensor of floats
        """
        if len(x.shape) == 2:
            x = x.unsqueeze(1)
        return self.decoder(x)
    def forward(self, x: torch.LongTensor) -> torch.Tensor:
--- a/src/process.py
+++ b/src/process.py
@ -7,10 +7,13 @@ import numpy as np
 import torch
 import torch.nn as nn
 from tqdm import tqdm
 import struct
 from src.models import AutoEncoder
 from src.utils import reference_ae
 NUMBITS = 64
 def probs_to_freqs(probs, total_freq=8192):
    freqs = (probs * total_freq).round().long()
@ -20,7 +23,7 @@ def probs_to_freqs(probs, total_freq=8192):
    # Re-normalize so the sum matches total_freq
    diff = total_freq - freqs.sum()
-    freqs[0] += diff  # fix the sum by adjusting the first bin
+    freqs[freqs.argmax()] += diff  # fix the sum by adjusting the first bin
    return freqs
@ -32,32 +35,39 @@ def ae_compress(
        model: nn.Module,
        byte_data: bytes,
        tensor: torch.Tensor
 ):
    # Init AE
    print("Initializing AE")
    with contextlib.closing(reference_ae.BitOutputStream(open(output_file, "wb"))) as bitout:
        enc = reference_ae.ArithmeticEncoder(len(byte_data), bitout)
-        context = deque([0] * context_length, maxlen=context_length)
+    with open(output_file, "wb") as raw_out:
        # Write original length header (8 bytes)
        raw_out.write(struct.pack(">Q", len(byte_data)))
-        # Compress
+        with contextlib.closing(reference_ae.BitOutputStream(raw_out)) as bitout:
-        for byte in tqdm(tensor.tolist(), desc="Compressing"):
+            enc = reference_ae.ArithmeticEncoder(NUMBITS, bitout)
            context_tensor = torch.tensor([list(context)], dtype=torch.long, device=device)
-            with torch.inference_mode():
+            context = deque([0] * context_length, maxlen=context_length)
                logits = model(context_tensor)
                probabilities = torch.softmax(logits[0], dim=-1)
            print(f"probabilities: {probabilities}")
            probabilities = probabilities.detach()
            probability_table = reference_ae.SimpleFrequencyTable(probs_to_freqs(probabilities))
-            # write byte to output file
+            for byte in tqdm(tensor.tolist(), desc="Compressing"):
-            enc.write(probability_table, byte)
+                context_tensor = torch.tensor(
                    [list(context)],
                    dtype=torch.long,
                    device=device
                )
-            context.append(byte)
+                with torch.inference_mode():
                    logits = model(context_tensor)
                    probabilities = torch.softmax(logits[0], dim=-1)
-def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
+                freqs = probs_to_freqs(probabilities).tolist()
                probability_table = reference_ae.SimpleFrequencyTable(freqs)
                enc.write(probability_table, byte)
                context.append(byte)
            enc.finish()
 def chunk_data(x: bytes, context_length=128) -> torch.Tensor:
    tensor_data = torch.tensor(list(x), dtype=torch.long)
    shape = tensor_data.size(0)
    row_count = math.ceil(shape / context_length)
@ -65,13 +75,14 @@ def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
    tensor_data = nn.functional.pad(tensor_data, (0, pad_count), value=0)
    return tensor_data.view(row_count, context_length).float() / 255.0
 def auto_encoder_compress(
        data: bytes,
        model: AutoEncoder,
-        output_file: str,
+        output_file: str | None = None,
        context_length: int = 128,
        device: str = "cuda"
-):
+) -> torch.Tensor:
    # convert data to chunks of context length tensors
    # send the data to device
    tensor = chunk_data(data, context_length).to(device)
@ -83,10 +94,11 @@ def auto_encoder_compress(
    print(f"output shape of compress: {4 * output.shape[0] * output.shape[1]} bytes")
    # write output to file
-    print(f"saving to file {output_file}...")
+    if output_file is not None:
-    torch.save(output.detach(), output_file)
+        print(f"saving to file {output_file}...")
-
+        torch.save(output.detach(), output_file)
    return output
 def compress(
@ -99,7 +111,7 @@ def compress(
 ):
    # Get input to compress
    print("Reading input")
-    if input_file:
+    if input_file is not None:
        with open(input_file, "rb") as file:
            byte_data = file.read()
    else:
@ -111,14 +123,14 @@ def compress(
    tensor = torch.tensor(list(byte_data), dtype=torch.long)
    # Get model
-    print("Loading model")
+    print(f"Loading model: {model_name}")
    model = torch.load(model_path, weights_only=False)
    model.to(device)
    model.eval()
    match model_name:
        case "cnn":
-            ae_compress(
+            return ae_compress(
                output_file,
                context_length,
                device,
@ -127,7 +139,7 @@ def compress(
                tensor
            )
        case "autoencoder":
-            auto_encoder_compress(
+            return auto_encoder_compress(
                byte_data,
                model,
                output_file,
@ -138,16 +150,75 @@ def compress(
            raise ValueError(f"Unknown model type: {model_name}")
 def ae_decompress(
-
+        model: nn.Module,
        input_file: str,
        context_length=128,
        device="cuda",
        output_file: str | None = None
 ):
-    pass
+    print("Initializing AE decoder")
    with open(input_file, "rb") as raw_in:
        # Read original length header
        original_length_bytes = raw_in.read(8)
        if len(original_length_bytes) != 8:
            raise ValueError("Invalid compressed file (missing length header)")
        original_length = struct.unpack(">Q", original_length_bytes)[0]
        print(f"Original length: {original_length} bytes")
        with contextlib.closing(reference_ae.BitInputStream(raw_in)) as bitin:
            dec = reference_ae.ArithmeticDecoder(NUMBITS, bitin)
            context = deque([0] * context_length, maxlen=context_length)
            output_data = []
            # Decode exactly original_length bytes
            for _ in range(original_length):
                context_tensor = torch.tensor(
                    [list(context)],
                    dtype=torch.long,
                    device=device
                )
                with torch.inference_mode():
                    logits = model(context_tensor)
                    probabilities = torch.softmax(logits[0], dim=-1)
                freqs = probs_to_freqs(probabilities).tolist()
                probability_table = reference_ae.SimpleFrequencyTable(freqs)
                byte = dec.read(probability_table)
                output_data.append(byte)
                context.append(byte)
    byte_data = torch.tensor(output_data, dtype=torch.long).byte()
    if output_file is not None:
        with open(output_file, "wb") as file:
            file.write(byte_data.cpu().numpy().tobytes())
    return byte_data
 def auto_encoder_decompress(
        data: torch.Tensor,
        model: AutoEncoder,
        output_file: str | None = None,
        context_length=128,
        device="cuda"
 ) -> torch.Tensor:
    decompressed = model.decode(data).squeeze(1)
-):
+    # convert result back to bytes
-    pass
+    byte_data = (decompressed * 255.0).round().byte().detach()
    if output_file is not None:
        with open(output_file, "wb") as file:
            file.write(byte_data.cpu().numpy().tobytes())
    return byte_data
 def decompress(
@ -156,14 +227,16 @@ def decompress(
        model_name: str,
        input_file: str,
        output_file: str | None = None,
-       context_length: int = 128
+        context_length: int = 128
 ):
    print("Reading in the data")
-    with open(input_file, "r") as f:
+    if model_name != "autoencoder":
-        length = int(f.readline())
+        with open(input_file, "rb") as f:
-        bytes_data = f.read()
+            data = f.read()
    else:
        data = torch.load(input_file, map_location=device)
-    if len(bytes_data) == 0:
+    if len(data) == 0:
        print("Input file is empty, nothing has to be done...")
        return
@ -174,8 +247,19 @@ def decompress(
    match model_name:
        case "cnn":
-            ae_decompress()
+            return ae_decompress(
                model=model,
                input_file=input_file,
                context_length=context_length,
                output_file=output_file
            )
        case "autoencoder":
-            auto_encoder_decompress()
+            return auto_encoder_decompress(
                data,
                model,
                output_file,
                context_length,
                device
            )
        case _:
            raise ValueError(f"Unknown model type: {model_name}")