feat: measuring code + graph generator code

2025-12-15 22:53:32 +01:00 · 2025-12-15 22:53:32 +01:00 · f3b07c1df3
commit f3b07c1df3
parent dd0b3d3945
6 changed files with 325 additions and 140 deletions
--- a/graphs.ipynb
+++ b/graphs.ipynb
--- a/graphs/autoencoder_loss.png
+++ b/graphs/autoencoder_loss.png
--- a/make_graphs.py
+++ b/make_graphs.py
@ -0,0 +1,78 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+if __name__ == "__main__":
+    # read in the csv
+    df = pd.read_csv("./results/compress/compression_results.csv")
+
+    for model_type in df["model_type"].unique():
+        model_df = df[df["model_type"] == model_type]
+
+        # execution time
+        plt.figure()
+        grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
+        labels = grouped.index.astype(str)  # "128", "256"
+        x = np.arange(len(labels))  # [0, 1]
+
+        plt.bar(x, grouped.values, width=0.6)
+        plt.title(f"{model_type} mean compression time")
+        plt.xticks(x, labels)
+        plt.xlabel("Context length")
+        plt.ylabel("Mean compression time [s]")
+        plt.tight_layout()
+        plt.savefig(f"./graphs/{model_type}_{}_compression_time.png")
+
+        plt.figure()
+        grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
+        labels = grouped.index.astype(str)  # "128", "256"
+        x = np.arange(len(labels))  # [0, 1]
+
+        plt.bar(x, grouped.values, width=0.6)
+        plt.title(f"{model_type} mean decompression time")
+        plt.xticks(x, labels)
+        plt.xlabel("Context length")
+        plt.ylabel("Mean decompression time [s]")
+        plt.tight_layout()
+        plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png")
+
+        # accuracy
+        plt.figure()
+        bar_height = 0.25
+        files = model_df["input_file_name"].unique()
+        y = np.arange(len(files))
+        c256 = model_df[model_df["context_length"] == 256]
+        c128 = model_df[model_df["context_length"] == 128]
+
+        plt.barh(
+            y - bar_height / 2,
+            c256["match_percentage"] * 100,
+            height=bar_height,
+            label="256"
+        )
+
+        plt.barh(
+            y + bar_height / 2,
+            c128["match_percentage"] * 100,
+            height=bar_height,
+            label="128"
+        )
+        plt.yticks(y, files)
+        plt.title(f"{model_type} time for different context lengths")
+        plt.xlabel("accuracy")
+        plt.ylabel("Filename")
+        plt.legend()
+        plt.savefig(f"./graphs/{model_type}_{}_accuracy.png")
+
+        # compression ratio
+        plt.figure()
+        c256 = model_df[model_df["context_length"] == 256]
+        c128 = model_df[model_df["context_length"] == 128]
+
+        plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256")
+        plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128")
+        plt.title(f"{model_type} compressed file evolution")
+        plt.xlabel("Original file size [MB]")
+        plt.ylabel("Compressed file size [MB]")
+        plt.legend()
+        plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png")
--- a/measure.py
+++ b/measure.py
@ -0,0 +1,123 @@
+import os
+from argparse import ArgumentParser
+from contextlib import contextmanager
+
+import torch
+
+import src.process as p
+
+import time
+
+
+@contextmanager
+def timer():
+    start = time.time_ns()
+    elapsed = None
+
+    def get_elapsed():
+        nonlocal elapsed
+        if elapsed is None:
+            elapsed = time.time_ns() - start
+        return elapsed
+
+    yield get_elapsed
+    get_elapsed()
+
+
+def compare_files(original, decompressed: str | torch.Tensor):
+    with open(original, "rb") as file:
+        original = file.read()
+    original = torch.tensor(list(original), dtype=torch.uint8).cpu()
+
+    if type(decompressed) == "str":
+        with open(decompressed, "rb") as file:
+            decompressed = file.read()
+        decompressed = torch.tensor(list(decompressed), dtype=torch.uint8).cpu()
+
+    # count bytes matching
+    count = torch.sum(original == decompressed[:original.shape[0]])
+    accuracy = count / original.shape[0]
+    return accuracy
+
+
+if __name__ == "__main__":
+    files_genome = [
+        "genome.fna",
+        "genome_large.fna",
+        "genome_xlarge.fna"
+    ]
+
+    files_genome_cnn = [
+        "genome_small.fna",
+        "genome_xsmall.fna",
+        "genome_xxsmall.fna"
+    ]
+
+    files_enwik9 = [
+        # "text.txt",
+        # "txt_large.txt",
+        # "txt_xlarge.txt"
+    ]
+
+    files_enwik9_cnn = [
+
+    ]
+
+    models = [
+        ("auto-genome-full-256.pt", 256, "autoencoder", files_genome),
+        ("auto-genome-full-128.pt", 128, "autoencoder", files_genome),
+        ("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn),
+        ("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn),
+        ("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9),
+        ("auto-enwik9-full-128", 128, "autoencoder", files_enwik9),
+        ("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn),
+        ("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn),
+    ]
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    with open("./results/compress/compression_results.csv", "w") as f:
+        # write header
+        f.write(
+            "model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
+        )
+
+        for model, context_length, model_name, files in models:
+            for file in files:
+                in_file = f"./data/compression_sets/{file}"
+                model_path = f"./models/{model_name}/{model}"
+                print(f"Running for model {model} and file {file}...")
+                with timer() as t:
+                    compressed = p.compress(
+                        device=device,
+                        input_file=in_file,
+                        model_name=model_name,
+                        model_path=model_path,
+                        context_length=context_length,
+                        output_file="./output/tmp.pt"
+                    )
+                compression_time = t()
+
+                with timer() as t:
+                    decompressed = p.decompress(
+                        device,
+                        model_name=model_name,
+                        model_path=model_path,
+                        context_length=context_length,
+                        input_file="./output/tmp.pt"
+                    )
+                decompression_time = t()
+
+
+                accuracy = compare_files(in_file, decompressed.flatten().cpu())
+
+                og_file_len = os.path.getsize(in_file)
+                if compressed is None:
+                    compressed_size = os.path.getsize("./output/tmp.pt")
+                else:
+                    compressed_size = 4 * compressed.shape[0] * compressed.shape[1]
+
+                os.remove("./output/tmp.pt")
+
+                f.write(
+                    f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
+                )
--- a/src/models/autoencoder/autoencoder.py
+++ b/src/models/autoencoder/autoencoder.py
@ -58,8 +58,6 @@ class AutoEncoder(Model):
        """
        x: torch.Tensor of floats
        """
-        if len(x.shape) == 2:
-            x = x.unsqueeze(1)
        return self.decoder(x)

    def forward(self, x: torch.LongTensor) -> torch.Tensor:
--- a/src/process.py
+++ b/src/process.py
@ -7,10 +7,13 @@ import numpy as np
 import torch
 import torch.nn as nn
 from tqdm import tqdm
+import struct

 from src.models import AutoEncoder
 from src.utils import reference_ae

+NUMBITS = 64
+

 def probs_to_freqs(probs, total_freq=8192):
    freqs = (probs * total_freq).round().long()
@ -20,7 +23,7 @@ def probs_to_freqs(probs, total_freq=8192):

    # Re-normalize so the sum matches total_freq
    diff = total_freq - freqs.sum()
-    freqs[0] += diff  # fix the sum by adjusting the first bin
+    freqs[freqs.argmax()] += diff  # fix the sum by adjusting the first bin

    return freqs

@ -32,32 +35,39 @@ def ae_compress(
        model: nn.Module,
        byte_data: bytes,
        tensor: torch.Tensor
-
 ):
-    # Init AE
    print("Initializing AE")
-    with contextlib.closing(reference_ae.BitOutputStream(open(output_file, "wb"))) as bitout:
-        enc = reference_ae.ArithmeticEncoder(len(byte_data), bitout)

-        context = deque([0] * context_length, maxlen=context_length)
+    with open(output_file, "wb") as raw_out:
+        # Write original length header (8 bytes)
+        raw_out.write(struct.pack(">Q", len(byte_data)))

-        # Compress
-        for byte in tqdm(tensor.tolist(), desc="Compressing"):
-            context_tensor = torch.tensor([list(context)], dtype=torch.long, device=device)
+        with contextlib.closing(reference_ae.BitOutputStream(raw_out)) as bitout:
+            enc = reference_ae.ArithmeticEncoder(NUMBITS, bitout)

-            with torch.inference_mode():
-                logits = model(context_tensor)
-                probabilities = torch.softmax(logits[0], dim=-1)
-            print(f"probabilities: {probabilities}")
-            probabilities = probabilities.detach()
-            probability_table = reference_ae.SimpleFrequencyTable(probs_to_freqs(probabilities))
+            context = deque([0] * context_length, maxlen=context_length)

-            # write byte to output file
-            enc.write(probability_table, byte)
+            for byte in tqdm(tensor.tolist(), desc="Compressing"):
+                context_tensor = torch.tensor(
+                    [list(context)],
+                    dtype=torch.long,
+                    device=device
+                )

-            context.append(byte)
+                with torch.inference_mode():
+                    logits = model(context_tensor)
+                    probabilities = torch.softmax(logits[0], dim=-1)

-def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
+                freqs = probs_to_freqs(probabilities).tolist()
+                probability_table = reference_ae.SimpleFrequencyTable(freqs)
+
+                enc.write(probability_table, byte)
+                context.append(byte)
+
+            enc.finish()
+
+
+def chunk_data(x: bytes, context_length=128) -> torch.Tensor:
    tensor_data = torch.tensor(list(x), dtype=torch.long)
    shape = tensor_data.size(0)
    row_count = math.ceil(shape / context_length)
@ -65,13 +75,14 @@ def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
    tensor_data = nn.functional.pad(tensor_data, (0, pad_count), value=0)
    return tensor_data.view(row_count, context_length).float() / 255.0

+
 def auto_encoder_compress(
        data: bytes,
        model: AutoEncoder,
-        output_file: str,
+        output_file: str | None = None,
        context_length: int = 128,
        device: str = "cuda"
-):
+) -> torch.Tensor:
    # convert data to chunks of context length tensors
    # send the data to device
    tensor = chunk_data(data, context_length).to(device)
@ -83,10 +94,11 @@ def auto_encoder_compress(
    print(f"output shape of compress: {4 * output.shape[0] * output.shape[1]} bytes")

    # write output to file
-    print(f"saving to file {output_file}...")
-    torch.save(output.detach(), output_file)
-
+    if output_file is not None:
+        print(f"saving to file {output_file}...")
+        torch.save(output.detach(), output_file)

+    return output


 def compress(
@ -99,7 +111,7 @@ def compress(
 ):
    # Get input to compress
    print("Reading input")
-    if input_file:
+    if input_file is not None:
        with open(input_file, "rb") as file:
            byte_data = file.read()
    else:
@ -111,14 +123,14 @@ def compress(
    tensor = torch.tensor(list(byte_data), dtype=torch.long)

    # Get model
-    print("Loading model")
+    print(f"Loading model: {model_name}")
    model = torch.load(model_path, weights_only=False)
    model.to(device)
    model.eval()

    match model_name:
        case "cnn":
-            ae_compress(
+            return ae_compress(
                output_file,
                context_length,
                device,
@ -127,7 +139,7 @@ def compress(
                tensor
            )
        case "autoencoder":
-            auto_encoder_compress(
+            return auto_encoder_compress(
                byte_data,
                model,
                output_file,
@ -138,16 +150,75 @@ def compress(
            raise ValueError(f"Unknown model type: {model_name}")


-
 def ae_decompress(
-
+        model: nn.Module,
+        input_file: str,
+        context_length=128,
+        device="cuda",
+        output_file: str | None = None
 ):
-    pass
+    print("Initializing AE decoder")
+
+    with open(input_file, "rb") as raw_in:
+        # Read original length header
+        original_length_bytes = raw_in.read(8)
+        if len(original_length_bytes) != 8:
+            raise ValueError("Invalid compressed file (missing length header)")
+
+        original_length = struct.unpack(">Q", original_length_bytes)[0]
+        print(f"Original length: {original_length} bytes")
+
+        with contextlib.closing(reference_ae.BitInputStream(raw_in)) as bitin:
+            dec = reference_ae.ArithmeticDecoder(NUMBITS, bitin)
+
+            context = deque([0] * context_length, maxlen=context_length)
+            output_data = []
+
+            # Decode exactly original_length bytes
+            for _ in range(original_length):
+                context_tensor = torch.tensor(
+                    [list(context)],
+                    dtype=torch.long,
+                    device=device
+                )
+
+                with torch.inference_mode():
+                    logits = model(context_tensor)
+                    probabilities = torch.softmax(logits[0], dim=-1)
+
+                freqs = probs_to_freqs(probabilities).tolist()
+                probability_table = reference_ae.SimpleFrequencyTable(freqs)
+
+                byte = dec.read(probability_table)
+                output_data.append(byte)
+                context.append(byte)
+
+    byte_data = torch.tensor(output_data, dtype=torch.long).byte()
+
+    if output_file is not None:
+        with open(output_file, "wb") as file:
+            file.write(byte_data.cpu().numpy().tobytes())
+
+    return byte_data
+

 def auto_encoder_decompress(
+        data: torch.Tensor,
+        model: AutoEncoder,
+        output_file: str | None = None,
+        context_length=128,
+        device="cuda"
+) -> torch.Tensor:
+    decompressed = model.decode(data).squeeze(1)

-):
-    pass
+    # convert result back to bytes
+    byte_data = (decompressed * 255.0).round().byte().detach()
+
+    if output_file is not None:
+        with open(output_file, "wb") as file:
+            file.write(byte_data.cpu().numpy().tobytes())
+
+    return byte_data


 def decompress(
@ -156,14 +227,16 @@ def decompress(
        model_name: str,
        input_file: str,
        output_file: str | None = None,
-       context_length: int = 128
+        context_length: int = 128
 ):
    print("Reading in the data")
-    with open(input_file, "r") as f:
-        length = int(f.readline())
-        bytes_data = f.read()
+    if model_name != "autoencoder":
+        with open(input_file, "rb") as f:
+            data = f.read()
+    else:
+        data = torch.load(input_file, map_location=device)

-    if len(bytes_data) == 0:
+    if len(data) == 0:
        print("Input file is empty, nothing has to be done...")
        return

@ -174,8 +247,19 @@ def decompress(

    match model_name:
        case "cnn":
-            ae_decompress()
+            return ae_decompress(
+                model=model,
+                input_file=input_file,
+                context_length=context_length,
+                output_file=output_file
+            )
        case "autoencoder":
-            auto_encoder_decompress()
+            return auto_encoder_decompress(
+                data,
+                model,
+                output_file,
+                context_length,
+                device
+            )
        case _:
            raise ValueError(f"Unknown model type: {model_name}")