feat: graphs + models + updated finished graph code + data in csv

2025-12-16 10:06:47 +01:00 · 2025-12-16 10:06:47 +01:00 · 1abc2f5113
commit 1abc2f5113
parent f3b07c1df3
34 changed files with 76 additions and 73 deletions
--- a/graphs/autoencoder_enwik9_accuracy.png
+++ b/graphs/autoencoder_enwik9_accuracy.png
--- a/graphs/autoencoder_enwik9_compression_ratio.png
+++ b/graphs/autoencoder_enwik9_compression_ratio.png
--- a/graphs/autoencoder_enwik9_compression_time.png
+++ b/graphs/autoencoder_enwik9_compression_time.png
--- a/graphs/autoencoder_enwik9_decompression_time.png
+++ b/graphs/autoencoder_enwik9_decompression_time.png
--- a/graphs/autoencoder_genome_accuracy.png
+++ b/graphs/autoencoder_genome_accuracy.png
--- a/graphs/autoencoder_genome_compression_ratio.png
+++ b/graphs/autoencoder_genome_compression_ratio.png
--- a/graphs/autoencoder_genome_compression_time.png
+++ b/graphs/autoencoder_genome_compression_time.png
--- a/graphs/autoencoder_genome_decompression_time.png
+++ b/graphs/autoencoder_genome_decompression_time.png
--- a/graphs/cnn_enwik9_accuracy.png
+++ b/graphs/cnn_enwik9_accuracy.png
--- a/graphs/cnn_enwik9_compression_ratio.png
+++ b/graphs/cnn_enwik9_compression_ratio.png
--- a/graphs/cnn_enwik9_compression_time.png
+++ b/graphs/cnn_enwik9_compression_time.png
--- a/graphs/cnn_enwik9_decompression_time.png
+++ b/graphs/cnn_enwik9_decompression_time.png
--- a/graphs/cnn_genome_accuracy.png
+++ b/graphs/cnn_genome_accuracy.png
--- a/graphs/cnn_genome_compression_ratio.png
+++ b/graphs/cnn_genome_compression_ratio.png
--- a/graphs/cnn_genome_compression_time.png
+++ b/graphs/cnn_genome_compression_time.png
--- a/graphs/cnn_genome_decompression_time.png
+++ b/graphs/cnn_genome_decompression_time.png
--- a/make_graphs.py
+++ b/make_graphs.py
@ -6,73 +6,76 @@ if __name__ == "__main__":
    # read in the csv
    df = pd.read_csv("./results/compress/compression_results.csv")

-    for model_type in df["model_type"].unique():
-        model_df = df[df["model_type"] == model_type]
+    for dataset_type in df["dataset_type"].unique():
+        for model_type in df["model_type"].unique():
+            dataset_df = df[df["dataset_type"] == dataset_type]
+            model_df = dataset_df[dataset_df["model_type"] == model_type]

-        # execution time
-        plt.figure()
-        grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
-        labels = grouped.index.astype(str)  # "128", "256"
-        x = np.arange(len(labels))  # [0, 1]
+            # execution time
+            plt.figure()
+            grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
+            labels = grouped.index.astype(str)  # "128", "256"
+            x = np.arange(len(labels))  # [0, 1]

-        plt.bar(x, grouped.values, width=0.6)
-        plt.title(f"{model_type} mean compression time")
-        plt.xticks(x, labels)
-        plt.xlabel("Context length")
-        plt.ylabel("Mean compression time [s]")
-        plt.tight_layout()
-        plt.savefig(f"./graphs/{model_type}_{}_compression_time.png")
+            plt.bar(x, grouped.values, width=0.6)
+            plt.title(f"{model_type.capitalize()} mean compression time")
+            plt.xticks(x, labels)
+            plt.xlabel("Context length")
+            plt.ylabel("Mean compression time [s]")
+            plt.tight_layout()
+            plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_time.png")

-        plt.figure()
-        grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
-        labels = grouped.index.astype(str)  # "128", "256"
-        x = np.arange(len(labels))  # [0, 1]
+            plt.figure()
+            grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
+            labels = grouped.index.astype(str)  # "128", "256"
+            x = np.arange(len(labels))  # [0, 1]

-        plt.bar(x, grouped.values, width=0.6)
-        plt.title(f"{model_type} mean decompression time")
-        plt.xticks(x, labels)
-        plt.xlabel("Context length")
-        plt.ylabel("Mean decompression time [s]")
-        plt.tight_layout()
-        plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png")
+            plt.bar(x, grouped.values, width=0.6)
+            plt.title(f"{model_type.capitalize()} mean decompression time")
+            plt.xticks(x, labels)
+            plt.xlabel("Context length")
+            plt.ylabel("Mean decompression time [s]")
+            plt.tight_layout()
+            plt.savefig(f"./graphs/{model_type}_{dataset_type}_decompression_time.png")

-        # accuracy
-        plt.figure()
-        bar_height = 0.25
-        files = model_df["input_file_name"].unique()
-        y = np.arange(len(files))
-        c256 = model_df[model_df["context_length"] == 256]
-        c128 = model_df[model_df["context_length"] == 128]
+            # accuracy
+            plt.figure(figsize=(10, 4))
+            bar_height = 0.25
+            files = model_df["input_file_name"].unique()
+            y = np.arange(len(files))
+            c256 = model_df[model_df["context_length"] == 256]
+            c128 = model_df[model_df["context_length"] == 128]

-        plt.barh(
-            y - bar_height / 2,
-            c256["match_percentage"] * 100,
-            height=bar_height,
-            label="256"
-        )
+            plt.barh(
+                y - bar_height / 2,
+                c256["match_percentage"] * 100,
+                height=bar_height,
+                label="256"
+            )

-        plt.barh(
-            y + bar_height / 2,
-            c128["match_percentage"] * 100,
-            height=bar_height,
-            label="128"
-        )
-        plt.yticks(y, files)
-        plt.title(f"{model_type} time for different context lengths")
-        plt.xlabel("accuracy")
-        plt.ylabel("Filename")
-        plt.legend()
-        plt.savefig(f"./graphs/{model_type}_{}_accuracy.png")
+            plt.barh(
+                y + bar_height / 2,
+                c128["match_percentage"] * 100,
+                height=bar_height,
+                label="128"
+            )
+            plt.yticks(y, files, rotation=45, ha="right")
+            plt.title(f"{model_type.capitalize()} accuracy for different context lengths")
+            plt.xlabel("Accuracy")
+            plt.ylabel("Filename")
+            plt.legend()
+            plt.tight_layout()
+            plt.savefig(f"./graphs/{model_type}_{dataset_type}_accuracy.png")

-        # compression ratio
-        plt.figure()
-        c256 = model_df[model_df["context_length"] == 256]
-        c128 = model_df[model_df["context_length"] == 128]
+            # compression ratio
+            plt.figure()
+            c256 = model_df[model_df["context_length"] == 256]
+            c128 = model_df[model_df["context_length"] == 128]

-        plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256")
-        plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128")
-        plt.title(f"{model_type} compressed file evolution")
-        plt.xlabel("Original file size [MB]")
-        plt.ylabel("Compressed file size [MB]")
-        plt.legend()
-        plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png")
+            plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256")
+            plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128")
+            plt.title(f"{model_type.capitalize()} compressed file evolution")
+            plt.xlabel("Original file size [MB]")
+            plt.ylabel("Compressed file size [MB]")
+            plt.legend()
+            plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png")
--- a/measure.py
+++ b/measure.py
@ -1,5 +1,4 @@
 import os
-from argparse import ArgumentParser
 from contextlib import contextmanager

 import torch
@ -54,13 +53,15 @@ if __name__ == "__main__":
    ]

    files_enwik9 = [
-        # "text.txt",
-        # "txt_large.txt",
-        # "txt_xlarge.txt"
+        "text.txt",
+        "text_large.txt",
+        "text_xlarge.txt"
    ]

    files_enwik9_cnn = [
-
+        "text_small.txt",
+        "text_xsmall.txt",
+        "text_xxsmall.txt"
    ]

    models = [
@ -69,7 +70,7 @@ if __name__ == "__main__":
        ("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn),
        ("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn),
        ("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9),
-        ("auto-enwik9-full-128", 128, "autoencoder", files_enwik9),
+        ("auto-enwik9-full-128.pt", 128, "autoencoder", files_enwik9),
        ("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn),
        ("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn),
    ]
@ -78,10 +79,11 @@ if __name__ == "__main__":
    with open("./results/compress/compression_results.csv", "w") as f:
        # write header
        f.write(
-            "model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
+            "model_type,model_name,context_length,dataset_type,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
        )

        for model, context_length, model_name, files in models:
+            dataset_type = "genome" if "genome" in model else "enwik9"
            for file in files:
                in_file = f"./data/compression_sets/{file}"
                model_path = f"./models/{model_name}/{model}"
@ -119,5 +121,5 @@ if __name__ == "__main__":
                os.remove("./output/tmp.pt")

                f.write(
-                    f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
+                    f"{model_name},{model},{context_length},{dataset_type},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
                )
--- a/models/autoencoder/auto-enwik9-128.pt
+++ b/models/autoencoder/auto-enwik9-128.pt
--- a/models/autoencoder/auto-enwik9-256.pt
+++ b/models/autoencoder/auto-enwik9-256.pt
--- a/models/autoencoder/auto-enwik9-full-128.pt
+++ b/models/autoencoder/auto-enwik9-full-128.pt
--- a/models/autoencoder/auto-enwik9-full-256.pt
+++ b/models/autoencoder/auto-enwik9-full-256.pt
--- a/models/autoencoder/auto-genome-128.pt
+++ b/models/autoencoder/auto-genome-128.pt
--- a/models/autoencoder/auto-genome-256.pt
+++ b/models/autoencoder/auto-genome-256.pt
--- a/models/autoencoder/auto-genome-full-128.pt
+++ b/models/autoencoder/auto-genome-full-128.pt
--- a/models/autoencoder/auto-genome-full-256.pt
+++ b/models/autoencoder/auto-genome-full-256.pt
--- a/models/cnn/cnn-enwik9-128.pt
+++ b/models/cnn/cnn-enwik9-128.pt
--- a/models/cnn/cnn-enwik9-256.pt
+++ b/models/cnn/cnn-enwik9-256.pt
--- a/models/cnn/cnn-enwik9-full-128.pt
+++ b/models/cnn/cnn-enwik9-full-128.pt
--- a/models/cnn/cnn-enwik9-full-256.pt
+++ b/models/cnn/cnn-enwik9-full-256.pt
--- a/models/cnn/cnn-genome-256.pt
+++ b/models/cnn/cnn-genome-256.pt
--- a/models/cnn/cnn-genome-full-128.pt
+++ b/models/cnn/cnn-genome-full-128.pt
--- a/models/cnn/cnn-genome-full-256.pt
+++ b/models/cnn/cnn-genome-full-256.pt
--- a/src/process.py
+++ b/src/process.py
@ -1,18 +1,16 @@
 import contextlib
 import math
+import struct
 from collections import deque
-from decimal import Decimal

-import numpy as np
 import torch
 import torch.nn as nn
 from tqdm import tqdm
-import struct

 from src.models import AutoEncoder
 from src.utils import reference_ae

-NUMBITS = 64
+NUMBITS = 16


 def probs_to_freqs(probs, total_freq=8192):