feat (WIP): Compress

2025-12-10 21:13:09 +01:00 · 2025-12-10 21:13:09 +01:00 · 5c26a52e16
commit 5c26a52e16
parent d0457b6571
4 changed files with 70 additions and 8 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -9,6 +9,7 @@ dependencies = [
    "huggingface_hub==0.27.0",
    "fsspec==2024.9.0",
    "lorem>=0.1.1",
    "arithmeticencodingpython",
 ]
 [project.optional-dependencies]
@ -21,3 +22,6 @@ dev = [
    "torchdata==0.7.1",
    "torchvision==0.24.0",
 ]
 [tool.uv.sources]
 arithmeticencodingpython = { git = "https://github.com/ahmedfgad/ArithmeticEncodingPython.git", rev = "60aad0528c57289218b241d75993574f31b90456" }
--- a/src/process.py
+++ b/src/process.py
@ -1,13 +1,22 @@
 from collections import deque
 from decimal import Decimal
 import torch
 from pyae import ArithmeticEncoding
 from tqdm import tqdm
 def compress(
-    device,
+        device,
-    model_path: str,
+        model_path: str,
-    output_file: str,
+        input_file: str | None = None,
-    input_file: str | None = None
+        output_file: str | None = None
 ):
    # NOTE Hardcoded context length
    context_length = 128
    # Get input to compress
    print("Reading input")
    if input_file:
        with open(input_file, "rb") as file:
            byte_data = file.read()
@ -16,14 +25,56 @@ def compress(
        text = input()
        byte_data = text.encode('utf-8', errors='replace')
    print("Converting to tensor")
    tensor = torch.tensor(list(byte_data), dtype=torch.long)
    print(tensor)
    # Get model
    print("Loading model")
    model = torch.load(model_path, weights_only=False)
    model.to(device)
    model.eval()
-    # TODO Feed to model for compression, store result
+    # Init AE
-    return
+    print("Initializing AE")
    AE = ArithmeticEncoding(frequency_table={0: 1})  # These are dummies because they are not used
    stage_min, stage_max = Decimal(0), Decimal(1)
    stage = None
    # Compress
    context = deque([0] * context_length, maxlen=context_length)
    for byte in tqdm(tensor.tolist(), desc="Compressing"):
        context_tensor = torch.tensor([list(context)], dtype=torch.long, device=device)
        with torch.inference_mode():
            logits = model(context_tensor)
            probabilities = torch.softmax(logits[0], dim=-1)
        probabilities = probabilities.detach().cpu().numpy()
        eps = 1e-10
        frequency_table = {i: float(probabilities[i]) + eps for i in range(len(probabilities))}
        probability_table = AE.get_probability_table(frequency_table)
        stage = AE.process_stage(probability_table, stage_min, stage_max)
        stage_min, stage_max = stage[byte]
        context.append(byte)
    print("Getting encoded value")
    interval_min, interval_max, _ = AE.get_encoded_value(stage)
    print("Encoding in binary")
    binary_code, _ = AE.encode_binary(interval_min, interval_max)
    # Pack
    bits = binary_code.split(".", maxsplit=1)[1]
    val = int(bits, 2) if len(bits) else 0
    out_bytes = val.to_bytes((len(bits) + 7) // 8, "big")
    if output_file:
        print(f"Writing to {output_file}")
        with open(output_file, "wb") as file:
            file.write(out_bytes)
    else:
        print(out_bytes)
 def decompress():
--- a/src/train.py
+++ b/src/train.py
@ -19,7 +19,7 @@ def train(
        model_path: str | None = None,
        model_out: str | None = None
 ):
-    batch_size = 2
+    batch_size = 64
    assert model_name or model_path, "Either a model to train or a model to load from model_path must be provided"
--- a/uv.lock
+++ b/uv.lock
@ -163,6 +163,11 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/7f/9c/36c5c37947ebfb8c7f22e0eb6e4d188ee2d53aa3880f3f2744fb894f0cb1/anyio-4.12.0-py3-none-any.whl", hash = "sha256:dad2376a628f98eeca4881fc56cd06affd18f659b17a747d3ff0307ced94b1bb", size = 113362, upload-time = "2025-11-28T23:36:57.897Z" },
 ]
 [[package]]
 name = "arithmeticencodingpython"
 version = "1.0.0"
 source = { git = "https://github.com/ahmedfgad/ArithmeticEncodingPython.git?rev=60aad0528c57289218b241d75993574f31b90456#60aad0528c57289218b241d75993574f31b90456" }
 [[package]]
 name = "attrs"
 version = "25.4.0"
@ -1621,6 +1626,7 @@ name = "project-ml"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
    { name = "arithmeticencodingpython" },
    { name = "datasets" },
    { name = "fsspec" },
    { name = "huggingface-hub" },
@ -1640,6 +1646,7 @@ dev = [
 [package.metadata]
 requires-dist = [
    { name = "arithmeticencodingpython", git = "https://github.com/ahmedfgad/ArithmeticEncodingPython.git?rev=60aad0528c57289218b241d75993574f31b90456" },
    { name = "datasets", specifier = ">=3.2.0" },
    { name = "fsspec", specifier = "==2024.9.0" },
    { name = "huggingface-hub", specifier = "==0.27.0" },