feat: measuring code + graph generator code
This commit is contained in:
parent
dd0b3d3945
commit
f3b07c1df3
6 changed files with 325 additions and 140 deletions
98
graphs.ipynb
98
graphs.ipynb
File diff suppressed because one or more lines are too long
Binary file not shown.
|
Before Width: | Height: | Size: 2.3 KiB |
78
make_graphs.py
Normal file
78
make_graphs.py
Normal file
|
|
@ -0,0 +1,78 @@
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# read in the csv
|
||||||
|
df = pd.read_csv("./results/compress/compression_results.csv")
|
||||||
|
|
||||||
|
for model_type in df["model_type"].unique():
|
||||||
|
model_df = df[df["model_type"] == model_type]
|
||||||
|
|
||||||
|
# execution time
|
||||||
|
plt.figure()
|
||||||
|
grouped = model_df.groupby("context_length")["compression_time"].mean() / 1e9
|
||||||
|
labels = grouped.index.astype(str) # "128", "256"
|
||||||
|
x = np.arange(len(labels)) # [0, 1]
|
||||||
|
|
||||||
|
plt.bar(x, grouped.values, width=0.6)
|
||||||
|
plt.title(f"{model_type} mean compression time")
|
||||||
|
plt.xticks(x, labels)
|
||||||
|
plt.xlabel("Context length")
|
||||||
|
plt.ylabel("Mean compression time [s]")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(f"./graphs/{model_type}_{}_compression_time.png")
|
||||||
|
|
||||||
|
plt.figure()
|
||||||
|
grouped = model_df.groupby("context_length")["decompression_time"].mean() / 1e9
|
||||||
|
labels = grouped.index.astype(str) # "128", "256"
|
||||||
|
x = np.arange(len(labels)) # [0, 1]
|
||||||
|
|
||||||
|
plt.bar(x, grouped.values, width=0.6)
|
||||||
|
plt.title(f"{model_type} mean decompression time")
|
||||||
|
plt.xticks(x, labels)
|
||||||
|
plt.xlabel("Context length")
|
||||||
|
plt.ylabel("Mean decompression time [s]")
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig(f"./graphs/{model_type}_{}_decompression_time.png")
|
||||||
|
|
||||||
|
# accuracy
|
||||||
|
plt.figure()
|
||||||
|
bar_height = 0.25
|
||||||
|
files = model_df["input_file_name"].unique()
|
||||||
|
y = np.arange(len(files))
|
||||||
|
c256 = model_df[model_df["context_length"] == 256]
|
||||||
|
c128 = model_df[model_df["context_length"] == 128]
|
||||||
|
|
||||||
|
plt.barh(
|
||||||
|
y - bar_height / 2,
|
||||||
|
c256["match_percentage"] * 100,
|
||||||
|
height=bar_height,
|
||||||
|
label="256"
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.barh(
|
||||||
|
y + bar_height / 2,
|
||||||
|
c128["match_percentage"] * 100,
|
||||||
|
height=bar_height,
|
||||||
|
label="128"
|
||||||
|
)
|
||||||
|
plt.yticks(y, files)
|
||||||
|
plt.title(f"{model_type} time for different context lengths")
|
||||||
|
plt.xlabel("accuracy")
|
||||||
|
plt.ylabel("Filename")
|
||||||
|
plt.legend()
|
||||||
|
plt.savefig(f"./graphs/{model_type}_{}_accuracy.png")
|
||||||
|
|
||||||
|
# compression ratio
|
||||||
|
plt.figure()
|
||||||
|
c256 = model_df[model_df["context_length"] == 256]
|
||||||
|
c128 = model_df[model_df["context_length"] == 128]
|
||||||
|
|
||||||
|
plt.plot(c256["original_file_size"] / 1_000_000, c256["compressed_file_size"] / 1_000_000, label="256")
|
||||||
|
plt.plot(c128["original_file_size"] / 1_000_000, c128["compressed_file_size"] / 1_000_000, label="128")
|
||||||
|
plt.title(f"{model_type} compressed file evolution")
|
||||||
|
plt.xlabel("Original file size [MB]")
|
||||||
|
plt.ylabel("Compressed file size [MB]")
|
||||||
|
plt.legend()
|
||||||
|
plt.savefig(f"./graphs/{model_type}_{}_compression_ratio.png")
|
||||||
123
measure.py
Normal file
123
measure.py
Normal file
|
|
@ -0,0 +1,123 @@
|
||||||
|
import os
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import src.process as p
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def timer():
|
||||||
|
start = time.time_ns()
|
||||||
|
elapsed = None
|
||||||
|
|
||||||
|
def get_elapsed():
|
||||||
|
nonlocal elapsed
|
||||||
|
if elapsed is None:
|
||||||
|
elapsed = time.time_ns() - start
|
||||||
|
return elapsed
|
||||||
|
|
||||||
|
yield get_elapsed
|
||||||
|
get_elapsed()
|
||||||
|
|
||||||
|
|
||||||
|
def compare_files(original, decompressed: str | torch.Tensor):
|
||||||
|
with open(original, "rb") as file:
|
||||||
|
original = file.read()
|
||||||
|
original = torch.tensor(list(original), dtype=torch.uint8).cpu()
|
||||||
|
|
||||||
|
if type(decompressed) == "str":
|
||||||
|
with open(decompressed, "rb") as file:
|
||||||
|
decompressed = file.read()
|
||||||
|
decompressed = torch.tensor(list(decompressed), dtype=torch.uint8).cpu()
|
||||||
|
|
||||||
|
# count bytes matching
|
||||||
|
count = torch.sum(original == decompressed[:original.shape[0]])
|
||||||
|
accuracy = count / original.shape[0]
|
||||||
|
return accuracy
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
files_genome = [
|
||||||
|
"genome.fna",
|
||||||
|
"genome_large.fna",
|
||||||
|
"genome_xlarge.fna"
|
||||||
|
]
|
||||||
|
|
||||||
|
files_genome_cnn = [
|
||||||
|
"genome_small.fna",
|
||||||
|
"genome_xsmall.fna",
|
||||||
|
"genome_xxsmall.fna"
|
||||||
|
]
|
||||||
|
|
||||||
|
files_enwik9 = [
|
||||||
|
# "text.txt",
|
||||||
|
# "txt_large.txt",
|
||||||
|
# "txt_xlarge.txt"
|
||||||
|
]
|
||||||
|
|
||||||
|
files_enwik9_cnn = [
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
models = [
|
||||||
|
("auto-genome-full-256.pt", 256, "autoencoder", files_genome),
|
||||||
|
("auto-genome-full-128.pt", 128, "autoencoder", files_genome),
|
||||||
|
("cnn-genome-full-256.pt", 256, "cnn", files_genome_cnn),
|
||||||
|
("cnn-genome-full-128.pt", 128, "cnn", files_genome_cnn),
|
||||||
|
("auto-enwik9-full-256.pt", 256, "autoencoder", files_enwik9),
|
||||||
|
("auto-enwik9-full-128", 128, "autoencoder", files_enwik9),
|
||||||
|
("cnn-enwik9-full-256.pt", 256, "cnn", files_enwik9_cnn),
|
||||||
|
("cnn-enwik9-full-128.pt", 128, "cnn", files_enwik9_cnn),
|
||||||
|
]
|
||||||
|
|
||||||
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
with open("./results/compress/compression_results.csv", "w") as f:
|
||||||
|
# write header
|
||||||
|
f.write(
|
||||||
|
"model_type,model_name,context_length,input_file_name,original_file_size,compressed_file_size,match_percentage,compression_time,decompression_time\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
for model, context_length, model_name, files in models:
|
||||||
|
for file in files:
|
||||||
|
in_file = f"./data/compression_sets/{file}"
|
||||||
|
model_path = f"./models/{model_name}/{model}"
|
||||||
|
print(f"Running for model {model} and file {file}...")
|
||||||
|
with timer() as t:
|
||||||
|
compressed = p.compress(
|
||||||
|
device=device,
|
||||||
|
input_file=in_file,
|
||||||
|
model_name=model_name,
|
||||||
|
model_path=model_path,
|
||||||
|
context_length=context_length,
|
||||||
|
output_file="./output/tmp.pt"
|
||||||
|
)
|
||||||
|
compression_time = t()
|
||||||
|
|
||||||
|
with timer() as t:
|
||||||
|
decompressed = p.decompress(
|
||||||
|
device,
|
||||||
|
model_name=model_name,
|
||||||
|
model_path=model_path,
|
||||||
|
context_length=context_length,
|
||||||
|
input_file="./output/tmp.pt"
|
||||||
|
)
|
||||||
|
decompression_time = t()
|
||||||
|
|
||||||
|
|
||||||
|
accuracy = compare_files(in_file, decompressed.flatten().cpu())
|
||||||
|
|
||||||
|
og_file_len = os.path.getsize(in_file)
|
||||||
|
if compressed is None:
|
||||||
|
compressed_size = os.path.getsize("./output/tmp.pt")
|
||||||
|
else:
|
||||||
|
compressed_size = 4 * compressed.shape[0] * compressed.shape[1]
|
||||||
|
|
||||||
|
os.remove("./output/tmp.pt")
|
||||||
|
|
||||||
|
f.write(
|
||||||
|
f"{model_name},{model},{context_length},{file},{og_file_len},{compressed_size},{accuracy},{compression_time},{decompression_time}\n"
|
||||||
|
)
|
||||||
|
|
@ -58,8 +58,6 @@ class AutoEncoder(Model):
|
||||||
"""
|
"""
|
||||||
x: torch.Tensor of floats
|
x: torch.Tensor of floats
|
||||||
"""
|
"""
|
||||||
if len(x.shape) == 2:
|
|
||||||
x = x.unsqueeze(1)
|
|
||||||
return self.decoder(x)
|
return self.decoder(x)
|
||||||
|
|
||||||
def forward(self, x: torch.LongTensor) -> torch.Tensor:
|
def forward(self, x: torch.LongTensor) -> torch.Tensor:
|
||||||
|
|
|
||||||
164
src/process.py
164
src/process.py
|
|
@ -7,10 +7,13 @@ import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
import struct
|
||||||
|
|
||||||
from src.models import AutoEncoder
|
from src.models import AutoEncoder
|
||||||
from src.utils import reference_ae
|
from src.utils import reference_ae
|
||||||
|
|
||||||
|
NUMBITS = 64
|
||||||
|
|
||||||
|
|
||||||
def probs_to_freqs(probs, total_freq=8192):
|
def probs_to_freqs(probs, total_freq=8192):
|
||||||
freqs = (probs * total_freq).round().long()
|
freqs = (probs * total_freq).round().long()
|
||||||
|
|
@ -20,7 +23,7 @@ def probs_to_freqs(probs, total_freq=8192):
|
||||||
|
|
||||||
# Re-normalize so the sum matches total_freq
|
# Re-normalize so the sum matches total_freq
|
||||||
diff = total_freq - freqs.sum()
|
diff = total_freq - freqs.sum()
|
||||||
freqs[0] += diff # fix the sum by adjusting the first bin
|
freqs[freqs.argmax()] += diff # fix the sum by adjusting the first bin
|
||||||
|
|
||||||
return freqs
|
return freqs
|
||||||
|
|
||||||
|
|
@ -32,32 +35,39 @@ def ae_compress(
|
||||||
model: nn.Module,
|
model: nn.Module,
|
||||||
byte_data: bytes,
|
byte_data: bytes,
|
||||||
tensor: torch.Tensor
|
tensor: torch.Tensor
|
||||||
|
|
||||||
):
|
):
|
||||||
# Init AE
|
|
||||||
print("Initializing AE")
|
print("Initializing AE")
|
||||||
with contextlib.closing(reference_ae.BitOutputStream(open(output_file, "wb"))) as bitout:
|
|
||||||
enc = reference_ae.ArithmeticEncoder(len(byte_data), bitout)
|
|
||||||
|
|
||||||
context = deque([0] * context_length, maxlen=context_length)
|
with open(output_file, "wb") as raw_out:
|
||||||
|
# Write original length header (8 bytes)
|
||||||
|
raw_out.write(struct.pack(">Q", len(byte_data)))
|
||||||
|
|
||||||
# Compress
|
with contextlib.closing(reference_ae.BitOutputStream(raw_out)) as bitout:
|
||||||
for byte in tqdm(tensor.tolist(), desc="Compressing"):
|
enc = reference_ae.ArithmeticEncoder(NUMBITS, bitout)
|
||||||
context_tensor = torch.tensor([list(context)], dtype=torch.long, device=device)
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
context = deque([0] * context_length, maxlen=context_length)
|
||||||
logits = model(context_tensor)
|
|
||||||
probabilities = torch.softmax(logits[0], dim=-1)
|
|
||||||
print(f"probabilities: {probabilities}")
|
|
||||||
probabilities = probabilities.detach()
|
|
||||||
probability_table = reference_ae.SimpleFrequencyTable(probs_to_freqs(probabilities))
|
|
||||||
|
|
||||||
# write byte to output file
|
for byte in tqdm(tensor.tolist(), desc="Compressing"):
|
||||||
enc.write(probability_table, byte)
|
context_tensor = torch.tensor(
|
||||||
|
[list(context)],
|
||||||
|
dtype=torch.long,
|
||||||
|
device=device
|
||||||
|
)
|
||||||
|
|
||||||
context.append(byte)
|
with torch.inference_mode():
|
||||||
|
logits = model(context_tensor)
|
||||||
|
probabilities = torch.softmax(logits[0], dim=-1)
|
||||||
|
|
||||||
def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
|
freqs = probs_to_freqs(probabilities).tolist()
|
||||||
|
probability_table = reference_ae.SimpleFrequencyTable(freqs)
|
||||||
|
|
||||||
|
enc.write(probability_table, byte)
|
||||||
|
context.append(byte)
|
||||||
|
|
||||||
|
enc.finish()
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_data(x: bytes, context_length=128) -> torch.Tensor:
|
||||||
tensor_data = torch.tensor(list(x), dtype=torch.long)
|
tensor_data = torch.tensor(list(x), dtype=torch.long)
|
||||||
shape = tensor_data.size(0)
|
shape = tensor_data.size(0)
|
||||||
row_count = math.ceil(shape / context_length)
|
row_count = math.ceil(shape / context_length)
|
||||||
|
|
@ -65,13 +75,14 @@ def chunk_data(x: bytes, context_length = 128) -> torch.Tensor:
|
||||||
tensor_data = nn.functional.pad(tensor_data, (0, pad_count), value=0)
|
tensor_data = nn.functional.pad(tensor_data, (0, pad_count), value=0)
|
||||||
return tensor_data.view(row_count, context_length).float() / 255.0
|
return tensor_data.view(row_count, context_length).float() / 255.0
|
||||||
|
|
||||||
|
|
||||||
def auto_encoder_compress(
|
def auto_encoder_compress(
|
||||||
data: bytes,
|
data: bytes,
|
||||||
model: AutoEncoder,
|
model: AutoEncoder,
|
||||||
output_file: str,
|
output_file: str | None = None,
|
||||||
context_length: int = 128,
|
context_length: int = 128,
|
||||||
device: str = "cuda"
|
device: str = "cuda"
|
||||||
):
|
) -> torch.Tensor:
|
||||||
# convert data to chunks of context length tensors
|
# convert data to chunks of context length tensors
|
||||||
# send the data to device
|
# send the data to device
|
||||||
tensor = chunk_data(data, context_length).to(device)
|
tensor = chunk_data(data, context_length).to(device)
|
||||||
|
|
@ -83,10 +94,11 @@ def auto_encoder_compress(
|
||||||
print(f"output shape of compress: {4 * output.shape[0] * output.shape[1]} bytes")
|
print(f"output shape of compress: {4 * output.shape[0] * output.shape[1]} bytes")
|
||||||
|
|
||||||
# write output to file
|
# write output to file
|
||||||
print(f"saving to file {output_file}...")
|
if output_file is not None:
|
||||||
torch.save(output.detach(), output_file)
|
print(f"saving to file {output_file}...")
|
||||||
|
torch.save(output.detach(), output_file)
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
def compress(
|
def compress(
|
||||||
|
|
@ -99,7 +111,7 @@ def compress(
|
||||||
):
|
):
|
||||||
# Get input to compress
|
# Get input to compress
|
||||||
print("Reading input")
|
print("Reading input")
|
||||||
if input_file:
|
if input_file is not None:
|
||||||
with open(input_file, "rb") as file:
|
with open(input_file, "rb") as file:
|
||||||
byte_data = file.read()
|
byte_data = file.read()
|
||||||
else:
|
else:
|
||||||
|
|
@ -111,14 +123,14 @@ def compress(
|
||||||
tensor = torch.tensor(list(byte_data), dtype=torch.long)
|
tensor = torch.tensor(list(byte_data), dtype=torch.long)
|
||||||
|
|
||||||
# Get model
|
# Get model
|
||||||
print("Loading model")
|
print(f"Loading model: {model_name}")
|
||||||
model = torch.load(model_path, weights_only=False)
|
model = torch.load(model_path, weights_only=False)
|
||||||
model.to(device)
|
model.to(device)
|
||||||
model.eval()
|
model.eval()
|
||||||
|
|
||||||
match model_name:
|
match model_name:
|
||||||
case "cnn":
|
case "cnn":
|
||||||
ae_compress(
|
return ae_compress(
|
||||||
output_file,
|
output_file,
|
||||||
context_length,
|
context_length,
|
||||||
device,
|
device,
|
||||||
|
|
@ -127,7 +139,7 @@ def compress(
|
||||||
tensor
|
tensor
|
||||||
)
|
)
|
||||||
case "autoencoder":
|
case "autoencoder":
|
||||||
auto_encoder_compress(
|
return auto_encoder_compress(
|
||||||
byte_data,
|
byte_data,
|
||||||
model,
|
model,
|
||||||
output_file,
|
output_file,
|
||||||
|
|
@ -138,16 +150,75 @@ def compress(
|
||||||
raise ValueError(f"Unknown model type: {model_name}")
|
raise ValueError(f"Unknown model type: {model_name}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def ae_decompress(
|
def ae_decompress(
|
||||||
|
model: nn.Module,
|
||||||
|
input_file: str,
|
||||||
|
context_length=128,
|
||||||
|
device="cuda",
|
||||||
|
output_file: str | None = None
|
||||||
):
|
):
|
||||||
pass
|
print("Initializing AE decoder")
|
||||||
|
|
||||||
|
with open(input_file, "rb") as raw_in:
|
||||||
|
# Read original length header
|
||||||
|
original_length_bytes = raw_in.read(8)
|
||||||
|
if len(original_length_bytes) != 8:
|
||||||
|
raise ValueError("Invalid compressed file (missing length header)")
|
||||||
|
|
||||||
|
original_length = struct.unpack(">Q", original_length_bytes)[0]
|
||||||
|
print(f"Original length: {original_length} bytes")
|
||||||
|
|
||||||
|
with contextlib.closing(reference_ae.BitInputStream(raw_in)) as bitin:
|
||||||
|
dec = reference_ae.ArithmeticDecoder(NUMBITS, bitin)
|
||||||
|
|
||||||
|
context = deque([0] * context_length, maxlen=context_length)
|
||||||
|
output_data = []
|
||||||
|
|
||||||
|
# Decode exactly original_length bytes
|
||||||
|
for _ in range(original_length):
|
||||||
|
context_tensor = torch.tensor(
|
||||||
|
[list(context)],
|
||||||
|
dtype=torch.long,
|
||||||
|
device=device
|
||||||
|
)
|
||||||
|
|
||||||
|
with torch.inference_mode():
|
||||||
|
logits = model(context_tensor)
|
||||||
|
probabilities = torch.softmax(logits[0], dim=-1)
|
||||||
|
|
||||||
|
freqs = probs_to_freqs(probabilities).tolist()
|
||||||
|
probability_table = reference_ae.SimpleFrequencyTable(freqs)
|
||||||
|
|
||||||
|
byte = dec.read(probability_table)
|
||||||
|
output_data.append(byte)
|
||||||
|
context.append(byte)
|
||||||
|
|
||||||
|
byte_data = torch.tensor(output_data, dtype=torch.long).byte()
|
||||||
|
|
||||||
|
if output_file is not None:
|
||||||
|
with open(output_file, "wb") as file:
|
||||||
|
file.write(byte_data.cpu().numpy().tobytes())
|
||||||
|
|
||||||
|
return byte_data
|
||||||
|
|
||||||
|
|
||||||
def auto_encoder_decompress(
|
def auto_encoder_decompress(
|
||||||
|
data: torch.Tensor,
|
||||||
|
model: AutoEncoder,
|
||||||
|
output_file: str | None = None,
|
||||||
|
context_length=128,
|
||||||
|
device="cuda"
|
||||||
|
) -> torch.Tensor:
|
||||||
|
decompressed = model.decode(data).squeeze(1)
|
||||||
|
|
||||||
):
|
# convert result back to bytes
|
||||||
pass
|
byte_data = (decompressed * 255.0).round().byte().detach()
|
||||||
|
|
||||||
|
if output_file is not None:
|
||||||
|
with open(output_file, "wb") as file:
|
||||||
|
file.write(byte_data.cpu().numpy().tobytes())
|
||||||
|
|
||||||
|
return byte_data
|
||||||
|
|
||||||
|
|
||||||
def decompress(
|
def decompress(
|
||||||
|
|
@ -156,14 +227,16 @@ def decompress(
|
||||||
model_name: str,
|
model_name: str,
|
||||||
input_file: str,
|
input_file: str,
|
||||||
output_file: str | None = None,
|
output_file: str | None = None,
|
||||||
context_length: int = 128
|
context_length: int = 128
|
||||||
):
|
):
|
||||||
print("Reading in the data")
|
print("Reading in the data")
|
||||||
with open(input_file, "r") as f:
|
if model_name != "autoencoder":
|
||||||
length = int(f.readline())
|
with open(input_file, "rb") as f:
|
||||||
bytes_data = f.read()
|
data = f.read()
|
||||||
|
else:
|
||||||
|
data = torch.load(input_file, map_location=device)
|
||||||
|
|
||||||
if len(bytes_data) == 0:
|
if len(data) == 0:
|
||||||
print("Input file is empty, nothing has to be done...")
|
print("Input file is empty, nothing has to be done...")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
@ -174,8 +247,19 @@ def decompress(
|
||||||
|
|
||||||
match model_name:
|
match model_name:
|
||||||
case "cnn":
|
case "cnn":
|
||||||
ae_decompress()
|
return ae_decompress(
|
||||||
|
model=model,
|
||||||
|
input_file=input_file,
|
||||||
|
context_length=context_length,
|
||||||
|
output_file=output_file
|
||||||
|
)
|
||||||
case "autoencoder":
|
case "autoencoder":
|
||||||
auto_encoder_decompress()
|
return auto_encoder_decompress(
|
||||||
|
data,
|
||||||
|
model,
|
||||||
|
output_file,
|
||||||
|
context_length,
|
||||||
|
device
|
||||||
|
)
|
||||||
case _:
|
case _:
|
||||||
raise ValueError(f"Unknown model type: {model_name}")
|
raise ValueError(f"Unknown model type: {model_name}")
|
||||||
|
|
|
||||||
Reference in a new issue