2025ML-project-neural_compr.../results/make_graphs.py

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from matplotlib.figure import Figure

ALGORITHM_COL = 'compressor'
LABEL_COL = 'label'
CONTEXT_COL = 'context_size'
INPUT_SIZE_COL = 'input_size'
OUTPUT_SIZE_COL = 'compressed_size'
COMPRESS_TIME_COL = 'compression_time'
DECOMPRESS_TIME_COL = 'decompression_time'
RATE_COL = 'compression_ratio'
DISTORTION_COL = 'mse_loss'


def original_v_compressed_filesize(
        df: pd.DataFrame,
        unique_labels: list[str],
        palette_dict,
        markers_dict
) -> Figure:
    """The "rate" graph"""
    plt.figure()

    break_point = 0.1

    ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
                                     OUTPUT_SIZE_COL, 'Compressed size (log, MB)',
                                     break_point, 'Compressor', 'upper left', LABEL_COL,
                                     unique_labels, palette_dict, markers_dict)

    # Add Baseline (y=x)
    df_small, df_large = df[df[INPUT_SIZE_COL] < break_point], df[df[INPUT_SIZE_COL] > break_point]
    baseline_label = 'Compression ratio 1.0'
    baseline_alpha = 0.5
    min_xy, max_xy = df_small[INPUT_SIZE_COL].min(), df_small[INPUT_SIZE_COL].max()
    ax_small.plot([min_xy, max_xy], [min_xy, max_xy],
                  color='gray', linestyle='--', label=baseline_label, alpha=baseline_alpha)
    min_xy, max_xy = df_large[INPUT_SIZE_COL].min(), df_large[INPUT_SIZE_COL].max()
    ax_large.plot([min_xy, max_xy], [min_xy, max_xy],
                  color='gray', linestyle='--', label=baseline_label, alpha=baseline_alpha)

    plt.yscale('log')

    return plt.gcf()


def filesize_v_compression_time(
        df: pd.DataFrame,
        unique_labels: list[str],
        palette_dict,
        markers_dict
) -> Figure:
    """The "execution time" graph"""
    plt.figure()

    split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
                COMPRESS_TIME_COL, 'Compression time (log, s)',
                0.1, 'Compressor', 'center left', LABEL_COL,
                unique_labels, palette_dict, markers_dict)

    plt.yscale('log')

    return plt.gcf()


def filesize_v_decompression_time(
        df: pd.DataFrame,
        unique_labels: list[str],
        palette_dict,
        markers_dict
) -> Figure:
    """The "execution time" graph"""
    plt.figure()

    split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
                DECOMPRESS_TIME_COL, 'Decompression time (log, s)',
                0.1, 'Compressor', 'center left', LABEL_COL,
                unique_labels, palette_dict, markers_dict)

    plt.yscale('log')

    return plt.gcf()


def split_graph(
        df, x, x_axis_label, y, y_axis_label,
        break_point, legend_title, legend_loc, hue, unique_labels, palette_dict, markers_dict
) -> tuple:
    df = df.sort_values(by=x)

    f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(10, 5))

    df_left = df[df[x] < break_point]
    sns.scatterplot(
        data=df_left,
        x=x,
        y=y,
        ax=ax_left,
        hue=hue,
        hue_order=unique_labels,
        palette=palette_dict,
        style=hue,
        style_order=unique_labels,
        markers=markers_dict
    )
    ax_left.set_xlabel('')

    df_right = df[df[x] > break_point]
    sns.scatterplot(
        data=df_right,
        x=x,
        y=y,
        ax=ax_right,
        hue=hue,
        hue_order=unique_labels,
        palette=palette_dict,
        style=hue,
        style_order=unique_labels,
        markers=markers_dict
    )
    ax_right.set_xlabel('')
    ax_right.set_ylabel('')

    # Combine both plots into one
    ax_left.spines['right'].set_visible(False)
    ax_right.spines['left'].set_visible(False)
    ax_right.yaxis.tick_right()
    ax_right.tick_params(labelright=False)
    ax_right.yaxis.set_ticks_position('none')

    # Add diagonal slash lines to indicate the break (with help from Gemini)
    d = .015  # proportion of vertical to horizontal extent of the slanted line
    kwargs = dict(transform=ax_left.transAxes, color='k', clip_on=False)
    ax_left.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # Top-right diagonal
    ax_left.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # Bottom-right diagonal

    kwargs.update(transform=ax_right.transAxes)  # Switch to the other axes
    ax_right.plot((-d, +d), (1 - d, 1 + d), **kwargs)  # Top-left diagonal
    ax_right.plot((-d, +d), (-d, +d), **kwargs)  # Bottom-left diagonal

    # Fix legends
    handles_left, labels_left = ax_left.get_legend_handles_labels()
    handles_right, labels_right = ax_right.get_legend_handles_labels()
    unique_legend = dict(zip(labels_left + labels_right, handles_left + handles_right))
    ax_left.get_legend().remove()
    ax_right.get_legend().remove()
    ax_left.legend(unique_legend.values(), unique_legend.keys(), title=legend_title, loc=legend_loc)

    f.text(0.5, 0, x_axis_label, ha='center', va='center')
    ax_left.set_ylabel(y_axis_label)

    ax_left.grid(True)
    ax_right.grid(True)

    plt.tight_layout()
    return ax_left, ax_right


def compression_v_mse_scatter(df: pd.DataFrame) -> Figure:
    """The "distortion" graph"""
    plt.figure()

    sns.scatterplot(
        data=df,
        x=RATE_COL,
        y=DISTORTION_COL
    )

    plt.xscale('log')
    plt.xlabel('Compression ratio (log)')

    # TODO This does not work properly

    plt.yscale('log')
    plt.ylabel('MSE (log)')

    return plt.gcf()


def compression_ratios(df: pd.DataFrame) -> Figure:
    """The "distortion" graph"""
    plt.figure()

    fig, ax = plt.subplots()
    sns.boxplot(
        data=df,
        x=RATE_COL,
        y=LABEL_COL,
        ax=ax
    )

    ax.set_xlabel('Compression ratio')
    ax.set_ylabel('')

    ax.grid(True)

    return plt.gcf()


def generate(
        df: pd.DataFrame, unique_labels, palette_dict, markers_dict,
        tgt_dir: str, dpi: int = 300
) -> None:
    """Generate all the plots"""
    # Make plots

    original_v_compressed_filesize(df, unique_labels, palette_dict, markers_dict).savefig(
        os.path.join(tgt_dir, 'original_v_compressed_filesize.png'),
        bbox_inches='tight',
        dpi=dpi
    )

    filesize_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig(
        os.path.join(tgt_dir, 'filesize_v_compression_time.png'),
        bbox_inches='tight',
        dpi=dpi
    )
    filesize_v_decompression_time(df, unique_labels, palette_dict, markers_dict).savefig(
        os.path.join(tgt_dir, 'filesize_v_decompression_time.png'),
        bbox_inches='tight',
        dpi=dpi
    )

    # compression_v_mse_scatter(df).savefig(os.path.join(tgt_dir, 'compression_v_mse.png'), bbox_inches='tight')
    compression_ratios(df).savefig(os.path.join(tgt_dir, 'compression_ratios.png'), bbox_inches='tight')


def setup(tgt_dir):
    # Create the targ directory if it does not exist
    os.makedirs(tgt_dir, exist_ok=True)

    # Prepare matplotlib for use with LaTeX (makes it look less out of place, less Pythonesque)
    params = {'text.usetex': True,
              'font.size': 11,
              'font.family': 'serif',
              }
    plt.rcParams.update(params)


def preprocessing(df: pd.DataFrame) -> tuple:
    # Convert byts to MB
    df[INPUT_SIZE_COL] /= 1e6
    df[OUTPUT_SIZE_COL] /= 1e6

    # Convert ns to s
    df[COMPRESS_TIME_COL] /= 1e9

    # Add labels to differentiate between algorithms with context lengths
    def create_label(row):
        compressor = row[ALGORITHM_COL]
        return compressor if pd.isna(row[CONTEXT_COL]) else f"{compressor} ($L = {int(row[CONTEXT_COL])}$)"

    df[LABEL_COL] = df.apply(create_label, axis=1)

    # Add the compression ratio
    df[RATE_COL] = df[INPUT_SIZE_COL] / df[OUTPUT_SIZE_COL]

    # Identify all categories upfront
    unique_labels = sorted(df[LABEL_COL].unique())
    n_labels = len(unique_labels)

    # Create fixed palette and marker mapping
    palette_dict = dict(zip(unique_labels, sns.color_palette("tab10", n_labels)))
    markers_dict = dict(zip(unique_labels, ['x', '+', '1', '2', '3', '4']))

    return df, unique_labels, palette_dict, markers_dict


def main():
    """Load the data and generate the plots."""
    df = pd.read_csv("measurements.csv")

    tgt_dir = "figures"
    setup(tgt_dir)
    generate(*preprocessing(df), tgt_dir=tgt_dir, dpi=150)


if __name__ == "__main__":
    main()
    exit()

    # read in the csv
    df = pd.read_csv("compression_results.csv")

    for dataset_type in df["dataset_type"].unique():
        for model_type in df["model_type"].unique():
            dataset_df = df[df["dataset_type"] == dataset_type]
            model_df = dataset_df[dataset_df["model_type"] == model_type].copy()

            # execution time
            plt.figure()
            model_df["original_file_size_mb"] = model_df["original_file_size"] / 1e6
            model_df["compression_time_s"] = model_df["compression_time"] / 1e9
            model_df["decompression_time_s"] = model_df["decompression_time"] / 1e9
            # compression
            sns.lineplot(
                data=model_df,
                x="original_file_size_mb",
                y="compression_time_s",
                hue="context_length",
                palette="Set1",
                markers=True,
                legend="brief",
                linestyle="-"
            )
            # decompression
            sns.lineplot(
                data=model_df,
                x="original_file_size_mb",
                y="decompression_time_s",
                hue="context_length",
                palette="Set1",
                markers=True,
                legend=False,
                linestyle="--"
            )
            plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}")
            plt.xlabel("file size [MB]")
            plt.ylabel("Time [s]")
            plt.yscale("log")
            plt.legend(
                [f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])])
            plt.tight_layout()
            plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png")

            # compression ratio
            plt.figure()
            c256 = model_df[model_df["context_length"] == 256]
            c128 = model_df[model_df["context_length"] == 128]

            plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256")
            plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128")
            plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}")
            plt.xlabel("Original file size [MB]")
            plt.ylabel("Compressed file size [MB]")
            plt.legend()
            plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png")

            # if model_type == "cnn":
            #     import numpy as np
            #
            #     plt.figure()
            #     for length, linestyle in [(128, '-'), (256, '--')]:
            #         # extrapolate execution time to larger files
            #         x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6
            #         y = model_df[model_df["context_length"] == length]["compression_time"]
            #         y_decom = model_df[model_df["context_length"] == length]["decompression_time"]
            #
            #         b1, loga1 = np.polyfit(x, np.log(y), 1)
            #         b2, loga2 = np.polyfit(x, np.log(y_decom), 1)
            #
            #         x_comp = np.linspace(0, 40, 1000)
            #         x_decomp = np.linspace(0, 40, 1000)
            #         a1 = np.exp(loga1)
            #         a2 = np.exp(loga2)
            #
            #
            #         plt.plot(
            #             x_comp, a1 * np.exp(x_comp),
            #             label=f"{length} compression",
            #             linestyle=linestyle
            #         )
            #         plt.plot(
            #             x_decomp, a2 * np.exp(x_decomp),
            #             label=f"{length} decompression",
            #             linestyle=linestyle
            #         )
            #
            #
            #
            #     plt.legend()
            #     plt.title(f"Extrapolated execution time for CNN compression and decompression")
            #     plt.xlabel("File size [MB]")
            #     plt.ylabel("Time [s]")
            #     plt.tight_layout()
            #     plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png")

    for model_type in df["model_type"].unique():
        model_df = df[df["model_type"] == model_type]

        plt.figure(figsize=(10, 4))
        bar_height = 0.25
        files = model_df["input_file_name"].unique()
        y = np.arange(len(files))
        c256 = model_df[model_df["context_length"] == 256]
        c128 = model_df[model_df["context_length"] == 128]

        plt.barh(
            y - bar_height / 2,
            c256["mse_loss"],
            height=bar_height,
            label="256"
        )

        plt.barh(
            y + bar_height / 2,
            c128["mse_loss"],
            height=bar_height,
            label="128"
        )
        plt.yticks(y, files, rotation=45, ha="right")
        plt.title(f"MSE loss for different context lengths")
        plt.xlabel("MSE loss")
        plt.ylabel("Filename")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"./graphs/{model_type}_loss.png")