feat: Graphs checkpoint

2025-12-19 00:06:14 +01:00 · 2025-12-19 00:06:14 +01:00 · b62f06018d
commit b62f06018d
parent 15062d8884
1 changed files with 201 additions and 138 deletions
--- a/results/make_graphs.py
+++ b/results/make_graphs.py
@ -3,7 +3,6 @@ import os
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-import scipy
 import seaborn as sns
 from matplotlib.figure import Figure

@ -18,21 +17,18 @@ RATE_COL = 'compression_ratio'
 DISTORTION_COL = 'mse_loss'


-def original_v_compressed_filesize(
-        df: pd.DataFrame,
-        unique_labels: list[str],
-        palette_dict,
-        markers_dict
-) -> Figure:
+def original_v_compressed_filesize(df: pd.DataFrame,
+                                   unique_labels: list[str], palette_dict, markers_dict
+                                   ) -> Figure:
    """The "rate" graph"""
    plt.figure()

    break_point = 0.1

-    ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
-                                     OUTPUT_SIZE_COL, 'Compressed size (log, MB)',
-                                     break_point, 'Compressor', 'upper left', LABEL_COL,
-                                     unique_labels, palette_dict, markers_dict)
+    _, ax_small, ax_large = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
+                                        OUTPUT_SIZE_COL, 'Compressed size (log, MB)',
+                                        break_point, 'Compressor', 'upper left', LABEL_COL,
+                                        unique_labels, palette_dict, markers_dict)

    # Add Baseline (y=x)
    df_small, df_large = df[df[INPUT_SIZE_COL] < break_point], df[df[INPUT_SIZE_COL] > break_point]
@ -50,51 +46,136 @@ def original_v_compressed_filesize(
    return plt.gcf()


-def filesize_v_compression_time(
-        df: pd.DataFrame,
-        unique_labels: list[str],
-        palette_dict,
-        markers_dict
-) -> Figure:
+def compression_ratios(df: pd.DataFrame, unique_labels, palette_dict) -> Figure:
+    """The "rate" graph"""
+    plt.figure()
+
+    fig, ax = plt.subplots()
+    sns.boxplot(
+        data=df,
+        x=RATE_COL,
+        y=LABEL_COL,
+        hue=LABEL_COL,
+        hue_order=unique_labels,
+        palette=palette_dict,
+        ax=ax,
+        fill=False
+    )
+
+    ax.set_xlabel('Compression ratio')
+    ax.set_ylabel('Compressor')
+
+    plt.yticks(rotation=45, ha="right")
+
+    ax.grid(True)
+
+    return plt.gcf()
+
+
+def filesize_v_compression_time(df: pd.DataFrame,
+                                unique_labels: list[str], palette_dict, markers_dict
+                                ) -> Figure:
    """The "execution time" graph"""
    plt.figure()

-    split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
-                COMPRESS_TIME_COL, 'Compression time (log, s)',
-                0.1, 'Compressor', 'center left', LABEL_COL,
-                unique_labels, palette_dict, markers_dict)
+    f, _, _ = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
+                          COMPRESS_TIME_COL, 'Runtime (log, s)',
+                          0.1, 'Compressor', 'center left', LABEL_COL,
+                          unique_labels, palette_dict, markers_dict)

+    f.text(0.5, 1, 'Compression runtime for different filesizes using each compressor', va='center', ha='center')
    plt.yscale('log')

    return plt.gcf()


-def filesize_v_decompression_time(
-        df: pd.DataFrame,
-        unique_labels: list[str],
-        palette_dict,
-        markers_dict
-) -> Figure:
+def filesize_v_decompression_time(df: pd.DataFrame,
+                                  unique_labels: list[str], palette_dict, markers_dict
+                                  ) -> Figure:
    """The "execution time" graph"""
    plt.figure()

-    split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
-                DECOMPRESS_TIME_COL, 'Decompression time (log, s)',
-                0.1, 'Compressor', 'center left', LABEL_COL,
-                unique_labels, palette_dict, markers_dict)
+    f, _, _ = split_graph(df, INPUT_SIZE_COL, 'Input size (MB)',
+                          DECOMPRESS_TIME_COL, 'Runtime (log, s)',
+                          0.1, 'Compressor', 'center left', LABEL_COL,
+                          unique_labels, palette_dict, markers_dict)

+    f.text(0.5, 1, 'Decompression runtime for different filesizes using each compressor', va='center', ha='center')
    plt.yscale('log')

    return plt.gcf()


+def filesize_v_mse(df: pd.DataFrame) -> Figure:
+    """The "distortion" graph"""
+    plt.figure()
+
+    df = df[df[DISTORTION_COL] != 0]
+    df = df[df[ALGORITHM_COL] == 'Autoencoder']
+
+    df.sort_values(by=INPUT_SIZE_COL, inplace=True)
+
+    def filename_and_size(row):
+        filename = row['input_filename']
+        size = row[INPUT_SIZE_COL]
+        return f"{filename} ({size:.4f} MB)"
+
+    df['input_filename_size'] = df.apply(filename_and_size, axis=1)
+
+    fig, ax = plt.subplots()
+    sns.barplot(
+        data=df,
+        y='input_filename',
+        x=DISTORTION_COL,
+        hue=CONTEXT_COL,
+        ax=ax,
+        palette='Set2'
+    )
+
+    plt.title('MSE for autoencoder')
+    plt.xlabel('MSE')
+    plt.ylabel('Filename')
+    plt.yticks(rotation=45, ha="right")
+    plt.legend(title='Context size')
+
+    plt.grid(True)
+
+    return plt.gcf()
+
+
+def mse_losses(df: pd.DataFrame, unique_labels, palette_dict) -> Figure:
+    """The "distortion" graph"""
+    plt.figure()
+
+    fig, ax = plt.subplots()
+    sns.boxplot(
+        data=df,
+        x=DISTORTION_COL,
+        y=LABEL_COL,
+        hue=LABEL_COL,
+        hue_order=unique_labels,
+        palette=palette_dict,
+        ax=ax,
+        fill=False
+    )
+
+    ax.set_xlabel('MSE')
+    ax.set_ylabel('Compressor')
+
+    plt.yticks(rotation=45, ha="right")
+
+    ax.grid(True)
+
+    return plt.gcf()
+
+
 def split_graph(
        df, x, x_axis_label, y, y_axis_label,
        break_point, legend_title, legend_loc, hue, unique_labels, palette_dict, markers_dict
 ) -> tuple:
    df = df.sort_values(by=x)

-    f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(10, 5))
+    f, (ax_left, ax_right) = plt.subplots(1, 2, sharey=True, figsize=(8, 4))

    df_left = df[df[x] < break_point]
    sns.scatterplot(
@ -107,7 +188,8 @@ def split_graph(
        palette=palette_dict,
        style=hue,
        style_order=unique_labels,
-        markers=markers_dict
+        markers=markers_dict,
+        # s=150
    )
    ax_left.set_xlabel('')

@ -122,7 +204,8 @@ def split_graph(
        palette=palette_dict,
        style=hue,
        style_order=unique_labels,
-        markers=markers_dict
+        markers=markers_dict,
+        # s=150
    )
    ax_right.set_xlabel('')
    ax_right.set_ylabel('')
@ -159,48 +242,7 @@ def split_graph(
    ax_right.grid(True)

    plt.tight_layout()
-    return ax_left, ax_right
-
-
-def compression_v_mse_scatter(df: pd.DataFrame) -> Figure:
-    """The "distortion" graph"""
-    plt.figure()
-
-    sns.scatterplot(
-        data=df,
-        x=RATE_COL,
-        y=DISTORTION_COL
-    )
-
-    plt.xscale('log')
-    plt.xlabel('Compression ratio (log)')
-
-    # TODO This does not work properly
-
-    plt.yscale('log')
-    plt.ylabel('MSE (log)')
-
-    return plt.gcf()
-
-
-def compression_ratios(df: pd.DataFrame) -> Figure:
-    """The "distortion" graph"""
-    plt.figure()
-
-    fig, ax = plt.subplots()
-    sns.boxplot(
-        data=df,
-        x=RATE_COL,
-        y=LABEL_COL,
-        ax=ax
-    )
-
-    ax.set_xlabel('Compression ratio')
-    ax.set_ylabel('')
-
-    ax.grid(True)
-
-    return plt.gcf()
+    return f, ax_left, ax_right


 def generate(
@ -213,22 +255,29 @@ def generate(
    original_v_compressed_filesize(df, unique_labels, palette_dict, markers_dict).savefig(
        os.path.join(tgt_dir, 'original_v_compressed_filesize.png'),
        bbox_inches='tight',
-        dpi=dpi
    )

    filesize_v_compression_time(df, unique_labels, palette_dict, markers_dict).savefig(
        os.path.join(tgt_dir, 'filesize_v_compression_time.png'),
        bbox_inches='tight',
-        dpi=dpi
    )
    filesize_v_decompression_time(df, unique_labels, palette_dict, markers_dict).savefig(
        os.path.join(tgt_dir, 'filesize_v_decompression_time.png'),
        bbox_inches='tight',
-        dpi=dpi
    )

-    # compression_v_mse_scatter(df).savefig(os.path.join(tgt_dir, 'compression_v_mse.png'), bbox_inches='tight')
-    compression_ratios(df).savefig(os.path.join(tgt_dir, 'compression_ratios.png'), bbox_inches='tight')
+    compression_ratios(df, unique_labels, palette_dict).savefig(
+        os.path.join(tgt_dir, 'compression_ratios.png'),
+        bbox_inches='tight'
+    )
+    filesize_v_mse(df).savefig(
+        os.path.join(tgt_dir, 'filesize_mse.png'),
+        bbox_inches='tight'
+    )
+    mse_losses(df, unique_labels, palette_dict).savefig(
+        os.path.join(tgt_dir, 'mse_losses.png'),
+        bbox_inches='tight'
+    )


 def setup(tgt_dir):
@ -239,6 +288,7 @@ def setup(tgt_dir):
    params = {'text.usetex': True,
              'font.size': 11,
              'font.family': 'serif',
+              'figure.dpi': 300,
              }
    plt.rcParams.update(params)

@ -266,8 +316,8 @@ def preprocessing(df: pd.DataFrame) -> tuple:
    n_labels = len(unique_labels)

    # Create fixed palette and marker mapping
-    palette_dict = dict(zip(unique_labels, sns.color_palette("tab10", n_labels)))
-    markers_dict = dict(zip(unique_labels, ['x', '+', '1', '2', '3', '4']))
+    palette_dict = dict(zip(unique_labels, sns.color_palette("Set2", n_labels)))
+    markers_dict = dict(zip(unique_labels, ['o', '^', 'v', 's', 'D', 'H', 'X']))

    return df, unique_labels, palette_dict, markers_dict

@ -281,20 +331,26 @@ def main():
    generate(*preprocessing(df), tgt_dir=tgt_dir, dpi=150)


-if __name__ == "__main__":
-    main()
-    exit()
-
+def old_results():
    # read in the csv
    df = pd.read_csv("compression_results.csv")

+    # Make compatible with new code
+    df[INPUT_SIZE_COL] = df['original_file_size']
+    df[OUTPUT_SIZE_COL] = df['compressed_file_size']
+    df['compressor'] = df['model_type']
+    df[CONTEXT_COL] = df['context_length']
+    #
+
+    df, unique_labels, palette_dict, markers_dict = preprocessing(df)
+
    for dataset_type in df["dataset_type"].unique():
        for model_type in df["model_type"].unique():
            dataset_df = df[df["dataset_type"] == dataset_type]
            model_df = dataset_df[dataset_df["model_type"] == model_type].copy()

            # execution time
-            plt.figure()
+            plt.figure(figsize=(4, 3))
            model_df["original_file_size_mb"] = model_df["original_file_size"] / 1e6
            model_df["compression_time_s"] = model_df["compression_time"] / 1e9
            model_df["decompression_time_s"] = model_df["decompression_time"] / 1e9
@ -304,7 +360,7 @@ if __name__ == "__main__":
                x="original_file_size_mb",
                y="compression_time_s",
                hue="context_length",
-                palette="Set1",
+                palette="Set2",
                markers=True,
                legend="brief",
                linestyle="-"
@ -315,14 +371,14 @@ if __name__ == "__main__":
                x="original_file_size_mb",
                y="decompression_time_s",
                hue="context_length",
-                palette="Set1",
+                palette="Set2",
                markers=True,
                legend=False,
                linestyle="--"
            )
-            plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}")
-            plt.xlabel("file size [MB]")
-            plt.ylabel("Time [s]")
+            # plt.title(f"{model_type.capitalize()} compression and decompression time: {dataset_type}")
+            plt.xlabel("File size (MB)")
+            plt.ylabel("Time (log, s)")
            plt.yscale("log")
            plt.legend(
                [f"{style}, {c_type}" for style, c_type in zip(["Solid", "Dashed"], ["compression", "decompression"])])
@ -330,56 +386,57 @@ if __name__ == "__main__":
            plt.savefig(f"./graphs/{model_type}_{dataset_type}_execution_time.png")

            # compression ratio
-            plt.figure()
+            plt.figure(figsize=(4, 3))
            c256 = model_df[model_df["context_length"] == 256]
            c128 = model_df[model_df["context_length"] == 128]

            plt.plot(c256["original_file_size"] / 1e6, c256["compressed_file_size"] / 1e6, label="256")
            plt.plot(c128["original_file_size"] / 1e6, c128["compressed_file_size"] / 1e6, label="128")
-            plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}")
-            plt.xlabel("Original file size [MB]")
-            plt.ylabel("Compressed file size [MB]")
-            plt.legend()
+            # plt.title(f"{model_type.capitalize()} compressed file evolution: {dataset_type}")
+            plt.xlabel("Original file size (MB)")
+            plt.ylabel("Compressed file size (MB)")
+            plt.ylim(0, model_df["compressed_file_size"].max() / 1e6)
+            plt.legend(title="Context size")
+            plt.tight_layout()
            plt.savefig(f"./graphs/{model_type}_{dataset_type}_compression_ratio.png")

-            # if model_type == "cnn":
-            #     import numpy as np
-            #
-            #     plt.figure()
-            #     for length, linestyle in [(128, '-'), (256, '--')]:
-            #         # extrapolate execution time to larger files
-            #         x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6
-            #         y = model_df[model_df["context_length"] == length]["compression_time"]
-            #         y_decom = model_df[model_df["context_length"] == length]["decompression_time"]
-            #
-            #         b1, loga1 = np.polyfit(x, np.log(y), 1)
-            #         b2, loga2 = np.polyfit(x, np.log(y_decom), 1)
-            #
-            #         x_comp = np.linspace(0, 40, 1000)
-            #         x_decomp = np.linspace(0, 40, 1000)
-            #         a1 = np.exp(loga1)
-            #         a2 = np.exp(loga2)
-            #
-            #
-            #         plt.plot(
-            #             x_comp, a1 * np.exp(x_comp),
-            #             label=f"{length} compression",
-            #             linestyle=linestyle
-            #         )
-            #         plt.plot(
-            #             x_decomp, a2 * np.exp(x_decomp),
-            #             label=f"{length} decompression",
-            #             linestyle=linestyle
-            #         )
-            #
-            #
-            #
-            #     plt.legend()
-            #     plt.title(f"Extrapolated execution time for CNN compression and decompression")
-            #     plt.xlabel("File size [MB]")
-            #     plt.ylabel("Time [s]")
-            #     plt.tight_layout()
-            #     plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png")
+            if model_type == "cnn":
+
+                plt.figure()
+                for length, linestyle in [(128, '-'), (256, '--')]:
+                    # extrapolate execution time to larger files
+                    x = model_df[model_df["context_length"] == length]["original_file_size"] / 1e6
+                    y = model_df[model_df["context_length"] == length]["compression_time"]
+                    y_decom = model_df[model_df["context_length"] == length]["decompression_time"]
+
+                    b1, loga1 = np.polyfit(x, np.log(y), 1)
+                    b2, loga2 = np.polyfit(x, np.log(y_decom), 1)
+
+                    x_comp = np.linspace(0, 40, 1000)
+                    x_decomp = np.linspace(0, 40, 1000)
+                    a1 = np.exp(loga1)
+                    a2 = np.exp(loga2)
+
+                    plt.plot(
+                        x_comp, a1 * np.exp(x_comp),
+                        label=f"{length} compression",
+                        linestyle=linestyle
+                    )
+                    plt.plot(
+                        x_decomp, a2 * np.exp(x_decomp),
+                        label=f"{length} decompression",
+                        linestyle=linestyle
+                    )
+
+                plt.grid(True)
+                plt.legend()
+                plt.title(f"(Log-linear) Extrapolated execution time for CNN")
+                # plt.xscale('log')
+                plt.xlabel("File size (MB)")
+                plt.yscale('log')
+                plt.ylabel("Time (log, s)")
+                plt.tight_layout()
+                plt.savefig(f"./graphs/{model_type}_{dataset_type}_extrapolated_execution_time.png")

    for model_type in df["model_type"].unique():
        model_df = df[df["model_type"] == model_type]
@ -395,19 +452,25 @@ if __name__ == "__main__":
            y - bar_height / 2,
            c256["mse_loss"],
            height=bar_height,
-            label="256"
+            label="256",
        )

        plt.barh(
            y + bar_height / 2,
            c128["mse_loss"],
            height=bar_height,
-            label="128"
+            label="128",
        )
        plt.yticks(y, files, rotation=45, ha="right")
        plt.title(f"MSE loss for different context lengths")
        plt.xlabel("MSE loss")
        plt.ylabel("Filename")
        plt.legend()
+        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"./graphs/{model_type}_loss.png")
+
+
+if __name__ == "__main__":
+    main()
+    old_results()