feat: CNN model start + rude concept of training
This commit is contained in:
parent
d6c8bf4a13
commit
947aba31ee
6 changed files with 383 additions and 1 deletions
46
CNN-model/cnn.py
Normal file
46
CNN-model/cnn.py
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from torch import Tensor
|
||||
from torch.nn.functional import softmax
|
||||
|
||||
|
||||
class CausalConv1d(nn.Conv1d):
|
||||
def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
|
||||
super().__init__(input_channels, output_channels, kernel_size, padding=kernel_size-1, **kwargs)
|
||||
def forward(self, input: Tensor) -> Tensor:
|
||||
return super().forward(input)
|
||||
|
||||
class CNNPredictor(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=256,
|
||||
context_length=128,
|
||||
num_layers=3,
|
||||
hidden_dim=128,
|
||||
kernel_size=3,
|
||||
dropout_prob=0.1,
|
||||
use_batchnorm=False
|
||||
):
|
||||
super().__init__()
|
||||
self.embedding = nn.Embedding(vocab_size, hidden_dim)
|
||||
layers = []
|
||||
in_channels = hidden_dim
|
||||
for _ in range(num_layers):
|
||||
out_channels = hidden_dim
|
||||
layers.append(CausalConv1d(in_channels, out_channels, kernel_size))
|
||||
if use_batchnorm:
|
||||
layers.append(nn.BatchNorm1d(out_channels))
|
||||
layers.append(nn.ReLU())
|
||||
layers.append(nn.Dropout(dropout_prob))
|
||||
in_channels = out_channels
|
||||
|
||||
self.network = nn.Sequential(*layers)
|
||||
self.output_layer = nn.Linear(hidden_dim, vocab_size)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
emdedding = self.embedding(x) # B, L, H
|
||||
emdedding = emdedding.transpose(1, 2) # B, H, L
|
||||
prediction = self.network(emdedding)
|
||||
last_prediction = prediction[:, :, -1]
|
||||
return softmax(self.output_layer(last_prediction), dim=-1) # convert output of linear layer to prob. distr.
|
||||
|
||||
11
CNN-model/data_utils.py
Normal file
11
CNN-model/data_utils.py
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
import torch
|
||||
from torch.utils.data import TensorDataset
|
||||
|
||||
|
||||
def make_context_pairs(data: bytes, context_length: int) -> TensorDataset:
|
||||
data = torch.tensor(data, dtype=torch.uint8)
|
||||
sample_count = data.shape[0] - context_length
|
||||
x = data.unfold(0, context_length, 1)[:sample_count]
|
||||
y = data[context_length:]
|
||||
return TensorDataset(x, y)
|
||||
|
||||
35
CNN-model/main_cnn.py
Normal file
35
CNN-model/main_cnn.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.functional as F
|
||||
import optuna.trial as tr
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from optuna_trial import create_model
|
||||
from data_utils import make_context_pairs
|
||||
|
||||
# hyper parameters
|
||||
context_length = 128
|
||||
|
||||
def train_and_eval(
|
||||
model: nn.Module,
|
||||
training_data: bytes,
|
||||
validation_data: bytes,
|
||||
batch_size: int,
|
||||
epochs: int = 100,
|
||||
learning_rate: float = 1e-3,
|
||||
device: torch.device = torch.device("cpu")
|
||||
):
|
||||
model.to(device)
|
||||
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
|
||||
training_loader = DataLoader(make_context_pairs(training_data, context_length=context_length))
|
||||
validation_loader= DataLoader(make_context_pairs(validation_data, context_length=context_length))
|
||||
|
||||
for epoch in range(epochs):
|
||||
model.train()
|
||||
|
||||
|
||||
def objective_function(trial: tr.Trial):
|
||||
model = create_model(trial)
|
||||
|
||||
if __name__ == "__main__":
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
19
CNN-model/optuna_trial.py
Normal file
19
CNN-model/optuna_trial.py
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
import optuna.trial as tr
|
||||
from cnn import CNNPredictor
|
||||
|
||||
def create_model(trial: tr.Trial, vocab_size: int = 256, context_length: int = 128):
|
||||
num_layers = trial.suggest_int("num_layers", 1, 6)
|
||||
hidden_dim = trial.suggest_int("hidden_dim", 64, 512, log=True)
|
||||
kernel_size = trial.suggest_int("kernel_size", 2, 7)
|
||||
dropout_prob = trial.suggest_float("dropout_prob", 0.1, 0.5)
|
||||
use_batchnorm = trial.suggest_categorical("use_batchnorm", [True, False])
|
||||
|
||||
return CNNPredictor(
|
||||
vocab_size=vocab_size,
|
||||
context_length=context_length,
|
||||
num_layers=num_layers,
|
||||
hidden_dim=hidden_dim,
|
||||
kernel_size=kernel_size,
|
||||
dropout_prob=dropout_prob,
|
||||
use_batchnorm=use_batchnorm
|
||||
)
|
||||
Reference in a new issue