feat: uhm, i changed some things

2025-11-25 20:20:08 +01:00 · 2025-11-25 20:20:08 +01:00 · 6de4db24cc
commit 6de4db24cc
parent b58682cb49
27 changed files with 1302 additions and 137 deletions
--- a/CNN-model/datasets/LoremIpsumDataset.py
+++ b/CNN-model/datasets/LoremIpsumDataset.py
@ -0,0 +1,5 @@
+import lorem
+
+class LoremIpsumDataset:
+    def __init__(self):
+        self.data = lorem.text(paragraphs=100)
--- a/CNN-model/datasets/init.py
+++ b/CNN-model/datasets/init.py
@ -0,0 +1,2 @@
+from EnWik9 import EnWik9DataSet
+from LoremIpsumDataset import LoremIpsumDataset
--- a/CNN-model/main_cnn.py
+++ b/CNN-model/main_cnn.py
@ -1,93 +1,55 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import optuna.trial as tr
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-import argparse
+from argparse import ArgumentParser
+from math import ceil

-from optuna_trial import create_model
-from utils import make_context_pairs, load_data
-import optuna
+import torch
+from torch.utils.data import DataLoader, TensorDataset
+
+from datasets import EnWik9DataSet, LoremIpsumDataset
+from trainers import OptunaTrainer, Trainer
+
+BATCH_SIZE = 64
+DEVICE = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"

 # hyper parameters
 context_length = 128

-def train_and_eval(
-        model: nn.Module,
-        training_data: bytes,
-        validation_data: bytes,
-        batch_size: int,
-        epochs: int = 100,
-        learning_rate: float = 1e-3,
-        device: torch.device = torch.device("cpu")
-) -> dict:
-    model.to(device)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
-    training_loader = DataLoader(make_context_pairs(training_data, context_length=context_length), batch_size=batch_size)
-    validation_loader= DataLoader(make_context_pairs(validation_data, context_length=context_length), batch_size=batch_size)
-
-    training_losses = []
-    validation_losses = []
-    best_val_loss = float("inf")
-
-    for epoch in range(epochs):
-        model.train()
-        train_loss = 0
-        for x, y in tqdm(training_loader, desc=f"Epoch {epoch}"):
-            x, y = x.to(device), y.to(device)
-            prediction = model(x)
-            loss = F.cross_entropy(prediction, y)
-            train_loss += loss.item()
-
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-        training_losses.append(train_loss / len(training_loader))
-
-        model.eval()
-
-        with torch.no_grad():
-            val_loss = 0
-            for x, y in validation_loader:
-                x, y = x.to(device), y.to(device)
-                prediction = model(x)
-                loss = F.cross_entropy(prediction, y)
-                val_loss += loss.item()
-            validation_losses.append(val_loss / len(validation_loader))
-            if validation_losses[-1] < best_val_loss:
-                best_val_loss = validation_losses[-1]
-
-    return {
-        "training_losses": training_losses,
-        "validation_losses": validation_losses,
-        "best_validation_loss": best_val_loss
-    }
-
-
-
-def objective_function(trial: tr.Trial, train_data: bytes, validation_data: bytes, batch_size: int):
-    model = create_model(trial)
-    result = train_and_eval(model, train_data, validation_data, batch_size)
-    return result["best_validation_loss"]
-
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--train-data", type=str, required=True)
-    parser.add_argument("--validation-data", type=str, required=True)
-    parser.add_argument("--batch-size", type=int, default=128)
-
+    print(f"Running on device: {DEVICE}...")
+    parser = ArgumentParser()
+    parser.add_argument("--method", choices=["optuna", "train"], required=True)
+    parser.add_argument("--model-path", type=str, required=False)
    args = parser.parse_args()
-    print(args)

-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    train_data = load_data(args.train_data)
-    validation_data = load_data(args.validation_data)
-    batch_size = args.batch_size
+    if args.method == "train":
+        dataset = EnWik9DataSet()
+    elif args.method == "optuna":
+        dataset = LoremIpsumDataset()
+    else:
+        raise ValueError(f"Unknown method: {args.method}")

-    print(f"training data length: {len(train_data)}")
-    print(f"validation data length: {len(validation_data)}")
-    print(f"batch size: {batch_size}")
+    dataset_length = len(dataset)
+    training_size = ceil(0.8 * dataset_length)

-    study = optuna.create_study(study_name="CNN network",direction="minimize")
-    study.optimize(lambda trial: objective_function(trial, train_data, validation_data, batch_size), n_trials=10)
+    print(f"training set size = {training_size}, validation set size {dataset_length - training_size}")
+    data = dataset.data["text"]
+
+    train_set, validate_set = torch.utils.data.random_split(TensorDataset(data),
+                                                            [training_size, dataset_length - training_size])
+    training_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
+    validation_loader = DataLoader(validate_set, batch_size=BATCH_SIZE, shuffle=False)
+    loss_fn = torch.nn.CrossEntropyLoss()
+
+    model = None
+    if args.model_path is not None:
+        model = torch.load(args.model_path)
+
+    trainer: Trainer = OptunaTrainer() if args.method == "optuna" else None
+
+    trainer.execute(
+        model=model,
+        train_loader=training_loader,
+        validation_loader=validation_loader,
+        loss_fn=loss_fn,
+        n_epochs=200,
+        device=DEVICE
+    )
--- a/CNN-model/model/init.py
+++ b/CNN-model/model/init.py
@ -0,0 +1 @@
+from .cnn import CNNPredictor
--- a/CNN-model/model/cnn.py
+++ b/CNN-model/model/cnn.py
@ -8,7 +8,7 @@ class CausalConv1d(nn.Conv1d):
    def __init__(self, input_channels, output_channels, kernel_size, **kwargs):
        super().__init__(input_channels, output_channels, kernel_size, padding=kernel_size-1, **kwargs)
    def forward(self, input: Tensor) -> Tensor:
-        return super().forward(input)
+        return super().forward(input)[:, :, :input.size(-1)]

 class CNNPredictor(nn.Module):
    def __init__(
@ -41,5 +41,5 @@ class CNNPredictor(nn.Module):
        emdedding = emdedding.transpose(1, 2)   # B, H, L
        prediction = self.network(emdedding)
        last_prediction = prediction[:, :, -1]
-        return softmax(self.output_layer(last_prediction), dim=-1) # convert output of linear layer to prob. distr.
+        return self.output_layer(last_prediction)

--- a/CNN-model/optuna_trial.py
+++ b/CNN-model/optuna_trial.py
@ -1,18 +0,0 @@
-import optuna.trial as tr
-from cnn import CNNPredictor
-
-def create_model(trial: tr.Trial, vocab_size: int = 256, context_length: int = 128):
-    num_layers = trial.suggest_int("num_layers", 1, 6)
-    hidden_dim = trial.suggest_int("hidden_dim", 64, 512, log=True)
-    kernel_size = trial.suggest_int("kernel_size", 2, 7)
-    dropout_prob = trial.suggest_float("dropout_prob", 0.1, 0.5)
-    use_batchnorm = trial.suggest_categorical("use_batchnorm", [True, False])
-
-    return CNNPredictor(
-        vocab_size=vocab_size,
-        num_layers=num_layers,
-        hidden_dim=hidden_dim,
-        kernel_size=kernel_size,
-        dropout_prob=dropout_prob,
-        use_batchnorm=use_batchnorm
-    )
--- a/CNN-model/trainers/FullTrainer.py
+++ b/CNN-model/trainers/FullTrainer.py
@ -0,0 +1,26 @@
+from typing import Callable
+
+import torch
+from torch import nn as nn
+from torch.utils.data import DataLoader
+
+from trainer import Trainer
+from train import train
+from ..utils import print_losses
+
+class FullTrainer(Trainer):
+    def execute(
+            self,
+            model: nn.Module | None,
+            train_loader: DataLoader,
+            validation_loader: DataLoader,
+            loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+            n_epochs: int,
+            device: str
+    ) -> None:
+        if model is None:
+            raise ValueError("Model must be provided: run optuna optimizations first")
+
+        model.to(device)
+        train_loss, val_loss = train(model, train_loader, validation_loader, loss_fn, n_epochs)
+        print_losses(train_loss, val_loss)
--- a/CNN-model/trainers/OptunaTrainer.py
+++ b/CNN-model/trainers/OptunaTrainer.py
@ -0,0 +1,63 @@
+from typing import Callable
+
+import optuna
+import optuna.trial as tr
+import torch
+from torch import nn as nn
+from torch.utils.data import DataLoader
+
+from trainer import Trainer
+from ..model.cnn import CNNPredictor
+from train import train
+
+
+def create_model(trial: tr.Trial, vocab_size: int = 256):
+    num_layers = trial.suggest_int("num_layers", 1, 6)
+    hidden_dim = trial.suggest_int("hidden_dim", 64, 512, log=True)
+    kernel_size = trial.suggest_int("kernel_size", 2, 7)
+    dropout_prob = trial.suggest_float("dropout_prob", 0.1, 0.5)
+    use_batchnorm = trial.suggest_categorical("use_batchnorm", [True, False])
+
+    return CNNPredictor(
+        vocab_size=vocab_size,
+        num_layers=num_layers,
+        hidden_dim=hidden_dim,
+        kernel_size=kernel_size,
+        dropout_prob=dropout_prob,
+        use_batchnorm=use_batchnorm
+    )
+
+
+def objective_function(
+        trial: tr.Trial,
+        training_loader: DataLoader,
+        validation_loader: DataLoader,
+        loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+        device: str
+):
+    model = create_model(trial).to(device)
+    _, validation_loss = train(model, training_loader, validation_loader, loss_fn)
+    return min(validation_loss)
+
+
+class OptunaTrainer(Trainer):
+    def execute(
+            self,
+            model: nn.Module | None,
+            train_loader: DataLoader,
+            validation_loader: DataLoader,
+            loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+            n_epochs: int,
+            device: str
+    ) -> None:
+        study = optuna.create_study(study_name="CNN network", direction="minimize")
+        study.optimize(
+            lambda trial: objective_function(trial, train_loader, validation_loader, loss_fn, device),
+            n_trials=20
+        )
+
+        best_params = study.best_trial.params
+        best_model = CNNPredictor(
+            **best_params
+        )
+        torch.save(best_model, "models/final_model.pt")
--- a/CNN-model/trainers/init.py
+++ b/CNN-model/trainers/init.py
@ -0,0 +1,2 @@
+from OptunaTrainer import OptunaTrainer
+from trainer import Trainer
--- a/CNN-model/trainers/train.py
+++ b/CNN-model/trainers/train.py
@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+from torch.utils.data.dataloader import DataLoader
+from tqdm import tqdm
+
+from typing import Callable
+
+
+def train(
+        model: nn.Module,
+        training_loader: DataLoader,
+        validation_loader: DataLoader,
+        loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+        epochs: int = 100,
+        learning_rate: float = 1e-3,
+        weight_decay: float = 1e-8
+) -> tuple[list[float], list[float]]:
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+    avg_training_losses = []
+    avg_validation_losses = []
+
+    for epoch in range(epochs):
+        model.train()
+        total_loss = []
+
+        for data in tqdm(training_loader):
+            optimizer.zero_grad()
+
+            x_hat = model(data)
+
+            loss = loss_fn(x_hat, data)
+            loss.backward()
+            optimizer.step()
+
+            total_loss.append(loss.item())
+
+        avg_training_losses.append(sum(total_loss) / len(total_loss))
+
+        with torch.no_grad():
+            losses = []
+            for data in validation_loader:
+                x_hat = model(data)
+                loss = loss_fn(x_hat, data)
+                losses.append(loss.item())
+            avg_loss = sum(losses) / len(losses)
+            avg_validation_losses.append(avg_loss)
+        tqdm.write(f"epoch: {epoch + 1}, avg loss = {avg_loss}")
+
+    return avg_training_losses, avg_validation_losses
+
--- a/CNN-model/trainers/trainer.py
+++ b/CNN-model/trainers/trainer.py
@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+from typing import Callable
+
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+
+class Trainer(ABC):
+    """Abstract class for trainers."""
+
+    @abstractmethod
+    def execute(
+            self,
+            model: nn.Module | None,
+            train_loader: DataLoader,
+            validation_loader: DataLoader,
+            loss_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor],
+            n_epochs: int,
+            device: str
+    ) -> None:
+        pass
--- a/CNN-model/utils/init.py
+++ b/CNN-model/utils/init.py
@ -0,0 +1 @@
+from .utils import *
--- a/CNN-model/utils/utils.py
+++ b/CNN-model/utils/utils.py
@ -14,6 +14,17 @@ def print_distribution(from_to: tuple[int, int], probabilities: list[float]):
    plt.hist(range(from_to[0], from_to[1]), weights=probabilities)
    plt.show()

+def print_losses(train_losses: list[float], validation_losses: list[float], show=False):
+    plt.plot(train_losses, label="Training loss")
+    plt.plot(validation_losses, label="Validation loss")
+    plt.xlabel("Epoch")
+    plt.ylabel("Loss (cross entropy)")
+    plt.legend()
+
+    if show:
+        plt.show()
+    plt.savefig("losses.png")
+

 def load_data(path: str) -> bytes:
    with open(path, "rb") as f:
--- a/integer_discrete_flows/coding/coder.py
+++ b/integer_discrete_flows/coding/coder.py
@ -36,7 +36,7 @@ def CDF_fn(pz, bin_width, variable_type, distribution_type):

    bin_locations = torch.arange(-n_bins // 2, n_bins // 2)[None, None, None, None, :] + MEAN.cpu()[..., None]
    bin_locations = bin_locations.float() * bin_width
-    bin_locations = bin_locations.to(device=pz[0].device)
+    bin_locations = bin_locations.to(device=pz[0].DEVICE)

    pz = [param[:, :, :, :, None] for param in pz]
    cdf = cdf_fn(
@ -86,7 +86,7 @@ def decode_sample(
        state, pz, variable_type, distribution_type, bin_width=1./256):
    state = rans.unflatten(state)

-    device = pz[0].device
+    device = pz[0].DEVICE
    size = pz[0].size()[0:4]

    CDFs, MEAN = CDF_fn(pz, bin_width, variable_type, distribution_type)
--- a/integer_discrete_flows/main_experiment.py
+++ b/integer_discrete_flows/main_experiment.py
@ -190,7 +190,7 @@ def run(args, kwargs):
    import models.Model as Model

    model = Model.Model(args)
-    args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    args.DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.set_temperature(args.temperature)
    model.enable_hard_round(args.hard_round)

@ -208,7 +208,7 @@ def run(args, kwargs):
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = torch.nn.DataParallel(model, dim=0)

-    model.to(args.device)
+    model.to(args.DEVICE)

    def lr_lambda(epoch):
        return min(1., (epoch+1) / args.warmup) * np.power(args.lr_decay, epoch)
--- a/integer_discrete_flows/models/backround.py
+++ b/integer_discrete_flows/models/backround.py
@ -34,7 +34,7 @@ def _stacked_sigmoid(x, temperature, n_approx=3):
    x_remainder = x_remainder.view(size + (1,))

    translation = torch.arange(n_approx) - n_approx // 2
-    translation = translation.to(device=x.device, dtype=x.dtype)
+    translation = translation.to(device=x.DEVICE, dtype=x.dtype)
    translation = translation.view([1] * len(size) + [len(translation)])
    out = torch.sigmoid((x_remainder - translation) / temperature).sum(dim=-1)

--- a/integer_discrete_flows/optimization/training.py
+++ b/integer_discrete_flows/optimization/training.py
@ -17,7 +17,7 @@ def train(epoch, train_loader, model, opt, args):
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.view(-1, *args.input_size)

-        data = data.to(args.device)
+        data = data.to(args.DEVICE)

        opt.zero_grad()
        loss, bpd, bpd_per_prior, pz, z, pys, py, ldj = model(data)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,8 +5,12 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
+    "datasets>=4.4.1",
+    "lorem>=0.1.1",
    "matplotlib>=3.10.7",
    "numpy>=2.3.4",
    "optuna>=4.5.0",
    "torch>=2.9.0",
+    "torchdata>=0.11.0",
+    "torchvision>=0.24.0",
 ]
--- a/transformer-xl/pytorch/mem_transformer.py
+++ b/transformer-xl/pytorch/mem_transformer.py
@ -176,9 +176,9 @@ class RelMultiHeadAttn(nn.Module):
    def _shift(self, x, qlen, klen, mask, left=False):
        if qlen > 1:
            zero_pad = torch.zeros((x.size(0), qlen-1, x.size(2), x.size(3)),
-                                    device=x.device, dtype=x.dtype)
+                                   device=x.DEVICE, dtype=x.dtype)
        else:
-            zero_pad = torch.zeros(0, device=x.device, dtype=x.dtype)
+            zero_pad = torch.zeros(0, device=x.DEVICE, dtype=x.dtype)

        if left:
            mask = mask.flip(1)
@ -193,7 +193,7 @@ class RelMultiHeadAttn(nn.Module):

    def _rel_shift(self, x, zero_triu=False):
        zero_pad = torch.zeros((x.size(0), 1, *x.size()[2:]),
-                               device=x.device, dtype=x.dtype)
+                               device=x.DEVICE, dtype=x.dtype)
        x_padded = torch.cat([zero_pad, x], dim=1)

        x_padded = x_padded.view(x.size(1) + 1, x.size(0), *x.size()[2:])
@ -661,7 +661,7 @@ class MemTransformerLM(nn.Module):

        hids = []
        if self.attn_type == 0: # default
-            pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, 
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.DEVICE,
                                   dtype=word_emb.dtype)
            if self.clamp_len > 0:
                pos_seq.clamp_(max=self.clamp_len)
@ -691,7 +691,7 @@ class MemTransformerLM(nn.Module):
                        r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i)
                hids.append(core_out)
        elif self.attn_type == 2: # absolute
-            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device,
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.DEVICE,
                                   dtype=word_emb.dtype)
            if self.clamp_len > 0:
                pos_seq.clamp_(max=self.clamp_len)
--- a/transformer-xl/pytorch/train.py
+++ b/transformer-xl/pytorch/train.py
@ -160,7 +160,7 @@ np.random.seed(args.seed)
 torch.manual_seed(args.seed)
 if torch.cuda.is_available():
    if not args.cuda:
-        print('WARNING: You have a CUDA device, so you should probably run with --cuda')
+        print('WARNING: You have a CUDA DEVICE, so you should probably run with --cuda')
    else:
        torch.cuda.manual_seed_all(args.seed)

--- a/transformer-xl/pytorch/utils/adaptive_softmax.py
+++ b/transformer-xl/pytorch/utils/adaptive_softmax.py
@ -50,7 +50,7 @@ class AdaptiveLogSoftmax(nn.Module):
        head_logprob = F.log_softmax(head_logit, dim=1)

        nll = torch.zeros_like(target,
-                dtype=hidden.dtype, device=hidden.device)
+                               dtype=hidden.dtype, device=hidden.DEVICE)

        offset = 0
        cutoff_values = [0] + self.cutoffs
--- a/transformer-xl/pytorch/utils/log_uniform_sampler.py
+++ b/transformer-xl/pytorch/utils/log_uniform_sampler.py
@ -38,7 +38,7 @@ class LogUniformSampler(object):

        with torch.no_grad():
            neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique()
-            device = labels.device
+            device = labels.DEVICE
            neg_samples = neg_samples.to(device)
            true_log_probs = self.log_q[labels].to(device)
            samp_log_probs = self.log_q[neg_samples].to(device)
--- a/transformer-xl/pytorch/utils/proj_adaptive_softmax.py
+++ b/transformer-xl/pytorch/utils/proj_adaptive_softmax.py
@ -112,7 +112,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
            head_logprob = F.log_softmax(head_logit, dim=1)

            nll = torch.zeros_like(target,
-                    dtype=hidden.dtype, device=hidden.device)
+                                   dtype=hidden.dtype, device=hidden.DEVICE)

            offset = 0
            cutoff_values = [0] + self.cutoffs
--- a/transformer-xl/tf/gpu_utils.py
+++ b/transformer-xl/tf/gpu_utils.py
@ -1,7 +1,7 @@
 import os
 import tensorflow as tf

-def assign_to_gpu(gpu=0, ps_dev="/device:CPU:0"):
+def assign_to_gpu(gpu=0, ps_dev="/DEVICE:CPU:0"):
    def _assign(op):
        node_def = op if isinstance(op, tf.NodeDef) else op.node_def
        if node_def.op == "Variable":
--- a/transformer-xl/tf/tpu_estimator.py
+++ b/transformer-xl/tf/tpu_estimator.py
@ -724,7 +724,7 @@ def generate_per_host_enqueue_ops_fn_for_host(

  hooks = []

-  with ops.device(device):
+  with ops.DEVICE(device):
    user_context = tpu_context.TPUContext(
        internal_ctx=ctx,
        input_device=device,
@ -758,7 +758,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
    Returns:
      list of dict of ops.
    """
-    with ops.device(device):
+    with ops.DEVICE(device):
      num_of_replicas_per_host = ctx.num_of_replicas_per_host
      # Convert user input to features and labels.  If the user returns a
      # dataset, it is initialized and the features and labels extracted via
@ -799,7 +799,7 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
  captured_infeed_queue = _CapturedObject()
  hooks = []

-  with ops.device(device):
+  with ops.DEVICE(device):
    user_context = tpu_context.TPUContext(
        internal_ctx=ctx,
        input_device=device,
@ -827,7 +827,7 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
    per_host_sharded_inputs = []
    num_replicas_per_host = ctx.num_of_replicas_per_host
    cached_signals = None
-    with ops.device(device):
+    with ops.DEVICE(device):
      if not inputs.is_dataset:
        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
      for _ in range(num_replicas_per_host):
@ -888,7 +888,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
  captured_infeed_queue = _CapturedObject()
  hooks = []
  device_0 = ctx.tpu_host_placement_function(host_id=0)
-  with ops.device(device_0):
+  with ops.DEVICE(device_0):
    user_context = tpu_context.TPUContext(
        internal_ctx=ctx, input_device=device_0, invocation_index=0)
    inputs = _Inputs.from_input_fn(input_fn(user_context))
@ -924,7 +924,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
    flattened_inputs = None  # Cache result from input_fn.
    signals = None
    for host_id in xrange(num_hosts):
-      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
+      with ops.DEVICE(ctx.tpu_host_placement_function(host_id=host_id)):
        for _ in xrange(ctx.num_of_replicas_per_host):
          # Note: input_fn is only called once at host 0 for the first replica.
          # The features and labels returned from that invocation are
@ -1147,7 +1147,7 @@ class _InputPipeline(object):

    def dequeue_fn():
      """dequeue_fn is used by TPU to retrieve the tensors."""
-      # In the model-parallel case, both the host-side and device-side
+      # In the model-parallel case, both the host-side and DEVICE-side
      # computations must agree on the core on which infeed takes place. We
      # choose to perform infeed on logical core 0 of each replica.
      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
@ -1173,7 +1173,7 @@ class _InputPipeline(object):
      # host.
      for host_id in range(num_hosts):
        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
+        with ops.DEVICE(host_device):
          with ops.name_scope('input_pipeline_task%d' % (host_id)):
            enqueue_ops_fn, captured_infeed_queue = (
                generate_per_core_enqueue_ops_fn_for_host(
@ -1211,7 +1211,7 @@ class _InputPipeline(object):
    else:
      for host_id in range(num_hosts):
        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
+        with ops.DEVICE(host_device):
          with ops.name_scope('input_pipeline_task%d' % (host_id)):
            if self._ctx.is_input_per_host_with_iterators():
              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
@ -1712,7 +1712,7 @@ class _OutfeedHostCall(object):
    for name in self._names:
      tensors.extend(self._tensors[name])

-    with ops.device(tpu.core(0)):
+    with ops.DEVICE(tpu.core(0)):
      return [tpu_ops.outfeed_enqueue_tuple(tensors)]

  def create_tpu_hostcall(self):
@ -1751,7 +1751,7 @@ class _OutfeedHostCall(object):
    # per replica.
    for i in xrange(self._ctx.num_replicas):
      host_device, ordinal_id = self._ctx.device_for_replica(i)
-      with ops.device(host_device):
+      with ops.DEVICE(host_device):
        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
            dtypes=tensor_dtypes,
            shapes=tensor_shapes,
@ -1770,7 +1770,7 @@ class _OutfeedHostCall(object):
    # place all ops on tpu host if possible.
    #
    # TODO(jhseu): Evaluate whether this is right for summaries.
-    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
+    with ops.DEVICE(self._ctx.tpu_host_placement_function(replica_id=0)):
      for name in self._names:
        dequeue_ops = dequeue_ops_by_name[name]
        for i, item in enumerate(dequeue_ops):
@ -2426,7 +2426,7 @@ class TPUEstimator(estimator_lib.Estimator):
      # For export_savedmodel, input_fn is never passed to Estimator. So,
      # `is_export_mode` must be False.
      if ctx.is_running_on_cpu(is_export_mode=False):
-        with ops.device('/device:CPU:0'):
+        with ops.DEVICE('/DEVICE:CPU:0'):
          return input_fn(**kwargs)

      # For TPU computation, input_fn should be invoked in a tf.while_loop for
@ -2971,7 +2971,7 @@ def _wrap_computation_in_while_loop(device, op_fn):
  iterations_per_loop_var = _create_or_get_iterations_per_loop()
  # By setting parallel_iterations=1, the parallel execution in while_loop is
  # basically turned off.
-  with ops.device(device):
+  with ops.DEVICE(device):
    iterations = array_ops.identity(iterations_per_loop_var)
    return control_flow_ops.while_loop(
        lambda i: i < iterations,
@ -2995,7 +2995,7 @@ def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):

  # By setting parallel_iterations=1, the parallel execution in while_loop is
  # basically turned off.
-  with ops.device(device):
+  with ops.DEVICE(device):
    return control_flow_ops.while_loop(
        cond,
        computation, [_StopSignals.NON_STOPPING_SIGNAL],
@ -3006,7 +3006,7 @@ def _validate_tpu_training_graph():
  """Validate graph before running distributed training.

  Raises:
-    ValueError: If the graph seems invalid for running on device
+    ValueError: If the graph seems invalid for running on DEVICE
  """
  operations = ops.get_default_graph().get_operations()

--- a/transformer-xl/tf/train_gpu.py
+++ b/transformer-xl/tf/train_gpu.py
@ -249,7 +249,7 @@ def train(n_token, cutoffs, ps_device):

  for i in range(FLAGS.num_core_per_host):
    reuse = True if i > 0 else None
-    with tf.device(assign_to_gpu(i, ps_device)), \
+    with tf.DEVICE(assign_to_gpu(i, ps_device)), \
        tf.variable_scope(tf.get_variable_scope(), reuse=reuse):

      mems_i = [tf.placeholder(tf.float32,
@ -384,7 +384,7 @@ def evaluate(n_token, cutoffs, ps_device):
  tower_mems, tower_losses, tower_new_mems = [], [], []

  for i in range(FLAGS.num_core_per_host):
-    with tf.device(assign_to_gpu(i, ps_device)), \
+    with tf.DEVICE(assign_to_gpu(i, ps_device)), \
        tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):

      mems_i = [tf.placeholder(tf.float32,
--- a/uv.lock
+++ b/uv.lock