feat: uhm, i changed some things

2025-11-25 20:20:08 +01:00 · 2025-11-25 20:20:08 +01:00 · 6de4db24cc
commit 6de4db24cc
parent b58682cb49
27 changed files with 1302 additions and 137 deletions
--- a/transformer-xl/tf/gpu_utils.py
+++ b/transformer-xl/tf/gpu_utils.py
@ -1,7 +1,7 @@
 import os
 import tensorflow as tf

-def assign_to_gpu(gpu=0, ps_dev="/device:CPU:0"):
+def assign_to_gpu(gpu=0, ps_dev="/DEVICE:CPU:0"):
    def _assign(op):
        node_def = op if isinstance(op, tf.NodeDef) else op.node_def
        if node_def.op == "Variable":
--- a/transformer-xl/tf/tpu_estimator.py
+++ b/transformer-xl/tf/tpu_estimator.py
@ -724,7 +724,7 @@ def generate_per_host_enqueue_ops_fn_for_host(

  hooks = []

-  with ops.device(device):
+  with ops.DEVICE(device):
    user_context = tpu_context.TPUContext(
        internal_ctx=ctx,
        input_device=device,
@ -758,7 +758,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
    Returns:
      list of dict of ops.
    """
-    with ops.device(device):
+    with ops.DEVICE(device):
      num_of_replicas_per_host = ctx.num_of_replicas_per_host
      # Convert user input to features and labels.  If the user returns a
      # dataset, it is initialized and the features and labels extracted via
@ -799,7 +799,7 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
  captured_infeed_queue = _CapturedObject()
  hooks = []

-  with ops.device(device):
+  with ops.DEVICE(device):
    user_context = tpu_context.TPUContext(
        internal_ctx=ctx,
        input_device=device,
@ -827,7 +827,7 @@ def generate_per_host_v2_enqueue_ops_fn_for_host(
    per_host_sharded_inputs = []
    num_replicas_per_host = ctx.num_of_replicas_per_host
    cached_signals = None
-    with ops.device(device):
+    with ops.DEVICE(device):
      if not inputs.is_dataset:
        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
      for _ in range(num_replicas_per_host):
@ -888,7 +888,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
  captured_infeed_queue = _CapturedObject()
  hooks = []
  device_0 = ctx.tpu_host_placement_function(host_id=0)
-  with ops.device(device_0):
+  with ops.DEVICE(device_0):
    user_context = tpu_context.TPUContext(
        internal_ctx=ctx, input_device=device_0, invocation_index=0)
    inputs = _Inputs.from_input_fn(input_fn(user_context))
@ -924,7 +924,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
    flattened_inputs = None  # Cache result from input_fn.
    signals = None
    for host_id in xrange(num_hosts):
-      with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
+      with ops.DEVICE(ctx.tpu_host_placement_function(host_id=host_id)):
        for _ in xrange(ctx.num_of_replicas_per_host):
          # Note: input_fn is only called once at host 0 for the first replica.
          # The features and labels returned from that invocation are
@ -1147,7 +1147,7 @@ class _InputPipeline(object):

    def dequeue_fn():
      """dequeue_fn is used by TPU to retrieve the tensors."""
-      # In the model-parallel case, both the host-side and device-side
+      # In the model-parallel case, both the host-side and DEVICE-side
      # computations must agree on the core on which infeed takes place. We
      # choose to perform infeed on logical core 0 of each replica.
      values = self._infeed_queue.generate_dequeue_op(tpu_device=0)
@ -1173,7 +1173,7 @@ class _InputPipeline(object):
      # host.
      for host_id in range(num_hosts):
        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
+        with ops.DEVICE(host_device):
          with ops.name_scope('input_pipeline_task%d' % (host_id)):
            enqueue_ops_fn, captured_infeed_queue = (
                generate_per_core_enqueue_ops_fn_for_host(
@ -1211,7 +1211,7 @@ class _InputPipeline(object):
    else:
      for host_id in range(num_hosts):
        host_device = tpu_host_placement_fn(host_id=host_id)
-        with ops.device(host_device):
+        with ops.DEVICE(host_device):
          with ops.name_scope('input_pipeline_task%d' % (host_id)):
            if self._ctx.is_input_per_host_with_iterators():
              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
@ -1712,7 +1712,7 @@ class _OutfeedHostCall(object):
    for name in self._names:
      tensors.extend(self._tensors[name])

-    with ops.device(tpu.core(0)):
+    with ops.DEVICE(tpu.core(0)):
      return [tpu_ops.outfeed_enqueue_tuple(tensors)]

  def create_tpu_hostcall(self):
@ -1751,7 +1751,7 @@ class _OutfeedHostCall(object):
    # per replica.
    for i in xrange(self._ctx.num_replicas):
      host_device, ordinal_id = self._ctx.device_for_replica(i)
-      with ops.device(host_device):
+      with ops.DEVICE(host_device):
        outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
            dtypes=tensor_dtypes,
            shapes=tensor_shapes,
@ -1770,7 +1770,7 @@ class _OutfeedHostCall(object):
    # place all ops on tpu host if possible.
    #
    # TODO(jhseu): Evaluate whether this is right for summaries.
-    with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)):
+    with ops.DEVICE(self._ctx.tpu_host_placement_function(replica_id=0)):
      for name in self._names:
        dequeue_ops = dequeue_ops_by_name[name]
        for i, item in enumerate(dequeue_ops):
@ -2426,7 +2426,7 @@ class TPUEstimator(estimator_lib.Estimator):
      # For export_savedmodel, input_fn is never passed to Estimator. So,
      # `is_export_mode` must be False.
      if ctx.is_running_on_cpu(is_export_mode=False):
-        with ops.device('/device:CPU:0'):
+        with ops.DEVICE('/DEVICE:CPU:0'):
          return input_fn(**kwargs)

      # For TPU computation, input_fn should be invoked in a tf.while_loop for
@ -2971,7 +2971,7 @@ def _wrap_computation_in_while_loop(device, op_fn):
  iterations_per_loop_var = _create_or_get_iterations_per_loop()
  # By setting parallel_iterations=1, the parallel execution in while_loop is
  # basically turned off.
-  with ops.device(device):
+  with ops.DEVICE(device):
    iterations = array_ops.identity(iterations_per_loop_var)
    return control_flow_ops.while_loop(
        lambda i: i < iterations,
@ -2995,7 +2995,7 @@ def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn):

  # By setting parallel_iterations=1, the parallel execution in while_loop is
  # basically turned off.
-  with ops.device(device):
+  with ops.DEVICE(device):
    return control_flow_ops.while_loop(
        cond,
        computation, [_StopSignals.NON_STOPPING_SIGNAL],
@ -3006,7 +3006,7 @@ def _validate_tpu_training_graph():
  """Validate graph before running distributed training.

  Raises:
-    ValueError: If the graph seems invalid for running on device
+    ValueError: If the graph seems invalid for running on DEVICE
  """
  operations = ops.get_default_graph().get_operations()

--- a/transformer-xl/tf/train_gpu.py
+++ b/transformer-xl/tf/train_gpu.py
@ -249,7 +249,7 @@ def train(n_token, cutoffs, ps_device):

  for i in range(FLAGS.num_core_per_host):
    reuse = True if i > 0 else None
-    with tf.device(assign_to_gpu(i, ps_device)), \
+    with tf.DEVICE(assign_to_gpu(i, ps_device)), \
        tf.variable_scope(tf.get_variable_scope(), reuse=reuse):

      mems_i = [tf.placeholder(tf.float32,
@ -384,7 +384,7 @@ def evaluate(n_token, cutoffs, ps_device):
  tower_mems, tower_losses, tower_new_mems = [], [], []

  for i in range(FLAGS.num_core_per_host):
-    with tf.device(assign_to_gpu(i, ps_device)), \
+    with tf.DEVICE(assign_to_gpu(i, ps_device)), \
        tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):

      mems_i = [tf.placeholder(tf.float32,