Merge pull request #13 from sugarme/pytorch-1.7

Update to Pytorch 1.7
2020-11-03 00:29:26 +11:00 · 2020-11-03 00:29:26 +11:00 · bea0e28542
commit bea0e28542
parent 45fb5cfc36 d2ad0e7f5a
28 changed files with 136212 additions and 15918 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -41,3 +41,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Added drawing image label at `example/yolo` example
 - Added some example images and README files for `example/yolo` and `example/neural-style-transfer`
+
+## [0.3.0]
+
+### Changed
+- Updated to Pytorch C++ APIs v1.7.0
+- Switched back to `lib.AtoAddParametersOld` as the `ato_add_parameters` has not been implemented correctly. Using the updated API will cause optimizer stops working.
--- a/README.md
+++ b/README.md
@ -5,32 +5,32 @@

 - **GoTch** is a C++ Libtorch Go binding for developing and implementing deep learning projects in Go.
 - This package is to create a thin wrapper of Libtorch to make use of its tensor APIs and CUDA support while implementing as much idiomatic Go as possible. 
- There are about **1129** auto-generated tensor APIs.
+- There are about **1404** auto-generated tensor APIs.

 ## Dependencies

- **Libtorch** C++ v1.5.0 library of [Pytorch](https://pytorch.org/)
+- **Libtorch** C++ v1.7.0 library of [Pytorch](https://pytorch.org/)


 ## Installation

 - **CPU**

-    Default values: `LIBTORCH_VER=1.5.1` and `GOTCH_VER=v0.1.7`
+    Default values: `LIBTORCH_VER=1.7.0` and `GOTCH_VER=v0.3.0`

    ```bash
-    go get -u github.com/sugarme/gotch@v0.1.7
-    bash ${GOPATH}/pkg/mod/github.com/sugarme/gotch@v0.1.7/setup-cpu.sh
+    go get -u github.com/sugarme/gotch@v0.3.0
+    bash ${GOPATH}/pkg/mod/github.com/sugarme/gotch@v0.3.0/setup-cpu.sh

    ```

 - **GPU**

-    Default values: `LIBTORCH_VER=1.5.1`, `CUDA_VER=10.1` and `GOTCH_VER=v0.1.7`
+    Default values: `LIBTORCH_VER=1.7.0`, `CUDA_VER=10.1` and `GOTCH_VER=v0.3.0`

    ```bash
-    go get -u github.com/sugarme/gotch@v0.1.7
-    bash ${GOPATH}/pkg/mod/github.com/sugarme/gotch@v0.1.7/setup-gpu.sh
+    go get -u github.com/sugarme/gotch@v0.3.0
+    bash ${GOPATH}/pkg/mod/github.com/sugarme/gotch@v0.3.0/setup-gpu.sh

    ```

--- a/example/char-rnn/main.go
+++ b/example/char-rnn/main.go
@ -116,9 +116,9 @@ func main() {
 			sumLoss += loss.Float64Values()[0]
 			cntLoss += 1.0

-			batchTs.MustDrop()
-			batchNarrow.MustDrop()
-			xsOnehotTmp.MustDrop()
+			// batchTs.MustDrop()
+			// batchNarrow.MustDrop()
+			// xsOnehotTmp.MustDrop()
 			xsOnehot.MustDrop()
 			ys.MustDrop()
 			lstmOut.MustDrop()
--- a/example/mnist/cnn.go
+++ b/example/mnist/cnn.go
@ -117,21 +117,21 @@ func runCNN1() {
 			logits := net.ForwardT(bImages, true)
 			loss := logits.CrossEntropyForLogits(bLabels)

-			// loss = loss.MustSetRequiresGrad(true)
+			// loss = loss.MustSetRequiresGrad(true, false)
 			opt.BackwardStep(loss)

 			epocLoss = loss.MustShallowClone()
 			epocLoss.Detach_()

-			// fmt.Printf("completed \t %v batches\t %.2f\n", i, loss.Values()[0])
+			// fmt.Printf("completed \t %v batches\t %.2f\n", i, loss.Float64Values()[0])

 			bImages.MustDrop()
 			bLabels.MustDrop()
 		}

-		vs.Freeze()
+		// vs.Freeze()
 		testAccuracy := nn.BatchAccuracyForLogits(vs, net, testImages, testLabels, vs.Device(), 1024)
-		vs.Unfreeze()
+		// vs.Unfreeze()
 		fmt.Printf("Epoch: %v\t Loss: %.2f \t Test accuracy: %.2f%%\n", epoch, epocLoss.Float64Values()[0], testAccuracy*100.0)
 		if testAccuracy > bestAccuracy {
 			bestAccuracy = testAccuracy
--- a/example/mnist/linear.go
+++ b/example/mnist/linear.go
@ -45,7 +45,7 @@ func runLinear() {
 		})

 		testLogits := ds.TestImages.MustMm(ws, false).MustAdd(bs, true)
-		testAccuracy := testLogits.MustArgmax(-1, false, true).MustEq1(ds.TestLabels, true).MustTotype(gotch.Float, true).MustMean(gotch.Float, true).MustView([]int64{-1}, true).MustFloat64Value([]int64{0})
+		testAccuracy := testLogits.MustArgmax([]int64{-1}, false, true).MustEq1(ds.TestLabels, true).MustTotype(gotch.Float, true).MustMean(gotch.Float, true).MustView([]int64{-1}, true).MustFloat64Value([]int64{0})

 		fmt.Printf("Epoch: %v - Loss: %.3f - Test accuracy: %.2f%%\n", epoch, loss.Float64Values()[0], testAccuracy*100)

--- a/example/tensor-io/file.pt
+++ b/example/tensor-io/file.pt
--- a/example/tensor-io/file_multi.pt
+++ b/example/tensor-io/file_multi.pt
--- a/example/tensor-io/mnist-tensor-saved.png
+++ b/example/tensor-io/mnist-tensor-saved.png
--- a/example/yolo/darknet.go
+++ b/example/yolo/darknet.go
@ -271,7 +271,7 @@ func upsample(prevChannels int64) (retVal1 int64, retVal2 interface{}) {
 		h := res[2]
 		w := res[3]

-		return xs.MustUpsampleNearest2d([]int64{h * 2, w * 2}, 2.0, 2.0, false)
+		return xs.MustUpsampleNearest2d([]int64{h * 2, w * 2}, []float64{2.0}, []float64{2.0}, false)
 	})

 	return prevChannels, Layer{Val: layer}
--- a/gen/gen.ml
+++ b/gen/gen.ml
@ -28,7 +28,8 @@ let excluded_functions =
    ; "_amp_non_finite_check_and_unscale_"
    ; "_cummin_helper"
    ; "_cummax_helper"
-    ; "retain_grad" ]
+    ; "retain_grad"
+    ; "_validate_sparse_coo_tensor_args" ]

 let no_tensor_options =
  Set.of_list
@ -47,7 +48,7 @@ let no_tensor_options =
 *     (module String)
 *     ["add"; "add_"; "div"; "div_"; "mul"; "mul_"; "sub"; "sub_"; "nll_loss"]
 *  *)
-let excluded_prefixes = ["_thnn_"; "_th_"; "thnn_"; "th_"]
+let excluded_prefixes = ["_thnn_"; "_th_"; "thnn_"; "th_"; "_foreach"]

 let excluded_suffixes = ["_forward"; "_forward_out"]

@ -79,7 +80,9 @@ module Func = struct
  type arg_type =
    | Bool
    | Int64
+    | Int64Option
    | Double
+    | DoubleOption
    | Tensor
    | TensorOption
    | IntList
@ -104,8 +107,8 @@ module Func = struct
  let arg_type_of_string str ~is_nullable =
    match String.lowercase str with
    | "bool" -> Some Bool
-    | "int64_t" -> Some Int64
-    | "double" -> Some Double
+    | "int64_t" -> Some (if is_nullable then Int64Option else Int64)
+    | "double" -> Some (if is_nullable then DoubleOption else Double)
    | "booltensor" | "indextensor" | "tensor" ->
        Some (if is_nullable then TensorOption else Tensor)
    | "tensoroptions" -> Some TensorOptions
@ -127,6 +130,10 @@ module Func = struct
        | TensorOptions ->
            Printf.sprintf "int %s_kind, int %s_device" arg_name arg_name
        | String -> Printf.sprintf "char* %s_ptr, int %s_len" arg_name arg_name
+        | Int64Option ->
+            Printf.sprintf "int64_t %s_v, uint8_t %s_null" arg_name arg_name
+        | DoubleOption ->
+            Printf.sprintf "double %s_v, uint8_t %s_null" arg_name arg_name
        | otherwise ->
            let simple_type_cstring =
              match otherwise with
@ -138,7 +145,9 @@ module Func = struct
              | ScalarType -> "int"
              | Device -> "int"
              | Scalar -> "scalar"
-              | String | IntList | TensorList | TensorOptions -> assert false
+              | Int64Option | DoubleOption | String | IntList | TensorList
+               |TensorOptions ->
+                  assert false
            in
            Printf.sprintf "%s %s" simple_type_cstring arg_name )
    |> String.concat ~sep:", "
@ -162,6 +171,14 @@ module Func = struct
            Printf.sprintf
              "at::device(device_of_int(%s_device)).dtype(at::ScalarType(%s_kind))"
              arg_name arg_name
+        | Int64Option ->
+            Printf.sprintf
+              "%s_null ? c10::nullopt : c10::optional<int64_t>(%s_v)" arg_name
+              arg_name
+        | DoubleOption ->
+            Printf.sprintf
+              "%s_null ? c10::nullopt : c10::optional<double>(%s_v)" arg_name
+              arg_name
        | ScalarType -> Printf.sprintf "at::ScalarType(%s)" arg_name
        | Device -> Printf.sprintf "device_of_int(%s)" arg_name
        | _ -> arg_name )
@ -229,6 +246,8 @@ module Func = struct
        | String -> single_param "string"
        | IntList -> Printf.sprintf "%sData []int64, %sLen int" an an
        | TensorList -> Printf.sprintf "%sData []Ctensor, %sLen int" an an
+        | Int64Option -> Printf.sprintf "%sVal int64, %sNull int" an an
+        | DoubleOption -> Printf.sprintf "%sVal float64, %sNull int" an an
        | TensorOptions -> Printf.sprintf "%sKind int32, %sDevice int32" an an
    )
    |> String.concat ~sep:", "
@ -250,6 +269,8 @@ module Func = struct
        | String -> Printf.sprintf "c%s, c%sLen" an an
        | IntList -> Printf.sprintf "c%sDataPtr, c%sLen" an an
        | TensorList -> Printf.sprintf "c%sDataPtr, c%sLen" an an
+        | Int64Option -> Printf.sprintf "c%sVal, c%sNull" an an
+        | DoubleOption -> Printf.sprintf "c%sVal, c%sNull" an an
        | TensorOptions -> Printf.sprintf "c%sKind, c%sDevice" an an )
    |> String.concat ~sep:", "

@ -291,6 +312,18 @@ module Func = struct
               c%sDataPtr := (*Ctensor)(unsafe.Pointer(&%sData[0]))\n\
               c%sLen := *(*C.int)(unsafe.Pointer(&%sLen))"
              an an an an
+        | Int64Option ->
+            Printf.sprintf
+              "\n\
+               c%sVal := *(*C.int64_t)(unsafe.Pointer(&%sVal))\n\
+               c%sNull := *(*C.uint8_t)(unsafe.Pointer(&%sNull))"
+              an an an an
+        | DoubleOption ->
+            Printf.sprintf
+              "\n\
+               c%sVal := *(*C.double)(unsafe.Pointer(&%sVal))\n\
+               c%sNull := *(*C.uint8_t)(unsafe.Pointer(&%sNull))"
+              an an an an
        | TensorOptions ->
            Printf.sprintf
              "\n\
@ -356,6 +389,8 @@ module Func = struct
              | TensorOptions -> "gotch.KindDevice"
              | Scalar -> "*Scalar"
              | ScalarType -> "gotch.DType"
+              | Int64Option -> "[]int64"
+              | DoubleOption -> "[]float64"
              | Device -> "gotch.Device"
            in
            match arg.arg_type with
@ -436,6 +471,8 @@ module Func = struct
        | String -> Printf.sprintf "%s" name
        | IntList -> Printf.sprintf "%s, len(%s)" name name
        | TensorList -> Printf.sprintf "c%s, len(c%s)" name name
+        | Int64Option -> Printf.sprintf "c%sVal, c%sNull" name name
+        | DoubleOption -> Printf.sprintf "c%sVal, c%sNull" name name
        | TensorOption -> Printf.sprintf "%s.ctensor" name
        | _ -> name )
    |> String.concat ~sep:", "
@ -456,6 +493,24 @@ module Func = struct
        | Device -> ""
        | String -> ""
        | IntList -> ""
+        | Int64Option ->
+            Printf.sprintf
+              "var c%sVal int64 = 0\n\
+              \ var c%sNull int = 1\n\
+              \ if len(%s) > 0 {\n\
+              \ c%sVal = %s[0]\n\
+              \ c%sNull = 0\n\
+              \ }\n"
+              an an an an an an
+        | DoubleOption ->
+            Printf.sprintf
+              "var c%sVal float64 = 0.0\n\
+              \ var c%sNull int = 1\n\
+              \ if len(%s) > 0 {\n\
+              \ c%sVal = %s[0]\n\
+              \ c%sNull = 0\n\
+              \ }\n"
+              an an an an an an
        | TensorList ->
            Printf.sprintf
              " var c%s []lib.Ctensor\n\
@ -687,7 +742,16 @@ let write_wrapper funcs filename =
            ; "Split"
            ; "SplitWithSizes"
            ; "Unbind"
-            ; "Where" ]
+            ; "Where"
+            ; "Atleast1d1"
+            ; "Atleast2d1"
+            ; "Atleast3d1"
+            ; "Dequantize1"
+            ; "QuantizePerTensor1"
+            ; "UnsafeChunk"
+            ; "UnsafeSplit"
+            ; "UnsafeSplitWithSizes"
+            ; "AlignTensors" ]
          in
          if
            List.exists excluded_funcs ~f:(fun name ->
@ -793,7 +857,16 @@ let write_must_wrapper funcs filename =
            ; "Split"
            ; "SplitWithSizes"
            ; "Unbind"
-            ; "Where" ]
+            ; "Where"
+            ; "Atleast1d1"
+            ; "Atleast2d1"
+            ; "Atleast3d1"
+            ; "Dequantize1"
+            ; "QuantizePerTensor1"
+            ; "UnsafeChunk"
+            ; "UnsafeSplit"
+            ; "UnsafeSplitWithSizes"
+            ; "AlignTensors" ]
          in
          if
            List.exists excluded_funcs ~f:(fun name ->
@ -943,7 +1016,7 @@ let run ~yaml_filename ~cpp_filename ~ffi_filename ~must_wrapper_filename
  write_wrapper funcs wrapper_filename

 let () =
-  run ~yaml_filename:"gen/pytorch/Declarations-v1.5.0.yaml"
+  run ~yaml_filename:"gen/pytorch/Declarations-v1.7.0.yaml"
    ~cpp_filename:"libtch/torch_api_generated"
    ~ffi_filename:"libtch/c-generated.go"
    ~must_wrapper_filename:"tensor/must-tensor-generated.go"
--- a/gen/pytorch/Declarations-v1.7.0.yaml
+++ b/gen/pytorch/Declarations-v1.7.0.yaml
--- a/libtch/c-generated.go
+++ b/libtch/c-generated.go
--- a/libtch/tensor.go
+++ b/libtch/tensor.go
@ -401,8 +401,9 @@ func AtoSgd(learningRate, momentum, dampening, weightDecay float64, nesterov int
 	return C.ato_sgd(clearningRate, cmomentum, cdampening, cweightDecay, cnesterov)
 }

+// NOTE. Backward compat for param group not updated (#261)
 // void ato_add_parameters(optimizer, tensor *, int ntensors);
-func AtoAddParameters(coptimizer Coptimizer, tensors []Ctensor, ntensors int) {
+func AtoAddParametersOld(coptimizer Coptimizer, tensors []Ctensor, ntensors int) {

 	var ctensors []C.tensor
 	for i := 0; i < len(tensors); i++ {
@ -412,7 +413,23 @@ func AtoAddParameters(coptimizer Coptimizer, tensors []Ctensor, ntensors int) {
 	cntensors := *(*C.int)(unsafe.Pointer(&ntensors))

 	// Just give pointer to the first element of ctensors slice
-	C.ato_add_parameters(coptimizer, &ctensors[0], cntensors)
+	C.ato_add_parameters_old(coptimizer, &ctensors[0], cntensors)
+}
+
+// NOTE. This function is not working correctly. Need to update!!!
+// DO NOT USE!!!!!
+// TODO. updated
+func AtoAddParameters(coptimizer Coptimizer, tensors []Ctensor, ntensors int) {
+
+	var ctensors []C.tensor
+	for i := 0; i < len(tensors); i++ {
+		ctensors = append(ctensors, (C.tensor)(tensors[i]))
+	}
+
+	cntensors := *(*C.size_t)(unsafe.Pointer(&ntensors))
+
+	// Just give pointer to the first element of ctensors slice
+	C.ato_add_parameters(coptimizer, ctensors[0], cntensors)
 }

 // void ato_set_learning_rate(optimizer, double learning_rate);
--- a/libtch/torch_api.cpp
+++ b/libtch/torch_api.cpp
@ -1,5 +1,7 @@
 #include<torch/csrc/autograd/engine.h>
+#include<torch/csrc/jit/runtime/graph_executor.h>
 #include<torch/torch.h>
+#include<ATen/autocast_mode.h>
 #include<torch/script.h>
 #include<stdexcept>
 #include<vector>
@ -43,6 +45,15 @@ tensor at_new_tensor() {
  return nullptr;
 }

+tensor at_tensor_of_blob(void *data, int64_t *dims, size_t ndims, int64_t *strides, size_t nstrides, int type, int device) {
+  PROTECT(
+    at::TensorOptions blobOptions = at::TensorOptions().device(device_of_int(device)).dtype(torch::ScalarType(type));
+    return new torch::Tensor(torch::from_blob(data, torch::IntArrayRef(dims, ndims), torch::IntArrayRef(strides, nstrides), blobOptions));
+  )
+
+  return nullptr;
+}
+
 tensor at_tensor_of_data(void *vs, int64_t *dims, size_t ndims, size_t element_size_in_bytes, int type) {
  PROTECT(
    torch::Tensor tensor = torch::zeros(torch::IntArrayRef(dims, ndims), torch::ScalarType(type));
@ -90,6 +101,11 @@ int at_defined(tensor t) {
  return -1;
 }

+int at_is_mkldnn(tensor t) {
+  PROTECT(return t->is_mkldnn();)
+  return -1;
+}
+
 int at_is_sparse(tensor t) {
  PROTECT(return t->is_sparse();)
  return -1;
@ -107,6 +123,13 @@ void at_shape(tensor t, int64_t *dims) {
  )
 }

+void at_stride(tensor t, int64_t *dims) {
+  PROTECT(
+    int i = 0;
+    for (int64_t dim: t->strides()) dims[i++] = dim;
+  )
+}
+
 int at_scalar_type(tensor t) {
  PROTECT(
    return static_cast<int>(t->scalar_type());
@ -114,6 +137,46 @@ int at_scalar_type(tensor t) {
  return -1;
 }

+void at__amp_non_finite_check_and_unscale(tensor t, tensor found_inf, tensor inf_scale) {
+  PROTECT(
+    at::_amp_non_finite_check_and_unscale_(*t, *found_inf, *inf_scale);
+  )
+}
+
+void at_autocast_clear_cache() {
+  at::autocast::clear_cache();
+}
+
+int at_autocast_decrement_nesting() {
+  PROTECT(
+    return at::autocast::decrement_nesting();
+  )
+  return -1;
+}
+
+int at_autocast_increment_nesting() {
+  PROTECT(
+    return at::autocast::increment_nesting();
+  )
+  return -1;
+}
+
+bool at_autocast_is_enabled() {
+  PROTECT(
+    return at::autocast::is_enabled();
+  )
+  return -1;
+}
+
+bool at_autocast_set_enabled(bool b) {
+  PROTECT(
+    bool is_enabled = at::autocast::is_enabled();
+    at::autocast::set_enabled(b);
+    return is_enabled;
+  )
+  return -1;
+}
+
 int at_device(tensor t) {
  PROTECT(
    auto device = t->device();
@ -417,6 +480,20 @@ optimizer ato_adam(double learning_rate,
  return nullptr;
 }

+optimizer ato_adamw(double learning_rate,
+                    double beta1,
+                    double beta2,
+                    double weight_decay) {
+  PROTECT(
+    auto options =
+      torch::optim::AdamWOptions(learning_rate)
+        .betas(std::tuple<double, double>(beta1, beta2))
+        .weight_decay(weight_decay);
+    return new torch::optim::AdamW(vector<torch::Tensor>(), options);
+  )
+  return nullptr;
+}
+
 optimizer ato_rms_prop(double learning_rate,
                       double alpha,
                       double eps,
@ -453,24 +530,63 @@ optimizer ato_sgd(double learning_rate,
  return nullptr;
 }

-void ato_add_parameters(optimizer t, tensor *tensors, int ntensors) {
+// NOTE. backward compat as param group (#261) not updated yet.
+void ato_add_parameters_old(optimizer t, tensor *tensors, int ntensors) {
  PROTECT(
    for (int i = 0; i < ntensors; ++i)
      t->param_groups()[0].params().push_back(*(tensors[i]));
  )
 }

+void ato_add_parameters(optimizer t, tensor tensor, size_t group) {
+  PROTECT(
+    auto &groups = t->param_groups();
+    while (groups.size() <= group) {
+      groups.push_back(torch::optim::OptimizerParamGroup({}, t->defaults().clone()));
+    }
+    groups[group].params().push_back(*tensor);
+  )
+}
+
+template <class T>
+void set_lr(optimizer t, double learning_rate) {
+  torch::optim::OptimizerOptions* d = &(t->defaults());
+  if (auto p = dynamic_cast<T*>(d)) {
+    p->lr(learning_rate);
+    for (auto &param_group: t->param_groups()) {
+      torch::optim::OptimizerOptions* d = &(param_group.options());
+      if (auto p2 = dynamic_cast<T*>(d)) {
+        p2->lr(learning_rate);
+      }
+      else throw std::invalid_argument("unexpected param group type");
+    }
+  }
+}
+
 void ato_set_learning_rate(optimizer t, double learning_rate) {
  PROTECT(
-    torch::optim::OptimizerOptions* d = &(t->defaults());
-    if (auto adam = dynamic_cast<torch::optim::AdamOptions*>(d))
-      adam->lr(learning_rate);
-    else if (auto rms = dynamic_cast<torch::optim::RMSpropOptions*>(d))
-      rms->lr(learning_rate);
-    else if (auto sgd = dynamic_cast<torch::optim::SGDOptions*>(d))
-      sgd->lr(learning_rate);
-    else
-      throw std::invalid_argument("unexpected optimizer");
+    set_lr<torch::optim::AdamOptions>(t, learning_rate);
+    set_lr<torch::optim::AdamWOptions>(t, learning_rate);
+    set_lr<torch::optim::RMSpropOptions>(t, learning_rate);
+    set_lr<torch::optim::SGDOptions>(t, learning_rate);
+  )
+}
+
+template <class T>
+void set_lr_group(optimizer t, size_t group, double learning_rate) {
+  auto &param_group = t->param_groups().at(group);
+  torch::optim::OptimizerOptions* d = &(param_group.options());
+  if (auto p = dynamic_cast<T*>(d)) {
+    p->lr(learning_rate);
+  }
+}
+
+void ato_set_learning_rate_group(optimizer t, size_t group, double learning_rate) {
+  PROTECT(
+    set_lr_group<torch::optim::AdamOptions>(t, group, learning_rate);
+    set_lr_group<torch::optim::AdamWOptions>(t, group, learning_rate);
+    set_lr_group<torch::optim::RMSpropOptions>(t, group, learning_rate);
+    set_lr_group<torch::optim::SGDOptions>(t, group, learning_rate);
  )
 }

@ -480,16 +596,115 @@ void ato_set_momentum(optimizer t, double momentum) {
    if (auto adam = dynamic_cast<torch::optim::AdamOptions*>(d)) {
      auto betas = adam->betas();
      adam->betas(std::tuple<double, double>(momentum, get<1>(betas)));
+      for (auto &param_group: t->param_groups()) {
+          torch::optim::OptimizerOptions* d = &(param_group.options());
+          if (auto adam2 = dynamic_cast<torch::optim::AdamOptions*>(d)) {
+              adam2->betas(std::tuple<double, double>(momentum, get<1>(betas)));
+          }
+          else throw std::invalid_argument("unexpected param group type");
+      }
    }
-    else if (auto rms = dynamic_cast<torch::optim::RMSpropOptions*>(d))
-      rms->momentum(momentum);
-    else if (auto sgd = dynamic_cast<torch::optim::SGDOptions*>(d))
+    else if (auto adamw = dynamic_cast<torch::optim::AdamWOptions*>(d)) {
+        auto betas = adamw->betas();
+        adamw->betas(std::tuple<double, double>(momentum, get<1>(betas)));
+        for (auto &param_group: t->param_groups()) {
+            torch::optim::OptimizerOptions* d = &(param_group.options());
+            if (auto adamw2 = dynamic_cast<torch::optim::AdamWOptions*>(d)) {
+                adamw2->betas(std::tuple<double, double>(momentum, get<1>(betas)));
+            }
+            else throw std::invalid_argument("unexpected param group type");
+        }
+    }
+    else if (auto rms = dynamic_cast<torch::optim::RMSpropOptions*>(d)) {
+      for (auto &param_group: t->param_groups()) {
+          torch::optim::OptimizerOptions* d = &(param_group.options());
+          if (auto rms2 = dynamic_cast<torch::optim::RMSpropOptions*>(d)) {
+              rms2->momentum(momentum);
+          }
+          else throw std::invalid_argument("unexpected param group type");
+      }
+    }
+    else if (auto sgd = dynamic_cast<torch::optim::SGDOptions*>(d)) {
      sgd->momentum(momentum);
+      for (auto &param_group: t->param_groups()) {
+          torch::optim::OptimizerOptions* d = &(param_group.options());
+          if (auto sgd2 = dynamic_cast<torch::optim::SGDOptions*>(d)) {
+              sgd2->momentum(momentum);
+          }
+          else throw std::invalid_argument("unexpected param group type");
+      }
+    }
    else
     throw std::invalid_argument("unexpected optimizer");
  )
 }

+void ato_set_momentum_group(optimizer t, size_t group, double momentum) {
+  PROTECT(
+    auto &param_group = t->param_groups().at(group);
+    torch::optim::OptimizerOptions* d = &(param_group.options());
+
+    if (auto adam = dynamic_cast<torch::optim::AdamOptions*>(d)) {
+        auto betas = adam->betas();
+        adam->betas(std::tuple<double, double>(momentum, get<1>(betas)));
+    }
+    else if (auto adamw = dynamic_cast<torch::optim::AdamWOptions*>(d)) {
+        auto betas = adamw->betas();
+        adamw->betas(std::tuple<double, double>(momentum, get<1>(betas)));
+    }
+    else if (auto rms = dynamic_cast<torch::optim::RMSpropOptions*>(d)) {
+        rms->momentum(momentum);
+    }
+    if (auto sgd = dynamic_cast<torch::optim::SGDOptions*>(d)) {
+        sgd->momentum(momentum);
+    }
+    else
+        throw std::invalid_argument("unexpected optimizer");
+  )
+}
+
+template <class T>
+void set_weight_decay(optimizer t, double weight_decay) {
+  torch::optim::OptimizerOptions* d = &(t->defaults());
+  if (auto p = dynamic_cast<T*>(d)) {
+    p->weight_decay(weight_decay);
+    for (auto &param_group: t->param_groups()) {
+      torch::optim::OptimizerOptions* d = &(param_group.options());
+      if (auto p2 = dynamic_cast<T*>(d)) {
+        p2->weight_decay(weight_decay);
+      }
+      else throw std::invalid_argument("unexpected param group type");
+    }
+  }
+}
+
+void ato_set_weight_decay(optimizer t, double weight_decay) {
+  PROTECT(
+    set_weight_decay<torch::optim::AdamOptions>(t, weight_decay);
+    set_weight_decay<torch::optim::AdamWOptions>(t, weight_decay);
+    set_weight_decay<torch::optim::RMSpropOptions>(t, weight_decay);
+    set_weight_decay<torch::optim::SGDOptions>(t, weight_decay);
+  )
+}
+
+template <class T>
+void set_weight_decay_group(optimizer t, size_t group, double weight_decay) {
+  auto &param_group = t->param_groups().at(group);
+  torch::optim::OptimizerOptions* d = &(param_group.options());
+  if (auto p = dynamic_cast<T*>(d)) {
+    p->weight_decay(weight_decay);
+  }
+}
+
+void ato_set_weight_decay_group(optimizer t, size_t group, double weight_decay) {
+  PROTECT(
+    set_weight_decay_group<torch::optim::AdamOptions>(t, group, weight_decay);
+    set_weight_decay_group<torch::optim::AdamWOptions>(t, group, weight_decay);
+    set_weight_decay_group<torch::optim::RMSpropOptions>(t, group, weight_decay);
+    set_weight_decay_group<torch::optim::SGDOptions>(t, group, weight_decay);
+  )
+}
+
 void ato_zero_grad(optimizer t) {
  PROTECT(t->zero_grad();)
 }
@ -590,7 +805,7 @@ tensor atm_forward(module m, tensor *tensors, int ntensors) {
    std::vector<torch::jit::IValue> inputs;
    for (int i = 0; i < ntensors; ++i)
      inputs.push_back(*(tensors[i]));
-    torch::jit::IValue output = m->forward(inputs);
+    torch::jit::IValue output = m->forward(std::move(inputs));
    if (!output.isTensor())
      throw std::invalid_argument("forward did not return a tensor");
    return new torch::Tensor(output.toTensor());
@ -605,7 +820,31 @@ ivalue atm_forward_(module m,
    std::vector<torch::jit::IValue> inputs;
    for (int i = 0; i < nivalues; ++i)
      inputs.push_back(*(ivalues[i]));
-    torch::jit::IValue output = m->forward(inputs);
+    torch::jit::IValue output = m->forward(std::move(inputs));
+    return new torch::jit::IValue(output);
+  )
+  return nullptr;
+}
+
+tensor atm_method(module m, char *method_name, tensor *tensors, int ntensors) {
+  PROTECT(
+    std::vector<torch::jit::IValue> inputs;
+    for (int i = 0; i < ntensors; ++i)
+      inputs.push_back(*(tensors[i]));
+    torch::jit::IValue output = m->get_method(method_name)(std::move(inputs));
+    if (!output.isTensor())
+      throw std::invalid_argument("method did not return a tensor");
+    return new torch::Tensor(output.toTensor());
+  )
+  return nullptr;
+}
+
+ivalue atm_method_(module m, char *method_name, ivalue *ivalues, int nivalues) {
+  PROTECT(
+    std::vector<torch::jit::IValue> inputs;
+    for (int i = 0; i < nivalues; ++i)
+      inputs.push_back(*(ivalues[i]));
+    torch::jit::IValue output = m->get_method(method_name)(std::move(inputs));
    return new torch::jit::IValue(output);
  )
  return nullptr;
@ -615,12 +854,40 @@ void atm_free(module m) {
  delete(m);
 }

+void atm_save(module m, char *filename) {
+  PROTECT(
+    m->save(filename);
+  )
+}
+
 void atm_to(module m, int device, int dtype, bool non_blocking) {
  PROTECT(
    m->to(device_of_int(device), at::ScalarType(dtype), non_blocking);
  )
 }

+int atm_get_profiling_mode() {
+  PROTECT(
+    return torch::jit::getProfilingMode();
+  )
+  return 0;
+}
+
+void atm_set_profiling_mode(int b) {
+  PROTECT(
+    torch::jit::getProfilingMode() = (bool)b;
+  )
+}
+
+void atm_named_parameters(module m, void *data, void (*f)(void *, char *, tensor)) {
+  PROTECT(
+    for (const auto &p : m->named_parameters()) {
+      auto v = p.value;
+      f(data, (char*)p.name.c_str(), new torch::Tensor(v));
+    }
+  )
+}
+
 ivalue ati_tensor(tensor t) {
  PROTECT(
    return new torch::jit::IValue(*t);
@ -718,6 +985,15 @@ ivalue ati_bool_list(char *is, int nvalues) {
  return nullptr;
 }

+ivalue ati_string_list(char **is, int nvalues) {
+  PROTECT(
+    c10::List<string> vec;
+    for (int i = 0; i < nvalues; ++i) vec.push_back(string(is[i]));
+    return new torch::jit::IValue(vec);
+  )
+  return nullptr;
+}
+
 ivalue ati_tensor_list(tensor *is, int nvalues) {
  PROTECT(
    c10::List<at::Tensor> vec;
@ -855,7 +1131,7 @@ void ati_to_int_list(ivalue i,
  PROTECT(
    auto vec = i->toIntList();
    if (vec.size() != noutputs) {
-      throw std::invalid_argument("unexpected list size");
+      throw std::invalid_argument("unexpected list<int> size");
    }
    for (int i = 0; i < noutputs; ++i)
      outputs[i] = vec[i];
@ -868,7 +1144,7 @@ void ati_to_double_list(ivalue i,
  PROTECT(
    auto vec = i->toDoubleList();
    if (vec.size() != noutputs) {
-      throw std::invalid_argument("unexpected list size");
+      throw std::invalid_argument("unexpected list<double> size");
    }
    for (int i = 0; i < noutputs; ++i)
      outputs[i] = vec[i];
@ -881,7 +1157,7 @@ void ati_to_bool_list(ivalue i,
  PROTECT(
    auto vec = i->toBoolList();
    if (vec.size() != noutputs) {
-      throw std::invalid_argument("unexpected list size");
+      throw std::invalid_argument("unexpected list<bool> size");
    }
    for (int i = 0; i < noutputs; ++i)
      outputs[i] = vec[i];
@ -894,7 +1170,7 @@ void ati_to_tensor_list(ivalue i,
  PROTECT(
    auto vec = i->toTensorList();
    if (vec.size() != noutputs) {
-      throw std::invalid_argument("unexpected tuple size");
+      throw std::invalid_argument("unexpected list<tensor> size");
    }
    for (int i = 0; i < noutputs; ++i)
      outputs[i] = new torch::Tensor(vec[i]);
--- a/libtch/torch_api.h
+++ b/libtch/torch_api.h
@ -1,6 +1,6 @@
 #ifndef __TORCH_API_H__
 #define __TORCH_API_H__
-#include<stdint.h>
+#include <stdint.h>

 #ifdef __cplusplus
 thread_local char *torch_last_err = nullptr;
@ -11,11 +11,11 @@ typedef torch::Scalar *scalar;
 typedef torch::optim::Optimizer *optimizer;
 typedef torch::jit::script::Module *module;
 typedef torch::jit::IValue *ivalue;
-#define PROTECT(x) \
-  try { \
-    x \
-  } catch (const exception& e) { \
-      torch_last_err = strdup(e.what()); \
+#define PROTECT(x)                                                             \
+  try {                                                                        \
+    x                                                                          \
+  } catch (const exception &e) {                                               \
+    torch_last_err = strdup(e.what());                                         \
  }
 #else
 typedef void *tensor;
@ -28,18 +28,33 @@ typedef void *ivalue;
 char *get_and_reset_last_err(); // thread-local
 void at_manual_seed(int64_t);
 tensor at_new_tensor();
-tensor at_tensor_of_data(void *vs, int64_t *dims, size_t ndims, size_t element_size_in_bytes, int type);
-void at_copy_data(tensor tensor, void *vs, size_t numel, size_t element_size_in_bytes);
+tensor at_tensor_of_blob(void *data, int64_t *dims, size_t ndims,
+                         int64_t *strides, size_t nstrides, int type,
+                         int device);
+tensor at_tensor_of_data(void *vs, int64_t *dims, size_t ndims,
+                         size_t element_size_in_bytes, int type);
+void at_copy_data(tensor tensor, void *vs, size_t numel,
+                  size_t element_size_in_bytes);
 tensor at_shallow_clone(tensor);

 void *at_data_ptr(tensor);
 int at_defined(tensor);
+int at_is_mkldnn(tensor);
 int at_is_sparse(tensor);
 int at_device(tensor);
 size_t at_dim(tensor);
 void at_shape(tensor, int64_t *);
+void at_stride(tensor, int64_t *);
 int at_scalar_type(tensor);

+void at__amp_non_finite_check_and_unscale(tensor, tensor, tensor);
+
+void at_autocast_clear_cache();
+int at_autocast_decrement_nesting();
+int at_autocast_increment_nesting();
+bool at_autocast_is_enabled();
+bool at_autocast_set_enabled(bool b);
+
 void at_backward(tensor, int, int);
 int at_requires_grad(tensor);
 int at_grad_set_enabled(int);
@ -50,8 +65,10 @@ void at_fill_int64(tensor, int64_t);

 double at_double_value_at_indexes(tensor, int64_t *indexes, int indexes_len);
 int64_t at_int64_value_at_indexes(tensor, int64_t *indexes, int indexes_len);
-void at_set_double_value_at_indexes(tensor, int *indexes, int indexes_len, double v);
-void at_set_int64_value_at_indexes(tensor, int *indexes, int indexes_len, int64_t v);
+void at_set_double_value_at_indexes(tensor, int *indexes, int indexes_len,
+                                    double v);
+void at_set_int64_value_at_indexes(tensor, int *indexes, int indexes_len,
+                                   int64_t v);

 void at_copy_(tensor dst, tensor src);

@ -63,14 +80,20 @@ tensor at_load_image(char *filename);
 int at_save_image(tensor, char *filename);
 tensor at_resize_image(tensor, int w, int h);

-void at_save_multi(tensor *tensors, char **tensor_names, int ntensors, char *filename);
+void at_save_multi(tensor *tensors, char **tensor_names, int ntensors,
+                   char *filename);
 /* [at_load_multi] takes as input an array of nullptr for [tensors]. */
-void at_load_multi(tensor *tensors, char **tensor_names, int ntensors, char *filename);
+void at_load_multi(tensor *tensors, char **tensor_names, int ntensors,
+                   char *filename);
 /* [at_load_multi_] takes as input an array of allocation [tensors]. */
-void at_load_multi_(tensor *tensors, char **tensor_names, int ntensors, char *filename);
+void at_load_multi_(tensor *tensors, char **tensor_names, int ntensors,
+                    char *filename);

-void at_load_callback(char *filename, void *data, void (*f)(void *, char *, tensor));
-void at_load_callback_with_device(char *filename, void *data, void (*f)(void *, char *, tensor), int device_id);
+void at_load_callback(char *filename, void *data,
+                      void (*f)(void *, char *, tensor));
+void at_load_callback_with_device(char *filename, void *data,
+                                  void (*f)(void *, char *, tensor),
+                                  int device_id);

 int at_get_num_interop_threads();

@ -82,32 +105,27 @@ void at_set_num_threads(int n_threads);

 void at_free(tensor);

-void at_run_backward(tensor *tensors,
-                      int ntensors,
-                      tensor *inputs,
-                      int ninputs,
-                      tensor *outputs,
-                      int keep_graph,
-                      int create_graph);
+void at_run_backward(tensor *tensors, int ntensors, tensor *inputs, int ninputs,
+                     tensor *outputs, int keep_graph, int create_graph);

-optimizer ato_adam(double learning_rate,
-                   double beta1,
-                   double beta2,
+optimizer ato_adam(double learning_rate, double beta1, double beta2,
                   double weight_decay);
-optimizer ato_rms_prop(double learning_rate,
-                       double alpha,
-                       double eps,
-                       double weight_decay,
-                       double momentum,
-                       int centered);
-optimizer ato_sgd(double learning_rate,
-                  double momentum,
-                  double dampening,
-                  double weight_decay,
-                  int nesterov);
-void ato_add_parameters(optimizer, tensor *, int ntensors);
+optimizer ato_adamw(double learning_rate, double beta1, double beta2,
+                    double weight_decay);
+optimizer ato_rms_prop(double learning_rate, double alpha, double eps,
+                       double weight_decay, double momentum, int centered);
+optimizer ato_sgd(double learning_rate, double momentum, double dampening,
+                  double weight_decay, int nesterov);
+// NOTE. switch back as param group #261 not updated yet.
+// Backward compat
+void ato_add_parameters_old(optimizer, tensor *, int ntensors);
+void ato_add_parameters(optimizer, tensor, size_t group);
 void ato_set_learning_rate(optimizer, double learning_rate);
 void ato_set_momentum(optimizer, double momentum);
+void ato_set_learning_rate_group(optimizer, size_t group, double learning_rate);
+void ato_set_momentum_group(optimizer, size_t group, double momentum);
+void ato_set_weight_decay(optimizer t, double weight_decay);
+void ato_set_weight_decay_group(optimizer t, size_t group, double weight_decay);
 void ato_zero_grad(optimizer);
 void ato_step(optimizer);
 void ato_free(optimizer);
@ -129,11 +147,16 @@ module atm_load_on_device(char *, int device);
 module atm_load_str(char *, size_t sz);
 module atm_load_str_on_device(char *, size_t sz, int device);
 tensor atm_forward(module, tensor *tensors, int ntensors);
-ivalue atm_forward_(module,
-                    ivalue *ivalues,
-                    int nivalues);
+ivalue atm_forward_(module, ivalue *ivalues, int nivalues);
+tensor atm_method(module, char *method_name, tensor *tensors, int ntensors);
+ivalue atm_method_(module, char *method_name, ivalue *ivalues, int nivalues);
 void atm_free(module);
 void atm_to(module m, int device, int dtype, bool non_blocking);
+void atm_save(module m, char *);
+int atm_get_profiling_mode();
+void atm_set_profiling_mode(int);
+void atm_named_parameters(module, void *data,
+                          void (*f)(void *, char *, tensor));

 ivalue ati_none();
 ivalue ati_tensor(tensor);
@ -147,6 +170,7 @@ ivalue ati_generic_dict(ivalue *, int);
 ivalue ati_int_list(int64_t *, int);
 ivalue ati_double_list(double *, int);
 ivalue ati_bool_list(char *, int);
+ivalue ati_string_list(char **, int);
 ivalue ati_tensor_list(tensor *, int);

 tensor ati_to_tensor(ivalue);
--- a/libtch/torch_api_generated.cpp.h
+++ b/libtch/torch_api_generated.cpp.h
--- a/libtch/torch_api_generated.h
+++ b/libtch/torch_api_generated.h
@ -22,11 +22,17 @@ void atg___xor__(tensor *, tensor self, scalar other);
 void atg___xor__1(tensor *, tensor self, tensor other);
 void atg__adaptive_avg_pool2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len);
 void atg__adaptive_avg_pool2d_backward(tensor *, tensor grad_output, tensor self);
-void atg__addr(tensor *, tensor self, tensor vec1, tensor vec2);
-void atg__addr_(tensor *, tensor self, tensor vec1, tensor vec2);
-void atg__addr_out(tensor *, tensor out, tensor self, tensor vec1, tensor vec2);
+void atg__add_batch_dim(tensor *, tensor self, int64_t batch_dim, int64_t level);
+void atg__add_relu(tensor *, tensor self, tensor other);
+void atg__add_relu_(tensor *, tensor self, tensor other);
+void atg__add_relu_out(tensor *, tensor out, tensor self, tensor other);
+void atg__addmv_impl_(tensor *, tensor self, tensor self2, tensor mat, tensor vec);
+void atg__aminmax(tensor *, tensor self);
+void atg__aminmax1(tensor *, tensor self, int64_t dim, int keepdim);
 void atg__amp_update_scale(tensor *, tensor growth_tracker, tensor current_scale, tensor found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
 void atg__baddbmm_mkl_(tensor *, tensor self, tensor batch1, tensor batch2);
+void atg__bmm(tensor *, tensor self, tensor mat2, int deterministic);
+void atg__bmm_out(tensor *, tensor out, tensor self, tensor mat2, int deterministic);
 void atg__cast_byte(tensor *, tensor self, int non_blocking);
 void atg__cast_char(tensor *, tensor self, int non_blocking);
 void atg__cast_double(tensor *, tensor self, int non_blocking);
@ -41,7 +47,11 @@ void atg__cdist_backward(tensor *, tensor grad, tensor x1, tensor x2, double p,
 void atg__cholesky_helper(tensor *, tensor self, int upper);
 void atg__cholesky_solve_helper(tensor *, tensor self, tensor A, int upper);
 void atg__coalesced_(tensor *, tensor self, int coalesced);
+void atg__compute_linear_combination(tensor *, tensor input, tensor coefficients);
+void atg__compute_linear_combination_out(tensor *, tensor out, tensor input, tensor coefficients);
+void atg__conj(tensor *, tensor self);
 void atg__convolution(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len, int64_t groups, int benchmark, int deterministic, int cudnn_enabled);
+void atg__convolution1(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len, int64_t groups, int benchmark, int deterministic, int cudnn_enabled, int allow_tf32);
 void atg__convolution_nogroup(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len);
 void atg__copy_from(tensor *, tensor self, tensor dst, int non_blocking);
 void atg__ctc_loss(tensor *, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int zero_infinity);
@ -59,28 +69,35 @@ void atg__dirichlet_grad(tensor *, tensor x, tensor alpha, tensor total);
 void atg__embedding_bag(tensor *, tensor weight, tensor indices, tensor offsets, int scale_grad_by_freq, int64_t mode, int sparse, tensor per_sample_weights, int include_last_offset);
 void atg__embedding_bag_backward(tensor *, tensor grad, tensor indices, tensor offsets, tensor offset2bag, tensor bag_size, tensor maximum_indices, int64_t num_weights, int scale_grad_by_freq, int64_t mode, int sparse, tensor per_sample_weights);
 void atg__embedding_bag_dense_backward(tensor *, tensor grad, tensor indices, tensor offsets, tensor offset2bag, tensor bag_size, tensor maximum_indices, int64_t num_weights, int scale_grad_by_freq, int64_t mode, tensor per_sample_weights);
+void atg__embedding_bag_forward_only(tensor *, tensor weight, tensor indices, tensor offsets, int scale_grad_by_freq, int64_t mode, int sparse, tensor per_sample_weights, int include_last_offset);
 void atg__embedding_bag_per_sample_weights_backward(tensor *, tensor grad, tensor weight, tensor indices, tensor offsets, tensor offset2bag, int64_t mode);
 void atg__embedding_bag_sparse_backward(tensor *, tensor grad, tensor indices, tensor offsets, tensor offset2bag, tensor bag_size, int64_t num_weights, int scale_grad_by_freq, int64_t mode, tensor per_sample_weights);
 void atg__empty_affine_quantized(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device, double scale, int64_t zero_point);
 void atg__empty_per_channel_affine_quantized(tensor *, int64_t *size_data, int size_len, tensor scales, tensor zero_points, int64_t axis, int options_kind, int options_device);
+void atg__euclidean_dist(tensor *, tensor x1, tensor x2);
+void atg__fake_quantize_learnable_per_channel_affine(tensor *, tensor self, tensor scale, tensor zero_point, int64_t axis, int64_t quant_min, int64_t quant_max);
+void atg__fake_quantize_learnable_per_channel_affine_backward(tensor *, tensor grad, tensor self, tensor scale, tensor zero_point, int64_t axis, int64_t quant_min, int64_t quant_max);
+void atg__fake_quantize_learnable_per_tensor_affine(tensor *, tensor self, tensor scale, tensor zero_point, int64_t quant_min, int64_t quant_max);
+void atg__fake_quantize_learnable_per_tensor_affine_backward(tensor *, tensor grad, tensor self, tensor scale, tensor zero_point, int64_t quant_min, int64_t quant_max);
 void atg__fft_with_size(tensor *, tensor self, int64_t signal_ndim, int complex_input, int complex_output, int inverse, int64_t *checked_signal_sizes_data, int checked_signal_sizes_len, int normalized, int onesided, int64_t *output_sizes_data, int output_sizes_len);
+void atg__fft_with_size1(tensor *, tensor self, int64_t signal_ndim, int complex_input, int complex_output, int inverse, int64_t *checked_signal_sizes_data, int checked_signal_sizes_len, int64_t normalization, int onesided, int64_t *output_sizes_data, int output_sizes_len);
 void atg__fused_dropout(tensor *, tensor self, double p);
 void atg__gather_sparse_backward(tensor *, tensor self, int64_t dim, tensor index, tensor grad);
+void atg__grid_sampler_2d_cpu_fallback(tensor *, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
+void atg__grid_sampler_2d_cpu_fallback_backward(tensor *, tensor grad_output, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 void atg__index_copy_(tensor *, tensor self, int64_t dim, tensor index, tensor source);
 void atg__index_put_impl_(tensor *, tensor self, tensor *indices_data, int indices_len, tensor values, int accumulate, int unsafe);
 void atg__indices(tensor *, tensor self);
 void atg__inverse_helper(tensor *, tensor self);
 void atg__log_softmax(tensor *, tensor self, int64_t dim, int half_to_float);
 void atg__log_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
+void atg__logcumsumexp(tensor *, tensor self, int64_t dim);
+void atg__logcumsumexp_out(tensor *, tensor out, tensor self, int64_t dim);
 void atg__lu_solve_helper(tensor *, tensor self, tensor LU_data, tensor LU_pivots);
 void atg__lu_with_info(tensor *, tensor self, int pivot, int check_errors);
 void atg__make_per_channel_quantized_tensor(tensor *, tensor self, tensor scale, tensor zero_point, int64_t axis);
 void atg__make_per_tensor_quantized_tensor(tensor *, tensor self, double scale, int64_t zero_point);
 void atg__masked_scale(tensor *, tensor self, tensor mask, double scale);
-void atg__max(tensor *, tensor self, int64_t dim, int keepdim);
-void atg__max_out(tensor *, tensor max, tensor max_indices, tensor self, int64_t dim, int keepdim);
-void atg__min(tensor *, tensor self, int64_t dim, int keepdim);
-void atg__min_out(tensor *, tensor min, tensor min_indices, tensor self, int64_t dim, int keepdim);
 void atg__mkldnn_reshape(tensor *, tensor self, int64_t *shape_data, int shape_len);
 void atg__mkldnn_transpose(tensor *, tensor self, int64_t dim0, int64_t dim1);
 void atg__mkldnn_transpose_(tensor *, tensor self, int64_t dim0, int64_t dim1);
@ -96,9 +113,11 @@ void atg__pack_padded_sequence_backward(tensor *, tensor grad, int64_t *input_si
 void atg__pad_packed_sequence(tensor *, tensor data, tensor batch_sizes, int batch_first, scalar padding_value, int64_t total_length);
 void atg__pdist_backward(tensor *, tensor grad, tensor self, double p, tensor pdist);
 void atg__qr_helper(tensor *, tensor self, int some);
+void atg__remove_batch_dim(tensor *, tensor self, int64_t level, int64_t batch_size, int64_t out_dim);
 void atg__reshape_from_tensor(tensor *, tensor self, tensor shape);
 void atg__s_where(tensor *, tensor condition, tensor self, tensor other);
 void atg__sample_dirichlet(tensor *, tensor self);
+void atg__saturate_weight_to_fp16(tensor *, tensor weight);
 void atg__shape_as_tensor(tensor *, tensor self);
 void atg__sobol_engine_draw(tensor *, tensor quasi, int64_t n, tensor sobolstate, int64_t dimension, int64_t num_generated, int dtype);
 void atg__sobol_engine_ff_(tensor *, tensor self, int64_t n, tensor sobolstate, int64_t dimension, int64_t num_generated);
@ -111,7 +130,13 @@ void atg__sparse_addmm(tensor *, tensor self, tensor sparse, tensor dense);
 void atg__sparse_coo_tensor_unsafe(tensor *, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_coo_tensor_with_dims(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_coo_tensor_with_dims_and_tensors(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int options_kind, int options_device);
+void atg__sparse_log_softmax(tensor *, tensor self, int64_t dim, int dtype);
+void atg__sparse_log_softmax1(tensor *, tensor self, int64_t dim, int half_to_float);
+void atg__sparse_log_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
 void atg__sparse_mm(tensor *, tensor sparse, tensor dense);
+void atg__sparse_softmax(tensor *, tensor self, int64_t dim, int dtype);
+void atg__sparse_softmax1(tensor *, tensor self, int64_t dim, int half_to_float);
+void atg__sparse_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
 void atg__sparse_sum(tensor *, tensor self);
 void atg__sparse_sum1(tensor *, tensor self, int dtype);
 void atg__sparse_sum2(tensor *, tensor self, int64_t *dim_data, int dim_len);
@ -122,6 +147,9 @@ void atg__standard_gamma_grad(tensor *, tensor self, tensor output);
 void atg__std(tensor *, tensor self, int unbiased);
 void atg__svd_helper(tensor *, tensor self, int some, int compute_uv);
 void atg__symeig_helper(tensor *, tensor self, int eigenvectors, int upper);
+void atg__test_optional_filled_intlist(tensor *, tensor values, int64_t *addends_data, int addends_len);
+void atg__test_optional_intlist(tensor *, tensor values, int64_t *addends_data, int addends_len);
+void atg__test_serialization_subcmul(tensor *, tensor self, tensor other);
 void atg__triangular_solve_helper(tensor *, tensor self, tensor A, int upper, int transpose, int unitriangular);
 void atg__trilinear(tensor *, tensor i1, tensor i2, tensor i3, int64_t *expand1_data, int expand1_len, int64_t *expand2_data, int expand2_len, int64_t *expand3_data, int expand3_len, int64_t *sumdim_data, int sumdim_len, int64_t unroll_dim);
 void atg__unique(tensor *, tensor self, int sorted, int return_inverse);
@ -136,9 +164,15 @@ void atg__weight_norm_differentiable_backward(tensor *, tensor grad_w, tensor sa
 void atg_abs(tensor *, tensor self);
 void atg_abs_(tensor *, tensor self);
 void atg_abs_out(tensor *, tensor out, tensor self);
+void atg_absolute(tensor *, tensor self);
+void atg_absolute_(tensor *, tensor self);
+void atg_absolute_out(tensor *, tensor out, tensor self);
 void atg_acos(tensor *, tensor self);
 void atg_acos_(tensor *, tensor self);
 void atg_acos_out(tensor *, tensor out, tensor self);
+void atg_acosh(tensor *, tensor self);
+void atg_acosh_(tensor *, tensor self);
+void atg_acosh_out(tensor *, tensor out, tensor self);
 void atg_adaptive_avg_pool1d(tensor *, tensor self, int64_t *output_size_data, int output_size_len);
 void atg_adaptive_avg_pool2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len);
 void atg_adaptive_avg_pool2d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len);
@ -188,6 +222,10 @@ void atg_all1(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_all_out(tensor *, tensor out, tensor self, int64_t dim, int keepdim);
 void atg_alpha_dropout(tensor *, tensor input, double p, int train);
 void atg_alpha_dropout_(tensor *, tensor self, double p, int train);
+void atg_amax(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
+void atg_amax_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim);
+void atg_amin(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
+void atg_amin_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_angle(tensor *, tensor self);
 void atg_angle_out(tensor *, tensor out, tensor self);
 void atg_any(tensor *, tensor self);
@ -198,29 +236,59 @@ void atg_arange1(tensor *, scalar start, scalar end, int options_kind, int optio
 void atg_arange2(tensor *, scalar start, scalar end, scalar step, int options_kind, int options_device);
 void atg_arange_out(tensor *, tensor out, scalar end);
 void atg_arange_out1(tensor *, tensor out, scalar start, scalar end);
-void atg_argmax(tensor *, tensor self, int64_t dim, int keepdim);
-void atg_argmin(tensor *, tensor self, int64_t dim, int keepdim);
+void atg_arccos(tensor *, tensor self);
+void atg_arccos_(tensor *, tensor self);
+void atg_arccos_out(tensor *, tensor out, tensor self);
+void atg_arccosh(tensor *, tensor self);
+void atg_arccosh_(tensor *, tensor self);
+void atg_arccosh_out(tensor *, tensor out, tensor self);
+void atg_arcsin(tensor *, tensor self);
+void atg_arcsin_(tensor *, tensor self);
+void atg_arcsin_out(tensor *, tensor out, tensor self);
+void atg_arcsinh(tensor *, tensor self);
+void atg_arcsinh_(tensor *, tensor self);
+void atg_arcsinh_out(tensor *, tensor out, tensor self);
+void atg_arctan(tensor *, tensor self);
+void atg_arctan_(tensor *, tensor self);
+void atg_arctan_out(tensor *, tensor out, tensor self);
+void atg_arctanh(tensor *, tensor self);
+void atg_arctanh_(tensor *, tensor self);
+void atg_arctanh_out(tensor *, tensor out, tensor self);
+void atg_argmax(tensor *, tensor self, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_argmin(tensor *, tensor self, int64_t dim_v, uint8_t dim_null, int keepdim);
 void atg_argsort(tensor *, tensor self, int64_t dim, int descending);
-void atg_as_strided(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int64_t storage_offset);
-void atg_as_strided_(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int64_t storage_offset);
+void atg_as_strided(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int64_t storage_offset_v, uint8_t storage_offset_null);
+void atg_as_strided_(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int64_t storage_offset_v, uint8_t storage_offset_null);
 void atg_asin(tensor *, tensor self);
 void atg_asin_(tensor *, tensor self);
 void atg_asin_out(tensor *, tensor out, tensor self);
+void atg_asinh(tensor *, tensor self);
+void atg_asinh_(tensor *, tensor self);
+void atg_asinh_out(tensor *, tensor out, tensor self);
 void atg_atan(tensor *, tensor self);
 void atg_atan2(tensor *, tensor self, tensor other);
 void atg_atan2_(tensor *, tensor self, tensor other);
 void atg_atan2_out(tensor *, tensor out, tensor self, tensor other);
 void atg_atan_(tensor *, tensor self);
 void atg_atan_out(tensor *, tensor out, tensor self);
+void atg_atanh(tensor *, tensor self);
+void atg_atanh_(tensor *, tensor self);
+void atg_atanh_out(tensor *, tensor out, tensor self);
+void atg_atleast_1d(tensor *, tensor self);
+tensor *atg_atleast_1d1(tensor *tensors_data, int tensors_len);
+void atg_atleast_2d(tensor *, tensor self);
+tensor *atg_atleast_2d1(tensor *tensors_data, int tensors_len);
+void atg_atleast_3d(tensor *, tensor self);
+tensor *atg_atleast_3d1(tensor *tensors_data, int tensors_len);
 void atg_avg_pool1d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad);
-void atg_avg_pool2d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override);
-void atg_avg_pool2d_backward(tensor *, tensor grad_output, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override);
-void atg_avg_pool2d_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override);
-void atg_avg_pool2d_out(tensor *, tensor out, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override);
-void atg_avg_pool3d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override);
-void atg_avg_pool3d_backward(tensor *, tensor grad_output, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override);
-void atg_avg_pool3d_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override);
-void atg_avg_pool3d_out(tensor *, tensor out, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override);
+void atg_avg_pool2d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override_v, uint8_t divisor_override_null);
+void atg_avg_pool2d_backward(tensor *, tensor grad_output, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override_v, uint8_t divisor_override_null);
+void atg_avg_pool2d_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override_v, uint8_t divisor_override_null);
+void atg_avg_pool2d_out(tensor *, tensor out, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override_v, uint8_t divisor_override_null);
+void atg_avg_pool3d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override_v, uint8_t divisor_override_null);
+void atg_avg_pool3d_backward(tensor *, tensor grad_output, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override_v, uint8_t divisor_override_null);
+void atg_avg_pool3d_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override_v, uint8_t divisor_override_null);
+void atg_avg_pool3d_out(tensor *, tensor out, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int ceil_mode, int count_include_pad, int64_t divisor_override_v, uint8_t divisor_override_null);
 void atg_baddbmm(tensor *, tensor self, tensor batch1, tensor batch2);
 void atg_baddbmm_(tensor *, tensor self, tensor batch1, tensor batch2);
 void atg_baddbmm_out(tensor *, tensor out, tensor self, tensor batch1, tensor batch2);
@ -232,7 +300,7 @@ void atg_batch_norm_backward_reduce(tensor *, tensor grad_out, tensor input, ten
 void atg_batch_norm_elemt(tensor *, tensor input, tensor weight, tensor bias, tensor mean, tensor invstd, double eps);
 void atg_batch_norm_elemt_out(tensor *, tensor out, tensor input, tensor weight, tensor bias, tensor mean, tensor invstd, double eps);
 void atg_batch_norm_gather_stats(tensor *, tensor input, tensor mean, tensor invstd, tensor running_mean, tensor running_var, double momentum, double eps, int64_t count);
-void atg_batch_norm_gather_stats_with_counts(tensor *, tensor input, tensor mean, tensor invstd, tensor running_mean, tensor running_var, double momentum, double eps, int64_t *counts_data, int counts_len);
+void atg_batch_norm_gather_stats_with_counts(tensor *, tensor input, tensor mean, tensor invstd, tensor running_mean, tensor running_var, double momentum, double eps, tensor counts);
 void atg_batch_norm_stats(tensor *, tensor input, double eps);
 void atg_batch_norm_update_stats(tensor *, tensor input, tensor running_mean, tensor running_var, double momentum);
 void atg_bernoulli(tensor *, tensor self);
@ -248,6 +316,7 @@ void atg_binary_cross_entropy_out(tensor *, tensor out, tensor self, tensor targ
 void atg_binary_cross_entropy_with_logits(tensor *, tensor self, tensor target, tensor weight, tensor pos_weight, int64_t reduction);
 void atg_binary_cross_entropy_with_logits_backward(tensor *, tensor grad_output, tensor self, tensor target, tensor weight, tensor pos_weight, int64_t reduction);
 void atg_bincount(tensor *, tensor self, tensor weights, int64_t minlength);
+void atg_binomial(tensor *, tensor count, tensor prob);
 void atg_bitwise_and(tensor *, tensor self, scalar other);
 void atg_bitwise_and1(tensor *, tensor self, tensor other);
 void atg_bitwise_and_(tensor *, tensor self, scalar other);
@ -271,20 +340,25 @@ void atg_bitwise_xor_out(tensor *, tensor out, tensor self, tensor other);
 void atg_bitwise_xor_out1(tensor *, tensor out, tensor self, scalar other);
 void atg_blackman_window(tensor *, int64_t window_length, int options_kind, int options_device);
 void atg_blackman_window1(tensor *, int64_t window_length, int periodic, int options_kind, int options_device);
+void atg_block_diag(tensor *, tensor *tensors_data, int tensors_len);
 void atg_bmm(tensor *, tensor self, tensor mat2);
 void atg_bmm_out(tensor *, tensor out, tensor self, tensor mat2);
 tensor *atg_broadcast_tensors(tensor *tensors_data, int tensors_len);
+void atg_bucketize(tensor *, tensor self, tensor boundaries, int out_int32, int right);
+void atg_bucketize1(tensor *, scalar self_scalar, tensor boundaries, int out_int32, int right);
+void atg_bucketize_out(tensor *, tensor out, tensor self, tensor boundaries, int out_int32, int right);
 void atg_cartesian_prod(tensor *, tensor *tensors_data, int tensors_len);
 void atg_cat(tensor *, tensor *tensors_data, int tensors_len, int64_t dim);
 void atg_cat_out(tensor *, tensor out, tensor *tensors_data, int tensors_len, int64_t dim);
 void atg_cauchy_(tensor *, tensor self, double median, double sigma);
-void atg_cdist(tensor *, tensor x1, tensor x2, double p, int64_t compute_mode);
+void atg_cdist(tensor *, tensor x1, tensor x2, double p, int64_t compute_mode_v, uint8_t compute_mode_null);
 void atg_ceil(tensor *, tensor self);
 void atg_ceil_(tensor *, tensor self);
 void atg_ceil_out(tensor *, tensor out, tensor self);
 void atg_celu(tensor *, tensor self);
 void atg_celu_(tensor *, tensor self);
 void atg_chain_matmul(tensor *, tensor *matrices_data, int matrices_len);
+void atg_channel_shuffle(tensor *, tensor self, int64_t groups);
 void atg_cholesky(tensor *, tensor self, int upper);
 void atg_cholesky_inverse(tensor *, tensor self, int upper);
 void atg_cholesky_inverse_out(tensor *, tensor out, tensor self, int upper);
@ -301,12 +375,17 @@ void atg_clamp_min(tensor *, tensor self, scalar min);
 void atg_clamp_min_(tensor *, tensor self, scalar min);
 void atg_clamp_min_out(tensor *, tensor out, tensor self, scalar min);
 void atg_clamp_out(tensor *, tensor out, tensor self, scalar min, scalar max);
+void atg_clip(tensor *, tensor self, scalar min, scalar max);
+void atg_clip_(tensor *, tensor self, scalar min, scalar max);
+void atg_clip_out(tensor *, tensor out, tensor self, scalar min, scalar max);
 void atg_coalesce(tensor *, tensor self);
 void atg_col2im(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int64_t *kernel_size_data, int kernel_size_len, int64_t *dilation_data, int dilation_len, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len);
 void atg_col2im_backward(tensor *, tensor grad_output, int64_t *kernel_size_data, int kernel_size_len, int64_t *dilation_data, int dilation_len, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len);
 void atg_col2im_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *kernel_size_data, int kernel_size_len, int64_t *dilation_data, int dilation_len, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len);
 void atg_col2im_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int64_t *kernel_size_data, int kernel_size_len, int64_t *dilation_data, int dilation_len, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len);
 void atg_combinations(tensor *, tensor self, int64_t r, int with_replacement);
+void atg_complex(tensor *, tensor real, tensor imag);
+void atg_complex_out(tensor *, tensor out, tensor real, tensor imag);
 void atg_conj(tensor *, tensor self);
 void atg_conj_out(tensor *, tensor out, tensor self);
 void atg_constant_pad_nd(tensor *, tensor self, int64_t *pad_data, int pad_len);
@ -330,8 +409,10 @@ void atg_cosh_(tensor *, tensor self);
 void atg_cosh_out(tensor *, tensor out, tensor self);
 void atg_cosine_embedding_loss(tensor *, tensor input1, tensor input2, tensor target, double margin, int64_t reduction);
 void atg_cosine_similarity(tensor *, tensor x1, tensor x2, int64_t dim, double eps);
-void atg_cross(tensor *, tensor self, tensor other, int64_t dim);
-void atg_cross_out(tensor *, tensor out, tensor self, tensor other, int64_t dim);
+void atg_count_nonzero(tensor *, tensor self, int64_t *dim_data, int dim_len);
+void atg_count_nonzero1(tensor *, tensor self, int64_t dim_v, uint8_t dim_null);
+void atg_cross(tensor *, tensor self, tensor other, int64_t dim_v, uint8_t dim_null);
+void atg_cross_out(tensor *, tensor out, tensor self, tensor other, int64_t dim_v, uint8_t dim_null);
 void atg_ctc_loss(tensor *, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int64_t reduction, int zero_infinity);
 void atg_ctc_loss1(tensor *, tensor log_probs, tensor targets, tensor input_lengths, tensor target_lengths, int64_t blank, int64_t reduction, int zero_infinity);
 void atg_cudnn_affine_grid_generator(tensor *, tensor theta, int64_t n, int64_t C, int64_t H, int64_t W);
@ -340,32 +421,42 @@ void atg_cudnn_batch_norm(tensor *, tensor input, tensor weight, tensor bias, te
 void atg_cudnn_batch_norm_backward(tensor *, tensor input, tensor grad_output, tensor weight, tensor running_mean, tensor running_var, tensor save_mean, tensor save_var, double epsilon, tensor reserveSpace);
 void atg_cudnn_convolution(tensor *, tensor self, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
 void atg_cudnn_convolution1(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_cudnn_convolution_backward_input(tensor *, int64_t *self_size_data, int self_size_len, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_cudnn_convolution_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
+void atg_cudnn_convolution2(tensor *, tensor self, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
+void atg_cudnn_convolution_backward_input(tensor *, int64_t *self_size_data, int self_size_len, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
+void atg_cudnn_convolution_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
 void atg_cudnn_convolution_transpose(tensor *, tensor self, tensor weight, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
 void atg_cudnn_convolution_transpose1(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_cudnn_convolution_transpose_backward_input(tensor *, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_cudnn_convolution_transpose_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
+void atg_cudnn_convolution_transpose2(tensor *, tensor self, tensor weight, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
+void atg_cudnn_convolution_transpose_backward_input(tensor *, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
+void atg_cudnn_convolution_transpose_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
 void atg_cudnn_grid_sampler(tensor *, tensor self, tensor grid);
 void atg_cudnn_grid_sampler_backward(tensor *, tensor self, tensor grid, tensor grad_output);
 void atg_cummax(tensor *, tensor self, int64_t dim);
 void atg_cummax_out(tensor *, tensor values, tensor indices, tensor self, int64_t dim);
+void atg_cummaxmin_backward(tensor *, tensor grad, tensor input, tensor indices, int64_t dim);
 void atg_cummin(tensor *, tensor self, int64_t dim);
 void atg_cummin_out(tensor *, tensor values, tensor indices, tensor self, int64_t dim);
 void atg_cumprod(tensor *, tensor self, int64_t dim, int dtype);
+void atg_cumprod_backward(tensor *, tensor grad, tensor input, int64_t dim);
 void atg_cumprod_out(tensor *, tensor out, tensor self, int64_t dim, int dtype);
 void atg_cumsum(tensor *, tensor self, int64_t dim, int dtype);
 void atg_cumsum_out(tensor *, tensor out, tensor self, int64_t dim, int dtype);
 void atg_data(tensor *, tensor self);
+void atg_deg2rad(tensor *, tensor self);
+void atg_deg2rad_(tensor *, tensor self);
+void atg_deg2rad_out(tensor *, tensor out, tensor self);
 void atg_dequantize(tensor *, tensor self);
+tensor *atg_dequantize1(tensor *tensors_data, int tensors_len);
 void atg_det(tensor *, tensor self);
 void atg_detach(tensor *, tensor self);
 void atg_detach_(tensor *, tensor self);
 void atg_diag(tensor *, tensor self, int64_t diagonal);
+void atg_diag_backward(tensor *, tensor grad, int64_t *input_sizes_data, int input_sizes_len, int64_t diagonal);
 void atg_diag_embed(tensor *, tensor self, int64_t offset, int64_t dim1, int64_t dim2);
 void atg_diag_out(tensor *, tensor out, tensor self, int64_t diagonal);
 void atg_diagflat(tensor *, tensor self, int64_t offset);
 void atg_diagonal(tensor *, tensor self, int64_t offset, int64_t dim1, int64_t dim2);
+void atg_diagonal_backward(tensor *, tensor grad, int64_t *input_sizes_data, int input_sizes_len, int64_t offset, int64_t dim1, int64_t dim2);
 void atg_digamma(tensor *, tensor self);
 void atg_digamma_(tensor *, tensor self);
 void atg_digamma_out(tensor *, tensor out, tensor self);
@ -375,10 +466,17 @@ void atg_div1(tensor *, tensor self, scalar other);
 void atg_div_(tensor *, tensor self, tensor other);
 void atg_div_1(tensor *, tensor self, scalar other);
 void atg_div_out(tensor *, tensor out, tensor self, tensor other);
+void atg_divide(tensor *, tensor self, tensor other);
+void atg_divide1(tensor *, tensor self, scalar other);
+void atg_divide_(tensor *, tensor self, tensor other);
+void atg_divide_1(tensor *, tensor self, scalar other);
+void atg_divide_out(tensor *, tensor out, tensor self, tensor other);
 void atg_dot(tensor *, tensor self, tensor tensor);
 void atg_dot_out(tensor *, tensor out, tensor self, tensor tensor);
 void atg_dropout(tensor *, tensor input, double p, int train);
 void atg_dropout_(tensor *, tensor self, double p, int train);
+void atg_dstack(tensor *, tensor *tensors_data, int tensors_len);
+void atg_dstack_out(tensor *, tensor out, tensor *tensors_data, int tensors_len);
 void atg_eig(tensor *, tensor self, int eigenvectors);
 void atg_eig_out(tensor *, tensor e, tensor v, tensor self, int eigenvectors);
 void atg_einsum(tensor *, char* equation_ptr, int equation_len, tensor *tensors_data, int tensors_len);
@ -395,7 +493,9 @@ void atg_embedding_renorm_(tensor *, tensor self, tensor indices, double max_nor
 void atg_embedding_sparse_backward(tensor *, tensor grad, tensor indices, int64_t num_weights, int64_t padding_idx, int scale_grad_by_freq);
 void atg_empty(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_empty_like(tensor *, tensor self);
+void atg_empty_meta(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_empty_out(tensor *, tensor out, int64_t *size_data, int size_len);
+void atg_empty_quantized(tensor *, int64_t *size_data, int size_len, tensor qtensor);
 void atg_empty_strided(tensor *, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int options_kind, int options_device);
 void atg_eq(tensor *, tensor self, scalar other);
 void atg_eq1(tensor *, tensor self, tensor other);
@ -413,6 +513,9 @@ void atg_erfinv(tensor *, tensor self);
 void atg_erfinv_(tensor *, tensor self);
 void atg_erfinv_out(tensor *, tensor out, tensor self);
 void atg_exp(tensor *, tensor self);
+void atg_exp2(tensor *, tensor self);
+void atg_exp2_(tensor *, tensor self);
+void atg_exp2_out(tensor *, tensor out, tensor self);
 void atg_exp_(tensor *, tensor self);
 void atg_exp_out(tensor *, tensor out, tensor self);
 void atg_expand(tensor *, tensor self, int64_t *size_data, int size_len, int implicit);
@ -441,11 +544,26 @@ void atg_feature_alpha_dropout_(tensor *, tensor self, double p, int train);
 void atg_feature_dropout(tensor *, tensor input, double p, int train);
 void atg_feature_dropout_(tensor *, tensor self, double p, int train);
 void atg_fft(tensor *, tensor self, int64_t signal_ndim, int normalized);
+void atg_fft_fft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_fftn(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
+void atg_fft_hfft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_ifft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_ifftn(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
+void atg_fft_ihfft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_irfft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_irfftn(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
+void atg_fft_rfft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_rfftn(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fill_(tensor *, tensor self, scalar value);
 void atg_fill_1(tensor *, tensor self, tensor value);
 void atg_fill_diagonal_(tensor *, tensor self, scalar fill_value, int wrap);
+void atg_fix(tensor *, tensor self);
+void atg_fix_(tensor *, tensor self);
+void atg_fix_out(tensor *, tensor out, tensor self);
 void atg_flatten(tensor *, tensor self, int64_t start_dim, int64_t end_dim);
 void atg_flip(tensor *, tensor self, int64_t *dims_data, int dims_len);
+void atg_fliplr(tensor *, tensor self);
+void atg_flipud(tensor *, tensor self);
 void atg_floor(tensor *, tensor self);
 void atg_floor_(tensor *, tensor self);
 void atg_floor_divide(tensor *, tensor self, tensor other);
@ -474,12 +592,16 @@ void atg_fractional_max_pool3d_out(tensor *, tensor output, tensor indices, tens
 void atg_frobenius_norm(tensor *, tensor self);
 void atg_frobenius_norm1(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_frobenius_norm_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim);
-void atg_from_file(tensor *, char* filename_ptr, int filename_len, int shared, int64_t size, int options_kind, int options_device);
+void atg_from_file(tensor *, char* filename_ptr, int filename_len, int shared, int64_t size_v, uint8_t size_null, int options_kind, int options_device);
 void atg_full(tensor *, int64_t *size_data, int size_len, scalar fill_value, int options_kind, int options_device);
 void atg_full_like(tensor *, tensor self, scalar fill_value);
 void atg_full_out(tensor *, tensor out, int64_t *size_data, int size_len, scalar fill_value);
 void atg_gather(tensor *, tensor self, int64_t dim, tensor index, int sparse_grad);
+void atg_gather_backward(tensor *, tensor grad, tensor self, int64_t dim, tensor index, int sparse_grad);
 void atg_gather_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, int sparse_grad);
+void atg_gcd(tensor *, tensor self, tensor other);
+void atg_gcd_(tensor *, tensor self, tensor other);
+void atg_gcd_out(tensor *, tensor out, tensor self, tensor other);
 void atg_ge(tensor *, tensor self, scalar other);
 void atg_ge1(tensor *, tensor self, tensor other);
 void atg_ge_(tensor *, tensor self, scalar other);
@ -498,6 +620,18 @@ void atg_glu_backward(tensor *, tensor grad_output, tensor self, int64_t dim);
 void atg_glu_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t dim);
 void atg_glu_out(tensor *, tensor out, tensor self, int64_t dim);
 void atg_grad(tensor *, tensor self);
+void atg_greater(tensor *, tensor self, scalar other);
+void atg_greater1(tensor *, tensor self, tensor other);
+void atg_greater_(tensor *, tensor self, scalar other);
+void atg_greater_1(tensor *, tensor self, tensor other);
+void atg_greater_equal(tensor *, tensor self, scalar other);
+void atg_greater_equal1(tensor *, tensor self, tensor other);
+void atg_greater_equal_(tensor *, tensor self, scalar other);
+void atg_greater_equal_1(tensor *, tensor self, tensor other);
+void atg_greater_equal_out(tensor *, tensor out, tensor self, scalar other);
+void atg_greater_equal_out1(tensor *, tensor out, tensor self, tensor other);
+void atg_greater_out(tensor *, tensor out, tensor self, scalar other);
+void atg_greater_out1(tensor *, tensor out, tensor self, tensor other);
 void atg_grid_sampler(tensor *, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 void atg_grid_sampler_2d(tensor *, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 void atg_grid_sampler_2d_backward(tensor *, tensor grad_output, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
@ -525,16 +659,31 @@ void atg_hardsigmoid(tensor *, tensor self);
 void atg_hardsigmoid_(tensor *, tensor self);
 void atg_hardsigmoid_backward(tensor *, tensor grad_output, tensor self);
 void atg_hardsigmoid_out(tensor *, tensor out, tensor self);
+void atg_hardswish(tensor *, tensor self);
+void atg_hardswish_(tensor *, tensor self);
+void atg_hardswish_backward(tensor *, tensor grad_output, tensor self);
+void atg_hardswish_out(tensor *, tensor out, tensor self);
 void atg_hardtanh(tensor *, tensor self);
 void atg_hardtanh_(tensor *, tensor self);
 void atg_hardtanh_backward(tensor *, tensor grad_output, tensor self, scalar min_val, scalar max_val);
 void atg_hardtanh_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, scalar min_val, scalar max_val);
 void atg_hardtanh_out(tensor *, tensor out, tensor self);
+void atg_heaviside(tensor *, tensor self, tensor values);
+void atg_heaviside_(tensor *, tensor self, tensor values);
+void atg_heaviside_out(tensor *, tensor out, tensor self, tensor values);
 void atg_hinge_embedding_loss(tensor *, tensor self, tensor target, double margin, int64_t reduction);
 void atg_histc(tensor *, tensor self, int64_t bins);
 void atg_histc_out(tensor *, tensor out, tensor self, int64_t bins);
 void atg_hspmm(tensor *, tensor mat1, tensor mat2);
 void atg_hspmm_out(tensor *, tensor out, tensor mat1, tensor mat2);
+void atg_hstack(tensor *, tensor *tensors_data, int tensors_len);
+void atg_hstack_out(tensor *, tensor out, tensor *tensors_data, int tensors_len);
+void atg_hypot(tensor *, tensor self, tensor other);
+void atg_hypot_(tensor *, tensor self, tensor other);
+void atg_hypot_out(tensor *, tensor out, tensor self, tensor other);
+void atg_i0(tensor *, tensor self);
+void atg_i0_(tensor *, tensor self);
+void atg_i0_out(tensor *, tensor out, tensor self);
 void atg_ifft(tensor *, tensor self, int64_t signal_ndim, int normalized);
 void atg_im2col(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *dilation_data, int dilation_len, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len);
 void atg_im2col_backward(tensor *, tensor grad_output, int64_t *input_size_data, int input_size_len, int64_t *kernel_size_data, int kernel_size_len, int64_t *dilation_data, int dilation_len, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len);
@ -553,8 +702,10 @@ void atg_index_fill_1(tensor *, tensor self, int64_t dim, tensor index, tensor v
 void atg_index_put(tensor *, tensor self, tensor *indices_data, int indices_len, tensor values, int accumulate);
 void atg_index_put_(tensor *, tensor self, tensor *indices_data, int indices_len, tensor values, int accumulate);
 void atg_index_select(tensor *, tensor self, int64_t dim, tensor index);
+void atg_index_select_backward(tensor *, tensor grad, int64_t *self_sizes_data, int self_sizes_len, int64_t dim, tensor index);
 void atg_index_select_out(tensor *, tensor out, tensor self, int64_t dim, tensor index);
 void atg_indices(tensor *, tensor self);
+void atg_infinitely_differentiable_gelu_backward(tensor *, tensor grad, tensor self);
 void atg_instance_norm(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int use_input_stats, double momentum, double eps, int cudnn_enabled);
 void atg_int_repr(tensor *, tensor self);
 void atg_inverse(tensor *, tensor self);
@ -564,8 +715,17 @@ void atg_isclose(tensor *, tensor self, tensor other, double rtol, double atol,
 void atg_isfinite(tensor *, tensor self);
 void atg_isinf(tensor *, tensor self);
 void atg_isnan(tensor *, tensor self);
-void atg_kl_div(tensor *, tensor self, tensor target, int64_t reduction);
-void atg_kl_div_backward(tensor *, tensor grad_output, tensor self, tensor target, int64_t reduction);
+void atg_isneginf(tensor *, tensor self);
+void atg_isneginf_out(tensor *, tensor out, tensor self);
+void atg_isposinf(tensor *, tensor self);
+void atg_isposinf_out(tensor *, tensor out, tensor self);
+void atg_isreal(tensor *, tensor self);
+void atg_istft(tensor *, tensor self, int64_t n_fft, int64_t hop_length_v, uint8_t hop_length_null, int64_t win_length_v, uint8_t win_length_null, tensor window, int center, int normalized, int onesided, int64_t length_v, uint8_t length_null, int return_complex);
+void atg_kaiser_window(tensor *, int64_t window_length, int options_kind, int options_device);
+void atg_kaiser_window1(tensor *, int64_t window_length, int periodic, int options_kind, int options_device);
+void atg_kaiser_window2(tensor *, int64_t window_length, int periodic, double beta, int options_kind, int options_device);
+void atg_kl_div(tensor *, tensor self, tensor target, int64_t reduction, int log_target);
+void atg_kl_div_backward(tensor *, tensor grad_output, tensor self, tensor target, int64_t reduction, int log_target);
 void atg_kthvalue(tensor *, tensor self, int64_t k, int64_t dim, int keepdim);
 void atg_kthvalue_out(tensor *, tensor values, tensor indices, tensor self, int64_t k, int64_t dim, int keepdim);
 void atg_l1_loss(tensor *, tensor self, tensor target, int64_t reduction);
@ -573,6 +733,9 @@ void atg_l1_loss_backward(tensor *, tensor grad_output, tensor self, tensor targ
 void atg_l1_loss_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, tensor target, int64_t reduction);
 void atg_l1_loss_out(tensor *, tensor out, tensor self, tensor target, int64_t reduction);
 void atg_layer_norm(tensor *, tensor input, int64_t *normalized_shape_data, int normalized_shape_len, tensor weight, tensor bias, double eps, int cudnn_enable);
+void atg_lcm(tensor *, tensor self, tensor other);
+void atg_lcm_(tensor *, tensor self, tensor other);
+void atg_lcm_out(tensor *, tensor out, tensor self, tensor other);
 void atg_le(tensor *, tensor self, scalar other);
 void atg_le1(tensor *, tensor self, tensor other);
 void atg_le_(tensor *, tensor self, scalar other);
@ -589,12 +752,29 @@ void atg_lerp_(tensor *, tensor self, tensor end, scalar weight);
 void atg_lerp_1(tensor *, tensor self, tensor end, tensor weight);
 void atg_lerp_out(tensor *, tensor out, tensor self, tensor end, scalar weight);
 void atg_lerp_out1(tensor *, tensor out, tensor self, tensor end, tensor weight);
+void atg_less(tensor *, tensor self, scalar other);
+void atg_less1(tensor *, tensor self, tensor other);
+void atg_less_(tensor *, tensor self, scalar other);
+void atg_less_1(tensor *, tensor self, tensor other);
+void atg_less_equal(tensor *, tensor self, scalar other);
+void atg_less_equal1(tensor *, tensor self, tensor other);
+void atg_less_equal_(tensor *, tensor self, scalar other);
+void atg_less_equal_1(tensor *, tensor self, tensor other);
+void atg_less_equal_out(tensor *, tensor out, tensor self, scalar other);
+void atg_less_equal_out1(tensor *, tensor out, tensor self, tensor other);
+void atg_less_out(tensor *, tensor out, tensor self, scalar other);
+void atg_less_out1(tensor *, tensor out, tensor self, tensor other);
 void atg_lgamma(tensor *, tensor self);
 void atg_lgamma_(tensor *, tensor self);
 void atg_lgamma_out(tensor *, tensor out, tensor self);
+void atg_linalg_det(tensor *, tensor self);
+void atg_linalg_norm(tensor *, tensor self, scalar ord, int64_t *dim_data, int dim_len, int keepdim, int dtype);
+void atg_linalg_norm1(tensor *, tensor self, char* ord_ptr, int ord_len, int64_t *dim_data, int dim_len, int keepdim, int dtype);
+void atg_linalg_norm_out(tensor *, tensor out, tensor self, scalar ord, int64_t *dim_data, int dim_len, int keepdim, int dtype);
+void atg_linalg_norm_out1(tensor *, tensor out, tensor self, char* ord_ptr, int ord_len, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_linear(tensor *, tensor input, tensor weight, tensor bias);
-void atg_linspace(tensor *, scalar start, scalar end, int64_t steps, int options_kind, int options_device);
-void atg_linspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps);
+void atg_linspace(tensor *, scalar start, scalar end, int64_t steps_v, uint8_t steps_null, int options_kind, int options_device);
+void atg_linspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps_v, uint8_t steps_null);
 void atg_log(tensor *, tensor self);
 void atg_log10(tensor *, tensor self);
 void atg_log10_(tensor *, tensor self);
@ -613,6 +793,12 @@ void atg_log_sigmoid_backward(tensor *, tensor grad_output, tensor self, tensor
 void atg_log_sigmoid_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, tensor buffer);
 void atg_log_sigmoid_out(tensor *, tensor out, tensor self);
 void atg_log_softmax(tensor *, tensor self, int64_t dim, int dtype);
+void atg_logaddexp(tensor *, tensor self, tensor other);
+void atg_logaddexp2(tensor *, tensor self, tensor other);
+void atg_logaddexp2_out(tensor *, tensor out, tensor self, tensor other);
+void atg_logaddexp_out(tensor *, tensor out, tensor self, tensor other);
+void atg_logcumsumexp(tensor *, tensor self, int64_t dim);
+void atg_logcumsumexp_out(tensor *, tensor out, tensor self, int64_t dim);
 void atg_logdet(tensor *, tensor self);
 void atg_logical_and(tensor *, tensor self, tensor other);
 void atg_logical_and_(tensor *, tensor self, tensor other);
@ -626,8 +812,13 @@ void atg_logical_or_out(tensor *, tensor out, tensor self, tensor other);
 void atg_logical_xor(tensor *, tensor self, tensor other);
 void atg_logical_xor_(tensor *, tensor self, tensor other);
 void atg_logical_xor_out(tensor *, tensor out, tensor self, tensor other);
-void atg_logspace(tensor *, scalar start, scalar end, int64_t steps, double base, int options_kind, int options_device);
-void atg_logspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps, double base);
+void atg_logit(tensor *, tensor self, double eps_v, uint8_t eps_null);
+void atg_logit_(tensor *, tensor self, double eps_v, uint8_t eps_null);
+void atg_logit_backward(tensor *, tensor grad_output, tensor self, double eps_v, uint8_t eps_null);
+void atg_logit_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, double eps_v, uint8_t eps_null);
+void atg_logit_out(tensor *, tensor out, tensor self, double eps_v, uint8_t eps_null);
+void atg_logspace(tensor *, scalar start, scalar end, int64_t steps_v, uint8_t steps_null, double base, int options_kind, int options_device);
+void atg_logspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps_v, uint8_t steps_null, double base);
 void atg_logsumexp(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_logsumexp_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_lstm(tensor *, tensor input, tensor *hx_data, int hx_len, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional, int batch_first);
@ -651,9 +842,12 @@ void atg_masked_fill_1(tensor *, tensor self, tensor mask, tensor value);
 void atg_masked_scatter(tensor *, tensor self, tensor mask, tensor source);
 void atg_masked_scatter_(tensor *, tensor self, tensor mask, tensor source);
 void atg_masked_select(tensor *, tensor self, tensor mask);
+void atg_masked_select_backward(tensor *, tensor grad, tensor input, tensor mask);
 void atg_masked_select_out(tensor *, tensor out, tensor self, tensor mask);
 void atg_matmul(tensor *, tensor self, tensor other);
 void atg_matmul_out(tensor *, tensor out, tensor self, tensor other);
+void atg_matrix_exp(tensor *, tensor self);
+void atg_matrix_exp_backward(tensor *, tensor self, tensor grad);
 void atg_matrix_power(tensor *, tensor self, int64_t n);
 void atg_matrix_rank(tensor *, tensor self, int symmetric);
 void atg_matrix_rank1(tensor *, tensor self, double tol, int symmetric);
@ -682,7 +876,8 @@ void atg_max_unpool3d(tensor *, tensor self, tensor indices, int64_t *output_siz
 void atg_max_unpool3d_backward(tensor *, tensor grad_output, tensor self, tensor indices, int64_t *output_size_data, int output_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
 void atg_max_unpool3d_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, tensor indices, int64_t *output_size_data, int output_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
 void atg_max_unpool3d_out(tensor *, tensor out, tensor self, tensor indices, int64_t *output_size_data, int output_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
-void atg_max_values(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
+void atg_maximum(tensor *, tensor self, tensor other);
+void atg_maximum_out(tensor *, tensor out, tensor self, tensor other);
 void atg_mean(tensor *, tensor self, int dtype);
 void atg_mean1(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_mean_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
@ -695,7 +890,8 @@ void atg_min1(tensor *, tensor self, tensor other);
 void atg_min2(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_min_out(tensor *, tensor out, tensor self, tensor other);
 void atg_min_out1(tensor *, tensor min, tensor min_indices, tensor self, int64_t dim, int keepdim);
-void atg_min_values(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
+void atg_minimum(tensor *, tensor self, tensor other);
+void atg_minimum_out(tensor *, tensor out, tensor self, tensor other);
 void atg_miopen_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double exponential_average_factor, double epsilon);
 void atg_miopen_batch_norm_backward(tensor *, tensor input, tensor grad_output, tensor weight, tensor running_mean, tensor running_var, tensor save_mean, tensor save_var, double epsilon);
 void atg_miopen_convolution(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
@ -715,11 +911,15 @@ void atg_mkldnn_convolution_backward_input(tensor *, int64_t *self_size_data, in
 void atg_mkldnn_convolution_backward_weights(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int bias_defined);
 void atg_mkldnn_linear(tensor *, tensor input, tensor weight, tensor bias);
 void atg_mkldnn_max_pool2d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
+void atg_mkldnn_max_pool3d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
 void atg_mkldnn_reorder_conv2d_weight(tensor *, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups);
+void atg_mkldnn_reorder_conv3d_weight(tensor *, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups);
 void atg_mm(tensor *, tensor self, tensor mat2);
 void atg_mm_out(tensor *, tensor out, tensor self, tensor mat2);
 void atg_mode(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_mode_out(tensor *, tensor values, tensor indices, tensor self, int64_t dim, int keepdim);
+void atg_movedim(tensor *, tensor self, int64_t *source_data, int source_len, int64_t *destination_data, int destination_len);
+void atg_movedim1(tensor *, tensor self, int64_t source, int64_t destination);
 void atg_mse_loss(tensor *, tensor self, tensor target, int64_t reduction);
 void atg_mse_loss_backward(tensor *, tensor grad_output, tensor self, tensor target, int64_t reduction);
 void atg_mse_loss_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, tensor target, int64_t reduction);
@ -737,17 +937,31 @@ void atg_multilabel_margin_loss_backward_out(tensor *, tensor grad_input, tensor
 void atg_multilabel_margin_loss_out(tensor *, tensor out, tensor self, tensor target, int64_t reduction);
 void atg_multinomial(tensor *, tensor self, int64_t num_samples, int replacement);
 void atg_multinomial_out(tensor *, tensor out, tensor self, int64_t num_samples, int replacement);
+void atg_multiply(tensor *, tensor self, tensor other);
+void atg_multiply1(tensor *, tensor self, scalar other);
+void atg_multiply_(tensor *, tensor self, tensor other);
+void atg_multiply_1(tensor *, tensor self, scalar other);
+void atg_multiply_out(tensor *, tensor out, tensor self, tensor other);
 void atg_mv(tensor *, tensor self, tensor vec);
 void atg_mv_out(tensor *, tensor out, tensor self, tensor vec);
 void atg_mvlgamma(tensor *, tensor self, int64_t p);
 void atg_mvlgamma_(tensor *, tensor self, int64_t p);
+void atg_nanquantile(tensor *, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_nanquantile1(tensor *, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_nanquantile_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_nanquantile_out1(tensor *, tensor out, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_nansum(tensor *, tensor self, int dtype);
+void atg_nansum1(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
+void atg_nansum_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_narrow(tensor *, tensor self, int64_t dim, int64_t start, int64_t length);
 void atg_narrow1(tensor *, tensor self, int64_t dim, tensor start, int64_t length);
 void atg_narrow_copy(tensor *, tensor self, int64_t dim, int64_t start, int64_t length);
 void atg_native_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double momentum, double eps);
 void atg_native_batch_norm_out(tensor *, tensor out, tensor save_mean, tensor save_invstd, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double momentum, double eps);
+void atg_native_group_norm(tensor *, tensor input, tensor weight, tensor bias, int64_t n, int64_t C, int64_t HxW, int64_t group, double eps);
 void atg_native_layer_norm(tensor *, tensor input, tensor weight, tensor bias, int64_t M, int64_t n, double eps);
 void atg_native_norm(tensor *, tensor self);
+void atg_native_norm1(tensor *, tensor self, scalar p, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_ne(tensor *, tensor self, scalar other);
 void atg_ne1(tensor *, tensor self, tensor other);
 void atg_ne_(tensor *, tensor self, scalar other);
@ -757,9 +971,15 @@ void atg_ne_out1(tensor *, tensor out, tensor self, tensor other);
 void atg_neg(tensor *, tensor self);
 void atg_neg_(tensor *, tensor self);
 void atg_neg_out(tensor *, tensor out, tensor self);
+void atg_negative(tensor *, tensor self);
+void atg_negative_(tensor *, tensor self);
+void atg_negative_out(tensor *, tensor out, tensor self);
 void atg_new_empty(tensor *, tensor self, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_new_full(tensor *, tensor self, int64_t *size_data, int size_len, scalar fill_value, int options_kind, int options_device);
 void atg_new_zeros(tensor *, tensor self, int64_t *size_data, int size_len, int options_kind, int options_device);
+void atg_nextafter(tensor *, tensor self, tensor other);
+void atg_nextafter_(tensor *, tensor self, tensor other);
+void atg_nextafter_out(tensor *, tensor out, tensor self, tensor other);
 void atg_nll_loss(tensor *, tensor self, tensor target, tensor weight, int64_t reduction, int64_t ignore_index);
 void atg_nll_loss2d(tensor *, tensor self, tensor target, tensor weight, int64_t reduction, int64_t ignore_index);
 void atg_nll_loss2d_backward(tensor *, tensor grad_output, tensor self, tensor target, tensor weight, int64_t reduction, int64_t ignore_index, tensor total_weight);
@ -783,6 +1003,12 @@ void atg_normal_out(tensor *, tensor out, tensor mean, double std);
 void atg_normal_out1(tensor *, tensor out, double mean, tensor std);
 void atg_normal_out2(tensor *, tensor out, tensor mean, tensor std);
 void atg_normal_out3(tensor *, tensor out, double mean, double std, int64_t *size_data, int size_len);
+void atg_not_equal(tensor *, tensor self, scalar other);
+void atg_not_equal1(tensor *, tensor self, tensor other);
+void atg_not_equal_(tensor *, tensor self, scalar other);
+void atg_not_equal_1(tensor *, tensor self, tensor other);
+void atg_not_equal_out(tensor *, tensor out, tensor self, scalar other);
+void atg_not_equal_out1(tensor *, tensor out, tensor self, tensor other);
 void atg_nuclear_norm(tensor *, tensor self, int keepdim);
 void atg_nuclear_norm1(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_nuclear_norm_out(tensor *, tensor out, tensor self, int keepdim);
@ -796,6 +1022,8 @@ void atg_orgqr(tensor *, tensor self, tensor input2);
 void atg_orgqr_out(tensor *, tensor out, tensor self, tensor input2);
 void atg_ormqr(tensor *, tensor self, tensor input2, tensor input3, int left, int transpose);
 void atg_ormqr_out(tensor *, tensor out, tensor self, tensor input2, tensor input3, int left, int transpose);
+void atg_outer(tensor *, tensor self, tensor vec2);
+void atg_outer_out(tensor *, tensor out, tensor self, tensor vec2);
 void atg_pairwise_distance(tensor *, tensor x1, tensor x2, double p, double eps, int keepdim);
 void atg_pdist(tensor *, tensor self, double p);
 void atg_permute(tensor *, tensor self, int64_t *dims_data, int dims_len);
@ -804,6 +1032,8 @@ void atg_pinverse(tensor *, tensor self, double rcond);
 void atg_pixel_shuffle(tensor *, tensor self, int64_t upscale_factor);
 void atg_poisson(tensor *, tensor self);
 void atg_poisson_nll_loss(tensor *, tensor input, tensor target, int log_input, int full, double eps, int64_t reduction);
+void atg_polar(tensor *, tensor abs, tensor angle);
+void atg_polar_out(tensor *, tensor out, tensor abs, tensor angle);
 void atg_polygamma(tensor *, int64_t n, tensor self);
 void atg_polygamma_(tensor *, tensor self, int64_t n);
 void atg_polygamma_out(tensor *, tensor out, int64_t n, tensor self);
@ -825,18 +1055,23 @@ void atg_q_per_channel_scales(tensor *, tensor self);
 void atg_q_per_channel_zero_points(tensor *, tensor self);
 void atg_qr(tensor *, tensor self, int some);
 void atg_qr_out(tensor *, tensor Q, tensor R, tensor self, int some);
+void atg_quantile(tensor *, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_quantile1(tensor *, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_quantile_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_quantile_out1(tensor *, tensor out, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim);
 void atg_quantize_per_channel(tensor *, tensor self, tensor scales, tensor zero_points, int64_t axis, int dtype);
 void atg_quantize_per_tensor(tensor *, tensor self, double scale, int64_t zero_point, int dtype);
+tensor *atg_quantize_per_tensor1(tensor *tensors_data, int tensors_len, tensor scales, tensor zero_points, int dtype);
 void atg_quantized_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor mean, tensor var, double eps, double output_scale, int64_t output_zero_point);
-void atg_quantized_gru(tensor *, tensor input, tensor hx, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional, int batch_first);
-void atg_quantized_gru1(tensor *, tensor data, tensor batch_sizes, tensor hx, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional);
 void atg_quantized_gru_cell(tensor *, tensor input, tensor hx, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh, tensor packed_ih, tensor packed_hh, tensor col_offsets_ih, tensor col_offsets_hh, scalar scale_ih, scalar scale_hh, scalar zero_point_ih, scalar zero_point_hh);
-void atg_quantized_lstm(tensor *, tensor input, tensor *hx_data, int hx_len, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional, int batch_first, int dtype, int use_dynamic);
-void atg_quantized_lstm1(tensor *, tensor data, tensor batch_sizes, tensor *hx_data, int hx_len, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional, int dtype, int use_dynamic);
 void atg_quantized_lstm_cell(tensor *, tensor input, tensor *hx_data, int hx_len, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh, tensor packed_ih, tensor packed_hh, tensor col_offsets_ih, tensor col_offsets_hh, scalar scale_ih, scalar scale_hh, scalar zero_point_ih, scalar zero_point_hh);
+void atg_quantized_max_pool1d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
 void atg_quantized_max_pool2d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
 void atg_quantized_rnn_relu_cell(tensor *, tensor input, tensor hx, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh, tensor packed_ih, tensor packed_hh, tensor col_offsets_ih, tensor col_offsets_hh, scalar scale_ih, scalar scale_hh, scalar zero_point_ih, scalar zero_point_hh);
 void atg_quantized_rnn_tanh_cell(tensor *, tensor input, tensor hx, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh, tensor packed_ih, tensor packed_hh, tensor col_offsets_ih, tensor col_offsets_hh, scalar scale_ih, scalar scale_hh, scalar zero_point_ih, scalar zero_point_hh);
+void atg_rad2deg(tensor *, tensor self);
+void atg_rad2deg_(tensor *, tensor self);
+void atg_rad2deg_out(tensor *, tensor out, tensor self);
 void atg_rand(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_rand_like(tensor *, tensor self);
 void atg_rand_out(tensor *, tensor out, int64_t *size_data, int size_len);
@ -851,7 +1086,7 @@ void atg_randn_like(tensor *, tensor self);
 void atg_randn_out(tensor *, tensor out, int64_t *size_data, int size_len);
 void atg_random_(tensor *, tensor self);
 void atg_random_1(tensor *, tensor self, int64_t to);
-void atg_random_2(tensor *, tensor self, int64_t from, int64_t to);
+void atg_random_2(tensor *, tensor self, int64_t from, int64_t to_v, uint8_t to_null);
 void atg_randperm(tensor *, int64_t n, int options_kind, int options_device);
 void atg_randperm_out(tensor *, tensor out, int64_t n);
 void atg_range(tensor *, scalar start, scalar end, int options_kind, int options_device);
@ -882,8 +1117,8 @@ void atg_renorm_(tensor *, tensor self, scalar p, int64_t dim, scalar maxnorm);
 void atg_renorm_out(tensor *, tensor out, tensor self, scalar p, int64_t dim, scalar maxnorm);
 void atg_repeat(tensor *, tensor self, int64_t *repeats_data, int repeats_len);
 void atg_repeat_interleave(tensor *, tensor repeats);
-void atg_repeat_interleave1(tensor *, tensor self, tensor repeats, int64_t dim);
-void atg_repeat_interleave2(tensor *, tensor self, int64_t repeats, int64_t dim);
+void atg_repeat_interleave1(tensor *, tensor self, tensor repeats, int64_t dim_v, uint8_t dim_null);
+void atg_repeat_interleave2(tensor *, tensor self, int64_t repeats, int64_t dim_v, uint8_t dim_null);
 void atg_replication_pad1d(tensor *, tensor self, int64_t *padding_data, int padding_len);
 void atg_replication_pad1d_backward(tensor *, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
 void atg_replication_pad1d_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
@ -896,7 +1131,7 @@ void atg_replication_pad3d(tensor *, tensor self, int64_t *padding_data, int pad
 void atg_replication_pad3d_backward(tensor *, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
 void atg_replication_pad3d_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
 void atg_replication_pad3d_out(tensor *, tensor out, tensor self, int64_t *padding_data, int padding_len);
-void atg_requires_grad_(tensor *, tensor self, int _requires_grad);
+void atg_requires_grad_(tensor *, tensor self, int requires_grad);
 void atg_reshape(tensor *, tensor self, int64_t *shape_data, int shape_len);
 void atg_reshape_as(tensor *, tensor self, tensor other);
 void atg_resize_(tensor *, tensor self, int64_t *size_data, int size_len);
@ -929,14 +1164,23 @@ void atg_scatter(tensor *, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter1(tensor *, tensor self, int64_t dim, tensor index, scalar value);
 void atg_scatter_(tensor *, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter_1(tensor *, tensor self, int64_t dim, tensor index, scalar value);
+void atg_scatter_2(tensor *, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len);
+void atg_scatter_3(tensor *, tensor self, int64_t dim, tensor index, scalar value, char* reduce_ptr, int reduce_len);
 void atg_scatter_add(tensor *, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter_add_(tensor *, tensor self, int64_t dim, tensor index, tensor src);
+void atg_searchsorted(tensor *, tensor sorted_sequence, tensor self, int out_int32, int right);
+void atg_searchsorted1(tensor *, tensor sorted_sequence, scalar self_scalar, int out_int32, int right);
+void atg_searchsorted_out(tensor *, tensor out, tensor sorted_sequence, tensor self, int out_int32, int right);
 void atg_select(tensor *, tensor self, int64_t dim, int64_t index);
+void atg_select_backward(tensor *, tensor grad, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t index);
 void atg_selu(tensor *, tensor self);
 void atg_selu_(tensor *, tensor self);
 void atg_set_(tensor *, tensor self);
 void atg_set_1(tensor *, tensor self, tensor source);
 void atg_set_requires_grad(tensor *, tensor self, int r);
+void atg_sgn(tensor *, tensor self);
+void atg_sgn_(tensor *, tensor self);
+void atg_sgn_out(tensor *, tensor out, tensor self);
 void atg_sigmoid(tensor *, tensor self);
 void atg_sigmoid_(tensor *, tensor self);
 void atg_sigmoid_backward(tensor *, tensor grad_output, tensor output);
@ -945,6 +1189,12 @@ void atg_sigmoid_out(tensor *, tensor out, tensor self);
 void atg_sign(tensor *, tensor self);
 void atg_sign_(tensor *, tensor self);
 void atg_sign_out(tensor *, tensor out, tensor self);
+void atg_signbit(tensor *, tensor self);
+void atg_signbit_out(tensor *, tensor out, tensor self);
+void atg_silu(tensor *, tensor self);
+void atg_silu_(tensor *, tensor self);
+void atg_silu_backward(tensor *, tensor grad_output, tensor self);
+void atg_silu_out(tensor *, tensor out, tensor self);
 void atg_sin(tensor *, tensor self);
 void atg_sin_(tensor *, tensor self);
 void atg_sin_out(tensor *, tensor out, tensor self);
@ -952,6 +1202,7 @@ void atg_sinh(tensor *, tensor self);
 void atg_sinh_(tensor *, tensor self);
 void atg_sinh_out(tensor *, tensor out, tensor self);
 void atg_slice(tensor *, tensor self, int64_t dim, int64_t start, int64_t end, int64_t step);
+void atg_slice_backward(tensor *, tensor grad, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t start, int64_t end, int64_t step);
 void atg_slogdet(tensor *, tensor self);
 void atg_slow_conv3d(tensor *, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
 void atg_slow_conv3d_out(tensor *, tensor out, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
@ -962,10 +1213,10 @@ void atg_slow_conv_transpose2d_out(tensor *, tensor out, tensor self, tensor wei
 void atg_slow_conv_transpose3d(tensor *, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *dilation_data, int dilation_len);
 void atg_slow_conv_transpose3d_out(tensor *, tensor out, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *dilation_data, int dilation_len);
 void atg_smm(tensor *, tensor self, tensor mat2);
-void atg_smooth_l1_loss(tensor *, tensor self, tensor target, int64_t reduction);
-void atg_smooth_l1_loss_backward(tensor *, tensor grad_output, tensor self, tensor target, int64_t reduction);
-void atg_smooth_l1_loss_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, tensor target, int64_t reduction);
-void atg_smooth_l1_loss_out(tensor *, tensor out, tensor self, tensor target, int64_t reduction);
+void atg_smooth_l1_loss(tensor *, tensor self, tensor target, int64_t reduction, double beta);
+void atg_smooth_l1_loss_backward(tensor *, tensor grad_output, tensor self, tensor target, int64_t reduction, double beta);
+void atg_smooth_l1_loss_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, tensor target, int64_t reduction, double beta);
+void atg_smooth_l1_loss_out(tensor *, tensor out, tensor self, tensor target, int64_t reduction, double beta);
 void atg_soft_margin_loss(tensor *, tensor self, tensor target, int64_t reduction);
 void atg_soft_margin_loss_backward(tensor *, tensor grad_output, tensor self, tensor target, int64_t reduction);
 void atg_soft_margin_loss_backward_out(tensor *, tensor grad_input, tensor grad_output, tensor self, tensor target, int64_t reduction);
@ -1009,12 +1260,17 @@ void atg_std1(tensor *, tensor self, int64_t *dim_data, int dim_len, int unbiase
 void atg_std_mean(tensor *, tensor self, int unbiased);
 void atg_std_mean1(tensor *, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_std_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
-void atg_stft(tensor *, tensor self, int64_t n_fft, int64_t hop_length, int64_t win_length, tensor window, int normalized, int onesided);
+void atg_stft(tensor *, tensor self, int64_t n_fft, int64_t hop_length_v, uint8_t hop_length_null, int64_t win_length_v, uint8_t win_length_null, tensor window, int normalized, int onesided, int return_complex);
 void atg_sub(tensor *, tensor self, tensor other);
 void atg_sub1(tensor *, tensor self, scalar other);
 void atg_sub_(tensor *, tensor self, tensor other);
 void atg_sub_1(tensor *, tensor self, scalar other);
 void atg_sub_out(tensor *, tensor out, tensor self, tensor other);
+void atg_subtract(tensor *, tensor self, tensor other);
+void atg_subtract1(tensor *, tensor self, scalar other);
+void atg_subtract_(tensor *, tensor self, tensor other);
+void atg_subtract_1(tensor *, tensor self, scalar other);
+void atg_subtract_out(tensor *, tensor out, tensor self, tensor other);
 void atg_sum(tensor *, tensor self, int dtype);
 void atg_sum1(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_sum_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
@ -1026,6 +1282,7 @@ void atg_symeig_out(tensor *, tensor e, tensor V, tensor self, int eigenvectors,
 void atg_t(tensor *, tensor self);
 void atg_t_(tensor *, tensor self);
 void atg_take(tensor *, tensor self, tensor index);
+void atg_take_backward(tensor *, tensor grad, tensor input, tensor index);
 void atg_take_out(tensor *, tensor out, tensor self, tensor index);
 void atg_tan(tensor *, tensor self);
 void atg_tan_(tensor *, tensor self);
@ -1055,6 +1312,7 @@ void atg_topk(tensor *, tensor self, int64_t k, int64_t dim, int largest, int so
 void atg_topk_out(tensor *, tensor values, tensor indices, tensor self, int64_t k, int64_t dim, int largest, int sorted);
 void atg_totype(tensor *, tensor self, int scalar_type);
 void atg_trace(tensor *, tensor self);
+void atg_trace_backward(tensor *, tensor grad, int64_t *sizes_data, int sizes_len);
 void atg_transpose(tensor *, tensor self, int64_t dim0, int64_t dim1);
 void atg_transpose_(tensor *, tensor self, int64_t dim0, int64_t dim1);
 void atg_trapz(tensor *, tensor y, tensor x, int64_t dim);
@ -1080,51 +1338,67 @@ void atg_trunc_(tensor *, tensor self);
 void atg_trunc_out(tensor *, tensor out, tensor self);
 void atg_type_as(tensor *, tensor self, tensor other);
 tensor *atg_unbind(tensor self, int64_t dim);
+void atg_unflatten(tensor *, tensor self, int64_t dim, int64_t *sizes_data, int sizes_len);
 void atg_unfold(tensor *, tensor self, int64_t dimension, int64_t size, int64_t step);
+void atg_unfold_backward(tensor *, tensor grad_in, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t size, int64_t step);
 void atg_uniform_(tensor *, tensor self, double from, double to);
-void atg_unique_consecutive(tensor *, tensor self, int return_inverse, int return_counts, int64_t dim);
+void atg_unique_consecutive(tensor *, tensor self, int return_inverse, int return_counts, int64_t dim_v, uint8_t dim_null);
 void atg_unique_dim(tensor *, tensor self, int64_t dim, int sorted, int return_inverse, int return_counts);
 void atg_unique_dim_consecutive(tensor *, tensor self, int64_t dim, int return_inverse, int return_counts);
+tensor *atg_unsafe_chunk(tensor self, int64_t chunks, int64_t dim);
+tensor *atg_unsafe_split(tensor self, int64_t split_size, int64_t dim);
+tensor *atg_unsafe_split_with_sizes(tensor self, int64_t *split_sizes_data, int split_sizes_len, int64_t dim);
 void atg_unsqueeze(tensor *, tensor self, int64_t dim);
 void atg_unsqueeze_(tensor *, tensor self, int64_t dim);
-void atg_upsample_bicubic2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h, double scales_w);
-void atg_upsample_bicubic2d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h, double scales_w);
-void atg_upsample_bicubic2d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h, double scales_w);
-void atg_upsample_bicubic2d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h, double scales_w);
-void atg_upsample_bilinear2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h, double scales_w);
-void atg_upsample_bilinear2d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h, double scales_w);
-void atg_upsample_bilinear2d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h, double scales_w);
-void atg_upsample_bilinear2d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h, double scales_w);
-void atg_upsample_linear1d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales);
-void atg_upsample_linear1d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales);
-void atg_upsample_linear1d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales);
-void atg_upsample_linear1d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales);
-void atg_upsample_nearest1d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales);
-void atg_upsample_nearest1d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales);
-void atg_upsample_nearest1d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales);
-void atg_upsample_nearest1d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales);
-void atg_upsample_nearest2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales_h, double scales_w);
-void atg_upsample_nearest2d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_h, double scales_w);
-void atg_upsample_nearest2d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_h, double scales_w);
-void atg_upsample_nearest2d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales_h, double scales_w);
-void atg_upsample_nearest3d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales_d, double scales_h, double scales_w);
-void atg_upsample_nearest3d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_d, double scales_h, double scales_w);
-void atg_upsample_nearest3d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_d, double scales_h, double scales_w);
-void atg_upsample_nearest3d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales_d, double scales_h, double scales_w);
-void atg_upsample_trilinear3d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_d, double scales_h, double scales_w);
-void atg_upsample_trilinear3d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_d, double scales_h, double scales_w);
-void atg_upsample_trilinear3d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_d, double scales_h, double scales_w);
-void atg_upsample_trilinear3d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_d, double scales_h, double scales_w);
+void atg_upsample_bicubic2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_bicubic2d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_bicubic2d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_bicubic2d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_bilinear2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_bilinear2d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_bilinear2d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_bilinear2d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_linear1d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_v, uint8_t scales_null);
+void atg_upsample_linear1d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_v, uint8_t scales_null);
+void atg_upsample_linear1d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_v, uint8_t scales_null);
+void atg_upsample_linear1d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_v, uint8_t scales_null);
+void atg_upsample_nearest1d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales_v, uint8_t scales_null);
+void atg_upsample_nearest1d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_v, uint8_t scales_null);
+void atg_upsample_nearest1d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_v, uint8_t scales_null);
+void atg_upsample_nearest1d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales_v, uint8_t scales_null);
+void atg_upsample_nearest2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_nearest2d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_nearest2d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_nearest2d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_nearest3d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_nearest3d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_nearest3d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_nearest3d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_trilinear3d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_trilinear3d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_trilinear3d_backward_out(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_upsample_trilinear3d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg_value_selecting_reduction_backward(tensor *, tensor grad, int64_t dim, tensor indices, int64_t *sizes_data, int sizes_len, int keepdim);
 void atg_values(tensor *, tensor self);
+void atg_vander(tensor *, tensor x, int64_t n_v, uint8_t n_null, int increasing);
 void atg_var(tensor *, tensor self, int unbiased);
 void atg_var1(tensor *, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_var_mean(tensor *, tensor self, int unbiased);
 void atg_var_mean1(tensor *, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_var_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
+void atg_vdot(tensor *, tensor self, tensor other);
+void atg_vdot_out(tensor *, tensor out, tensor self, tensor other);
 void atg_view(tensor *, tensor self, int64_t *size_data, int size_len);
 void atg_view_as(tensor *, tensor self, tensor other);
+void atg_view_as_complex(tensor *, tensor self);
+void atg_view_as_real(tensor *, tensor self);
+void atg_vstack(tensor *, tensor *tensors_data, int tensors_len);
+void atg_vstack_out(tensor *, tensor out, tensor *tensors_data, int tensors_len);
 tensor *atg_where(tensor condition);
 void atg_where1(tensor *, tensor condition, tensor self, tensor other);
+void atg_where2(tensor *, tensor condition, scalar self_scalar, tensor other);
+void atg_where3(tensor *, tensor condition, tensor self, scalar other);
+void atg_where4(tensor *, tensor condition, scalar self_scalar, scalar other);
 void atg_zero_(tensor *, tensor self);
 void atg_zeros(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_zeros_like(tensor *, tensor self);
--- a/nn/sequential.go
+++ b/nn/sequential.go
@ -254,6 +254,40 @@ func BatchAccuracyForLogits(vs *VarStore, m ts.ModuleT, xs, ys *ts.Tensor, d got
 	return sumAccuracy / sampleCount
 }

+func BatchAccuracyForLogitsOld(vs *VarStore, m ts.ModuleT, xs, ys *ts.Tensor, d gotch.Device, batchSize int) (retVal float64) {
+
+	var (
+		sumAccuracy float64 = 0.0
+		sampleCount float64 = 0.0
+	)
+
+	vs.Freeze()
+	defer vs.Unfreeze()
+
+	iter2 := ts.MustNewIter2(xs, ys, int64(batchSize))
+	for {
+		item, ok := iter2.Next()
+		if !ok {
+			break
+		}
+
+		size := float64(item.Data.MustSize()[0])
+		bImages := item.Data.MustTo(d, true)
+		bLabels := item.Label.MustTo(d, true)
+
+		logits := m.ForwardT(bImages, false)
+		acc := logits.AccuracyForLogits(bLabels)
+		sumAccuracy += acc.Float64Values()[0] * size
+		sampleCount += size
+
+		bImages.MustDrop()
+		bLabels.MustDrop()
+		acc.MustDrop()
+	}
+
+	return sumAccuracy / sampleCount
+}
+
 // BatchAccuracyForLogitIdx is an alternative of BatchAccuracyForLogits to
 // calculate accuracy for specified batch on module weight. It uses tensor
 // indexing instead of Iter2
--- a/setup-cpu.sh
+++ b/setup-cpu.sh
@ -1,8 +1,8 @@
 #!/bin/bash

 # Env
-GOTCH_VERSION="${GOTCH_VER:-v0.2.0}"
-LIBTORCH_VERSION="${LIBTORCH_VER:-1.5.1}"
+GOTCH_VERSION="${GOTCH_VER:-v0.3.0}"
+LIBTORCH_VERSION="${LIBTORCH_VER:-1.7.0}"

 GOTCH="$GOPATH/pkg/mod/github.com/sugarme/gotch@$GOTCH_VERSION"
 LIBTORCH="$GOPATH/pkg/mod/github.com/sugarme/gotch@$GOTCH_VERSION/libtch/libtorch"
--- a/setup-gpu.sh
+++ b/setup-gpu.sh
@ -1,7 +1,7 @@
 #!/bin/bash

-GOTCH_VERSION="${GOTCH_VER:-v0.2.0}"
-LIBTORCH_VERSION="${LIBTORCH_VER:-1.5.1}"
+GOTCH_VERSION="${GOTCH_VER:-v0.3.0}"
+LIBTORCH_VERSION="${LIBTORCH_VER:-1.7.0}"
 CUDA_VERSION="${CUDA_VER:-10.1}"
 CU_VERSION="${CUDA_VERSION//./}"

--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+export GOTCH_VERSION="${GOTCH_VER:-v0.3.0}"
+export LIBTORCH_VERSION="${LIBTORCH_VER:-1.7.0}"
+export CUDA_VERSION="${CUDA_VER:-10.1}"
+export CU_VERSION="${CUDA_VERSION//./}"
+
+export GOTCH="$HOME/projects/sugarme/gotch"
+export LIBTORCH="$HOME/projects/sugarme/gotch/libtch/libtorch"
+export LIBRARY_PATH="$LIBTORCH/lib"
+export CPATH="$LIBTORCH/lib:$LIBTORCH/include:$LIBTORCH/include/torch/csrc/api/include"
+export LD_LIBRARY_PATH="$LIBTORCH/lib:/usr/lib64-nvidia:/usr/local/cuda-${CUDA_VERSION}/lib64"
+
+sudo rm -rf $LIBTORCH
+sudo mkdir -p $LIBTORCH
+
+wget -O /tmp/libtorch-cxx11-abi-shared-with-deps-${LIBTORCH_VERSION}%2Bcu${CU_VERSION}.zip https://download.pytorch.org/libtorch/cu${CU_VERSION}/libtorch-cxx11-abi-shared-with-deps-${LIBTORCH_VERSION}%2Bcu${CU_VERSION}.zip
+sudo unzip /tmp/libtorch-cxx11-abi-shared-with-deps-${LIBTORCH_VERSION}%2Bcu${CU_VERSION}.zip -d $GOTCH/libtch
--- a/tensor/must-tensor-generated.go
+++ b/tensor/must-tensor-generated.go
--- a/tensor/optimizer.go
+++ b/tensor/optimizer.go
@ -67,7 +67,8 @@ func (co *COptimizer) AddParameters(tensors []Tensor) error {

 	ntensors := len(tensors)

-	lib.AtoAddParameters(co.coptimizer, ctensors, ntensors)
+	// NOTE. temporary switch back as param group not updated yet!
+	lib.AtoAddParametersOld(co.coptimizer, ctensors, ntensors)

 	return TorchErr()
 }
--- a/tensor/other.go
+++ b/tensor/other.go
@ -19,7 +19,7 @@ func (ts *Tensor) CrossEntropyForLogits(targets *Tensor) (retVal *Tensor) {
 // AccuracyForLogits returns the average accuracy for some given logits assuming that
 // targets represent ground-truth.
 func (ts *Tensor) AccuracyForLogits(targets *Tensor) (retVal *Tensor) {
-	argmax := ts.MustArgmax(-1, false, true)
+	argmax := ts.MustArgmax([]int64{-1}, false, true)
 	eq1 := argmax.MustEq1(targets, true)
 	return eq1.MustTotype(gotch.Float, true).MustMean(gotch.Float, true)
 }
--- a/tensor/tensor-generated.go
+++ b/tensor/tensor-generated.go
--- a/tensor/tensor.go
+++ b/tensor/tensor.go
@ -1171,7 +1171,7 @@ func (ts *Tensor) Swish() *Tensor {
 }

 func (ts *Tensor) AvgPool2DDefault(ksize int64, del bool) *Tensor {
-	return ts.MustAvgPool2d([]int64{ksize, ksize}, []int64{ksize, ksize}, []int64{0, 0}, false, true, 1, del)
+	return ts.MustAvgPool2d([]int64{ksize, ksize}, []int64{ksize, ksize}, []int64{0, 0}, false, true, []int64{1}, del)
 }

 // SaveMultiNew saves a slice of named tensors to the given file path.
--- a/vision/densenet.go
+++ b/vision/densenet.go
@ -103,7 +103,7 @@ func densenet(p *nn.Path, cIn, cOut, bnSize int64, blockConfig []int64, growth i

 	seq.AddFn(nn.NewFunc(func(xs *ts.Tensor) *ts.Tensor {
 		tmp1 := xs.MustRelu(false)
-		tmp2 := tmp1.MustAvgPool2d([]int64{7, 7}, []int64{1, 1}, []int64{0, 0}, false, true, 1, true)
+		tmp2 := tmp1.MustAvgPool2d([]int64{7, 7}, []int64{1, 1}, []int64{0, 0}, false, true, []int64{1}, true)
 		res := tmp2.FlatView()
 		tmp2.MustDrop()
 		return res
--- a/vision/inception.go
+++ b/vision/inception.go
@ -78,7 +78,7 @@ func inceptionA(p *nn.Path, cIn, cPool int64) ts.ModuleT {
 		b3Ts := b3Tmp2.ApplyT(b33, train)
 		b3Tmp2.MustDrop()

-		bpoolTmp := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, 9, false)
+		bpoolTmp := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, []int64{9}, false)
 		bpoolTs := bpoolTmp.ApplyT(bpool, train)

 		res := ts.MustCat([]ts.Tensor{*b1Ts, *b2Ts, *b3Ts, *bpoolTs}, 1)
@ -145,7 +145,7 @@ func inceptionC(p *nn.Path, cIn int64, c7 int64) ts.ModuleT {
 		b3Ts := b3Tmp4.ApplyT(b35, train)
 		b3Tmp4.MustDrop()

-		bpTmp1 := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, 9, false)
+		bpTmp1 := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, []int64{9}, false)
 		bpoolTs := bpTmp1.ApplyT(bpool, train)

 		return ts.MustCat([]ts.Tensor{*b1Ts, *b2Ts, *b3Ts, *bpoolTs}, 1)
@ -211,7 +211,7 @@ func inceptionE(p *nn.Path, cIn int64) ts.ModuleT {
 		b3bTs := b3Tmp2.ApplyT(b33b, train)
 		b3Ts := ts.MustCat([]ts.Tensor{*b3aTs, *b3bTs}, 1)

-		bpTmp1 := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, 9, false)
+		bpTmp1 := xs.MustAvgPool2d([]int64{3, 3}, []int64{1, 1}, []int64{1, 1}, false, true, []int64{9}, false)
 		bpoolTs := bpTmp1.ApplyT(bpool, train)

 		return ts.MustCat([]ts.Tensor{*b1Ts, *b2Ts, *b3Ts, *bpoolTs}, 1)