upgrade to libtorch 1.10

2021-11-06 14:44:27 +11:00 · 2021-11-06 14:44:27 +11:00 · 6c4ce7f55f
commit 6c4ce7f55f
parent 494f063642
13 changed files with 143988 additions and 596 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,11 +6,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

 ## [Unreleased]
- [#58] Fixed incorrect converting IValue from CIValue case 1 (Tensor). 

 ## [Nofix]
 - ctype `long` caused compiling error in MacOS as noted on [#44]. Not working on linux box.

+## [0.5.0]
+- Upgraded to libtorch 1.10
+- [#58] Fixed incorrect converting IValue from CIValue case 1 (Tensor). 
+
 ## [0.4.5]
 - Added Conv3DConfig and Conv3DConfig Option
 - Added missing Tensor methods APIs those return multiple tensors (e.g. `tensor.Svd`).
--- a/README.md
+++ b/README.md
@ -3,10 +3,10 @@

 ## Overview

-Gotch creates a thin wrapper to Pytorch C++ APIs (Libtorch) to make use of its already optimized C++ tensor APIs (~ over 1700) and dynamic graph computation with CUDA support and provides idiomatic Go APIs for developing and implementing Deep Learning in Go.
+Gotch creates a thin wrapper to Pytorch C++ APIs (Libtorch) to make use of its already optimized C++ tensor APIs (~ 2169) and dynamic graph computation with CUDA support and provides idiomatic Go APIs for developing and implementing Deep Learning in Go.

 **Some features are**
- [x] Comprehensive Pytorch tensor APIs (~ 1716) 
+- [x] Comprehensive Pytorch tensor APIs (~ 1844) 
 - [x] Fully featured Pytorch dynamic graph computation
 - [x] JIT interface to run model trained/saved using PyTorch Python API
 - [x] Load pretrained Pytorch models and run inference
@ -18,12 +18,12 @@ Gotch is in active development mode and may have API breaking changes. Feel free

 ## Dependencies

- **Libtorch** C++ v1.9.0 library of [Pytorch](https://pytorch.org/)
+- **Libtorch** C++ v1.10.0 library of [Pytorch](https://pytorch.org/)

 ## Installation

 - Default CUDA version is `11.1` if CUDA is available otherwise using CPU version.
- Default Pytorch C++ API version is `1.9.0`
+- Default Pytorch C++ API version is `1.10.0`

 **NOTE**: `libtorch` will be installed at **`/usr/local/lib`**

@ -51,7 +51,7 @@ Gotch is in active development mode and may have API breaking changes. Feel free
 ```bash
    wget https://raw.githubusercontent.com/sugarme/gotch/master/setup-gotch.sh
    chmod +x setup-gotch.sh
-    export CUDA_VER=cpu && export GOTCH_VER=v0.4.5 && bash setup-gotch.sh
+    export CUDA_VER=cpu && export GOTCH_VER=v0.5.0 && bash setup-gotch.sh
 ```

 ### GPU
@ -89,9 +89,9 @@ Gotch is in active development mode and may have API breaking changes. Feel free
    wget https://raw.githubusercontent.com/sugarme/gotch/master/setup-gotch.sh
    chmod +x setup-gotch.sh
    # CUDA 10.2
-    export CUDA_VER=10.2 && export GOTCH_VER=v0.4.5 && bash setup-gotch.sh
+    export CUDA_VER=10.2 && export GOTCH_VER=v0.5.0 && bash setup-gotch.sh
    # CUDA 11.1
-    export CUDA_VER=11.1 && export GOTCH_VER=v0.4.5 && bash setup-gotch.sh
+    export CUDA_VER=11.1 && export GOTCH_VER=v0.5.0 && bash setup-gotch.sh
 ```

 ## Examples
--- a/gen/gen.ml
+++ b/gen/gen.ml
@ -30,13 +30,18 @@ let excluded_functions =
    ; "_cummax_helper"
    ; "retain_grad"
    ; "_validate_sparse_coo_tensor_args"
+    ; "_validate_sparse_csr_tensor_args"
    ; "_backward"
    ; "size"
    ; "stride"
+    ; "histogram_out"
+    ; "histogram"
    ; "_assert_async"
    ; "gradient"
    ; "linalg_vector_norm"
-    ; "linalg_vector_norm_out" ]
+    ; "linalg_vector_norm_out" 
+    ; "linalg_matrix_norm"
+    ; "linalg_matrix_norm_out"]

 let no_tensor_options =
  Set.of_list
@ -129,7 +134,7 @@ module Func = struct
    | "at::device" -> Some Device
    | "const at::scalar &" | "at::scalar" -> Some Scalar
    | "at::scalartype" -> Some ScalarType
-    | "std::string" -> Some String
+    | "c10::string_view" -> Some String
    | _ -> None

  let c_typed_args_list t =
@ -801,6 +806,8 @@ let write_wrapper funcs filename =
            ; "AlignTensors"
            ; "BroadcastTensors"
            ; "Meshgrid"
+            ; "MeshgridIndexing"
+            ; "_ToCpu"
            ; "NonzeroNumpy"
            ; "Split"
            ; "SplitWithSizes"
@ -1023,6 +1030,8 @@ let write_must_wrapper funcs filename =
            ; "AlignTensors"
            ; "BroadcastTensors"
            ; "Meshgrid"
+            ; "MeshgridIndexing"
+            ; "_ToCpu"
            ; "NonzeroNumpy"
            ; "Split"
            ; "SplitWithSizes"
@ -1335,7 +1344,7 @@ let run ~yaml_filename ~cpp_filename ~ffi_filename ~must_wrapper_filename
  write_wrapper funcs wrapper_filename

 let () =
-  run ~yaml_filename:"gen/pytorch/Declarations-v1.9.0.yaml"
+  run ~yaml_filename:"gen/pytorch/Declarations-v1.10.0.yaml"
    ~cpp_filename:"libtch/torch_api_generated"
    ~ffi_filename:"libtch/c-generated.go"
    ~must_wrapper_filename:"tensor/must-tensor-generated.go"
--- a/gen/gen.ml.1.9
+++ b/gen/gen.ml.1.9
--- a/gen/pytorch/Declarations-v1.10.0.yaml
+++ b/gen/pytorch/Declarations-v1.10.0.yaml
--- a/libtch/c-generated.go
+++ b/libtch/c-generated.go
--- a/libtch/torch_api_generated.cpp.h
+++ b/libtch/torch_api_generated.cpp.h
--- a/libtch/torch_api_generated.h
+++ b/libtch/torch_api_generated.h
@ -28,12 +28,12 @@ void atg__add_batch_dim(tensor *, tensor self, int64_t batch_dim, int64_t level)
 void atg__add_relu(tensor *, tensor self, tensor other);
 void atg__add_relu_(tensor *, tensor self, tensor other);
 void atg__add_relu_out(tensor *, tensor out, tensor self, tensor other);
+void atg__add_relu_scalar(tensor *, tensor self, scalar other);
+void atg__add_relu_scalar_(tensor *, tensor self, scalar other);
 void atg__aminmax(tensor *, tensor self);
 void atg__aminmax_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg__amp_update_scale_(tensor *, tensor self, tensor growth_tracker, tensor found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
 void atg__baddbmm_mkl_(tensor *, tensor self, tensor batch1, tensor batch2);
-void atg__bmm(tensor *, tensor self, tensor mat2, int deterministic);
-void atg__bmm_out(tensor *, tensor out, tensor self, tensor mat2, int deterministic);
 void atg__cast_byte(tensor *, tensor self, int non_blocking);
 void atg__cast_char(tensor *, tensor self, int non_blocking);
 void atg__cast_double(tensor *, tensor self, int non_blocking);
@ -51,11 +51,18 @@ void atg__coalesced_(tensor *, tensor self, int coalesced);
 void atg__compute_linear_combination(tensor *, tensor input, tensor coefficients);
 void atg__compute_linear_combination_out(tensor *, tensor out, tensor input, tensor coefficients);
 void atg__conj(tensor *, tensor self);
+void atg__conj_physical(tensor *, tensor self);
+void atg__conv_depthwise2d(tensor *, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len);
+void atg__conv_depthwise2d_backward(tensor *, tensor grad_input, tensor grad_weight, tensor grad_output, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len);
+void atg__conv_depthwise2d_out(tensor *, tensor out, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len);
+void atg__convert_indices_from_coo_to_csr(tensor *, tensor self, int64_t size, int out_int32);
+void atg__convert_indices_from_coo_to_csr_out(tensor *, tensor out, tensor self, int64_t size, int out_int32);
 void atg__convolution(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len, int64_t groups, int benchmark, int deterministic, int cudnn_enabled, int allow_tf32);
 void atg__convolution_deprecated(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len, int64_t groups, int benchmark, int deterministic, int cudnn_enabled);
 void atg__convolution_mode(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, char* padding_ptr, int padding_len, int64_t *dilation_data, int dilation_len, int64_t groups);
 void atg__convolution_nogroup(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len);
 void atg__copy_from(tensor *, tensor self, tensor dst, int non_blocking);
+void atg__copy_from_and_resize(tensor *, tensor self, tensor dst);
 void atg__ctc_loss(tensor *, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int zero_infinity);
 void atg__ctc_loss_backward(tensor *, tensor grad, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, tensor neg_log_likelihood, tensor log_alpha, int64_t blank, int zero_infinity);
 void atg__cudnn_ctc_loss(tensor *, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int deterministic, int zero_infinity);
@ -64,11 +71,9 @@ void atg__cudnn_rnn(tensor *, tensor input, tensor *weight_data, int weight_len,
 void atg__cudnn_rnn_flatten_weight(tensor *, tensor *weight_arr_data, int weight_arr_len, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int batch_first, int bidirectional);
 int64_t atg__cufft_get_plan_cache_max_size(int64_t device_index);
 int64_t atg__cufft_get_plan_cache_size(int64_t device_index);
-void atg__cumprod(tensor *, tensor self, int64_t dim);
-void atg__cumprod_out(tensor *, tensor out, tensor self, int64_t dim);
-void atg__cumsum(tensor *, tensor self, int64_t dim);
-void atg__cumsum_out(tensor *, tensor out, tensor self, int64_t dim);
 int64_t atg__debug_has_internal_overlap(tensor self);
+void atg__det_lu_based_helper(tensor *, tensor self);
+void atg__det_lu_based_helper_backward_helper(tensor *, tensor det_grad, tensor det, tensor self, tensor lu, tensor pivs);
 void atg__dim_arange(tensor *, tensor like, int64_t dim);
 int64_t atg__dimi(tensor self);
 int64_t atg__dimv(tensor self);
@ -86,6 +91,7 @@ void atg__fake_quantize_learnable_per_channel_affine(tensor *, tensor self, tens
 void atg__fake_quantize_learnable_per_channel_affine_backward(tensor *, tensor grad, tensor self, tensor scale, tensor zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor);
 void atg__fake_quantize_learnable_per_tensor_affine(tensor *, tensor self, tensor scale, tensor zero_point, int64_t quant_min, int64_t quant_max, double grad_factor);
 void atg__fake_quantize_learnable_per_tensor_affine_backward(tensor *, tensor grad, tensor self, tensor scale, tensor zero_point, int64_t quant_min, int64_t quant_max, double grad_factor);
+void atg__fake_quantize_per_tensor_affine_cachemask_tensor_qparams(tensor *, tensor self, tensor scale, tensor zero_point, tensor fake_quant_enabled, int64_t quant_min, int64_t quant_max);
 void atg__fft_c2c(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t normalization, int forward);
 void atg__fft_c2c_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int64_t normalization, int forward);
 void atg__fft_c2r(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t normalization, int64_t last_dim_size);
@ -93,6 +99,7 @@ void atg__fft_c2r_out(tensor *, tensor out, tensor self, int64_t *dim_data, int
 void atg__fft_r2c(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t normalization, int onesided);
 void atg__fft_r2c_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int64_t normalization, int onesided);
 void atg__fused_dropout(tensor *, tensor self, double p);
+void atg__fused_moving_avg_obs_fq_helper(tensor *, tensor self, tensor observer_on, tensor fake_quant_on, tensor running_min, tensor running_max, tensor scale, tensor zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int per_row_fake_quant, int symmetric_quant);
 void atg__fw_primal(tensor *, tensor self, int64_t level);
 void atg__gather_sparse_backward(tensor *, tensor self, int64_t dim, tensor index, tensor grad);
 void atg__grid_sampler_2d_cpu_fallback(tensor *, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
@ -104,9 +111,10 @@ void atg__indices(tensor *, tensor self);
 void atg__inverse_helper(tensor *, tensor self);
 void atg__linalg_inv_out_helper_(tensor *, tensor self, tensor infos_lu, tensor infos_getri);
 void atg__linalg_qr_helper(tensor *, tensor self, char* mode_ptr, int mode_len);
-void atg__linalg_solve_out_helper_(tensor *, tensor self, tensor other, tensor infos);
 void atg__log_softmax(tensor *, tensor self, int64_t dim, int half_to_float);
 void atg__log_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
+void atg__log_softmax_backward_data_out(tensor *, tensor out, tensor grad_output, tensor output, int64_t dim, tensor self);
+void atg__log_softmax_out(tensor *, tensor out, tensor self, int64_t dim, int half_to_float);
 void atg__logcumsumexp(tensor *, tensor self, int64_t dim);
 void atg__logcumsumexp_out(tensor *, tensor out, tensor self, int64_t dim);
 void atg__lu_with_info(tensor *, tensor self, int pivot, int check_errors);
@ -117,6 +125,7 @@ void atg__masked_scale(tensor *, tensor self, tensor mask, double scale);
 void atg__mkldnn_reshape(tensor *, tensor self, int64_t *shape_data, int shape_len);
 void atg__mkldnn_transpose(tensor *, tensor self, int64_t dim0, int64_t dim1);
 void atg__mkldnn_transpose_(tensor *, tensor self, int64_t dim0, int64_t dim1);
+void atg__neg_view(tensor *, tensor self);
 int atg__nnpack_available();
 void atg__nnpack_spatial_convolution(tensor *, tensor input, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len);
 void atg__nnpack_spatial_convolution_backward_input(tensor *, tensor input, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len);
@ -126,12 +135,15 @@ void atg__pack_padded_sequence(tensor *, tensor input, tensor lengths, int batch
 void atg__pack_padded_sequence_backward(tensor *, tensor grad, int64_t *input_size_data, int input_size_len, tensor batch_sizes, int batch_first);
 void atg__pad_packed_sequence(tensor *, tensor data, tensor batch_sizes, int batch_first, scalar padding_value, int64_t total_length);
 void atg__pdist_backward(tensor *, tensor grad, tensor self, double p, tensor pdist);
+void atg__pin_memory(tensor *, tensor self, int device);
 void atg__remove_batch_dim(tensor *, tensor self, int64_t level, int64_t batch_size, int64_t out_dim);
+void atg__reshape_alias(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len);
 void atg__reshape_from_tensor(tensor *, tensor self, tensor shape);
 void atg__rowwise_prune(tensor *, tensor weight, tensor mask, int compressed_indices_dtype);
 void atg__s_where(tensor *, tensor condition, tensor self, tensor other);
 void atg__sample_dirichlet(tensor *, tensor self);
 void atg__saturate_weight_to_fp16(tensor *, tensor weight);
+void atg__segment_reduce_backward(tensor *, tensor grad, tensor output, tensor data, char* reduce_ptr, int reduce_len, tensor lengths, int64_t axis);
 void atg__shape_as_tensor(tensor *, tensor self);
 void atg__sobol_engine_draw(tensor *, tensor quasi, int64_t n, tensor sobolstate, int64_t dimension, int64_t num_generated, int dtype);
 void atg__sobol_engine_ff_(tensor *, tensor self, int64_t n, tensor sobolstate, int64_t dimension, int64_t num_generated);
@ -139,13 +151,14 @@ void atg__sobol_engine_initialize_state_(tensor *, tensor self, int64_t dimensio
 void atg__sobol_engine_scramble_(tensor *, tensor self, tensor ltm, int64_t dimension);
 void atg__softmax(tensor *, tensor self, int64_t dim, int half_to_float);
 void atg__softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
+void atg__softmax_backward_data_out(tensor *, tensor grad_input, tensor grad_output, tensor output, int64_t dim, tensor self);
+void atg__softmax_out(tensor *, tensor out, tensor self, int64_t dim, int half_to_float);
 void atg__solve_helper(tensor *, tensor self, tensor A);
 void atg__sparse_addmm(tensor *, tensor self, tensor sparse, tensor dense);
 void atg__sparse_coo_tensor_unsafe(tensor *, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_coo_tensor_with_dims(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_coo_tensor_with_dims_and_tensors(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int options_kind, int options_device);
-void atg__sparse_csr_tensor(tensor *, tensor crow_indices, tensor col_indices, tensor values, int options_kind, int options_device);
-void atg__sparse_csr_tensor_crow_col_value_size(tensor *, tensor crow_indices, tensor col_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
+void atg__sparse_csr_tensor_unsafe(tensor *, tensor crow_indices, tensor col_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_log_softmax(tensor *, tensor self, int64_t dim, int half_to_float);
 void atg__sparse_log_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
 void atg__sparse_log_softmax_int(tensor *, tensor self, int64_t dim, int dtype);
@ -172,6 +185,8 @@ void atg__test_optional_filled_intlist(tensor *, tensor values, int64_t *addends
 void atg__test_optional_intlist(tensor *, tensor values, int64_t *addends_data, int addends_len);
 void atg__test_serialization_subcmul(tensor *, tensor self, tensor other);
 void atg__test_string_default(tensor *, tensor dummy, char* a_ptr, int a_len, char* b_ptr, int b_len);
+void atg__to_copy(tensor *, tensor self, int options_kind, int options_device, int non_blocking);
+tensor *atg__to_cpu(tensor *tensors_data, int tensors_len);
 void atg__trilinear(tensor *, tensor i1, tensor i2, tensor i3, int64_t *expand1_data, int expand1_len, int64_t *expand2_data, int expand2_len, int64_t *expand3_data, int expand3_len, int64_t *sumdim_data, int sumdim_len, int64_t unroll_dim);
 void atg__unique(tensor *, tensor self, int sorted, int return_inverse);
 void atg__unique2(tensor *, tensor self, int sorted, int return_inverse, int return_counts);
@ -241,6 +256,7 @@ void atg_alias(tensor *, tensor self);
 void atg_align_as(tensor *, tensor self, tensor other);
 tensor *atg_align_tensors(tensor *tensors_data, int tensors_len);
 void atg_all(tensor *, tensor self);
+void atg_all_all_out(tensor *, tensor out, tensor self);
 void atg_all_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_all_out(tensor *, tensor out, tensor self, int64_t dim, int keepdim);
 int atg_allclose(tensor self, tensor other, double rtol, double atol, int equal_nan);
@ -250,9 +266,12 @@ void atg_amax(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim
 void atg_amax_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_amin(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_amin_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim);
+void atg_aminmax(tensor *, tensor self, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_aminmax_out(tensor *, tensor min, tensor max, tensor self, int64_t dim_v, uint8_t dim_null, int keepdim);
 void atg_angle(tensor *, tensor self);
 void atg_angle_out(tensor *, tensor out, tensor self);
 void atg_any(tensor *, tensor self);
+void atg_any_all_out(tensor *, tensor out, tensor self);
 void atg_any_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_any_out(tensor *, tensor out, tensor self, int64_t dim, int keepdim);
 void atg_arange(tensor *, scalar end, int options_kind, int options_device);
@ -349,6 +368,13 @@ void atg_bitwise_and_scalar_out(tensor *, tensor out, tensor self, scalar other)
 void atg_bitwise_and_tensor(tensor *, tensor self, tensor other);
 void atg_bitwise_and_tensor_(tensor *, tensor self, tensor other);
 void atg_bitwise_and_tensor_out(tensor *, tensor out, tensor self, tensor other);
+void atg_bitwise_left_shift(tensor *, tensor self, tensor other);
+void atg_bitwise_left_shift_(tensor *, tensor self, tensor other);
+void atg_bitwise_left_shift_scalar_tensor(tensor *, scalar self_scalar, tensor other);
+void atg_bitwise_left_shift_tensor_out(tensor *, tensor out, tensor self, tensor other);
+void atg_bitwise_left_shift_tensor_scalar(tensor *, tensor self, scalar other);
+void atg_bitwise_left_shift_tensor_scalar_(tensor *, tensor self, scalar other);
+void atg_bitwise_left_shift_tensor_scalar_out(tensor *, tensor out, tensor self, scalar other);
 void atg_bitwise_not(tensor *, tensor self);
 void atg_bitwise_not_(tensor *, tensor self);
 void atg_bitwise_not_out(tensor *, tensor out, tensor self);
@ -358,6 +384,13 @@ void atg_bitwise_or_scalar_out(tensor *, tensor out, tensor self, scalar other);
 void atg_bitwise_or_tensor(tensor *, tensor self, tensor other);
 void atg_bitwise_or_tensor_(tensor *, tensor self, tensor other);
 void atg_bitwise_or_tensor_out(tensor *, tensor out, tensor self, tensor other);
+void atg_bitwise_right_shift(tensor *, tensor self, tensor other);
+void atg_bitwise_right_shift_(tensor *, tensor self, tensor other);
+void atg_bitwise_right_shift_scalar_tensor(tensor *, scalar self_scalar, tensor other);
+void atg_bitwise_right_shift_tensor_out(tensor *, tensor out, tensor self, tensor other);
+void atg_bitwise_right_shift_tensor_scalar(tensor *, tensor self, scalar other);
+void atg_bitwise_right_shift_tensor_scalar_(tensor *, tensor self, scalar other);
+void atg_bitwise_right_shift_tensor_scalar_out(tensor *, tensor out, tensor self, scalar other);
 void atg_bitwise_xor(tensor *, tensor self, scalar other);
 void atg_bitwise_xor_(tensor *, tensor self, scalar other);
 void atg_bitwise_xor_scalar_out(tensor *, tensor out, tensor self, scalar other);
@ -431,8 +464,12 @@ void atg_column_stack_out(tensor *, tensor out, tensor *tensors_data, int tensor
 void atg_combinations(tensor *, tensor self, int64_t r, int with_replacement);
 void atg_complex(tensor *, tensor real, tensor imag);
 void atg_complex_out(tensor *, tensor out, tensor real, tensor imag);
+void atg_concat(tensor *, tensor *tensors_data, int tensors_len, int64_t dim);
+void atg_concat_out(tensor *, tensor out, tensor *tensors_data, int tensors_len, int64_t dim);
 void atg_conj(tensor *, tensor self);
-void atg_conj_out(tensor *, tensor out, tensor self);
+void atg_conj_physical(tensor *, tensor self);
+void atg_conj_physical_(tensor *, tensor self);
+void atg_conj_physical_out(tensor *, tensor out, tensor self);
 void atg_constant_pad_nd(tensor *, tensor self, int64_t *pad_data, int pad_len);
 void atg_contiguous(tensor *, tensor self);
 void atg_conv1d(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int64_t groups);
@ -457,6 +494,7 @@ void atg_copysign_out(tensor *, tensor out, tensor self, tensor other);
 void atg_copysign_scalar(tensor *, tensor self, scalar other);
 void atg_copysign_scalar_(tensor *, tensor self, scalar other);
 void atg_copysign_scalar_out(tensor *, tensor out, tensor self, scalar other);
+void atg_corrcoef(tensor *, tensor self);
 void atg_cos(tensor *, tensor self);
 void atg_cos_(tensor *, tensor self);
 void atg_cos_out(tensor *, tensor out, tensor self);
@ -467,8 +505,9 @@ void atg_cosine_embedding_loss(tensor *, tensor input1, tensor input2, tensor ta
 void atg_cosine_similarity(tensor *, tensor x1, tensor x2, int64_t dim, double eps);
 void atg_count_nonzero(tensor *, tensor self, int64_t dim_v, uint8_t dim_null);
 void atg_count_nonzero_dim_intlist(tensor *, tensor self, int64_t *dim_data, int dim_len);
+void atg_cov(tensor *, tensor self, int64_t correction, tensor fweights, tensor aweights);
 void atg_cross(tensor *, tensor self, tensor other, int64_t dim_v, uint8_t dim_null);
-void atg_cross_entropy_loss(tensor *, tensor self, tensor target, tensor weight, int64_t reduction, int64_t ignore_index);
+void atg_cross_entropy_loss(tensor *, tensor self, tensor target, tensor weight, int64_t reduction, int64_t ignore_index, double label_smoothing);
 void atg_cross_out(tensor *, tensor out, tensor self, tensor other, int64_t dim_v, uint8_t dim_null);
 void atg_crow_indices(tensor *, tensor self);
 void atg_ctc_loss(tensor *, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int64_t reduction, int zero_infinity);
@ -504,6 +543,8 @@ void atg_cumprod_out(tensor *, tensor out, tensor self, int64_t dim, int dtype);
 void atg_cumsum(tensor *, tensor self, int64_t dim, int dtype);
 void atg_cumsum_(tensor *, tensor self, int64_t dim, int dtype);
 void atg_cumsum_out(tensor *, tensor out, tensor self, int64_t dim, int dtype);
+void atg_cumulative_trapezoid(tensor *, tensor y, int64_t dim);
+void atg_cumulative_trapezoid_x(tensor *, tensor y, tensor x, int64_t dim);
 void atg_data(tensor *, tensor self);
 void atg_deg2rad(tensor *, tensor self);
 void atg_deg2rad_(tensor *, tensor self);
@ -520,7 +561,7 @@ void atg_diag_embed(tensor *, tensor self, int64_t offset, int64_t dim1, int64_t
 void atg_diag_out(tensor *, tensor out, tensor self, int64_t diagonal);
 void atg_diagflat(tensor *, tensor self, int64_t offset);
 void atg_diagonal(tensor *, tensor self, int64_t offset, int64_t dim1, int64_t dim2);
-void atg_diagonal_backward(tensor *, tensor grad, int64_t *input_sizes_data, int input_sizes_len, int64_t offset, int64_t dim1, int64_t dim2);
+void atg_diagonal_backward(tensor *, tensor grad_output, int64_t *input_sizes_data, int input_sizes_len, int64_t offset, int64_t dim1, int64_t dim2);
 void atg_diff(tensor *, tensor self, int64_t n, int64_t dim, tensor prepend, tensor append);
 void atg_diff_out(tensor *, tensor out, tensor self, int64_t n, int64_t dim, tensor prepend, tensor append);
 void atg_digamma(tensor *, tensor self);
@ -561,6 +602,7 @@ void atg_einsum(tensor *, char* equation_ptr, int equation_len, tensor *tensors_
 void atg_elu(tensor *, tensor self);
 void atg_elu_(tensor *, tensor self);
 void atg_elu_backward(tensor *, tensor grad_output, scalar alpha, scalar scale, scalar input_scale, int is_result, tensor self_or_result);
+void atg_elu_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, scalar alpha, scalar scale, scalar input_scale, int is_result, tensor self_or_result);
 void atg_elu_out(tensor *, tensor out, tensor self);
 void atg_embedding(tensor *, tensor weight, tensor indices, int64_t padding_idx, int scale_grad_by_freq, int sparse);
 void atg_embedding_backward(tensor *, tensor grad, tensor indices, int64_t num_weights, int64_t padding_idx, int scale_grad_by_freq, int sparse);
@ -572,7 +614,7 @@ void atg_embedding_sparse_backward(tensor *, tensor grad, tensor indices, int64_
 void atg_empty(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_empty_like(tensor *, tensor self);
 void atg_empty_out(tensor *, tensor out, int64_t *size_data, int size_len);
-void atg_empty_quantized(tensor *, int64_t *size_data, int size_len, tensor qtensor);
+void atg_empty_quantized(tensor *, int64_t *size_data, int size_len, tensor qtensor, int options_kind, int options_device);
 void atg_empty_strided(tensor *, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int options_kind, int options_device);
 void atg_eq(tensor *, tensor self, scalar other);
 void atg_eq_(tensor *, tensor self, scalar other);
@ -612,6 +654,7 @@ void atg_fake_quantize_per_channel_affine_cachemask_backward(tensor *, tensor gr
 void atg_fake_quantize_per_tensor_affine(tensor *, tensor self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max);
 void atg_fake_quantize_per_tensor_affine_cachemask(tensor *, tensor self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max);
 void atg_fake_quantize_per_tensor_affine_cachemask_backward(tensor *, tensor grad, tensor mask);
+void atg_fake_quantize_per_tensor_affine_tensor_qparams(tensor *, tensor self, tensor scale, tensor zero_point, int64_t quant_min, int64_t quant_max);
 void atg_fbgemm_linear_fp16_weight(tensor *, tensor input, tensor packed_weight, tensor bias);
 void atg_fbgemm_linear_fp16_weight_fp32_activation(tensor *, tensor input, tensor packed_weight, tensor bias);
 void atg_fbgemm_linear_int8_weight(tensor *, tensor input, tensor weight, tensor packed, tensor col_offsets, scalar weight_scale, scalar weight_zero_point, tensor bias);
@ -714,6 +757,7 @@ void atg_from_file(tensor *, char* filename_ptr, int filename_len, int shared, i
 void atg_full(tensor *, int64_t *size_data, int size_len, scalar fill_value, int options_kind, int options_device);
 void atg_full_like(tensor *, tensor self, scalar fill_value);
 void atg_full_out(tensor *, tensor out, int64_t *size_data, int size_len, scalar fill_value);
+void atg_fused_moving_avg_obs_fake_quant(tensor *, tensor self, tensor observer_on, tensor fake_quant_on, tensor running_min, tensor running_max, tensor scale, tensor zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int per_row_fake_quant, int symmetric_quant);
 void atg_gather(tensor *, tensor self, int64_t dim, tensor index, int sparse_grad);
 void atg_gather_backward(tensor *, tensor grad, tensor self, int64_t dim, tensor index, int sparse_grad);
 void atg_gather_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, int sparse_grad);
@ -728,6 +772,8 @@ void atg_ge_tensor_(tensor *, tensor self, tensor other);
 void atg_ge_tensor_out(tensor *, tensor out, tensor self, tensor other);
 void atg_gelu(tensor *, tensor self);
 void atg_gelu_backward(tensor *, tensor grad, tensor self);
+void atg_gelu_backward_grad_input(tensor *, tensor grad_input, tensor grad, tensor self);
+void atg_gelu_out(tensor *, tensor out, tensor self);
 void atg_geometric_(tensor *, tensor self, double p);
 void atg_geqrf(tensor *, tensor self);
 void atg_geqrf_a(tensor *, tensor a, tensor tau, tensor self);
@ -773,9 +819,12 @@ void atg_hann_window(tensor *, int64_t window_length, int options_kind, int opti
 void atg_hann_window_periodic(tensor *, int64_t window_length, int periodic, int options_kind, int options_device);
 void atg_hardshrink(tensor *, tensor self);
 void atg_hardshrink_backward(tensor *, tensor grad_out, tensor self, scalar lambd);
+void atg_hardshrink_backward_grad_input(tensor *, tensor grad_input, tensor grad_out, tensor self, scalar lambd);
+void atg_hardshrink_out(tensor *, tensor out, tensor self);
 void atg_hardsigmoid(tensor *, tensor self);
 void atg_hardsigmoid_(tensor *, tensor self);
 void atg_hardsigmoid_backward(tensor *, tensor grad_output, tensor self);
+void atg_hardsigmoid_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self);
 void atg_hardsigmoid_out(tensor *, tensor out, tensor self);
 void atg_hardswish(tensor *, tensor self);
 void atg_hardswish_(tensor *, tensor self);
@ -845,17 +894,26 @@ void atg_inverse(tensor *, tensor self);
 void atg_inverse_out(tensor *, tensor out, tensor self);
 int atg_is_coalesced(tensor self);
 int atg_is_complex(tensor self);
+int atg_is_conj(tensor self);
 int atg_is_distributed(tensor self);
 int atg_is_floating_point(tensor self);
+int atg_is_inference(tensor self);
 int atg_is_leaf(tensor self);
+int atg_is_neg(tensor self);
 int atg_is_nonzero(tensor self);
-int atg_is_pinned(tensor self);
+int atg_is_pinned(tensor self, int device);
 int atg_is_same_size(tensor self, tensor other);
 int atg_is_set_to(tensor self, tensor tensor);
 int atg_is_signed(tensor self);
 int atg_is_vulkan_available();
 void atg_isclose(tensor *, tensor self, tensor other, double rtol, double atol, int equal_nan);
 void atg_isfinite(tensor *, tensor self);
+void atg_isin(tensor *, tensor elements, tensor test_elements, int assume_unique, int invert);
+void atg_isin_scalar_tensor(tensor *, scalar element, tensor test_elements, int assume_unique, int invert);
+void atg_isin_scalar_tensor_out(tensor *, tensor out, scalar element, tensor test_elements, int assume_unique, int invert);
+void atg_isin_tensor_scalar(tensor *, tensor elements, scalar test_element, int assume_unique, int invert);
+void atg_isin_tensor_scalar_out(tensor *, tensor out, tensor elements, scalar test_element, int assume_unique, int invert);
+void atg_isin_tensor_tensor_out(tensor *, tensor out, tensor elements, tensor test_elements, int assume_unique, int invert);
 void atg_isinf(tensor *, tensor self);
 void atg_isnan(tensor *, tensor self);
 void atg_isneginf(tensor *, tensor self);
@ -893,6 +951,7 @@ void atg_le_tensor_out(tensor *, tensor out, tensor self, tensor other);
 void atg_leaky_relu(tensor *, tensor self);
 void atg_leaky_relu_(tensor *, tensor self);
 void atg_leaky_relu_backward(tensor *, tensor grad_output, tensor self, scalar negative_slope, int self_is_result);
+void atg_leaky_relu_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self, scalar negative_slope, int self_is_result);
 void atg_leaky_relu_out(tensor *, tensor out, tensor self);
 void atg_lerp(tensor *, tensor self, tensor end, scalar weight);
 void atg_lerp_(tensor *, tensor self, tensor end, scalar weight);
@ -915,10 +974,10 @@ void atg_less_tensor_out(tensor *, tensor out, tensor self, tensor other);
 void atg_lgamma(tensor *, tensor self);
 void atg_lgamma_(tensor *, tensor self);
 void atg_lgamma_out(tensor *, tensor out, tensor self);
-void atg_linalg_cholesky(tensor *, tensor self);
-void atg_linalg_cholesky_ex(tensor *, tensor self, int check_errors);
-void atg_linalg_cholesky_ex_l(tensor *, tensor L, tensor info, tensor self, int check_errors);
-void atg_linalg_cholesky_out(tensor *, tensor out, tensor self);
+void atg_linalg_cholesky(tensor *, tensor self, int upper);
+void atg_linalg_cholesky_ex(tensor *, tensor self, int upper, int check_errors);
+void atg_linalg_cholesky_ex_l(tensor *, tensor L, tensor info, tensor self, int upper, int check_errors);
+void atg_linalg_cholesky_out(tensor *, tensor out, tensor self, int upper);
 void atg_linalg_cond(tensor *, tensor self, scalar p);
 void atg_linalg_cond_out(tensor *, tensor out, tensor self, scalar p);
 void atg_linalg_cond_p_str(tensor *, tensor self, char* p_ptr, int p_len);
@ -941,10 +1000,8 @@ void atg_linalg_inv_ex_inverse(tensor *, tensor inverse, tensor info, tensor sel
 void atg_linalg_inv_out(tensor *, tensor out, tensor self);
 void atg_linalg_lstsq(tensor *, tensor self, tensor b, double rcond_v, uint8_t rcond_null, char* driver_ptr, int driver_len);
 void atg_linalg_lstsq_out(tensor *, tensor solution, tensor residuals, tensor rank, tensor singular_values, tensor self, tensor b, double rcond_v, uint8_t rcond_null, char* driver_ptr, int driver_len);
-void atg_linalg_matrix_norm(tensor *, tensor self, scalar ord, int64_t *dim_data, int dim_len, int keepdim, int dtype);
-void atg_linalg_matrix_norm_out(tensor *, tensor out, tensor self, scalar ord, int64_t *dim_data, int dim_len, int keepdim, int dtype);
-void atg_linalg_matrix_norm_str_ord(tensor *, tensor self, char* ord_ptr, int ord_len, int64_t *dim_data, int dim_len, int keepdim, int dtype);
-void atg_linalg_matrix_norm_str_ord_out(tensor *, tensor out, tensor self, char* ord_ptr, int ord_len, int64_t *dim_data, int dim_len, int keepdim, int dtype);
+void atg_linalg_matmul(tensor *, tensor self, tensor other);
+void atg_linalg_matmul_out(tensor *, tensor out, tensor self, tensor other);
 void atg_linalg_matrix_power(tensor *, tensor self, int64_t n);
 void atg_linalg_matrix_power_out(tensor *, tensor out, tensor self, int64_t n);
 void atg_linalg_matrix_rank(tensor *, tensor self, double tol_v, uint8_t tol_null, int hermitian);
@ -976,6 +1033,7 @@ void atg_linalg_tensorinv_out(tensor *, tensor out, tensor self, int64_t ind);
 void atg_linalg_tensorsolve(tensor *, tensor self, tensor other, int64_t *dims_data, int dims_len);
 void atg_linalg_tensorsolve_out(tensor *, tensor out, tensor self, tensor other, int64_t *dims_data, int dims_len);
 void atg_linear(tensor *, tensor input, tensor weight, tensor bias);
+void atg_linear_out(tensor *, tensor out, tensor input, tensor weight, tensor bias);
 void atg_linspace(tensor *, scalar start, scalar end, int64_t steps_v, uint8_t steps_null, int options_kind, int options_device);
 void atg_linspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps_v, uint8_t steps_null);
 void atg_log(tensor *, tensor self);
@ -1091,6 +1149,7 @@ void atg_median(tensor *, tensor self);
 void atg_median_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_median_dim_values(tensor *, tensor values, tensor indices, tensor self, int64_t dim, int keepdim);
 tensor *atg_meshgrid(tensor *tensors_data, int tensors_len);
+tensor *atg_meshgrid_indexing(tensor *tensors_data, int tensors_len, char* indexing_ptr, int indexing_len);
 void atg_min(tensor *, tensor self);
 void atg_min_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_min_dim_min(tensor *, tensor min, tensor min_indices, tensor self, int64_t dim, int keepdim);
@ -1165,9 +1224,12 @@ void atg_mv(tensor *, tensor self, tensor vec);
 void atg_mv_out(tensor *, tensor out, tensor self, tensor vec);
 void atg_mvlgamma(tensor *, tensor self, int64_t p);
 void atg_mvlgamma_(tensor *, tensor self, int64_t p);
+void atg_mvlgamma_out(tensor *, tensor out, tensor self, int64_t p);
 void atg_nan_to_num(tensor *, tensor self, double nan_v, uint8_t nan_null, double posinf_v, uint8_t posinf_null, double neginf_v, uint8_t neginf_null);
 void atg_nan_to_num_(tensor *, tensor self, double nan_v, uint8_t nan_null, double posinf_v, uint8_t posinf_null, double neginf_v, uint8_t neginf_null);
 void atg_nan_to_num_out(tensor *, tensor out, tensor self, double nan_v, uint8_t nan_null, double posinf_v, uint8_t posinf_null, double neginf_v, uint8_t neginf_null);
+void atg_nanmean(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
+void atg_nanmean_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_nanmedian(tensor *, tensor self);
 void atg_nanmedian_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_nanmedian_dim_values(tensor *, tensor values, tensor indices, tensor self, int64_t dim, int keepdim);
@ -1207,6 +1269,7 @@ void atg_negative_out(tensor *, tensor out, tensor self);
 void atg_new_empty(tensor *, tensor self, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_new_empty_strided(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int options_kind, int options_device);
 void atg_new_full(tensor *, tensor self, int64_t *size_data, int size_len, scalar fill_value, int options_kind, int options_device);
+void atg_new_ones(tensor *, tensor self, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_new_zeros(tensor *, tensor self, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_nextafter(tensor *, tensor self, tensor other);
 void atg_nextafter_(tensor *, tensor self, tensor other);
@ -1261,7 +1324,7 @@ void atg_pad_sequence(tensor *, tensor *sequences_data, int sequences_len, int b
 void atg_pairwise_distance(tensor *, tensor x1, tensor x2, double p, double eps, int keepdim);
 void atg_pdist(tensor *, tensor self, double p);
 void atg_permute(tensor *, tensor self, int64_t *dims_data, int dims_len);
-void atg_pin_memory(tensor *, tensor self);
+void atg_pin_memory(tensor *, tensor self, int device);
 void atg_pinverse(tensor *, tensor self, double rcond);
 void atg_pixel_shuffle(tensor *, tensor self, int64_t upscale_factor);
 void atg_pixel_unshuffle(tensor *, tensor self, int64_t downscale_factor);
@ -1273,13 +1336,13 @@ void atg_polygamma(tensor *, int64_t n, tensor self);
 void atg_polygamma_(tensor *, tensor self, int64_t n);
 void atg_polygamma_out(tensor *, tensor out, int64_t n, tensor self);
 void atg_positive(tensor *, tensor self);
-void atg_pow(tensor *, tensor self, scalar exponent);
+void atg_pow(tensor *, tensor self, tensor exponent);
 void atg_pow_(tensor *, tensor self, scalar exponent);
 void atg_pow_scalar(tensor *, scalar self_scalar, tensor exponent);
 void atg_pow_scalar_out(tensor *, tensor out, scalar self_scalar, tensor exponent);
 void atg_pow_tensor_(tensor *, tensor self, tensor exponent);
+void atg_pow_tensor_scalar(tensor *, tensor self, scalar exponent);
 void atg_pow_tensor_scalar_out(tensor *, tensor out, tensor self, scalar exponent);
-void atg_pow_tensor_tensor(tensor *, tensor self, tensor exponent);
 void atg_pow_tensor_tensor_out(tensor *, tensor out, tensor self, tensor exponent);
 void atg_prelu(tensor *, tensor self, tensor weight);
 void atg_prelu_backward(tensor *, tensor grad_output, tensor self, tensor weight);
@ -1305,6 +1368,7 @@ void atg_quantile_scalar(tensor *, tensor self, double q, int64_t dim_v, uint8_t
 void atg_quantile_scalar_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
 void atg_quantize_per_channel(tensor *, tensor self, tensor scales, tensor zero_points, int64_t axis, int dtype);
 void atg_quantize_per_tensor(tensor *, tensor self, double scale, int64_t zero_point, int dtype);
+void atg_quantize_per_tensor_tensor_qparams(tensor *, tensor self, tensor scale, tensor zero_point, int dtype);
 tensor *atg_quantize_per_tensor_tensors(tensor *tensors_data, int tensors_len, tensor scales, tensor zero_points, int dtype);
 void atg_quantized_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor mean, tensor var, double eps, double output_scale, int64_t output_zero_point);
 void atg_quantized_gru_cell(tensor *, tensor input, tensor hx, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh, tensor packed_ih, tensor packed_hh, tensor col_offsets_ih, tensor col_offsets_hh, scalar scale_ih, scalar scale_hh, scalar zero_point_ih, scalar zero_point_hh);
@ -1349,6 +1413,10 @@ void atg_reflection_pad2d(tensor *, tensor self, int64_t *padding_data, int padd
 void atg_reflection_pad2d_backward(tensor *, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
 void atg_reflection_pad2d_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
 void atg_reflection_pad2d_out(tensor *, tensor out, tensor self, int64_t *padding_data, int padding_len);
+void atg_reflection_pad3d(tensor *, tensor self, int64_t *padding_data, int padding_len);
+void atg_reflection_pad3d_backward(tensor *, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
+void atg_reflection_pad3d_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
+void atg_reflection_pad3d_out(tensor *, tensor out, tensor self, int64_t *padding_data, int padding_len);
 void atg_relu(tensor *, tensor self);
 void atg_relu6(tensor *, tensor self);
 void atg_relu6_(tensor *, tensor self);
@ -1356,6 +1424,7 @@ void atg_relu_(tensor *, tensor self);
 void atg_remainder(tensor *, tensor self, scalar other);
 void atg_remainder_(tensor *, tensor self, scalar other);
 void atg_remainder_scalar_out(tensor *, tensor out, tensor self, scalar other);
+void atg_remainder_scalar_tensor(tensor *, scalar self_scalar, tensor other);
 void atg_remainder_tensor(tensor *, tensor self, tensor other);
 void atg_remainder_tensor_(tensor *, tensor self, tensor other);
 void atg_remainder_tensor_out(tensor *, tensor out, tensor self, tensor other);
@ -1363,9 +1432,9 @@ void atg_renorm(tensor *, tensor self, scalar p, int64_t dim, scalar maxnorm);
 void atg_renorm_(tensor *, tensor self, scalar p, int64_t dim, scalar maxnorm);
 void atg_renorm_out(tensor *, tensor out, tensor self, scalar p, int64_t dim, scalar maxnorm);
 void atg_repeat(tensor *, tensor self, int64_t *repeats_data, int repeats_len);
-void atg_repeat_interleave(tensor *, tensor repeats);
-void atg_repeat_interleave_self_int(tensor *, tensor self, int64_t repeats, int64_t dim_v, uint8_t dim_null);
-void atg_repeat_interleave_self_tensor(tensor *, tensor self, tensor repeats, int64_t dim_v, uint8_t dim_null);
+void atg_repeat_interleave(tensor *, tensor repeats, int64_t output_size_v, uint8_t output_size_null);
+void atg_repeat_interleave_self_int(tensor *, tensor self, int64_t repeats, int64_t dim_v, uint8_t dim_null, int64_t output_size_v, uint8_t output_size_null);
+void atg_repeat_interleave_self_tensor(tensor *, tensor self, tensor repeats, int64_t dim_v, uint8_t dim_null, int64_t output_size_v, uint8_t output_size_null);
 void atg_replication_pad1d(tensor *, tensor self, int64_t *padding_data, int padding_len);
 void atg_replication_pad1d_backward(tensor *, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
 void atg_replication_pad1d_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self, int64_t *padding_data, int padding_len);
@ -1384,6 +1453,9 @@ void atg_reshape_as(tensor *, tensor self, tensor other);
 void atg_resize_(tensor *, tensor self, int64_t *size_data, int size_len);
 void atg_resize_as_(tensor *, tensor self, tensor the_template);
 void atg_resize_as_sparse_(tensor *, tensor self, tensor the_template);
+void atg_resolve_conj(tensor *, tensor self);
+void atg_resolve_neg(tensor *, tensor self);
+int atg_retains_grad(tensor self);
 void atg_rnn_relu(tensor *, tensor input, tensor hx, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional, int batch_first);
 void atg_rnn_relu_cell(tensor *, tensor input, tensor hx, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh);
 void atg_rnn_relu_data(tensor *, tensor data, tensor batch_sizes, tensor hx, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional);
@ -1413,17 +1485,23 @@ void atg_scatter(tensor *, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter_(tensor *, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter_add(tensor *, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter_add_(tensor *, tensor self, int64_t dim, tensor index, tensor src);
+void atg_scatter_add_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, tensor src);
+void atg_scatter_reduce(tensor *, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len);
 void atg_scatter_reduce_(tensor *, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len);
+void atg_scatter_reduce_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len);
+void atg_scatter_src_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter_value(tensor *, tensor self, int64_t dim, tensor index, scalar value);
 void atg_scatter_value_(tensor *, tensor self, int64_t dim, tensor index, scalar value);
+void atg_scatter_value_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, scalar value);
+void atg_scatter_value_reduce(tensor *, tensor self, int64_t dim, tensor index, scalar value, char* reduce_ptr, int reduce_len);
 void atg_scatter_value_reduce_(tensor *, tensor self, int64_t dim, tensor index, scalar value, char* reduce_ptr, int reduce_len);
+void atg_scatter_value_reduce_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, scalar value, char* reduce_ptr, int reduce_len);
 void atg_searchsorted(tensor *, tensor sorted_sequence, tensor self, int out_int32, int right);
 void atg_searchsorted_scalar(tensor *, tensor sorted_sequence, scalar self_scalar, int out_int32, int right);
 void atg_searchsorted_tensor_out(tensor *, tensor out, tensor sorted_sequence, tensor self, int out_int32, int right);
 void atg_segment_reduce(tensor *, tensor data, char* reduce_ptr, int reduce_len, tensor lengths, tensor indices, int64_t axis, int unsafe, scalar initial);
-void atg_segment_reduce_backward(tensor *, tensor grad, tensor output, tensor data, tensor lengths);
 void atg_select(tensor *, tensor self, int64_t dim, int64_t index);
-void atg_select_backward(tensor *, tensor grad, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t index);
+void atg_select_backward(tensor *, tensor grad_output, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t index);
 void atg_selu(tensor *, tensor self);
 void atg_selu_(tensor *, tensor self);
 void atg_set_(tensor *, tensor self);
@ -1445,6 +1523,7 @@ void atg_signbit_out(tensor *, tensor out, tensor self);
 void atg_silu(tensor *, tensor self);
 void atg_silu_(tensor *, tensor self);
 void atg_silu_backward(tensor *, tensor grad_output, tensor self);
+void atg_silu_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self);
 void atg_silu_out(tensor *, tensor out, tensor self);
 void atg_sin(tensor *, tensor self);
 void atg_sin_(tensor *, tensor self);
@ -1456,7 +1535,7 @@ void atg_sinh(tensor *, tensor self);
 void atg_sinh_(tensor *, tensor self);
 void atg_sinh_out(tensor *, tensor out, tensor self);
 void atg_slice(tensor *, tensor self, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step);
-void atg_slice_backward(tensor *, tensor grad, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t start, int64_t end, int64_t step);
+void atg_slice_backward(tensor *, tensor grad_output, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t start, int64_t end, int64_t step);
 void atg_slogdet(tensor *, tensor self);
 void atg_slow_conv3d(tensor *, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
 void atg_slow_conv3d_out(tensor *, tensor out, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
@ -1493,16 +1572,22 @@ void atg_sort_values_stable(tensor *, tensor values, tensor indices, tensor self
 void atg_sparse_coo_tensor(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_sparse_coo_tensor_indices(tensor *, tensor indices, tensor values, int options_kind, int options_device);
 void atg_sparse_coo_tensor_indices_size(tensor *, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
+void atg_sparse_csr_tensor(tensor *, tensor crow_indices, tensor col_indices, tensor values, int options_kind, int options_device);
+void atg_sparse_csr_tensor_crow_col_value_size(tensor *, tensor crow_indices, tensor col_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 int64_t atg_sparse_dim(tensor self);
 void atg_sparse_mask(tensor *, tensor self, tensor mask);
 void atg_sparse_resize_(tensor *, tensor self, int64_t *size_data, int size_len, int64_t sparse_dim, int64_t dense_dim);
 void atg_sparse_resize_and_clear_(tensor *, tensor self, int64_t *size_data, int size_len, int64_t sparse_dim, int64_t dense_dim);
+void atg_special_digamma(tensor *, tensor self);
+void atg_special_digamma_out(tensor *, tensor out, tensor self);
 void atg_special_entr(tensor *, tensor self);
 void atg_special_entr_out(tensor *, tensor out, tensor self);
 void atg_special_erf(tensor *, tensor self);
 void atg_special_erf_out(tensor *, tensor out, tensor self);
 void atg_special_erfc(tensor *, tensor self);
 void atg_special_erfc_out(tensor *, tensor out, tensor self);
+void atg_special_erfcx(tensor *, tensor self);
+void atg_special_erfcx_out(tensor *, tensor out, tensor self);
 void atg_special_erfinv(tensor *, tensor self);
 void atg_special_erfinv_out(tensor *, tensor out, tensor self);
 void atg_special_exp2(tensor *, tensor self);
@ -1511,18 +1596,59 @@ void atg_special_expit(tensor *, tensor self);
 void atg_special_expit_out(tensor *, tensor out, tensor self);
 void atg_special_expm1(tensor *, tensor self);
 void atg_special_expm1_out(tensor *, tensor out, tensor self);
+void atg_special_gammainc(tensor *, tensor self, tensor other);
+void atg_special_gammainc_out(tensor *, tensor out, tensor self, tensor other);
+void atg_special_gammaincc(tensor *, tensor self, tensor other);
+void atg_special_gammaincc_out(tensor *, tensor out, tensor self, tensor other);
 void atg_special_gammaln(tensor *, tensor self);
 void atg_special_gammaln_out(tensor *, tensor out, tensor self);
+void atg_special_i0(tensor *, tensor self);
+void atg_special_i0_out(tensor *, tensor out, tensor self);
 void atg_special_i0e(tensor *, tensor self);
 void atg_special_i0e_out(tensor *, tensor out, tensor self);
+void atg_special_i1(tensor *, tensor self);
+void atg_special_i1_out(tensor *, tensor out, tensor self);
+void atg_special_i1e(tensor *, tensor self);
+void atg_special_i1e_out(tensor *, tensor out, tensor self);
+void atg_special_log1p(tensor *, tensor self);
+void atg_special_log1p_out(tensor *, tensor out, tensor self);
+void atg_special_log_softmax(tensor *, tensor self, int64_t dim, int dtype);
 void atg_special_logit(tensor *, tensor self, double eps_v, uint8_t eps_null);
 void atg_special_logit_out(tensor *, tensor out, tensor self, double eps_v, uint8_t eps_null);
+void atg_special_logsumexp(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
+void atg_special_logsumexp_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim);
+void atg_special_multigammaln(tensor *, tensor self, int64_t p);
+void atg_special_multigammaln_out(tensor *, tensor out, tensor self, int64_t p);
+void atg_special_ndtr(tensor *, tensor self);
+void atg_special_ndtr_out(tensor *, tensor out, tensor self);
+void atg_special_ndtri(tensor *, tensor self);
+void atg_special_ndtri_out(tensor *, tensor out, tensor self);
+void atg_special_polygamma(tensor *, int64_t n, tensor self);
+void atg_special_polygamma_out(tensor *, tensor out, int64_t n, tensor self);
+void atg_special_psi(tensor *, tensor self);
+void atg_special_psi_out(tensor *, tensor out, tensor self);
+void atg_special_round(tensor *, tensor self);
+void atg_special_round_out(tensor *, tensor out, tensor self);
+void atg_special_sinc(tensor *, tensor self);
+void atg_special_sinc_out(tensor *, tensor out, tensor self);
 void atg_special_xlog1py(tensor *, tensor self, tensor other);
 void atg_special_xlog1py_other_scalar(tensor *, tensor self, scalar other);
 void atg_special_xlog1py_other_scalar_out(tensor *, tensor out, tensor self, scalar other);
 void atg_special_xlog1py_out(tensor *, tensor out, tensor self, tensor other);
 void atg_special_xlog1py_self_scalar(tensor *, scalar self_scalar, tensor other);
 void atg_special_xlog1py_self_scalar_out(tensor *, tensor out, scalar self_scalar, tensor other);
+void atg_special_xlogy(tensor *, tensor self, tensor other);
+void atg_special_xlogy_other_scalar(tensor *, tensor self, scalar other);
+void atg_special_xlogy_other_scalar_out(tensor *, tensor out, tensor self, scalar other);
+void atg_special_xlogy_out(tensor *, tensor out, tensor self, tensor other);
+void atg_special_xlogy_self_scalar(tensor *, scalar self_scalar, tensor other);
+void atg_special_xlogy_self_scalar_out(tensor *, tensor out, scalar self_scalar, tensor other);
+void atg_special_zeta(tensor *, tensor self, tensor other);
+void atg_special_zeta_other_scalar(tensor *, tensor self, scalar other);
+void atg_special_zeta_other_scalar_out(tensor *, tensor out, tensor self, scalar other);
+void atg_special_zeta_out(tensor *, tensor out, tensor self, tensor other);
+void atg_special_zeta_self_scalar(tensor *, scalar self_scalar, tensor other);
+void atg_special_zeta_self_scalar_out(tensor *, tensor out, scalar self_scalar, tensor other);
 tensor *atg_split(tensor self, int64_t split_size, int64_t dim);
 tensor *atg_split_with_sizes(tensor self, int64_t *split_sizes_data, int split_sizes_len, int64_t dim);
 void atg_sqrt(tensor *, tensor self);
@ -1613,6 +1739,8 @@ void atg_trace(tensor *, tensor self);
 void atg_trace_backward(tensor *, tensor grad, int64_t *sizes_data, int sizes_len);
 void atg_transpose(tensor *, tensor self, int64_t dim0, int64_t dim1);
 void atg_transpose_(tensor *, tensor self, int64_t dim0, int64_t dim1);
+void atg_trapezoid(tensor *, tensor y, int64_t dim);
+void atg_trapezoid_x(tensor *, tensor y, tensor x, int64_t dim);
 void atg_trapz(tensor *, tensor y, tensor x, int64_t dim);
 void atg_trapz_dx(tensor *, tensor y, double dx, int64_t dim);
 void atg_triangular_solve(tensor *, tensor self, tensor A, int upper, int transpose, int unitriangular);
--- a/setup-gotch.sh
+++ b/setup-gotch.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-GOTCH_VERSION="${GOTCH_VER:-v0.4.5}"
+GOTCH_VERSION="${GOTCH_VER:-v0.5.0}"
 CUDA_VERSION="${CUDA_VER:-11.1}"
 GOTCH_PATH="$GOPATH/pkg/mod/github.com/sugarme/gotch@$GOTCH_VERSION"

--- a/setup-libtorch.sh
+++ b/setup-libtorch.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-LIBTORCH_VERSION="${LIBTORCH_VER:-1.9.0}"
+LIBTORCH_VERSION="${LIBTORCH_VER:-1.10.0}"
 CUDA_VERSION="${CUDA_VER:-11.1}"

 if [[ -z "${CUDA_VERSION}"=="cpu" ]]; then
--- a/tensor/data_test.go
+++ b/tensor/data_test.go
@ -104,7 +104,7 @@ func TestTextDataIter(t *testing.T) {
 		col2 := xs.Idx([]ts.TensorIndexer{idxCol, idxCol2})
 		// t.Errorf("col2 shape: %v\n", col2.MustSize())

-		pow := col1Fmod.MustSub(col2, true).MustPow(ts.IntScalar(2), true)
+		pow := col1Fmod.MustSub(col2, true).MustPowTensorScalar(ts.IntScalar(2), true)
 		sum := pow.MustSum(gotch.Float, true)

 		// Will pass if there's no panic
--- a/tensor/must-tensor-generated.go
+++ b/tensor/must-tensor-generated.go
--- a/tensor/tensor-generated.go
+++ b/tensor/tensor-generated.go