upgraded to libtorch-1.11 required cuda11.3

2022-03-13 12:56:11 +11:00 · 2022-03-13 12:56:11 +11:00 · c24abefcf3
commit c24abefcf3
parent 664928551b
11 changed files with 182979 additions and 47645 deletions
--- a/README.md
+++ b/README.md
@ -3,10 +3,10 @@

 ## Overview

-`gotch` creates a thin wrapper to Pytorch C++ APIs (Libtorch) to make use of its already optimized C++ tensor APIs (~ 2169) and dynamic graph computation with CUDA support and provides idiomatic Go APIs for developing and implementing Deep Learning in Go.
+`gotch` creates a thin wrapper to Pytorch C++ APIs (Libtorch) to make use of its already optimized C++ tensor APIs (~ 2209) and dynamic graph computation with CUDA support and provides idiomatic Go APIs for developing and implementing Deep Learning in Go.

 **Some features are**
- [x] Comprehensive Pytorch tensor APIs (~ 2169) 
+- [x] Comprehensive Pytorch tensor APIs (~ 1893) 
 - [x] Fully featured Pytorch dynamic graph computation
 - [x] JIT interface to run model trained/saved using PyTorch Python API
 - [x] Load pretrained Pytorch models and run inference
--- a/gen/gen.ml
+++ b/gen/gen.ml
@ -41,7 +41,11 @@ let excluded_functions =
    ; "linalg_vector_norm"
    ; "linalg_vector_norm_out" 
    ; "linalg_matrix_norm"
-    ; "linalg_matrix_norm_out"]
+    ; "linalg_matrix_norm_out"
+    ; "_histogramdd_bin_edges"
+    ; "_histogramdd_bin_edges"
+    ; "_histogramdd_from_bin_cts"
+    ; "_linalg_check_errors"]

 let no_tensor_options =
  Set.of_list
@ -1344,7 +1348,7 @@ let run ~yaml_filename ~cpp_filename ~ffi_filename ~must_wrapper_filename
  write_wrapper funcs wrapper_filename

 let () =
-  run ~yaml_filename:"gen/pytorch/Declarations-v1.10.0.yaml"
+  run ~yaml_filename:"gen/pytorch/Declarations-v1.11.0.yaml"
    ~cpp_filename:"libtch/torch_api_generated"
    ~ffi_filename:"libtch/c-generated.go"
    ~must_wrapper_filename:"ts/must-tensor-generated.go"
--- a/gen/gen.ml.1.10
+++ b/gen/gen.ml.1.10
--- a/gen/pytorch/Declarations-v1.11.0.yaml
+++ b/gen/pytorch/Declarations-v1.11.0.yaml
--- a/libtch/c-generated.go
+++ b/libtch/c-generated.go
--- a/libtch/torch_api_generated.cpp.h
+++ b/libtch/torch_api_generated.cpp.h
--- a/libtch/torch_api_generated.h
+++ b/libtch/torch_api_generated.h
@ -33,7 +33,8 @@ void atg__add_relu_scalar_(tensor *, tensor self, scalar other);
 void atg__aminmax(tensor *, tensor self);
 void atg__aminmax_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg__amp_update_scale_(tensor *, tensor self, tensor growth_tracker, tensor found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval);
-void atg__baddbmm_mkl_(tensor *, tensor self, tensor batch1, tensor batch2);
+void atg__autocast_to_full_precision(tensor *, tensor self, int cuda_enabled, int cpu_enabled);
+void atg__autocast_to_reduced_precision(tensor *, tensor self, int cuda_enabled, int cpu_enabled, int cuda_dtype, int cpu_dtype);
 void atg__cast_byte(tensor *, tensor self, int non_blocking);
 void atg__cast_char(tensor *, tensor self, int non_blocking);
 void atg__cast_double(tensor *, tensor self, int non_blocking);
@ -53,14 +54,14 @@ void atg__compute_linear_combination_out(tensor *, tensor out, tensor input, ten
 void atg__conj(tensor *, tensor self);
 void atg__conj_physical(tensor *, tensor self);
 void atg__conv_depthwise2d(tensor *, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len);
-void atg__conv_depthwise2d_backward(tensor *, tensor grad_input, tensor grad_weight, tensor grad_output, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len);
 void atg__conv_depthwise2d_out(tensor *, tensor out, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len);
 void atg__convert_indices_from_coo_to_csr(tensor *, tensor self, int64_t size, int out_int32);
 void atg__convert_indices_from_coo_to_csr_out(tensor *, tensor out, tensor self, int64_t size, int out_int32);
+void atg__convert_indices_from_csr_to_coo(tensor *, tensor crow_indices, tensor col_indices, int out_int32, int transpose);
+void atg__convert_indices_from_csr_to_coo_out(tensor *, tensor out, tensor crow_indices, tensor col_indices, int out_int32, int transpose);
 void atg__convolution(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len, int64_t groups, int benchmark, int deterministic, int cudnn_enabled, int allow_tf32);
 void atg__convolution_deprecated(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len, int64_t groups, int benchmark, int deterministic, int cudnn_enabled);
 void atg__convolution_mode(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, char* padding_ptr, int padding_len, int64_t *dilation_data, int dilation_len, int64_t groups);
-void atg__convolution_nogroup(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int transposed, int64_t *output_padding_data, int output_padding_len);
 void atg__copy_from(tensor *, tensor self, tensor dst, int non_blocking);
 void atg__copy_from_and_resize(tensor *, tensor self, tensor dst);
 void atg__ctc_loss(tensor *, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int zero_infinity);
@ -78,6 +79,7 @@ void atg__dim_arange(tensor *, tensor like, int64_t dim);
 int64_t atg__dimi(tensor self);
 int64_t atg__dimv(tensor self);
 void atg__dirichlet_grad(tensor *, tensor x, tensor alpha, tensor total);
+void atg__efficientzerotensor(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__embedding_bag(tensor *, tensor weight, tensor indices, tensor offsets, int scale_grad_by_freq, int64_t mode, int sparse, tensor per_sample_weights, int include_last_offset, int64_t padding_idx);
 void atg__embedding_bag_backward(tensor *, tensor grad, tensor indices, tensor offsets, tensor offset2bag, tensor bag_size, tensor maximum_indices, int64_t num_weights, int scale_grad_by_freq, int64_t mode, int sparse, tensor per_sample_weights, int64_t padding_idx);
 void atg__embedding_bag_dense_backward(tensor *, tensor grad, tensor indices, tensor offset2bag, tensor bag_size, tensor maximum_indices, int64_t num_weights, int scale_grad_by_freq, int64_t mode, tensor per_sample_weights, int64_t padding_idx);
@ -105,15 +107,19 @@ void atg__gather_sparse_backward(tensor *, tensor self, int64_t dim, tensor inde
 void atg__grid_sampler_2d_cpu_fallback(tensor *, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 void atg__grid_sampler_2d_cpu_fallback_backward(tensor *, tensor grad_output, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 int atg__has_compatible_shallow_copy_type(tensor self, tensor from);
+int atg__has_same_storage_numel(tensor self, tensor other);
+void atg__histogramdd_from_bin_tensors(tensor *, tensor self, tensor *bins_data, int bins_len, tensor weight, int density);
 void atg__index_copy_(tensor *, tensor self, int64_t dim, tensor index, tensor source);
 void atg__index_put_impl_(tensor *, tensor self, tensor *indices_data, int indices_len, tensor values, int accumulate, int unsafe);
 void atg__indices(tensor *, tensor self);
-void atg__inverse_helper(tensor *, tensor self);
+int atg__is_zerotensor(tensor self);
 void atg__linalg_inv_out_helper_(tensor *, tensor self, tensor infos_lu, tensor infos_getri);
 void atg__linalg_qr_helper(tensor *, tensor self, char* mode_ptr, int mode_len);
+void atg__linalg_svd(tensor *, tensor A, int full_matrices, int compute_uv);
+void atg__linalg_svd_u(tensor *, tensor U, tensor S, tensor Vh, tensor A, int full_matrices, int compute_uv);
 void atg__log_softmax(tensor *, tensor self, int64_t dim, int half_to_float);
-void atg__log_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
-void atg__log_softmax_backward_data_out(tensor *, tensor out, tensor grad_output, tensor output, int64_t dim, tensor self);
+void atg__log_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, int input_dtype);
+void atg__log_softmax_backward_data_out(tensor *, tensor out, tensor grad_output, tensor output, int64_t dim, int input_dtype);
 void atg__log_softmax_out(tensor *, tensor out, tensor self, int64_t dim, int half_to_float);
 void atg__logcumsumexp(tensor *, tensor self, int64_t dim);
 void atg__logcumsumexp_out(tensor *, tensor out, tensor self, int64_t dim);
@ -122,14 +128,15 @@ void atg__make_dual(tensor *, tensor primal, tensor tangent, int64_t level);
 void atg__make_per_channel_quantized_tensor(tensor *, tensor self, tensor scale, tensor zero_point, int64_t axis);
 void atg__make_per_tensor_quantized_tensor(tensor *, tensor self, double scale, int64_t zero_point);
 void atg__masked_scale(tensor *, tensor self, tensor mask, double scale);
+void atg__masked_softmax(tensor *, tensor self, tensor mask);
 void atg__mkldnn_reshape(tensor *, tensor self, int64_t *shape_data, int shape_len);
 void atg__mkldnn_transpose(tensor *, tensor self, int64_t dim0, int64_t dim1);
 void atg__mkldnn_transpose_(tensor *, tensor self, int64_t dim0, int64_t dim1);
+void atg__native_multi_head_self_attention(tensor *, tensor query, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, tensor mask);
 void atg__neg_view(tensor *, tensor self);
+void atg__new_zeros_with_same_feature_meta(tensor *, tensor self, tensor other, int64_t self_num_batch_dims);
 int atg__nnpack_available();
 void atg__nnpack_spatial_convolution(tensor *, tensor input, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len);
-void atg__nnpack_spatial_convolution_backward_input(tensor *, tensor input, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len);
-void atg__nnpack_spatial_convolution_backward_weight(tensor *, tensor input, int64_t *weightsize_data, int weightsize_len, tensor grad_output, int64_t *padding_data, int padding_len);
 int64_t atg__nnz(tensor self);
 void atg__pack_padded_sequence(tensor *, tensor input, tensor lengths, int batch_first);
 void atg__pack_padded_sequence_backward(tensor *, tensor grad, int64_t *input_size_data, int input_size_len, tensor batch_sizes, int batch_first);
@ -145,16 +152,18 @@ void atg__sample_dirichlet(tensor *, tensor self);
 void atg__saturate_weight_to_fp16(tensor *, tensor weight);
 void atg__segment_reduce_backward(tensor *, tensor grad, tensor output, tensor data, char* reduce_ptr, int reduce_len, tensor lengths, int64_t axis);
 void atg__shape_as_tensor(tensor *, tensor self);
+void atg__slow_conv2d_backward(tensor *, tensor grad_input, tensor grad_weight, tensor grad_bias, tensor grad_output, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
 void atg__sobol_engine_draw(tensor *, tensor quasi, int64_t n, tensor sobolstate, int64_t dimension, int64_t num_generated, int dtype);
 void atg__sobol_engine_ff_(tensor *, tensor self, int64_t n, tensor sobolstate, int64_t dimension, int64_t num_generated);
 void atg__sobol_engine_initialize_state_(tensor *, tensor self, int64_t dimension);
 void atg__sobol_engine_scramble_(tensor *, tensor self, tensor ltm, int64_t dimension);
 void atg__softmax(tensor *, tensor self, int64_t dim, int half_to_float);
-void atg__softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
-void atg__softmax_backward_data_out(tensor *, tensor grad_input, tensor grad_output, tensor output, int64_t dim, tensor self);
+void atg__softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, int input_dtype);
+void atg__softmax_backward_data_out(tensor *, tensor grad_input, tensor grad_output, tensor output, int64_t dim, int input_dtype);
 void atg__softmax_out(tensor *, tensor out, tensor self, int64_t dim, int half_to_float);
 void atg__solve_helper(tensor *, tensor self, tensor A);
 void atg__sparse_addmm(tensor *, tensor self, tensor sparse, tensor dense);
+void atg__sparse_broadcast_to(tensor *, tensor self, int64_t *size_data, int size_len);
 void atg__sparse_coo_tensor_unsafe(tensor *, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_coo_tensor_with_dims(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_coo_tensor_with_dims_and_tensors(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int options_kind, int options_device);
@ -177,7 +186,6 @@ void atg__stack(tensor *, tensor *tensors_data, int tensors_len, int64_t dim);
 void atg__stack_out(tensor *, tensor out, tensor *tensors_data, int tensors_len, int64_t dim);
 void atg__standard_gamma(tensor *, tensor self);
 void atg__standard_gamma_grad(tensor *, tensor self, tensor output);
-void atg__svd_helper(tensor *, tensor self, int some, int compute_uv);
 void atg__symeig_helper(tensor *, tensor self, int eigenvectors, int upper);
 void atg__test_ambiguous_defaults(tensor *, tensor dummy, int64_t a, int64_t b);
 void atg__test_ambiguous_defaults_b(tensor *, tensor dummy, int64_t a, char* b_ptr, int b_len);
@ -185,13 +193,35 @@ void atg__test_optional_filled_intlist(tensor *, tensor values, int64_t *addends
 void atg__test_optional_intlist(tensor *, tensor values, int64_t *addends_data, int addends_len);
 void atg__test_serialization_subcmul(tensor *, tensor self, tensor other);
 void atg__test_string_default(tensor *, tensor dummy, char* a_ptr, int a_len, char* b_ptr, int b_len);
+void atg__test_warn_in_autograd(tensor *, tensor self);
 void atg__to_copy(tensor *, tensor self, int options_kind, int options_device, int non_blocking);
 tensor *atg__to_cpu(tensor *tensors_data, int tensors_len);
+void atg__torch_cuda_cu_linker_symbol_op(tensor *, tensor self);
 void atg__trilinear(tensor *, tensor i1, tensor i2, tensor i3, int64_t *expand1_data, int expand1_len, int64_t *expand2_data, int expand2_len, int64_t *expand3_data, int expand3_len, int64_t *sumdim_data, int sumdim_len, int64_t unroll_dim);
 void atg__unique(tensor *, tensor self, int sorted, int return_inverse);
 void atg__unique2(tensor *, tensor self, int sorted, int return_inverse, int return_counts);
 void atg__unpack_dual(tensor *, tensor dual, int64_t level);
 void atg__unsafe_view(tensor *, tensor self, int64_t *size_data, int size_len);
+void atg__upsample_bicubic2d_aa(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_bicubic2d_aa_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_bicubic2d_aa_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_bicubic2d_aa_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_bilinear2d_aa(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_bilinear2d_aa_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_bilinear2d_aa_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_bilinear2d_aa_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_nearest_exact1d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales_v, uint8_t scales_null);
+void atg__upsample_nearest_exact1d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_v, uint8_t scales_null);
+void atg__upsample_nearest_exact1d_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_v, uint8_t scales_null);
+void atg__upsample_nearest_exact1d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales_v, uint8_t scales_null);
+void atg__upsample_nearest_exact2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_nearest_exact2d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_nearest_exact2d_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_nearest_exact2d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_nearest_exact3d(tensor *, tensor self, int64_t *output_size_data, int output_size_len, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_nearest_exact3d_backward(tensor *, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_nearest_exact3d_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, int64_t *output_size_data, int output_size_len, int64_t *input_size_data, int input_size_len, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
+void atg__upsample_nearest_exact3d_out(tensor *, tensor out, tensor self, int64_t *output_size_data, int output_size_len, double scales_d_v, uint8_t scales_d_null, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
 int atg__use_cudnn_ctc_loss(tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank);
 int atg__use_cudnn_rnn_flatten_weight();
 void atg__values(tensor *, tensor self);
@ -250,6 +280,7 @@ void atg_addmv_out(tensor *, tensor out, tensor self, tensor mat, tensor vec);
 void atg_addr(tensor *, tensor self, tensor vec1, tensor vec2);
 void atg_addr_(tensor *, tensor self, tensor vec1, tensor vec2);
 void atg_addr_out(tensor *, tensor out, tensor self, tensor vec1, tensor vec2);
+void atg_adjoint(tensor *, tensor self);
 void atg_affine_grid_generator(tensor *, tensor theta, int64_t *size_data, int size_len, int align_corners);
 void atg_affine_grid_generator_backward(tensor *, tensor grad, int64_t *size_data, int size_len, int align_corners);
 void atg_alias(tensor *, tensor self);
@ -292,6 +323,9 @@ void atg_arcsinh(tensor *, tensor self);
 void atg_arcsinh_(tensor *, tensor self);
 void atg_arcsinh_out(tensor *, tensor out, tensor self);
 void atg_arctan(tensor *, tensor self);
+void atg_arctan2(tensor *, tensor self, tensor other);
+void atg_arctan2_(tensor *, tensor self, tensor other);
+void atg_arctan2_out(tensor *, tensor out, tensor self, tensor other);
 void atg_arctan_(tensor *, tensor self);
 void atg_arctan_out(tensor *, tensor out, tensor self);
 void atg_arctanh(tensor *, tensor self);
@ -302,6 +336,7 @@ void atg_argmax_out(tensor *, tensor out, tensor self, int64_t dim_v, uint8_t di
 void atg_argmin(tensor *, tensor self, int64_t dim_v, uint8_t dim_null, int keepdim);
 void atg_argmin_out(tensor *, tensor out, tensor self, int64_t dim_v, uint8_t dim_null, int keepdim);
 void atg_argsort(tensor *, tensor self, int64_t dim, int descending);
+void atg_argwhere(tensor *, tensor self);
 void atg_as_strided(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int64_t storage_offset_v, uint8_t storage_offset_null);
 void atg_as_strided_(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int64_t storage_offset_v, uint8_t storage_offset_null);
 void atg_asin(tensor *, tensor self);
@ -479,7 +514,6 @@ void atg_conv2d_padding(tensor *, tensor input, tensor weight, tensor bias, int6
 void atg_conv3d(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int64_t groups);
 void atg_conv3d_padding(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, char* padding_ptr, int padding_len, int64_t *dilation_data, int dilation_len, int64_t groups);
 void atg_conv_depthwise3d(tensor *, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len);
-void atg_conv_depthwise3d_backward(tensor *, tensor grad_input, tensor grad_weight, tensor grad_bias, tensor grad_output, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len);
 void atg_conv_tbc(tensor *, tensor self, tensor weight, tensor bias, int64_t pad);
 void atg_conv_tbc_backward(tensor *, tensor self, tensor input, tensor weight, tensor bias, int64_t pad);
 void atg_conv_transpose1d(tensor *, tensor input, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t groups, int64_t *dilation_data, int dilation_len);
@ -518,16 +552,8 @@ void atg_cudnn_batch_norm(tensor *, tensor input, tensor weight, tensor bias, te
 void atg_cudnn_batch_norm_backward(tensor *, tensor input, tensor grad_output, tensor weight, tensor running_mean, tensor running_var, tensor save_mean, tensor save_var, double epsilon, tensor reserveSpace);
 void atg_cudnn_convolution(tensor *, tensor self, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
 void atg_cudnn_convolution_add_relu(tensor *, tensor self, tensor weight, tensor z, scalar alpha, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int64_t groups);
-void atg_cudnn_convolution_backward_input(tensor *, int64_t *self_size_data, int self_size_len, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
-void atg_cudnn_convolution_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
-void atg_cudnn_convolution_deprecated(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_cudnn_convolution_deprecated2(tensor *, tensor self, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
 void atg_cudnn_convolution_relu(tensor *, tensor self, tensor weight, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int64_t groups);
 void atg_cudnn_convolution_transpose(tensor *, tensor self, tensor weight, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
-void atg_cudnn_convolution_transpose_backward_input(tensor *, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
-void atg_cudnn_convolution_transpose_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic, int allow_tf32);
-void atg_cudnn_convolution_transpose_deprecated(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_cudnn_convolution_transpose_deprecated2(tensor *, tensor self, tensor weight, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
 void atg_cudnn_grid_sampler(tensor *, tensor self, tensor grid);
 void atg_cudnn_grid_sampler_backward(tensor *, tensor self, tensor grid, tensor grad_output);
 int atg_cudnn_is_acceptable(tensor self);
@ -562,6 +588,7 @@ void atg_diag_out(tensor *, tensor out, tensor self, int64_t diagonal);
 void atg_diagflat(tensor *, tensor self, int64_t offset);
 void atg_diagonal(tensor *, tensor self, int64_t offset, int64_t dim1, int64_t dim2);
 void atg_diagonal_backward(tensor *, tensor grad_output, int64_t *input_sizes_data, int input_sizes_len, int64_t offset, int64_t dim1, int64_t dim2);
+void atg_diagonal_scatter(tensor *, tensor self, tensor src, int64_t offset, int64_t dim1, int64_t dim2);
 void atg_diff(tensor *, tensor self, int64_t n, int64_t dim, tensor prepend, tensor append);
 void atg_diff_out(tensor *, tensor out, tensor self, int64_t n, int64_t dim, tensor prepend, tensor append);
 void atg_digamma(tensor *, tensor self);
@ -676,7 +703,11 @@ void atg_fft_fftn(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *di
 void atg_fft_fftn_out(tensor *, tensor out, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fft_fftshift(tensor *, tensor self, int64_t *dim_data, int dim_len);
 void atg_fft_hfft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_hfft2(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
+void atg_fft_hfft2_out(tensor *, tensor out, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fft_hfft_out(tensor *, tensor out, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_hfftn(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
+void atg_fft_hfftn_out(tensor *, tensor out, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fft_ifft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
 void atg_fft_ifft2(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fft_ifft2_out(tensor *, tensor out, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
@ -685,7 +716,11 @@ void atg_fft_ifftn(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *d
 void atg_fft_ifftn_out(tensor *, tensor out, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fft_ifftshift(tensor *, tensor self, int64_t *dim_data, int dim_len);
 void atg_fft_ihfft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_ihfft2(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
+void atg_fft_ihfft2_out(tensor *, tensor out, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fft_ihfft_out(tensor *, tensor out, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
+void atg_fft_ihfftn(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
+void atg_fft_ihfftn_out(tensor *, tensor out, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fft_irfft(tensor *, tensor self, int64_t n_v, uint8_t n_null, int64_t dim, char* norm_ptr, int norm_len);
 void atg_fft_irfft2(tensor *, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
 void atg_fft_irfft2_out(tensor *, tensor out, tensor self, int64_t *s_data, int s_len, int64_t *dim_data, int dim_len, char* norm_ptr, int norm_len);
@ -798,7 +833,6 @@ void atg_greater_tensor_(tensor *, tensor self, tensor other);
 void atg_greater_tensor_out(tensor *, tensor out, tensor self, tensor other);
 void atg_grid_sampler(tensor *, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 void atg_grid_sampler_2d(tensor *, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
-void atg_grid_sampler_2d_backward(tensor *, tensor grad_output, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 void atg_grid_sampler_3d(tensor *, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 void atg_grid_sampler_3d_backward(tensor *, tensor grad_output, tensor input, tensor grid, int64_t interpolation_mode, int64_t padding_mode, int align_corners);
 void atg_group_norm(tensor *, tensor input, int64_t num_groups, tensor weight, tensor bias, double eps, int cudnn_enabled);
@ -871,8 +905,7 @@ void atg_imag(tensor *, tensor self);
 void atg_index(tensor *, tensor self, tensor *indices_data, int indices_len);
 void atg_index_add(tensor *, tensor self, int64_t dim, tensor index, tensor source);
 void atg_index_add_(tensor *, tensor self, int64_t dim, tensor index, tensor source);
-void atg_index_add_alpha(tensor *, tensor self, int64_t dim, tensor index, tensor source, scalar alpha);
-void atg_index_add_alpha_(tensor *, tensor self, int64_t dim, tensor index, tensor source, scalar alpha);
+void atg_index_add_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, tensor source);
 void atg_index_copy(tensor *, tensor self, int64_t dim, tensor index, tensor source);
 void atg_index_copy_(tensor *, tensor self, int64_t dim, tensor index, tensor source);
 void atg_index_fill(tensor *, tensor self, int64_t dim, tensor index, scalar value);
@ -982,8 +1015,11 @@ void atg_linalg_cond(tensor *, tensor self, scalar p);
 void atg_linalg_cond_out(tensor *, tensor out, tensor self, scalar p);
 void atg_linalg_cond_p_str(tensor *, tensor self, char* p_ptr, int p_len);
 void atg_linalg_cond_p_str_out(tensor *, tensor out, tensor self, char* p_ptr, int p_len);
+void atg_linalg_cross(tensor *, tensor self, tensor other, int64_t dim);
+void atg_linalg_cross_out(tensor *, tensor out, tensor self, tensor other, int64_t dim);
 void atg_linalg_det(tensor *, tensor self);
 void atg_linalg_det_out(tensor *, tensor out, tensor self);
+void atg_linalg_diagonal(tensor *, tensor A, int64_t offset, int64_t dim1, int64_t dim2);
 void atg_linalg_eig(tensor *, tensor self);
 void atg_linalg_eig_out(tensor *, tensor eigenvalues, tensor eigenvectors, tensor self);
 void atg_linalg_eigh(tensor *, tensor self, char* UPLO_ptr, int UPLO_len);
@ -1000,12 +1036,21 @@ void atg_linalg_inv_ex_inverse(tensor *, tensor inverse, tensor info, tensor sel
 void atg_linalg_inv_out(tensor *, tensor out, tensor self);
 void atg_linalg_lstsq(tensor *, tensor self, tensor b, double rcond_v, uint8_t rcond_null, char* driver_ptr, int driver_len);
 void atg_linalg_lstsq_out(tensor *, tensor solution, tensor residuals, tensor rank, tensor singular_values, tensor self, tensor b, double rcond_v, uint8_t rcond_null, char* driver_ptr, int driver_len);
+void atg_linalg_lu_factor(tensor *, tensor A, int pivot);
+void atg_linalg_lu_factor_ex(tensor *, tensor A, int pivot, int check_errors);
+void atg_linalg_lu_factor_ex_out(tensor *, tensor LU, tensor pivots, tensor info, tensor A, int pivot, int check_errors);
+void atg_linalg_lu_factor_out(tensor *, tensor LU, tensor pivots, tensor A, int pivot);
 void atg_linalg_matmul(tensor *, tensor self, tensor other);
 void atg_linalg_matmul_out(tensor *, tensor out, tensor self, tensor other);
+void atg_linalg_matrix_exp(tensor *, tensor self);
 void atg_linalg_matrix_power(tensor *, tensor self, int64_t n);
 void atg_linalg_matrix_power_out(tensor *, tensor out, tensor self, int64_t n);
-void atg_linalg_matrix_rank(tensor *, tensor self, double tol_v, uint8_t tol_null, int hermitian);
-void atg_linalg_matrix_rank_out(tensor *, tensor out, tensor self, double tol_v, uint8_t tol_null, int hermitian);
+void atg_linalg_matrix_rank(tensor *, tensor self, double tol, int hermitian);
+void atg_linalg_matrix_rank_atol_rtol_float(tensor *, tensor self, double atol_v, uint8_t atol_null, double rtol_v, uint8_t rtol_null, int hermitian);
+void atg_linalg_matrix_rank_atol_rtol_float_out(tensor *, tensor out, tensor self, double atol_v, uint8_t atol_null, double rtol_v, uint8_t rtol_null, int hermitian);
+void atg_linalg_matrix_rank_atol_rtol_tensor(tensor *, tensor input, tensor atol, tensor rtol, int hermitian);
+void atg_linalg_matrix_rank_atol_rtol_tensor_out(tensor *, tensor out, tensor input, tensor atol, tensor rtol, int hermitian);
+void atg_linalg_matrix_rank_out(tensor *, tensor out, tensor self, double tol, int hermitian);
 void atg_linalg_matrix_rank_out_tol_tensor(tensor *, tensor out, tensor input, tensor tol, int hermitian);
 void atg_linalg_matrix_rank_tol_tensor(tensor *, tensor input, tensor tol, int hermitian);
 void atg_linalg_multi_dot(tensor *, tensor *tensors_data, int tensors_len);
@ -1015,6 +1060,10 @@ void atg_linalg_norm_ord_str(tensor *, tensor self, char* ord_ptr, int ord_len,
 void atg_linalg_norm_ord_str_out(tensor *, tensor out, tensor self, char* ord_ptr, int ord_len, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_linalg_norm_out(tensor *, tensor out, tensor self, scalar ord, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_linalg_pinv(tensor *, tensor self, double rcond, int hermitian);
+void atg_linalg_pinv_atol_rtol_float(tensor *, tensor self, double atol_v, uint8_t atol_null, double rtol_v, uint8_t rtol_null, int hermitian);
+void atg_linalg_pinv_atol_rtol_float_out(tensor *, tensor out, tensor self, double atol_v, uint8_t atol_null, double rtol_v, uint8_t rtol_null, int hermitian);
+void atg_linalg_pinv_atol_rtol_tensor(tensor *, tensor self, tensor atol, tensor rtol, int hermitian);
+void atg_linalg_pinv_atol_rtol_tensor_out(tensor *, tensor out, tensor self, tensor atol, tensor rtol, int hermitian);
 void atg_linalg_pinv_out(tensor *, tensor out, tensor self, double rcond, int hermitian);
 void atg_linalg_pinv_out_rcond_tensor(tensor *, tensor out, tensor self, tensor rcond, int hermitian);
 void atg_linalg_pinv_rcond_tensor(tensor *, tensor self, tensor rcond, int hermitian);
@ -1024,18 +1073,20 @@ void atg_linalg_slogdet(tensor *, tensor self);
 void atg_linalg_slogdet_out(tensor *, tensor sign, tensor logabsdet, tensor self);
 void atg_linalg_solve(tensor *, tensor input, tensor other);
 void atg_linalg_solve_out(tensor *, tensor out, tensor input, tensor other);
-void atg_linalg_svd(tensor *, tensor self, int full_matrices);
-void atg_linalg_svd_u(tensor *, tensor U, tensor S, tensor Vh, tensor self, int full_matrices);
-void atg_linalg_svdvals(tensor *, tensor input);
-void atg_linalg_svdvals_out(tensor *, tensor out, tensor input);
+void atg_linalg_solve_triangular(tensor *, tensor self, tensor B, int upper, int left, int unitriangular);
+void atg_linalg_solve_triangular_out(tensor *, tensor out, tensor self, tensor B, int upper, int left, int unitriangular);
+void atg_linalg_svd(tensor *, tensor A, int full_matrices);
+void atg_linalg_svd_u(tensor *, tensor U, tensor S, tensor Vh, tensor A, int full_matrices);
+void atg_linalg_svdvals(tensor *, tensor A);
+void atg_linalg_svdvals_out(tensor *, tensor out, tensor A);
 void atg_linalg_tensorinv(tensor *, tensor self, int64_t ind);
 void atg_linalg_tensorinv_out(tensor *, tensor out, tensor self, int64_t ind);
 void atg_linalg_tensorsolve(tensor *, tensor self, tensor other, int64_t *dims_data, int dims_len);
 void atg_linalg_tensorsolve_out(tensor *, tensor out, tensor self, tensor other, int64_t *dims_data, int dims_len);
 void atg_linear(tensor *, tensor input, tensor weight, tensor bias);
 void atg_linear_out(tensor *, tensor out, tensor input, tensor weight, tensor bias);
-void atg_linspace(tensor *, scalar start, scalar end, int64_t steps_v, uint8_t steps_null, int options_kind, int options_device);
-void atg_linspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps_v, uint8_t steps_null);
+void atg_linspace(tensor *, scalar start, scalar end, int64_t steps, int options_kind, int options_device);
+void atg_linspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps);
 void atg_log(tensor *, tensor self);
 void atg_log10(tensor *, tensor self);
 void atg_log10_(tensor *, tensor self);
@ -1078,8 +1129,8 @@ void atg_logit_(tensor *, tensor self, double eps_v, uint8_t eps_null);
 void atg_logit_backward(tensor *, tensor grad_output, tensor self, double eps_v, uint8_t eps_null);
 void atg_logit_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self, double eps_v, uint8_t eps_null);
 void atg_logit_out(tensor *, tensor out, tensor self, double eps_v, uint8_t eps_null);
-void atg_logspace(tensor *, scalar start, scalar end, int64_t steps_v, uint8_t steps_null, double base, int options_kind, int options_device);
-void atg_logspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps_v, uint8_t steps_null, double base);
+void atg_logspace(tensor *, scalar start, scalar end, int64_t steps, double base, int options_kind, int options_device);
+void atg_logspace_out(tensor *, tensor out, scalar start, scalar end, int64_t steps, double base);
 void atg_logsumexp(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_logsumexp_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim);
 void atg_lstm(tensor *, tensor input, tensor *hx_data, int hx_len, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional, int batch_first);
@ -1111,6 +1162,7 @@ void atg_matmul(tensor *, tensor self, tensor other);
 void atg_matmul_out(tensor *, tensor out, tensor self, tensor other);
 void atg_matrix_exp(tensor *, tensor self);
 void atg_matrix_exp_backward(tensor *, tensor self, tensor grad);
+void atg_matrix_h(tensor *, tensor self);
 void atg_matrix_power(tensor *, tensor self, int64_t n);
 void atg_matrix_power_out(tensor *, tensor out, tensor self, int64_t n);
 void atg_matrix_rank(tensor *, tensor self, int symmetric);
@ -1150,6 +1202,7 @@ void atg_median_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_median_dim_values(tensor *, tensor values, tensor indices, tensor self, int64_t dim, int keepdim);
 tensor *atg_meshgrid(tensor *tensors_data, int tensors_len);
 tensor *atg_meshgrid_indexing(tensor *tensors_data, int tensors_len, char* indexing_ptr, int indexing_len);
+void atg_mh(tensor *, tensor self);
 void atg_min(tensor *, tensor self);
 void atg_min_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_min_dim_min(tensor *, tensor min, tensor min_indices, tensor self, int64_t dim, int keepdim);
@ -1160,15 +1213,8 @@ void atg_minimum_out(tensor *, tensor out, tensor self, tensor other);
 void atg_miopen_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double exponential_average_factor, double epsilon);
 void atg_miopen_batch_norm_backward(tensor *, tensor input, tensor grad_output, tensor weight, tensor running_mean, tensor running_var, tensor save_mean, tensor save_var, double epsilon);
 void atg_miopen_convolution(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_miopen_convolution_backward_bias(tensor *, tensor grad_output);
-void atg_miopen_convolution_backward_input(tensor *, int64_t *self_size_data, int self_size_len, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_miopen_convolution_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
 void atg_miopen_convolution_transpose(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *output_padding_data, int output_padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_miopen_convolution_transpose_backward_input(tensor *, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_miopen_convolution_transpose_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
 void atg_miopen_depthwise_convolution(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_miopen_depthwise_convolution_backward_input(tensor *, int64_t *self_size_data, int self_size_len, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
-void atg_miopen_depthwise_convolution_backward_weight(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int benchmark, int deterministic);
 void atg_miopen_rnn(tensor *, tensor input, tensor *weight_data, int weight_len, int64_t weight_stride0, tensor hx, tensor cx, int64_t mode, int64_t hidden_size, int64_t num_layers, int batch_first, double dropout, int train, int bidirectional, int64_t *batch_sizes_data, int batch_sizes_len, tensor dropout_state);
 void atg_mish(tensor *, tensor self);
 void atg_mish_(tensor *, tensor self);
@ -1177,8 +1223,6 @@ void atg_mish_out(tensor *, tensor out, tensor self);
 void atg_mkldnn_adaptive_avg_pool2d(tensor *, tensor self, int64_t *output_size_data, int output_size_len);
 void atg_mkldnn_adaptive_avg_pool2d_backward(tensor *, tensor grad_output, tensor self);
 void atg_mkldnn_convolution(tensor *, tensor self, tensor weight, tensor bias, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups);
-void atg_mkldnn_convolution_backward_input(tensor *, int64_t *self_size_data, int self_size_len, tensor grad_output, tensor weight, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int bias_defined);
-void atg_mkldnn_convolution_backward_weights(tensor *, int64_t *weight_size_data, int weight_size_len, tensor grad_output, tensor self, int64_t *padding_data, int padding_len, int64_t *stride_data, int stride_len, int64_t *dilation_data, int dilation_len, int64_t groups, int bias_defined);
 void atg_mkldnn_linear(tensor *, tensor self, tensor weight, tensor bias);
 void atg_mkldnn_linear_backward_input(tensor *, int64_t *input_size_data, int input_size_len, tensor grad_output, tensor weight);
 void atg_mkldnn_linear_backward_weights(tensor *, tensor grad_output, tensor input, tensor weight, int bias_defined);
@ -1202,6 +1246,7 @@ void atg_mse_loss_backward_grad_input(tensor *, tensor grad_input, tensor grad_o
 void atg_mse_loss_out(tensor *, tensor out, tensor self, tensor target, int64_t reduction);
 void atg_msort(tensor *, tensor self);
 void atg_msort_out(tensor *, tensor out, tensor self);
+void atg_mt(tensor *, tensor self);
 void atg_mul(tensor *, tensor self, tensor other);
 void atg_mul_(tensor *, tensor self, tensor other);
 void atg_mul_out(tensor *, tensor out, tensor self, tensor other);
@ -1233,14 +1278,10 @@ void atg_nanmean_out(tensor *, tensor out, tensor self, int64_t *dim_data, int d
 void atg_nanmedian(tensor *, tensor self);
 void atg_nanmedian_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_nanmedian_dim_values(tensor *, tensor values, tensor indices, tensor self, int64_t dim, int keepdim);
-void atg_nanquantile(tensor *, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim);
-void atg_nanquantile_new(tensor *, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
-void atg_nanquantile_new_out(tensor *, tensor out, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
-void atg_nanquantile_new_scalar(tensor *, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
-void atg_nanquantile_new_scalar_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
-void atg_nanquantile_out(tensor *, tensor out, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim);
-void atg_nanquantile_scalar(tensor *, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
-void atg_nanquantile_scalar_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_nanquantile(tensor *, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
+void atg_nanquantile_out(tensor *, tensor out, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
+void atg_nanquantile_scalar(tensor *, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
+void atg_nanquantile_scalar_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
 void atg_nansum(tensor *, tensor self, int dtype);
 void atg_nansum_dim_intlist(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_nansum_intlist_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
@ -1250,6 +1291,9 @@ void atg_narrow_copy_out(tensor *, tensor out, tensor self, int64_t dim, int64_t
 void atg_narrow_tensor(tensor *, tensor self, int64_t dim, tensor start, int64_t length);
 void atg_native_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double momentum, double eps);
 void atg_native_batch_norm_out(tensor *, tensor out, tensor save_mean, tensor save_invstd, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double momentum, double eps);
+void atg_native_channel_shuffle(tensor *, tensor self, int64_t groups);
+void atg_native_dropout(tensor *, tensor input, double p, int train);
+void atg_native_dropout_backward(tensor *, tensor grad_output, tensor mask, double scale);
 void atg_native_group_norm(tensor *, tensor input, tensor weight, tensor bias, int64_t n, int64_t C, int64_t HxW, int64_t group, double eps);
 void atg_native_layer_norm(tensor *, tensor input, int64_t *normalized_shape_data, int normalized_shape_len, tensor weight, tensor bias, double eps);
 void atg_native_norm(tensor *, tensor self);
@ -1358,16 +1402,13 @@ double atg_q_scale(tensor self);
 int64_t atg_q_zero_point(tensor self);
 void atg_qr(tensor *, tensor self, int some);
 void atg_qr_q(tensor *, tensor Q, tensor R, tensor self, int some);
-void atg_quantile(tensor *, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim);
-void atg_quantile_new(tensor *, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
-void atg_quantile_new_out(tensor *, tensor out, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
-void atg_quantile_new_scalar(tensor *, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
-void atg_quantile_new_scalar_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
-void atg_quantile_out(tensor *, tensor out, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim);
-void atg_quantile_scalar(tensor *, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
-void atg_quantile_scalar_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim);
+void atg_quantile(tensor *, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
+void atg_quantile_out(tensor *, tensor out, tensor self, tensor q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
+void atg_quantile_scalar(tensor *, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
+void atg_quantile_scalar_out(tensor *, tensor out, tensor self, double q, int64_t dim_v, uint8_t dim_null, int keepdim, char* interpolation_ptr, int interpolation_len);
 void atg_quantize_per_channel(tensor *, tensor self, tensor scales, tensor zero_points, int64_t axis, int dtype);
 void atg_quantize_per_tensor(tensor *, tensor self, double scale, int64_t zero_point, int dtype);
+void atg_quantize_per_tensor_dynamic(tensor *, tensor self, int dtype, int reduce_range);
 void atg_quantize_per_tensor_tensor_qparams(tensor *, tensor self, tensor scale, tensor zero_point, int dtype);
 tensor *atg_quantize_per_tensor_tensors(tensor *tensors_data, int tensors_len, tensor scales, tensor zero_points, int dtype);
 void atg_quantized_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor mean, tensor var, double eps, double output_scale, int64_t output_zero_point);
@ -1466,6 +1507,9 @@ void atg_roll(tensor *, tensor self, int64_t *shifts_data, int shifts_len, int64
 void atg_rot90(tensor *, tensor self, int64_t k, int64_t *dims_data, int dims_len);
 void atg_round(tensor *, tensor self);
 void atg_round_(tensor *, tensor self);
+void atg_round_decimals(tensor *, tensor self, int64_t decimals);
+void atg_round_decimals_(tensor *, tensor self, int64_t decimals);
+void atg_round_decimals_out(tensor *, tensor out, tensor self, int64_t decimals);
 void atg_round_out(tensor *, tensor out, tensor self);
 void atg_row_stack(tensor *, tensor *tensors_data, int tensors_len);
 void atg_row_stack_out(tensor *, tensor out, tensor *tensors_data, int tensors_len);
@ -1496,12 +1540,13 @@ void atg_scatter_value_out(tensor *, tensor out, tensor self, int64_t dim, tenso
 void atg_scatter_value_reduce(tensor *, tensor self, int64_t dim, tensor index, scalar value, char* reduce_ptr, int reduce_len);
 void atg_scatter_value_reduce_(tensor *, tensor self, int64_t dim, tensor index, scalar value, char* reduce_ptr, int reduce_len);
 void atg_scatter_value_reduce_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, scalar value, char* reduce_ptr, int reduce_len);
-void atg_searchsorted(tensor *, tensor sorted_sequence, tensor self, int out_int32, int right);
-void atg_searchsorted_scalar(tensor *, tensor sorted_sequence, scalar self_scalar, int out_int32, int right);
-void atg_searchsorted_tensor_out(tensor *, tensor out, tensor sorted_sequence, tensor self, int out_int32, int right);
+void atg_searchsorted(tensor *, tensor sorted_sequence, tensor self, int out_int32, int right, char* side_ptr, int side_len, tensor sorter);
+void atg_searchsorted_scalar(tensor *, tensor sorted_sequence, scalar self_scalar, int out_int32, int right, char* side_ptr, int side_len, tensor sorter);
+void atg_searchsorted_tensor_out(tensor *, tensor out, tensor sorted_sequence, tensor self, int out_int32, int right, char* side_ptr, int side_len, tensor sorter);
 void atg_segment_reduce(tensor *, tensor data, char* reduce_ptr, int reduce_len, tensor lengths, tensor indices, int64_t axis, int unsafe, scalar initial);
 void atg_select(tensor *, tensor self, int64_t dim, int64_t index);
 void atg_select_backward(tensor *, tensor grad_output, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t index);
+void atg_select_scatter(tensor *, tensor self, tensor src, int64_t dim, int64_t index);
 void atg_selu(tensor *, tensor self);
 void atg_selu_(tensor *, tensor self);
 void atg_set_(tensor *, tensor self);
@ -1536,6 +1581,7 @@ void atg_sinh_(tensor *, tensor self);
 void atg_sinh_out(tensor *, tensor out, tensor self);
 void atg_slice(tensor *, tensor self, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step);
 void atg_slice_backward(tensor *, tensor grad_output, int64_t *input_sizes_data, int input_sizes_len, int64_t dim, int64_t start, int64_t end, int64_t step);
+void atg_slice_scatter(tensor *, tensor self, tensor src, int64_t dim, int64_t start_v, uint8_t start_null, int64_t end_v, uint8_t end_null, int64_t step);
 void atg_slogdet(tensor *, tensor self);
 void atg_slow_conv3d(tensor *, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
 void atg_slow_conv3d_out(tensor *, tensor out, tensor self, tensor weight, int64_t *kernel_size_data, int kernel_size_len, tensor bias, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len);
@ -1556,8 +1602,8 @@ void atg_soft_margin_loss_backward_grad_input(tensor *, tensor grad_input, tenso
 void atg_soft_margin_loss_out(tensor *, tensor out, tensor self, tensor target, int64_t reduction);
 void atg_softmax(tensor *, tensor self, int64_t dim, int dtype);
 void atg_softplus(tensor *, tensor self);
-void atg_softplus_backward(tensor *, tensor grad_output, tensor self, scalar beta, scalar threshold, tensor output);
-void atg_softplus_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self, scalar beta, scalar threshold, tensor output);
+void atg_softplus_backward(tensor *, tensor grad_output, tensor self, scalar beta, scalar threshold);
+void atg_softplus_backward_grad_input(tensor *, tensor grad_input, tensor grad_output, tensor self, scalar beta, scalar threshold);
 void atg_softplus_out(tensor *, tensor out, tensor self);
 void atg_softshrink(tensor *, tensor self);
 void atg_softshrink_backward(tensor *, tensor grad_output, tensor self, scalar lambd);
@ -1578,6 +1624,8 @@ int64_t atg_sparse_dim(tensor self);
 void atg_sparse_mask(tensor *, tensor self, tensor mask);
 void atg_sparse_resize_(tensor *, tensor self, int64_t *size_data, int size_len, int64_t sparse_dim, int64_t dense_dim);
 void atg_sparse_resize_and_clear_(tensor *, tensor self, int64_t *size_data, int size_len, int64_t sparse_dim, int64_t dense_dim);
+void atg_sparse_sampled_addmm(tensor *, tensor self, tensor mat1, tensor mat2);
+void atg_sparse_sampled_addmm_out(tensor *, tensor out, tensor self, tensor mat1, tensor mat2);
 void atg_special_digamma(tensor *, tensor self);
 void atg_special_digamma_out(tensor *, tensor out, tensor self);
 void atg_special_entr(tensor *, tensor self);
@ -1627,10 +1675,11 @@ void atg_special_polygamma(tensor *, int64_t n, tensor self);
 void atg_special_polygamma_out(tensor *, tensor out, int64_t n, tensor self);
 void atg_special_psi(tensor *, tensor self);
 void atg_special_psi_out(tensor *, tensor out, tensor self);
-void atg_special_round(tensor *, tensor self);
-void atg_special_round_out(tensor *, tensor out, tensor self);
+void atg_special_round(tensor *, tensor self, int64_t decimals);
+void atg_special_round_out(tensor *, tensor out, tensor self, int64_t decimals);
 void atg_special_sinc(tensor *, tensor self);
 void atg_special_sinc_out(tensor *, tensor out, tensor self);
+void atg_special_softmax(tensor *, tensor self, int64_t dim, int dtype);
 void atg_special_xlog1py(tensor *, tensor self, tensor other);
 void atg_special_xlog1py_other_scalar(tensor *, tensor self, scalar other);
 void atg_special_xlog1py_other_scalar_out(tensor *, tensor out, tensor self, scalar other);
--- a/setup-gotch.sh
+++ b/setup-gotch.sh
@ -1,7 +1,7 @@
 #!/bin/bash

 GOTCH_VERSION="${GOTCH_VER:-v0.6.2}"
-CUDA_VERSION="${CUDA_VER:-11.1}"
+CUDA_VERSION="${CUDA_VER:-11.3}"

 if [ -z $GOPATH ] then
  $GOPATH="$HOME/go"
--- a/setup-libtorch.sh
+++ b/setup-libtorch.sh
@ -1,7 +1,7 @@
 #!/bin/bash

-LIBTORCH_VERSION="${LIBTORCH_VER:-1.10.0}"
-CUDA_VERSION="${CUDA_VER:-11.1}"
+LIBTORCH_VERSION="${LIBTORCH_VER:-1.11.0}"
+CUDA_VERSION="${CUDA_VER:-11.3}"

 if [ "${CUDA_VERSION}"=="cpu" ]; then
  CU_VERSION="cpu"
--- a/ts/must-tensor-generated.go
+++ b/ts/must-tensor-generated.go
--- a/ts/tensor-generated.go
+++ b/ts/tensor-generated.go