upgrade libtorch2.1.0

2023-10-11 14:53:35 +11:00 · 2023-10-11 14:53:35 +11:00 · bf24e57df9
commit bf24e57df9
parent f06e858def
14 changed files with 206359 additions and 1016 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Nofix]
 - ctype `long` caused compiling error in MacOS as noted on [#44]. Not working on linux box.

+# [0.9.0]
+- Upgrade libtorch v2.1.0

 # [0.8.0]
 - Upgrade libtorch v2.0.0
--- a/README.md
+++ b/README.md
@ -3,10 +3,10 @@

 ## Overview

-`gotch` creates a thin wrapper to Pytorch C++ APIs (Libtorch) to make use of its already optimized C++ tensor APIs (>2500) and dynamic graph computation with CUDA support and provides idiomatic Go APIs for developing and implementing Deep Learning in Go.
+`gotch` creates a thin wrapper to Pytorch C++ APIs (Libtorch) to make use of its already optimized C++ tensor APIs (3039) and dynamic graph computation with CUDA support and provides idiomatic Go APIs for developing and implementing Deep Learning in Go.

 **Some features are**
- [x] Comprehensive Pytorch tensor APIs
+- [x] Comprehensive Pytorch tensor APIs (2525)
 - [x] Fully featured Pytorch dynamic graph computation
 - [x] JIT interface to run model trained/saved using PyTorch Python API
 - [x] Load pretrained Pytorch models and run inference
@ -16,16 +16,17 @@

 `gotch` is in active development mode and may have API breaking changes. Feel free to pull request, report issues or discuss any concerns. All contributions are welcome. 

-`gotch` current version is **v0.8.0**
+`gotch` current version is **v0.9.0**

 ## Dependencies

- **Libtorch** C++ v2.0.1 library of [Pytorch](https://pytorch.org/)
+- **Libtorch** C++ v2.1.0 library of [Pytorch](https://pytorch.org/)
+- Clang-17/Clang++-17 compilers

 ## Installation

 - Default CUDA version is `11.8` if CUDA is available otherwise using CPU version.
- Default Pytorch C++ API version is `2.0.1`
+- Default Pytorch C++ API version is `2.1.0`

 **NOTE**: `libtorch` will be installed at **`/usr/local/lib`**

@ -34,7 +35,7 @@
 #### Step 1: Setup libtorch (skip this step if a valid libtorch already installed in your machine!)

 ```bash
-    wget https://github.com/sugarme/gotch/releases/download/v0.8.0/setup-libtorch.sh
+    wget https://github.com/sugarme/gotch/releases/download/v0.9.0/setup-libtorch.sh
    chmod +x setup-libtorch.sh
    export CUDA_VER=cpu && bash setup-libtorch.sh
 ```
@ -51,9 +52,9 @@
 #### Step 2: Setup gotch

 ```bash
-    wget https://github.com/sugarme/gotch/releases/download/v0.8.0/setup-gotch.sh
+    wget https://github.com/sugarme/gotch/releases/download/v0.9.0/setup-gotch.sh
    chmod +x setup-gotch.sh
-    export CUDA_VER=cpu && export GOTCH_VER=v0.8.0 && bash setup-gotch.sh
+    export CUDA_VER=cpu && export GOTCH_VER=v0.9.0 && bash setup-gotch.sh
 ```

 ### GPU
@ -67,7 +68,7 @@
 #### Step 1: Setup libtorch (skip this step if a valid libtorch already installed in your machine!)

 ```bash
-    wget https://github.com/sugarme/gotch/releases/download/v0.8.0/setup-libtorch.sh
+    wget https://github.com/sugarme/gotch/releases/download/v0.9.0/setup-libtorch.sh
    chmod +x setup-libtorch.sh

    export CUDA_VER=11.8 && bash setup-libtorch.sh
@ -85,9 +86,9 @@
 #### Step 2: Setup gotch

 ```bash
-    wget https://github.com/sugarme/gotch/releases/download/v0.8.0/setup-gotch.sh
+    wget https://github.com/sugarme/gotch/releases/download/v0.9.0/setup-gotch.sh
    chmod +x setup-gotch.sh
-    export CUDA_VER=11.8 && export GOTCH_VER=v0.8.0 && bash setup-gotch.sh
+    export CUDA_VER=11.8 && export GOTCH_VER=v0.9.0 && bash setup-gotch.sh
 ```

 ## Examples
--- a/gen/gen.ml
+++ b/gen/gen.ml
@ -69,6 +69,10 @@ let excluded_functions =
    ; "unsafe_split_out"
    ; "unsafe_split_with_sizes_out"
    ; "_histogramdd_from_bin_cts"
+    ; "sym_numel"
+    ; "sym_size"
+    ; "sym_stride"
+    ; "sym_storage_offset"
    ]

 let no_tensor_options =
@ -1437,7 +1441,7 @@ let run ~yaml_filename ~cpp_filename ~ffi_filename ~must_wrapper_filename


 let () =
-  run ~yaml_filename:"gen/pytorch/Declarations-v2.0.0.yaml"
+  run ~yaml_filename:"gen/pytorch/Declarations-v2.1.0.yaml"
    ~cpp_filename:"libtch/torch_api_generated"
    ~ffi_filename:"libtch/c-generated.go"
    ~must_wrapper_filename:"ts/must-tensor-generated.go"
--- a/gen/gen.ml.2.0
+++ b/gen/gen.ml.2.0
--- a/gen/pytorch/Declarations-v2.1.0.yaml
+++ b/gen/pytorch/Declarations-v2.1.0.yaml
--- a/libtch/c-generated.go
+++ b/libtch/c-generated.go
@ -237,13 +237,6 @@ func Atg_CholeskySolveHelperOut(ptr *Ctensor, out Ctensor, self Ctensor, a Ctens
 cupper := *(*C.int)(unsafe.Pointer(&upper)) 
 	C.atg__cholesky_solve_helper_out(ptr, out, self, a, cupper)
 }
-func Atg_ChunkGradOutputsEfficientAttention(query Ctensor, key Ctensor, value Ctensor, isCausal int32) bool{
-cisCausal := *(*C.int)(unsafe.Pointer(&isCausal))
-	 cResult := C.atg__chunk_grad_outputs_efficient_attention(query, key, value, cisCausal)
-	 cbool := *(*int)(unsafe.Pointer(&cResult))
-	 if cbool == 1{return true}
-	 return false
-}
 func Atg_Coalesce(ptr *Ctensor, self Ctensor){ 
 	C.atg__coalesce(ptr, self)
 }
@ -400,6 +393,13 @@ func Atg_CopyFromOut(ptr *Ctensor, out Ctensor, self Ctensor, dst Ctensor, nonBl
 cnonBlocking := *(*C.int)(unsafe.Pointer(&nonBlocking)) 
 	C.atg__copy_from_out(ptr, out, self, dst, cnonBlocking)
 }
+func Atg_CsltCompress(ptr *Ctensor, input Ctensor){ 
+	C.atg__cslt_compress(ptr, input)
+}
+func Atg_CsltSparseMm(ptr *Ctensor, compressedA Ctensor, denseB Ctensor, bias Ctensor, transposeResult int32){
+ctransposeResult := *(*C.int)(unsafe.Pointer(&transposeResult)) 
+	C.atg__cslt_sparse_mm(ptr, compressedA, denseB, bias, ctransposeResult)
+}
 func Atg_CtcLoss(ptr *Ctensor, logProbs Ctensor, targets Ctensor, inputLengthsData []int64, inputLengthsLen int, targetLengthsData []int64, targetLengthsLen int, blank int64, zeroInfinity int32){
 cinputLengthsDataPtr := (*C.int64_t)(unsafe.Pointer(&inputLengthsData[0]))
 cinputLengthsLen := *(*C.int)(unsafe.Pointer(&inputLengthsLen))
@ -549,16 +549,6 @@ cbatchSizesDataPtr := (*C.int64_t)(unsafe.Pointer(&batchSizesData[0]))
 cbatchSizesLen := *(*C.int)(unsafe.Pointer(&batchSizesLen)) 
 	C.atg__cudnn_rnn_out(ptr, out0, out1, out2, out3, out4, input, cweightDataPtr, cweightLen, cweightStride0, weightBuf, hx, cx, cmode, chiddenSize, cprojSize, cnumLayers, cbatchFirst, cdropout, ctrain, cbidirectional, cbatchSizesDataPtr, cbatchSizesLen, dropoutState)
 }
-func Atg_CufftGetPlanCacheMaxSize(deviceIndex int64) int64{
-cdeviceIndex := *(*C.int64_t)(unsafe.Pointer(&deviceIndex))
-	 cResult := C.atg__cufft_get_plan_cache_max_size(cdeviceIndex)
-	 return *(*int64)(unsafe.Pointer(&cResult))
-}
-func Atg_CufftGetPlanCacheSize(deviceIndex int64) int64{
-cdeviceIndex := *(*C.int64_t)(unsafe.Pointer(&deviceIndex))
-	 cResult := C.atg__cufft_get_plan_cache_size(cdeviceIndex)
-	 return *(*int64)(unsafe.Pointer(&cResult))
-}
 func Atg_DebugHasInternalOverlap(self Ctensor) int64{
 	 cResult := C.atg__debug_has_internal_overlap(self)
 	 return *(*int64)(unsafe.Pointer(&cResult))
@ -581,10 +571,17 @@ func Atg_DirichletGrad(ptr *Ctensor, x Ctensor, alpha Ctensor, total Ctensor){
 func Atg_DirichletGradOut(ptr *Ctensor, out Ctensor, x Ctensor, alpha Ctensor, total Ctensor){ 
 	C.atg__dirichlet_grad_out(ptr, out, x, alpha, total)
 }
-func Atg_EfficientAttentionBackward(ptr *Ctensor, gradOut_ Ctensor, query Ctensor, key Ctensor, value Ctensor, out Ctensor, logsumexp Ctensor, isCausal int32, chunkGradOutputs int32){
-cisCausal := *(*C.int)(unsafe.Pointer(&isCausal))
-cchunkGradOutputs := *(*C.int)(unsafe.Pointer(&chunkGradOutputs)) 
-	C.atg__efficient_attention_backward(ptr, gradOut_, query, key, value, out, logsumexp, cisCausal, cchunkGradOutputs)
+func Atg_EfficientAttentionBackward(ptr *Ctensor, gradOut_ Ctensor, query Ctensor, key Ctensor, value Ctensor, bias Ctensor, out Ctensor, cuSeqlensQ Ctensor, cuSeqlensK Ctensor, maxSeqlenK int64, maxSeqlenQ int64, logsumexp Ctensor, dropoutP float64, philoxSeed Ctensor, philoxOffset Ctensor, customMaskType int64, biasRequiresGrad int32, scaleVal float64, scaleNull int, numSplitsKeyVal int64, numSplitsKeyNull int){
+cmaxSeqlenK := *(*C.int64_t)(unsafe.Pointer(&maxSeqlenK))
+cmaxSeqlenQ := *(*C.int64_t)(unsafe.Pointer(&maxSeqlenQ))
+cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
+ccustomMaskType := *(*C.int64_t)(unsafe.Pointer(&customMaskType))
+cbiasRequiresGrad := *(*C.int)(unsafe.Pointer(&biasRequiresGrad))
+cscaleVal := *(*C.double)(unsafe.Pointer(&scaleVal))
+cscaleNull := *(*C.uint8_t)(unsafe.Pointer(&scaleNull))
+cnumSplitsKeyVal := *(*C.int64_t)(unsafe.Pointer(&numSplitsKeyVal))
+cnumSplitsKeyNull := *(*C.uint8_t)(unsafe.Pointer(&numSplitsKeyNull)) 
+	C.atg__efficient_attention_backward(ptr, gradOut_, query, key, value, bias, out, cuSeqlensQ, cuSeqlensK, cmaxSeqlenK, cmaxSeqlenQ, logsumexp, cdropoutP, philoxSeed, philoxOffset, ccustomMaskType, cbiasRequiresGrad, cscaleVal, cscaleNull, cnumSplitsKeyVal, cnumSplitsKeyNull)
 }
 func Atg_Efficientzerotensor(ptr *Ctensor, sizeData []int64, sizeLen int, optionsKind int32, optionsDevice int32){
 csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
@ -796,14 +793,20 @@ cnormalization := *(*C.int64_t)(unsafe.Pointer(&normalization))
 conesided := *(*C.int)(unsafe.Pointer(&onesided)) 
 	C.atg__fft_r2c_out(ptr, out, self, cdimDataPtr, cdimLen, cnormalization, conesided)
 }
-func Atg_FlashAttentionBackward(ptr *Ctensor, gradOut Ctensor, query Ctensor, key Ctensor, value Ctensor, out Ctensor, logsumexp Ctensor, cumSeqQ Ctensor, cumSeqK Ctensor, maxQ int64, maxK int64, dropoutP float64, isCausal int32, philoxSeed int64, philoxOffset int64){
+func Atg_FillMemEffDropoutMask_(ptr *Ctensor, self Ctensor, dropoutP float64, seed int64, offset int64){
+cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
+cseed := *(*C.int64_t)(unsafe.Pointer(&seed))
+coffset := *(*C.int64_t)(unsafe.Pointer(&offset)) 
+	C.atg__fill_mem_eff_dropout_mask_(ptr, self, cdropoutP, cseed, coffset)
+}
+func Atg_FlashAttentionBackward(ptr *Ctensor, gradOut Ctensor, query Ctensor, key Ctensor, value Ctensor, out Ctensor, logsumexp Ctensor, cumSeqQ Ctensor, cumSeqK Ctensor, maxQ int64, maxK int64, dropoutP float64, isCausal int32, philoxSeed Ctensor, philoxOffset Ctensor, scaleVal float64, scaleNull int){
 cmaxQ := *(*C.int64_t)(unsafe.Pointer(&maxQ))
 cmaxK := *(*C.int64_t)(unsafe.Pointer(&maxK))
 cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
 cisCausal := *(*C.int)(unsafe.Pointer(&isCausal))
-cphiloxSeed := *(*C.int64_t)(unsafe.Pointer(&philoxSeed))
-cphiloxOffset := *(*C.int64_t)(unsafe.Pointer(&philoxOffset)) 
-	C.atg__flash_attention_backward(ptr, gradOut, query, key, value, out, logsumexp, cumSeqQ, cumSeqK, cmaxQ, cmaxK, cdropoutP, cisCausal, cphiloxSeed, cphiloxOffset)
+cscaleVal := *(*C.double)(unsafe.Pointer(&scaleVal))
+cscaleNull := *(*C.uint8_t)(unsafe.Pointer(&scaleNull)) 
+	C.atg__flash_attention_backward(ptr, gradOut, query, key, value, out, logsumexp, cumSeqQ, cumSeqK, cmaxQ, cmaxK, cdropoutP, cisCausal, philoxSeed, philoxOffset, cscaleVal, cscaleNull)
 }
 func Atg_Foobar(ptr *Ctensor, self Ctensor, arg1 int32, arg2 int32, arg3 int32){
 carg1 := *(*C.int)(unsafe.Pointer(&arg1))
@ -817,6 +820,26 @@ carg2 := *(*C.int)(unsafe.Pointer(&arg2))
 carg3 := *(*C.int)(unsafe.Pointer(&arg3)) 
 	C.atg__foobar_out(ptr, out, self, carg1, carg2, carg3)
 }
+func Atg_FunctionalAssertAsync(ptr *Ctensor, self Ctensor, assertMsg string, depToken Ctensor){
+cassertMsg := C.CString(assertMsg)
+assertMsgLen := len(assertMsg)
+cassertMsgLen := *(*C.int)(unsafe.Pointer(&assertMsgLen)) 
+	C.atg__functional_assert_async(ptr, self, cassertMsg, cassertMsgLen, depToken)
+}
+func Atg_FunctionalSymConstrainRange(ptr *Ctensor, size Cscalar, minVal int64, minNull int, maxVal int64, maxNull int, depToken Ctensor){
+cminVal := *(*C.int64_t)(unsafe.Pointer(&minVal))
+cminNull := *(*C.uint8_t)(unsafe.Pointer(&minNull))
+cmaxVal := *(*C.int64_t)(unsafe.Pointer(&maxVal))
+cmaxNull := *(*C.uint8_t)(unsafe.Pointer(&maxNull)) 
+	C.atg__functional_sym_constrain_range(ptr, size , cminVal, cminNull, cmaxVal, cmaxNull, depToken)
+}
+func Atg_FunctionalSymConstrainRangeForSize(ptr *Ctensor, size Cscalar, minVal int64, minNull int, maxVal int64, maxNull int, depToken Ctensor){
+cminVal := *(*C.int64_t)(unsafe.Pointer(&minVal))
+cminNull := *(*C.uint8_t)(unsafe.Pointer(&minNull))
+cmaxVal := *(*C.int64_t)(unsafe.Pointer(&maxVal))
+cmaxNull := *(*C.uint8_t)(unsafe.Pointer(&maxNull)) 
+	C.atg__functional_sym_constrain_range_for_size(ptr, size , cminVal, cminNull, cmaxVal, cmaxNull, depToken)
+}
 func Atg_FusedDropout(ptr *Ctensor, self Ctensor, p float64){
 cp := *(*C.double)(unsafe.Pointer(&p)) 
 	C.atg__fused_dropout(ptr, self, cp)
@ -852,10 +875,12 @@ cperRowFakeQuant := *(*C.int)(unsafe.Pointer(&perRowFakeQuant))
 csymmetricQuant := *(*C.int)(unsafe.Pointer(&symmetricQuant)) 
 	C.atg__fused_moving_avg_obs_fq_helper_out(ptr, out0, out1, self, observerOn, fakeQuantOn, runningMin, runningMax, scale, zeroPoint, caveragingConst, cquantMin, cquantMax, cchAxis, cperRowFakeQuant, csymmetricQuant)
 }
-func Atg_FusedSdpChoice(query Ctensor, key Ctensor, value Ctensor, attnMask Ctensor, dropoutP float64, isCausal int32) int64{
+func Atg_FusedSdpChoice(query Ctensor, key Ctensor, value Ctensor, attnMask Ctensor, dropoutP float64, isCausal int32, scaleVal float64, scaleNull int) int64{
 cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
 cisCausal := *(*C.int)(unsafe.Pointer(&isCausal))
-	 cResult := C.atg__fused_sdp_choice(query, key, value, attnMask, cdropoutP, cisCausal)
+cscaleVal := *(*C.double)(unsafe.Pointer(&scaleVal))
+cscaleNull := *(*C.uint8_t)(unsafe.Pointer(&scaleNull))
+	 cResult := C.atg__fused_sdp_choice(query, key, value, attnMask, cdropoutP, cisCausal, cscaleVal, cscaleNull)
 	 return *(*int64)(unsafe.Pointer(&cResult))
 }
 func Atg_FwPrimal(ptr *Ctensor, self Ctensor, level int64){
@ -954,6 +979,12 @@ func Atg_IndicesCopy(ptr *Ctensor, self Ctensor){
 func Atg_IndicesCopyOut(ptr *Ctensor, out Ctensor, self Ctensor){ 
 	C.atg__indices_copy_out(ptr, out, self)
 }
+func Atg_IntMm(ptr *Ctensor, self Ctensor, mat2 Ctensor){ 
+	C.atg__int_mm(ptr, self, mat2)
+}
+func Atg_IntMmOut(ptr *Ctensor, out Ctensor, self Ctensor, mat2 Ctensor){ 
+	C.atg__int_mm_out(ptr, out, self, mat2)
+}
 func Atg_IsAllTrue(ptr *Ctensor, self Ctensor){ 
 	C.atg__is_all_true(ptr, self)
 }
@ -1077,6 +1108,11 @@ cpivot := *(*C.int)(unsafe.Pointer(&pivot))
 ccheckErrors := *(*C.int)(unsafe.Pointer(&checkErrors)) 
 	C.atg__lu_with_info(ptr, self, cpivot, ccheckErrors)
 }
+func Atg_MakeDepToken(ptr *Ctensor, optionsKind int32, optionsDevice int32){
+coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
+coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
+	C.atg__make_dep_token(ptr, coptionsKind, coptionsDevice)
+}
 func Atg_MakeDual(ptr *Ctensor, primal Ctensor, tangent Ctensor, level int64){
 clevel := *(*C.int64_t)(unsafe.Pointer(&level)) 
 	C.atg__make_dual(ptr, primal, tangent, clevel)
@ -1232,26 +1268,22 @@ cmomentum := *(*C.double)(unsafe.Pointer(&momentum))
 ceps := *(*C.double)(unsafe.Pointer(&eps)) 
 	C.atg__native_batch_norm_legit_no_stats_out(ptr, out, saveMean, saveInvstd, input, weight, bias, ctraining, cmomentum, ceps)
 }
+func Atg_NativeBatchNormLegitNoTraining(ptr *Ctensor, input Ctensor, weight Ctensor, bias Ctensor, runningMean Ctensor, runningVar Ctensor, momentum float64, eps float64){
+cmomentum := *(*C.double)(unsafe.Pointer(&momentum))
+ceps := *(*C.double)(unsafe.Pointer(&eps)) 
+	C.atg__native_batch_norm_legit_no_training(ptr, input, weight, bias, runningMean, runningVar, cmomentum, ceps)
+}
+func Atg_NativeBatchNormLegitNoTrainingOut(ptr *Ctensor, out0 Ctensor, out1 Ctensor, out2 Ctensor, input Ctensor, weight Ctensor, bias Ctensor, runningMean Ctensor, runningVar Ctensor, momentum float64, eps float64){
+cmomentum := *(*C.double)(unsafe.Pointer(&momentum))
+ceps := *(*C.double)(unsafe.Pointer(&eps)) 
+	C.atg__native_batch_norm_legit_no_training_out(ptr, out0, out1, out2, input, weight, bias, runningMean, runningVar, cmomentum, ceps)
+}
 func Atg_NativeBatchNormLegitOut(ptr *Ctensor, out Ctensor, saveMean Ctensor, saveInvstd Ctensor, input Ctensor, weight Ctensor, bias Ctensor, runningMean Ctensor, runningVar Ctensor, training int32, momentum float64, eps float64){
 ctraining := *(*C.int)(unsafe.Pointer(&training))
 cmomentum := *(*C.double)(unsafe.Pointer(&momentum))
 ceps := *(*C.double)(unsafe.Pointer(&eps)) 
 	C.atg__native_batch_norm_legit_out(ptr, out, saveMean, saveInvstd, input, weight, bias, runningMean, runningVar, ctraining, cmomentum, ceps)
 }
-func Atg_NativeDecoderOnlyMultiHeadAttention(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, embedDim int64, numHead int64, qkvWeight Ctensor, qkvBias Ctensor, projWeight Ctensor, projBias Ctensor, mask Ctensor, incrKey Ctensor, incrValue Ctensor, needWeights int32, averageAttnWeights int32){
-cembedDim := *(*C.int64_t)(unsafe.Pointer(&embedDim))
-cnumHead := *(*C.int64_t)(unsafe.Pointer(&numHead))
-cneedWeights := *(*C.int)(unsafe.Pointer(&needWeights))
-caverageAttnWeights := *(*C.int)(unsafe.Pointer(&averageAttnWeights)) 
-	C.atg__native_decoder_only_multi_head_attention(ptr, query, key, value, cembedDim, cnumHead, qkvWeight, qkvBias, projWeight, projBias, mask, incrKey, incrValue, cneedWeights, caverageAttnWeights)
-}
-func Atg_NativeDecoderOnlyMultiHeadAttentionOut(ptr *Ctensor, out0 Ctensor, out1 Ctensor, out2 Ctensor, out3 Ctensor, query Ctensor, key Ctensor, value Ctensor, embedDim int64, numHead int64, qkvWeight Ctensor, qkvBias Ctensor, projWeight Ctensor, projBias Ctensor, mask Ctensor, incrKey Ctensor, incrValue Ctensor, needWeights int32, averageAttnWeights int32){
-cembedDim := *(*C.int64_t)(unsafe.Pointer(&embedDim))
-cnumHead := *(*C.int64_t)(unsafe.Pointer(&numHead))
-cneedWeights := *(*C.int)(unsafe.Pointer(&needWeights))
-caverageAttnWeights := *(*C.int)(unsafe.Pointer(&averageAttnWeights)) 
-	C.atg__native_decoder_only_multi_head_attention_out(ptr, out0, out1, out2, out3, query, key, value, cembedDim, cnumHead, qkvWeight, qkvBias, projWeight, projBias, mask, incrKey, incrValue, cneedWeights, caverageAttnWeights)
-}
 func Atg_NativeMultiHeadAttention(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, embedDim int64, numHead int64, qkvWeight Ctensor, qkvBias Ctensor, projWeight Ctensor, projBias Ctensor, mask Ctensor, needWeights int32, averageAttnWeights int32, maskTypeVal int64, maskTypeNull int){
 cembedDim := *(*C.int64_t)(unsafe.Pointer(&embedDim))
 cnumHead := *(*C.int64_t)(unsafe.Pointer(&numHead))
@ -1304,20 +1336,14 @@ cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
 	C.atg__nested_sum_backward(ptr, grad, self, cdimDataPtr, cdimLen, ckeepdim)
 }
-func Atg_NestedViewFromBuffer(ptr *Ctensor, self Ctensor, nestedSize Ctensor, nestedStrides Ctensor, offsetsData []int64, offsetsLen int){
-coffsetsDataPtr := (*C.int64_t)(unsafe.Pointer(&offsetsData[0]))
-coffsetsLen := *(*C.int)(unsafe.Pointer(&offsetsLen)) 
-	C.atg__nested_view_from_buffer(ptr, self, nestedSize, nestedStrides, coffsetsDataPtr, coffsetsLen)
+func Atg_NestedViewFromBuffer(ptr *Ctensor, self Ctensor, nestedSize Ctensor, nestedStrides Ctensor, offsets Ctensor){ 
+	C.atg__nested_view_from_buffer(ptr, self, nestedSize, nestedStrides, offsets)
 }
-func Atg_NestedViewFromBufferCopy(ptr *Ctensor, self Ctensor, nestedSize Ctensor, nestedStrides Ctensor, offsetsData []int64, offsetsLen int){
-coffsetsDataPtr := (*C.int64_t)(unsafe.Pointer(&offsetsData[0]))
-coffsetsLen := *(*C.int)(unsafe.Pointer(&offsetsLen)) 
-	C.atg__nested_view_from_buffer_copy(ptr, self, nestedSize, nestedStrides, coffsetsDataPtr, coffsetsLen)
+func Atg_NestedViewFromBufferCopy(ptr *Ctensor, self Ctensor, nestedSize Ctensor, nestedStrides Ctensor, offsets Ctensor){ 
+	C.atg__nested_view_from_buffer_copy(ptr, self, nestedSize, nestedStrides, offsets)
 }
-func Atg_NestedViewFromBufferCopyOut(ptr *Ctensor, out Ctensor, self Ctensor, nestedSize Ctensor, nestedStrides Ctensor, offsetsData []int64, offsetsLen int){
-coffsetsDataPtr := (*C.int64_t)(unsafe.Pointer(&offsetsData[0]))
-coffsetsLen := *(*C.int)(unsafe.Pointer(&offsetsLen)) 
-	C.atg__nested_view_from_buffer_copy_out(ptr, out, self, nestedSize, nestedStrides, coffsetsDataPtr, coffsetsLen)
+func Atg_NestedViewFromBufferCopyOut(ptr *Ctensor, out Ctensor, self Ctensor, nestedSize Ctensor, nestedStrides Ctensor, offsets Ctensor){ 
+	C.atg__nested_view_from_buffer_copy_out(ptr, out, self, nestedSize, nestedStrides, offsets)
 }
 func Atg_NewZerosWithSameFeatureMeta(ptr *Ctensor, self Ctensor, other Ctensor, selfNumBatchDims int64){
 cselfNumBatchDims := *(*C.int64_t)(unsafe.Pointer(&selfNumBatchDims)) 
@ -1405,6 +1431,7 @@ func Atg_PreluKernel(ptr *Ctensor, self Ctensor, weight Ctensor){
 func Atg_PreluKernelBackward(ptr *Ctensor, gradOutput Ctensor, self Ctensor, weight Ctensor){ 
 	C.atg__prelu_kernel_backward(ptr, gradOutput, self, weight)
 }
+
 func Atg_RemoveBatchDim(ptr *Ctensor, self Ctensor, level int64, batchSize int64, outDim int64){
 clevel := *(*C.int64_t)(unsafe.Pointer(&level))
 cbatchSize := *(*C.int64_t)(unsafe.Pointer(&batchSize))
@ -1471,35 +1498,61 @@ func Atg_SampleDirichletOut(ptr *Ctensor, out Ctensor, self Ctensor){
 func Atg_SaturateWeightToFp16(ptr *Ctensor, weight Ctensor){ 
 	C.atg__saturate_weight_to_fp16(ptr, weight)
 }
-func Atg_ScaledDotProductAttention(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, attnMask Ctensor, dropoutP float64, needAttnWeights int32, isCausal int32){
+func Atg_ScaledDotProductAttentionMath(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, attnMask Ctensor, dropoutP float64, isCausal int32, dropoutMask Ctensor, scaleVal float64, scaleNull int){
 cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
-cneedAttnWeights := *(*C.int)(unsafe.Pointer(&needAttnWeights))
-cisCausal := *(*C.int)(unsafe.Pointer(&isCausal)) 
-	C.atg__scaled_dot_product_attention(ptr, query, key, value, attnMask, cdropoutP, cneedAttnWeights, cisCausal)
-}
-func Atg_ScaledDotProductAttentionMath(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, attnMask Ctensor, dropoutP float64, isCausal int32, dropoutMask Ctensor){
-cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
-cisCausal := *(*C.int)(unsafe.Pointer(&isCausal)) 
-	C.atg__scaled_dot_product_attention_math(ptr, query, key, value, attnMask, cdropoutP, cisCausal, dropoutMask)
-}
-func Atg_ScaledDotProductEfficientAttention(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, computeLogSumexp int32, isCausal int32){
-ccomputeLogSumexp := *(*C.int)(unsafe.Pointer(&computeLogSumexp))
-cisCausal := *(*C.int)(unsafe.Pointer(&isCausal)) 
-	C.atg__scaled_dot_product_efficient_attention(ptr, query, key, value, ccomputeLogSumexp, cisCausal)
-}
-func Atg_ScaledDotProductEfficientAttentionBackward(ptr *Ctensor, gradOut_ Ctensor, query Ctensor, key Ctensor, value Ctensor, out Ctensor, logsumexp Ctensor, isCausal int32, chunkGradOutputs int32){
 cisCausal := *(*C.int)(unsafe.Pointer(&isCausal))
-cchunkGradOutputs := *(*C.int)(unsafe.Pointer(&chunkGradOutputs)) 
-	C.atg__scaled_dot_product_efficient_attention_backward(ptr, gradOut_, query, key, value, out, logsumexp, cisCausal, cchunkGradOutputs)
+cscaleVal := *(*C.double)(unsafe.Pointer(&scaleVal))
+cscaleNull := *(*C.uint8_t)(unsafe.Pointer(&scaleNull)) 
+	C.atg__scaled_dot_product_attention_math(ptr, query, key, value, attnMask, cdropoutP, cisCausal, dropoutMask, cscaleVal, cscaleNull)
 }
-func Atg_ScaledDotProductFlashAttentionBackward(ptr *Ctensor, gradOut Ctensor, query Ctensor, key Ctensor, value Ctensor, out Ctensor, logsumexp Ctensor, cumSeqQ Ctensor, cumSeqK Ctensor, maxQ int64, maxK int64, dropoutP float64, isCausal int32, philoxSeed int64, philoxOffset int64){
+func Atg_ScaledDotProductEfficientAttention(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, attnBias Ctensor, computeLogSumexp int32, dropoutP float64, isCausal int32, scaleVal float64, scaleNull int){
+ccomputeLogSumexp := *(*C.int)(unsafe.Pointer(&computeLogSumexp))
+cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
+cisCausal := *(*C.int)(unsafe.Pointer(&isCausal))
+cscaleVal := *(*C.double)(unsafe.Pointer(&scaleVal))
+cscaleNull := *(*C.uint8_t)(unsafe.Pointer(&scaleNull)) 
+	C.atg__scaled_dot_product_efficient_attention(ptr, query, key, value, attnBias, ccomputeLogSumexp, cdropoutP, cisCausal, cscaleVal, cscaleNull)
+}
+func Atg_ScaledDotProductFlashAttentionBackward(ptr *Ctensor, gradOut Ctensor, query Ctensor, key Ctensor, value Ctensor, out Ctensor, logsumexp Ctensor, cumSeqQ Ctensor, cumSeqK Ctensor, maxQ int64, maxK int64, dropoutP float64, isCausal int32, philoxSeed Ctensor, philoxOffset Ctensor, scaleVal float64, scaleNull int){
 cmaxQ := *(*C.int64_t)(unsafe.Pointer(&maxQ))
 cmaxK := *(*C.int64_t)(unsafe.Pointer(&maxK))
 cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
 cisCausal := *(*C.int)(unsafe.Pointer(&isCausal))
-cphiloxSeed := *(*C.int64_t)(unsafe.Pointer(&philoxSeed))
-cphiloxOffset := *(*C.int64_t)(unsafe.Pointer(&philoxOffset)) 
-	C.atg__scaled_dot_product_flash_attention_backward(ptr, gradOut, query, key, value, out, logsumexp, cumSeqQ, cumSeqK, cmaxQ, cmaxK, cdropoutP, cisCausal, cphiloxSeed, cphiloxOffset)
+cscaleVal := *(*C.double)(unsafe.Pointer(&scaleVal))
+cscaleNull := *(*C.uint8_t)(unsafe.Pointer(&scaleNull)) 
+	C.atg__scaled_dot_product_flash_attention_backward(ptr, gradOut, query, key, value, out, logsumexp, cumSeqQ, cumSeqK, cmaxQ, cmaxK, cdropoutP, cisCausal, philoxSeed, philoxOffset, cscaleVal, cscaleNull)
+}
+func Atg_ScaledMm(ptr *Ctensor, self Ctensor, mat2 Ctensor, bias Ctensor, outDtype int32, scaleA Ctensor, scaleB Ctensor, scaleResult Ctensor){
+coutDtype := *(*C.int)(unsafe.Pointer(&outDtype)) 
+	C.atg__scaled_mm(ptr, self, mat2, bias, coutDtype, scaleA, scaleB, scaleResult)
+}
+func Atg_ScaledMmOut(ptr *Ctensor, out Ctensor, outAmax Ctensor, self Ctensor, mat2 Ctensor, bias Ctensor, outDtype int32, scaleA Ctensor, scaleB Ctensor, scaleResult Ctensor){
+coutDtype := *(*C.int)(unsafe.Pointer(&outDtype)) 
+	C.atg__scaled_mm_out(ptr, out, outAmax, self, mat2, bias, coutDtype, scaleA, scaleB, scaleResult)
+}
+func Atg_ScatterReduce(ptr *Ctensor, self Ctensor, dim int64, index Ctensor, src Ctensor, reduce string, includeSelf int32){
+cdim := *(*C.int64_t)(unsafe.Pointer(&dim))
+creduce := C.CString(reduce)
+reduceLen := len(reduce)
+creduceLen := *(*C.int)(unsafe.Pointer(&reduceLen))
+cincludeSelf := *(*C.int)(unsafe.Pointer(&includeSelf)) 
+	C.atg__scatter_reduce(ptr, self, cdim, index, src, creduce, creduceLen, cincludeSelf)
+}
+func Atg_ScatterReduce_(ptr *Ctensor, self Ctensor, dim int64, index Ctensor, src Ctensor, reduce string, includeSelf int32){
+cdim := *(*C.int64_t)(unsafe.Pointer(&dim))
+creduce := C.CString(reduce)
+reduceLen := len(reduce)
+creduceLen := *(*C.int)(unsafe.Pointer(&reduceLen))
+cincludeSelf := *(*C.int)(unsafe.Pointer(&includeSelf)) 
+	C.atg__scatter_reduce_(ptr, self, cdim, index, src, creduce, creduceLen, cincludeSelf)
+}
+func Atg_ScatterReduceTwoOut(ptr *Ctensor, out Ctensor, self Ctensor, dim int64, index Ctensor, src Ctensor, reduce string, includeSelf int32){
+cdim := *(*C.int64_t)(unsafe.Pointer(&dim))
+creduce := C.CString(reduce)
+reduceLen := len(reduce)
+creduceLen := *(*C.int)(unsafe.Pointer(&reduceLen))
+cincludeSelf := *(*C.int)(unsafe.Pointer(&includeSelf)) 
+	C.atg__scatter_reduce_two_out(ptr, out, self, cdim, index, src, creduce, creduceLen, cincludeSelf)
 }
 func Atg_SegmentReduceBackward(ptr *Ctensor, grad Ctensor, output Ctensor, data Ctensor, reduce string, lengths Ctensor, offsets Ctensor, axis int64, initial Cscalar){
 creduce := C.CString(reduce)
@ -1610,12 +1663,13 @@ coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
 coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
 	C.atg__sparse_compressed_tensor_unsafe(ptr, compressedIndices, plainIndices, values, csizeDataPtr, csizeLen, coptionsKind, coptionsDevice)
 }
-func Atg_SparseCooTensorUnsafe(ptr *Ctensor, indices Ctensor, values Ctensor, sizeData []int64, sizeLen int, optionsKind int32, optionsDevice int32){
+func Atg_SparseCooTensorUnsafe(ptr *Ctensor, indices Ctensor, values Ctensor, sizeData []int64, sizeLen int, optionsKind int32, optionsDevice int32, isCoalesced int32){
 csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
 csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen))
 coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
-coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
-	C.atg__sparse_coo_tensor_unsafe(ptr, indices, values, csizeDataPtr, csizeLen, coptionsKind, coptionsDevice)
+coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice))
+cisCoalesced := *(*C.int)(unsafe.Pointer(&isCoalesced)) 
+	C.atg__sparse_coo_tensor_unsafe(ptr, indices, values, csizeDataPtr, csizeLen, coptionsKind, coptionsDevice, cisCoalesced)
 }
 func Atg_SparseCooTensorWithDims(ptr *Ctensor, sparseDim int64, denseDim int64, sizeData []int64, sizeLen int, optionsKind int32, optionsDevice int32){
 csparseDim := *(*C.int64_t)(unsafe.Pointer(&sparseDim))
@ -1626,21 +1680,23 @@ coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
 coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
 	C.atg__sparse_coo_tensor_with_dims(ptr, csparseDim, cdenseDim, csizeDataPtr, csizeLen, coptionsKind, coptionsDevice)
 }
-func Atg_SparseCooTensorWithDimsAndTensors(ptr *Ctensor, sparseDim int64, denseDim int64, sizeData []int64, sizeLen int, indices Ctensor, values Ctensor, optionsKind int32, optionsDevice int32){
+func Atg_SparseCooTensorWithDimsAndTensors(ptr *Ctensor, sparseDim int64, denseDim int64, sizeData []int64, sizeLen int, indices Ctensor, values Ctensor, optionsKind int32, optionsDevice int32, isCoalesced int32){
 csparseDim := *(*C.int64_t)(unsafe.Pointer(&sparseDim))
 cdenseDim := *(*C.int64_t)(unsafe.Pointer(&denseDim))
 csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
 csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen))
 coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
-coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
-	C.atg__sparse_coo_tensor_with_dims_and_tensors(ptr, csparseDim, cdenseDim, csizeDataPtr, csizeLen, indices, values, coptionsKind, coptionsDevice)
+coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice))
+cisCoalesced := *(*C.int)(unsafe.Pointer(&isCoalesced)) 
+	C.atg__sparse_coo_tensor_with_dims_and_tensors(ptr, csparseDim, cdenseDim, csizeDataPtr, csizeLen, indices, values, coptionsKind, coptionsDevice, cisCoalesced)
 }
-func Atg_SparseCooTensorWithDimsAndTensorsOut(ptr *Ctensor, out Ctensor, sparseDim int64, denseDim int64, sizeData []int64, sizeLen int, indices Ctensor, values Ctensor){
+func Atg_SparseCooTensorWithDimsAndTensorsOut(ptr *Ctensor, out Ctensor, sparseDim int64, denseDim int64, sizeData []int64, sizeLen int, indices Ctensor, values Ctensor, isCoalesced int32){
 csparseDim := *(*C.int64_t)(unsafe.Pointer(&sparseDim))
 cdenseDim := *(*C.int64_t)(unsafe.Pointer(&denseDim))
 csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
-csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen)) 
-	C.atg__sparse_coo_tensor_with_dims_and_tensors_out(ptr, out, csparseDim, cdenseDim, csizeDataPtr, csizeLen, indices, values)
+csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen))
+cisCoalesced := *(*C.int)(unsafe.Pointer(&isCoalesced)) 
+	C.atg__sparse_coo_tensor_with_dims_and_tensors_out(ptr, out, csparseDim, cdenseDim, csizeDataPtr, csizeLen, indices, values, cisCoalesced)
 }
 func Atg_SparseCooTensorWithDimsOut(ptr *Ctensor, out Ctensor, sparseDim int64, denseDim int64, sizeData []int64, sizeLen int){
 csparseDim := *(*C.int64_t)(unsafe.Pointer(&sparseDim))
@ -1714,6 +1770,14 @@ cdim := *(*C.int64_t)(unsafe.Pointer(&dim))
 chalfToFloat := *(*C.int)(unsafe.Pointer(&halfToFloat)) 
 	C.atg__sparse_log_softmax_out(ptr, out, self, cdim, chalfToFloat)
 }
+func Atg_SparseMaskProjection(ptr *Ctensor, self Ctensor, mask Ctensor, accumulateMatches int32){
+caccumulateMatches := *(*C.int)(unsafe.Pointer(&accumulateMatches)) 
+	C.atg__sparse_mask_projection(ptr, self, mask, caccumulateMatches)
+}
+func Atg_SparseMaskProjectionOut(ptr *Ctensor, out Ctensor, self Ctensor, mask Ctensor, accumulateMatches int32){
+caccumulateMatches := *(*C.int)(unsafe.Pointer(&accumulateMatches)) 
+	C.atg__sparse_mask_projection_out(ptr, out, self, mask, caccumulateMatches)
+}
 func Atg_SparseMm(ptr *Ctensor, sparse Ctensor, dense Ctensor){ 
 	C.atg__sparse_mm(ptr, sparse, dense)
 }
@ -1729,6 +1793,12 @@ reduceLen := len(reduce)
 creduceLen := *(*C.int)(unsafe.Pointer(&reduceLen)) 
 	C.atg__sparse_mm_reduce_impl(ptr, self, other, creduce, creduceLen)
 }
+func Atg_SparseSemiStructuredLinear(ptr *Ctensor, input Ctensor, weight Ctensor, meta Ctensor, bias Ctensor, activation string){
+cactivation := C.CString(activation)
+activationLen := len(activation)
+cactivationLen := *(*C.int)(unsafe.Pointer(&activationLen)) 
+	C.atg__sparse_semi_structured_linear(ptr, input, weight, meta, bias, cactivation, cactivationLen)
+}
 func Atg_SparseSoftmax(ptr *Ctensor, self Ctensor, dim int64, halfToFloat int32){
 cdim := *(*C.int64_t)(unsafe.Pointer(&dim))
 chalfToFloat := *(*C.int)(unsafe.Pointer(&halfToFloat)) 
@ -1861,6 +1931,12 @@ func Atg_TestAutogradMultipleDispatchViewCopyOut(ptr *Ctensor, out Ctensor, self
 func Atg_TestCheckTensor(ptr *Ctensor, self Ctensor){ 
 	C.atg__test_check_tensor(ptr, self)
 }
+func Atg_TestFunctorchFallback(ptr *Ctensor, self Ctensor, other Ctensor){ 
+	C.atg__test_functorch_fallback(ptr, self, other)
+}
+func Atg_TestFunctorchFallbackOut(ptr *Ctensor, out Ctensor, self Ctensor, other Ctensor){ 
+	C.atg__test_functorch_fallback_out(ptr, out, self, other)
+}
 func Atg_TestOptionalFilledIntlist(ptr *Ctensor, values Ctensor, addendsData []int64, addendsLen int){
 caddendsDataPtr := (*C.int64_t)(unsafe.Pointer(&addendsData[0]))
 caddendsLen := *(*C.int)(unsafe.Pointer(&addendsLen)) 
@ -1920,13 +1996,90 @@ cnonBlocking := *(*C.int)(unsafe.Pointer(&nonBlocking))
 	C.atg__to_copy_out(ptr, out, self, cnonBlocking)
 }

-func Atg_ToDense(ptr *Ctensor, self Ctensor, dtype int32){
-cdtype := *(*C.int)(unsafe.Pointer(&dtype)) 
-	C.atg__to_dense(ptr, self, cdtype)
+func Atg_ToDense(ptr *Ctensor, self Ctensor, dtype int32, maskedGrad int32){
+cdtype := *(*C.int)(unsafe.Pointer(&dtype))
+cmaskedGrad := *(*C.int)(unsafe.Pointer(&maskedGrad)) 
+	C.atg__to_dense(ptr, self, cdtype, cmaskedGrad)
 }
-func Atg_ToDenseOut(ptr *Ctensor, out Ctensor, self Ctensor, dtype int32){
-cdtype := *(*C.int)(unsafe.Pointer(&dtype)) 
-	C.atg__to_dense_out(ptr, out, self, cdtype)
+func Atg_ToDenseOut(ptr *Ctensor, out Ctensor, self Ctensor, dtype int32, maskedGrad int32){
+cdtype := *(*C.int)(unsafe.Pointer(&dtype))
+cmaskedGrad := *(*C.int)(unsafe.Pointer(&maskedGrad)) 
+	C.atg__to_dense_out(ptr, out, self, cdtype, cmaskedGrad)
+}
+func Atg_ToSparse(ptr *Ctensor, self Ctensor, layout int8, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
+clayout := *(*C.int8_t)(unsafe.Pointer(&layout))
+cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
+cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse(ptr, self, clayout, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseBsc(ptr *Ctensor, self Ctensor, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
+cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
+cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_bsc(ptr, self, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseBscOut(ptr *Ctensor, out Ctensor, self Ctensor, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
+cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
+cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_bsc_out(ptr, out, self, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseBsr(ptr *Ctensor, self Ctensor, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
+cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
+cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_bsr(ptr, self, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseBsrOut(ptr *Ctensor, out Ctensor, self Ctensor, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
+cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
+cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_bsr_out(ptr, out, self, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseCsc(ptr *Ctensor, self Ctensor, denseDimVal int64, denseDimNull int){
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_csc(ptr, self, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseCscOut(ptr *Ctensor, out Ctensor, self Ctensor, denseDimVal int64, denseDimNull int){
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_csc_out(ptr, out, self, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseCsr(ptr *Ctensor, self Ctensor, denseDimVal int64, denseDimNull int){
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_csr(ptr, self, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseCsrOut(ptr *Ctensor, out Ctensor, self Ctensor, denseDimVal int64, denseDimNull int){
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_csr_out(ptr, out, self, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseOut(ptr *Ctensor, out Ctensor, self Ctensor, layout int8, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
+clayout := *(*C.int8_t)(unsafe.Pointer(&layout))
+cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
+cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
+cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
+cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
+	C.atg__to_sparse_out(ptr, out, self, clayout, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
+}
+func Atg_ToSparseSemiStructured(ptr *Ctensor, dense Ctensor){ 
+	C.atg__to_sparse_semi_structured(ptr, dense)
+}
+func Atg_ToSparseSparseDim(ptr *Ctensor, self Ctensor, sparseDim int64){
+csparseDim := *(*C.int64_t)(unsafe.Pointer(&sparseDim)) 
+	C.atg__to_sparse_sparse_dim(ptr, self, csparseDim)
+}
+func Atg_ToSparseSparseDimOut(ptr *Ctensor, out Ctensor, self Ctensor, sparseDim int64){
+csparseDim := *(*C.int64_t)(unsafe.Pointer(&sparseDim)) 
+	C.atg__to_sparse_sparse_dim_out(ptr, out, self, csparseDim)
 }
 func Atg_TransformBiasRescaleQkv(ptr *Ctensor, qkv Ctensor, qkvBias Ctensor, numHeads int64){
 cnumHeads := *(*C.int64_t)(unsafe.Pointer(&numHeads)) 
@ -1936,22 +2089,6 @@ func Atg_TransformBiasRescaleQkvOut(ptr *Ctensor, out0 Ctensor, out1 Ctensor, ou
 cnumHeads := *(*C.int64_t)(unsafe.Pointer(&numHeads)) 
 	C.atg__transform_bias_rescale_qkv_out(ptr, out0, out1, out2, qkv, qkvBias, cnumHeads)
 }
-func Atg_TransformerDecoderOnlyLayerFwd(ptr *Ctensor, src Ctensor, embedDim int64, numHeads int64, qkvWeight Ctensor, qkvBias Ctensor, projWeight Ctensor, projBias Ctensor, useGelu int32, normFirst int32, eps float64, normWeight1 Ctensor, normBias1 Ctensor, normWeight2 Ctensor, normBias2 Ctensor, ffnWeight1 Ctensor, ffnBias1 Ctensor, ffnWeight2 Ctensor, ffnBias2 Ctensor, mask Ctensor, incrKey Ctensor, incrValue Ctensor){
-cembedDim := *(*C.int64_t)(unsafe.Pointer(&embedDim))
-cnumHeads := *(*C.int64_t)(unsafe.Pointer(&numHeads))
-cuseGelu := *(*C.int)(unsafe.Pointer(&useGelu))
-cnormFirst := *(*C.int)(unsafe.Pointer(&normFirst))
-ceps := *(*C.double)(unsafe.Pointer(&eps)) 
-	C.atg__transformer_decoder_only_layer_fwd(ptr, src, cembedDim, cnumHeads, qkvWeight, qkvBias, projWeight, projBias, cuseGelu, cnormFirst, ceps, normWeight1, normBias1, normWeight2, normBias2, ffnWeight1, ffnBias1, ffnWeight2, ffnBias2, mask, incrKey, incrValue)
-}
-func Atg_TransformerDecoderOnlyLayerFwdOut(ptr *Ctensor, out0 Ctensor, out1 Ctensor, out2 Ctensor, src Ctensor, embedDim int64, numHeads int64, qkvWeight Ctensor, qkvBias Ctensor, projWeight Ctensor, projBias Ctensor, useGelu int32, normFirst int32, eps float64, normWeight1 Ctensor, normBias1 Ctensor, normWeight2 Ctensor, normBias2 Ctensor, ffnWeight1 Ctensor, ffnBias1 Ctensor, ffnWeight2 Ctensor, ffnBias2 Ctensor, mask Ctensor, incrKey Ctensor, incrValue Ctensor){
-cembedDim := *(*C.int64_t)(unsafe.Pointer(&embedDim))
-cnumHeads := *(*C.int64_t)(unsafe.Pointer(&numHeads))
-cuseGelu := *(*C.int)(unsafe.Pointer(&useGelu))
-cnormFirst := *(*C.int)(unsafe.Pointer(&normFirst))
-ceps := *(*C.double)(unsafe.Pointer(&eps)) 
-	C.atg__transformer_decoder_only_layer_fwd_out(ptr, out0, out1, out2, src, cembedDim, cnumHeads, qkvWeight, qkvBias, projWeight, projBias, cuseGelu, cnormFirst, ceps, normWeight1, normBias1, normWeight2, normBias2, ffnWeight1, ffnBias1, ffnWeight2, ffnBias2, mask, incrKey, incrValue)
-}
 func Atg_TransformerEncoderLayerFwd(ptr *Ctensor, src Ctensor, embedDim int64, numHeads int64, qkvWeight Ctensor, qkvBias Ctensor, projWeight Ctensor, projBias Ctensor, useGelu int32, normFirst int32, eps float64, normWeight1 Ctensor, normBias1 Ctensor, normWeight2 Ctensor, normBias2 Ctensor, ffnWeight1 Ctensor, ffnBias1 Ctensor, ffnWeight2 Ctensor, ffnBias2 Ctensor, mask Ctensor, maskTypeVal int64, maskTypeNull int){
 cembedDim := *(*C.int64_t)(unsafe.Pointer(&embedDim))
 cnumHeads := *(*C.int64_t)(unsafe.Pointer(&numHeads))
@ -2040,6 +2177,17 @@ func Atg_UnpackDual(ptr *Ctensor, dual Ctensor, level int64){
 clevel := *(*C.int64_t)(unsafe.Pointer(&level)) 
 	C.atg__unpack_dual(ptr, dual, clevel)
 }
+func Atg_UnsafeIndex(ptr *Ctensor, self Ctensor, indicesData []Ctensor, indicesLen int){
+cindicesDataPtr := (*Ctensor)(unsafe.Pointer(&indicesData[0]))
+cindicesLen := *(*C.int)(unsafe.Pointer(&indicesLen)) 
+	C.atg__unsafe_index(ptr, self, cindicesDataPtr, cindicesLen)
+}
+func Atg_UnsafeIndexPut(ptr *Ctensor, self Ctensor, indicesData []Ctensor, indicesLen int, values Ctensor, accumulate int32){
+cindicesDataPtr := (*Ctensor)(unsafe.Pointer(&indicesData[0]))
+cindicesLen := *(*C.int)(unsafe.Pointer(&indicesLen))
+caccumulate := *(*C.int)(unsafe.Pointer(&accumulate)) 
+	C.atg__unsafe_index_put(ptr, self, cindicesDataPtr, cindicesLen, values, caccumulate)
+}
 func Atg_UnsafeView(ptr *Ctensor, self Ctensor, sizeData []int64, sizeLen int){
 csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
 csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen)) 
@ -3042,11 +3190,11 @@ ceps := *(*C.double)(unsafe.Pointer(&eps))
 ccudnnEnabled := *(*C.int)(unsafe.Pointer(&cudnnEnabled)) 
 	C.atg_batch_norm(ptr, input, weight, bias, runningMean, runningVar, ctraining, cmomentum, ceps, ccudnnEnabled)
 }
-func AtgBatchNormBackwardElemt(ptr *Ctensor, gradOut Ctensor, input Ctensor, mean Ctensor, invstd Ctensor, weight Ctensor, meanDy Ctensor, meanDyXmu Ctensor, count Ctensor){ 
-	C.atg_batch_norm_backward_elemt(ptr, gradOut, input, mean, invstd, weight, meanDy, meanDyXmu, count)
+func AtgBatchNormBackwardElemt(ptr *Ctensor, gradOut Ctensor, input Ctensor, mean Ctensor, invstd Ctensor, weight Ctensor, sumDy Ctensor, sumDyXmu Ctensor, count Ctensor){ 
+	C.atg_batch_norm_backward_elemt(ptr, gradOut, input, mean, invstd, weight, sumDy, sumDyXmu, count)
 }
-func AtgBatchNormBackwardElemtOut(ptr *Ctensor, out Ctensor, gradOut Ctensor, input Ctensor, mean Ctensor, invstd Ctensor, weight Ctensor, meanDy Ctensor, meanDyXmu Ctensor, count Ctensor){ 
-	C.atg_batch_norm_backward_elemt_out(ptr, out, gradOut, input, mean, invstd, weight, meanDy, meanDyXmu, count)
+func AtgBatchNormBackwardElemtOut(ptr *Ctensor, out Ctensor, gradOut Ctensor, input Ctensor, mean Ctensor, invstd Ctensor, weight Ctensor, sumDy Ctensor, sumDyXmu Ctensor, count Ctensor){ 
+	C.atg_batch_norm_backward_elemt_out(ptr, out, gradOut, input, mean, invstd, weight, sumDy, sumDyXmu, count)
 }
 func AtgBatchNormBackwardReduce(ptr *Ctensor, gradOut Ctensor, input Ctensor, mean Ctensor, invstd Ctensor, weight Ctensor, inputG int32, weightG int32, biasG int32){
 cinputG := *(*C.int)(unsafe.Pointer(&inputG))
@ -4579,6 +4727,22 @@ csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
 csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen)) 
 	C.atg_empty_out(ptr, out, csizeDataPtr, csizeLen)
 }
+func AtgEmptyPermuted(ptr *Ctensor, sizeData []int64, sizeLen int, physicalLayoutData []int64, physicalLayoutLen int, optionsKind int32, optionsDevice int32){
+csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
+csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen))
+cphysicalLayoutDataPtr := (*C.int64_t)(unsafe.Pointer(&physicalLayoutData[0]))
+cphysicalLayoutLen := *(*C.int)(unsafe.Pointer(&physicalLayoutLen))
+coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
+coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
+	C.atg_empty_permuted(ptr, csizeDataPtr, csizeLen, cphysicalLayoutDataPtr, cphysicalLayoutLen, coptionsKind, coptionsDevice)
+}
+func AtgEmptyPermutedOut(ptr *Ctensor, out Ctensor, sizeData []int64, sizeLen int, physicalLayoutData []int64, physicalLayoutLen int){
+csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
+csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen))
+cphysicalLayoutDataPtr := (*C.int64_t)(unsafe.Pointer(&physicalLayoutData[0]))
+cphysicalLayoutLen := *(*C.int)(unsafe.Pointer(&physicalLayoutLen)) 
+	C.atg_empty_permuted_out(ptr, out, csizeDataPtr, csizeLen, cphysicalLayoutDataPtr, cphysicalLayoutLen)
+}
 func AtgEmptyQuantized(ptr *Ctensor, sizeData []int64, sizeLen int, qtensor Ctensor, optionsKind int32, optionsDevice int32){
 csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
 csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen))
@ -7544,6 +7708,9 @@ func AtgMinOther(ptr *Ctensor, self Ctensor, other Ctensor){
 func AtgMinOut(ptr *Ctensor, out Ctensor, self Ctensor, other Ctensor){ 
 	C.atg_min_out(ptr, out, self, other)
 }
+func AtgMinUnaryOut(ptr *Ctensor, out Ctensor, self Ctensor){ 
+	C.atg_min_unary_out(ptr, out, self)
+}
 func AtgMinimum(ptr *Ctensor, self Ctensor, other Ctensor){ 
 	C.atg_minimum(ptr, self, other)
 }
@ -8491,6 +8658,16 @@ func AtgNonzero(ptr *Ctensor, self Ctensor){
 func AtgNonzeroOut(ptr *Ctensor, out Ctensor, self Ctensor){ 
 	C.atg_nonzero_out(ptr, out, self)
 }
+func AtgNonzeroStatic(ptr *Ctensor, self Ctensor, size int64, fillValue int64){
+csize := *(*C.int64_t)(unsafe.Pointer(&size))
+cfillValue := *(*C.int64_t)(unsafe.Pointer(&fillValue)) 
+	C.atg_nonzero_static(ptr, self, csize, cfillValue)
+}
+func AtgNonzeroStaticOut(ptr *Ctensor, out Ctensor, self Ctensor, size int64, fillValue int64){
+csize := *(*C.int64_t)(unsafe.Pointer(&size))
+cfillValue := *(*C.int64_t)(unsafe.Pointer(&fillValue)) 
+	C.atg_nonzero_static_out(ptr, out, self, csize, cfillValue)
+}
 func AtgNorm(ptr *Ctensor, self Ctensor){ 
 	C.atg_norm(ptr, self)
 }
@ -8963,6 +9140,30 @@ cdilationLen := *(*C.int)(unsafe.Pointer(&dilationLen))
 cceilMode := *(*C.int)(unsafe.Pointer(&ceilMode)) 
 	C.atg_quantized_max_pool2d_out(ptr, out, self, ckernelSizeDataPtr, ckernelSizeLen, cstrideDataPtr, cstrideLen, cpaddingDataPtr, cpaddingLen, cdilationDataPtr, cdilationLen, cceilMode)
 }
+func AtgQuantizedMaxPool3d(ptr *Ctensor, self Ctensor, kernelSizeData []int64, kernelSizeLen int, strideData []int64, strideLen int, paddingData []int64, paddingLen int, dilationData []int64, dilationLen int, ceilMode int32){
+ckernelSizeDataPtr := (*C.int64_t)(unsafe.Pointer(&kernelSizeData[0]))
+ckernelSizeLen := *(*C.int)(unsafe.Pointer(&kernelSizeLen))
+cstrideDataPtr := (*C.int64_t)(unsafe.Pointer(&strideData[0]))
+cstrideLen := *(*C.int)(unsafe.Pointer(&strideLen))
+cpaddingDataPtr := (*C.int64_t)(unsafe.Pointer(&paddingData[0]))
+cpaddingLen := *(*C.int)(unsafe.Pointer(&paddingLen))
+cdilationDataPtr := (*C.int64_t)(unsafe.Pointer(&dilationData[0]))
+cdilationLen := *(*C.int)(unsafe.Pointer(&dilationLen))
+cceilMode := *(*C.int)(unsafe.Pointer(&ceilMode)) 
+	C.atg_quantized_max_pool3d(ptr, self, ckernelSizeDataPtr, ckernelSizeLen, cstrideDataPtr, cstrideLen, cpaddingDataPtr, cpaddingLen, cdilationDataPtr, cdilationLen, cceilMode)
+}
+func AtgQuantizedMaxPool3dOut(ptr *Ctensor, out Ctensor, self Ctensor, kernelSizeData []int64, kernelSizeLen int, strideData []int64, strideLen int, paddingData []int64, paddingLen int, dilationData []int64, dilationLen int, ceilMode int32){
+ckernelSizeDataPtr := (*C.int64_t)(unsafe.Pointer(&kernelSizeData[0]))
+ckernelSizeLen := *(*C.int)(unsafe.Pointer(&kernelSizeLen))
+cstrideDataPtr := (*C.int64_t)(unsafe.Pointer(&strideData[0]))
+cstrideLen := *(*C.int)(unsafe.Pointer(&strideLen))
+cpaddingDataPtr := (*C.int64_t)(unsafe.Pointer(&paddingData[0]))
+cpaddingLen := *(*C.int)(unsafe.Pointer(&paddingLen))
+cdilationDataPtr := (*C.int64_t)(unsafe.Pointer(&dilationData[0]))
+cdilationLen := *(*C.int)(unsafe.Pointer(&dilationLen))
+cceilMode := *(*C.int)(unsafe.Pointer(&ceilMode)) 
+	C.atg_quantized_max_pool3d_out(ptr, out, self, ckernelSizeDataPtr, ckernelSizeLen, cstrideDataPtr, cstrideLen, cpaddingDataPtr, cpaddingLen, cdilationDataPtr, cdilationLen, cceilMode)
+}
 func AtgQuantizedRnnReluCell(ptr *Ctensor, input Ctensor, hx Ctensor, wIh Ctensor, wHh Ctensor, bIh Ctensor, bHh Ctensor, packedIh Ctensor, packedHh Ctensor, colOffsetsIh Ctensor, colOffsetsHh Ctensor, scaleIh Cscalar, scaleHh Cscalar, zeroPointIh Cscalar, zeroPointHh Cscalar){ 
 	C.atg_quantized_rnn_relu_cell(ptr, input, hx, wIh, wHh, bIh, bHh, packedIh, packedHh, colOffsetsIh, colOffsetsHh, scaleIh , scaleHh , zeroPointIh , zeroPointHh )
 }
@ -9578,10 +9779,12 @@ coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice))
 func AtgScalarTensorOut(ptr *Ctensor, out Ctensor, s Cscalar){ 
 	C.atg_scalar_tensor_out(ptr, out, s )
 }
-func AtgScaledDotProductAttention(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, attnMask Ctensor, dropoutP float64, isCausal int32){
+func AtgScaledDotProductAttention(ptr *Ctensor, query Ctensor, key Ctensor, value Ctensor, attnMask Ctensor, dropoutP float64, isCausal int32, scaleVal float64, scaleNull int){
 cdropoutP := *(*C.double)(unsafe.Pointer(&dropoutP))
-cisCausal := *(*C.int)(unsafe.Pointer(&isCausal)) 
-	C.atg_scaled_dot_product_attention(ptr, query, key, value, attnMask, cdropoutP, cisCausal)
+cisCausal := *(*C.int)(unsafe.Pointer(&isCausal))
+cscaleVal := *(*C.double)(unsafe.Pointer(&scaleVal))
+cscaleNull := *(*C.uint8_t)(unsafe.Pointer(&scaleNull)) 
+	C.atg_scaled_dot_product_attention(ptr, query, key, value, attnMask, cdropoutP, cisCausal, cscaleVal, cscaleNull)
 }
 func AtgScatter(ptr *Ctensor, self Ctensor, dim int64, index Ctensor, src Ctensor){
 cdim := *(*C.int64_t)(unsafe.Pointer(&dim)) 
@ -10186,17 +10389,19 @@ coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
 coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
 	C.atg_sparse_coo_tensor(ptr, csizeDataPtr, csizeLen, coptionsKind, coptionsDevice)
 }
-func AtgSparseCooTensorIndices(ptr *Ctensor, indices Ctensor, values Ctensor, optionsKind int32, optionsDevice int32){
+func AtgSparseCooTensorIndices(ptr *Ctensor, indices Ctensor, values Ctensor, optionsKind int32, optionsDevice int32, isCoalesced int32){
 coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
-coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
-	C.atg_sparse_coo_tensor_indices(ptr, indices, values, coptionsKind, coptionsDevice)
+coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice))
+cisCoalesced := *(*C.int)(unsafe.Pointer(&isCoalesced)) 
+	C.atg_sparse_coo_tensor_indices(ptr, indices, values, coptionsKind, coptionsDevice, cisCoalesced)
 }
-func AtgSparseCooTensorIndicesSize(ptr *Ctensor, indices Ctensor, values Ctensor, sizeData []int64, sizeLen int, optionsKind int32, optionsDevice int32){
+func AtgSparseCooTensorIndicesSize(ptr *Ctensor, indices Ctensor, values Ctensor, sizeData []int64, sizeLen int, optionsKind int32, optionsDevice int32, isCoalesced int32){
 csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
 csizeLen := *(*C.int)(unsafe.Pointer(&sizeLen))
 coptionsKind := *(*C.int)(unsafe.Pointer(&optionsKind))
-coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice)) 
-	C.atg_sparse_coo_tensor_indices_size(ptr, indices, values, csizeDataPtr, csizeLen, coptionsKind, coptionsDevice)
+coptionsDevice := *(*C.int)(unsafe.Pointer(&optionsDevice))
+cisCoalesced := *(*C.int)(unsafe.Pointer(&isCoalesced)) 
+	C.atg_sparse_coo_tensor_indices_size(ptr, indices, values, csizeDataPtr, csizeLen, coptionsKind, coptionsDevice, cisCoalesced)
 }
 func AtgSparseCooTensorSizeOut(ptr *Ctensor, out Ctensor, sizeData []int64, sizeLen int){
 csizeDataPtr := (*C.int64_t)(unsafe.Pointer(&sizeData[0]))
@ -10908,21 +11113,17 @@ func AtgStd(ptr *Ctensor, self Ctensor, unbiased int32){
 cunbiased := *(*C.int)(unsafe.Pointer(&unbiased)) 
 	C.atg_std(ptr, self, cunbiased)
 }
-func AtgStdCorrection(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, correctionVal int64, correctionNull int, keepdim int32){
+func AtgStdCorrection(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, correction Cscalar, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
 cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
-ccorrectionVal := *(*C.int64_t)(unsafe.Pointer(&correctionVal))
-ccorrectionNull := *(*C.uint8_t)(unsafe.Pointer(&correctionNull))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
-	C.atg_std_correction(ptr, self, cdimDataPtr, cdimLen, ccorrectionVal, ccorrectionNull, ckeepdim)
+	C.atg_std_correction(ptr, self, cdimDataPtr, cdimLen, correction , ckeepdim)
 }
-func AtgStdCorrectionOut(ptr *Ctensor, out Ctensor, self Ctensor, dimData []int64, dimLen int, correctionVal int64, correctionNull int, keepdim int32){
+func AtgStdCorrectionOut(ptr *Ctensor, out Ctensor, self Ctensor, dimData []int64, dimLen int, correction Cscalar, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
 cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
-ccorrectionVal := *(*C.int64_t)(unsafe.Pointer(&correctionVal))
-ccorrectionNull := *(*C.uint8_t)(unsafe.Pointer(&correctionNull))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
-	C.atg_std_correction_out(ptr, out, self, cdimDataPtr, cdimLen, ccorrectionVal, ccorrectionNull, ckeepdim)
+	C.atg_std_correction_out(ptr, out, self, cdimDataPtr, cdimLen, correction , ckeepdim)
 }
 func AtgStdDim(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, unbiased int32, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
@ -10935,21 +11136,17 @@ func AtgStdMean(ptr *Ctensor, self Ctensor, unbiased int32){
 cunbiased := *(*C.int)(unsafe.Pointer(&unbiased)) 
 	C.atg_std_mean(ptr, self, cunbiased)
 }
-func AtgStdMeanCorrection(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, correctionVal int64, correctionNull int, keepdim int32){
+func AtgStdMeanCorrection(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, correction Cscalar, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
 cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
-ccorrectionVal := *(*C.int64_t)(unsafe.Pointer(&correctionVal))
-ccorrectionNull := *(*C.uint8_t)(unsafe.Pointer(&correctionNull))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
-	C.atg_std_mean_correction(ptr, self, cdimDataPtr, cdimLen, ccorrectionVal, ccorrectionNull, ckeepdim)
+	C.atg_std_mean_correction(ptr, self, cdimDataPtr, cdimLen, correction , ckeepdim)
 }
-func AtgStdMeanCorrectionOut(ptr *Ctensor, out0 Ctensor, out1 Ctensor, self Ctensor, dimData []int64, dimLen int, correctionVal int64, correctionNull int, keepdim int32){
+func AtgStdMeanCorrectionOut(ptr *Ctensor, out0 Ctensor, out1 Ctensor, self Ctensor, dimData []int64, dimLen int, correction Cscalar, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
 cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
-ccorrectionVal := *(*C.int64_t)(unsafe.Pointer(&correctionVal))
-ccorrectionNull := *(*C.uint8_t)(unsafe.Pointer(&correctionNull))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
-	C.atg_std_mean_correction_out(ptr, out0, out1, self, cdimDataPtr, cdimLen, ccorrectionVal, ccorrectionNull, ckeepdim)
+	C.atg_std_mean_correction_out(ptr, out0, out1, self, cdimDataPtr, cdimLen, correction , ckeepdim)
 }
 func AtgStdMeanDim(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, unbiased int32, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
@ -11081,6 +11278,8 @@ cdim0 := *(*C.int64_t)(unsafe.Pointer(&dim0))
 cdim1 := *(*C.int64_t)(unsafe.Pointer(&dim1)) 
 	C.atg_swapdims_(ptr, self, cdim0, cdim1)
 }
+
+
 func AtgT(ptr *Ctensor, self Ctensor){ 
 	C.atg_t(ptr, self)
 }
@ -11174,12 +11373,14 @@ func AtgTo(ptr *Ctensor, self Ctensor, device int32){
 cdevice := *(*C.int)(unsafe.Pointer(&device)) 
 	C.atg_to(ptr, self, cdevice)
 }
-func AtgToDense(ptr *Ctensor, self Ctensor, dtype int32){
-cdtype := *(*C.int)(unsafe.Pointer(&dtype)) 
-	C.atg_to_dense(ptr, self, cdtype)
+func AtgToDense(ptr *Ctensor, self Ctensor, dtype int32, maskedGrad int32){
+cdtype := *(*C.int)(unsafe.Pointer(&dtype))
+cmaskedGrad := *(*C.int)(unsafe.Pointer(&maskedGrad)) 
+	C.atg_to_dense(ptr, self, cdtype, cmaskedGrad)
 }
-func AtgToDenseBackward(ptr *Ctensor, grad Ctensor, input Ctensor){ 
-	C.atg_to_dense_backward(ptr, grad, input)
+func AtgToDenseBackward(ptr *Ctensor, grad Ctensor, input Ctensor, maskedGrad int32){
+cmaskedGrad := *(*C.int)(unsafe.Pointer(&maskedGrad)) 
+	C.atg_to_dense_backward(ptr, grad, input, cmaskedGrad)
 }
 func AtgToDevice(ptr *Ctensor, self Ctensor, device int32, dtype int32, nonBlocking int32, copy int32){
 cdevice := *(*C.int)(unsafe.Pointer(&device))
@ -11244,13 +11445,6 @@ cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
 cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
 	C.atg_to_sparse_bsc(ptr, self, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
 }
-func AtgToSparseBscOut(ptr *Ctensor, out Ctensor, self Ctensor, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
-cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
-cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
-cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
-cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
-	C.atg_to_sparse_bsc_out(ptr, out, self, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
-}
 func AtgToSparseBsr(ptr *Ctensor, self Ctensor, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
 cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
 cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
@ -11258,49 +11452,20 @@ cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
 cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
 	C.atg_to_sparse_bsr(ptr, self, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
 }
-func AtgToSparseBsrOut(ptr *Ctensor, out Ctensor, self Ctensor, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
-cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
-cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
-cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
-cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
-	C.atg_to_sparse_bsr_out(ptr, out, self, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
-}
 func AtgToSparseCsc(ptr *Ctensor, self Ctensor, denseDimVal int64, denseDimNull int){
 cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
 cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
 	C.atg_to_sparse_csc(ptr, self, cdenseDimVal, cdenseDimNull)
 }
-func AtgToSparseCscOut(ptr *Ctensor, out Ctensor, self Ctensor, denseDimVal int64, denseDimNull int){
-cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
-cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
-	C.atg_to_sparse_csc_out(ptr, out, self, cdenseDimVal, cdenseDimNull)
-}
 func AtgToSparseCsr(ptr *Ctensor, self Ctensor, denseDimVal int64, denseDimNull int){
 cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
 cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
 	C.atg_to_sparse_csr(ptr, self, cdenseDimVal, cdenseDimNull)
 }
-func AtgToSparseCsrOut(ptr *Ctensor, out Ctensor, self Ctensor, denseDimVal int64, denseDimNull int){
-cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
-cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
-	C.atg_to_sparse_csr_out(ptr, out, self, cdenseDimVal, cdenseDimNull)
-}
-func AtgToSparseOut(ptr *Ctensor, out Ctensor, self Ctensor, layout int8, blocksizeData []int64, blocksizeLen int, denseDimVal int64, denseDimNull int){
-clayout := *(*C.int8_t)(unsafe.Pointer(&layout))
-cblocksizeDataPtr := (*C.int64_t)(unsafe.Pointer(&blocksizeData[0]))
-cblocksizeLen := *(*C.int)(unsafe.Pointer(&blocksizeLen))
-cdenseDimVal := *(*C.int64_t)(unsafe.Pointer(&denseDimVal))
-cdenseDimNull := *(*C.uint8_t)(unsafe.Pointer(&denseDimNull)) 
-	C.atg_to_sparse_out(ptr, out, self, clayout, cblocksizeDataPtr, cblocksizeLen, cdenseDimVal, cdenseDimNull)
-}
 func AtgToSparseSparseDim(ptr *Ctensor, self Ctensor, sparseDim int64){
 csparseDim := *(*C.int64_t)(unsafe.Pointer(&sparseDim)) 
 	C.atg_to_sparse_sparse_dim(ptr, self, csparseDim)
 }
-func AtgToSparseSparseDimOut(ptr *Ctensor, out Ctensor, self Ctensor, sparseDim int64){
-csparseDim := *(*C.int64_t)(unsafe.Pointer(&sparseDim)) 
-	C.atg_to_sparse_sparse_dim_out(ptr, out, self, csparseDim)
-}
 func AtgTopk(ptr *Ctensor, self Ctensor, k int64, dim int64, largest int32, sorted int32){
 ck := *(*C.int64_t)(unsafe.Pointer(&k))
 cdim := *(*C.int64_t)(unsafe.Pointer(&dim))
@ -11958,21 +12123,17 @@ func AtgVar(ptr *Ctensor, self Ctensor, unbiased int32){
 cunbiased := *(*C.int)(unsafe.Pointer(&unbiased)) 
 	C.atg_var(ptr, self, cunbiased)
 }
-func AtgVarCorrection(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, correctionVal int64, correctionNull int, keepdim int32){
+func AtgVarCorrection(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, correction Cscalar, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
 cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
-ccorrectionVal := *(*C.int64_t)(unsafe.Pointer(&correctionVal))
-ccorrectionNull := *(*C.uint8_t)(unsafe.Pointer(&correctionNull))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
-	C.atg_var_correction(ptr, self, cdimDataPtr, cdimLen, ccorrectionVal, ccorrectionNull, ckeepdim)
+	C.atg_var_correction(ptr, self, cdimDataPtr, cdimLen, correction , ckeepdim)
 }
-func AtgVarCorrectionOut(ptr *Ctensor, out Ctensor, self Ctensor, dimData []int64, dimLen int, correctionVal int64, correctionNull int, keepdim int32){
+func AtgVarCorrectionOut(ptr *Ctensor, out Ctensor, self Ctensor, dimData []int64, dimLen int, correction Cscalar, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
 cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
-ccorrectionVal := *(*C.int64_t)(unsafe.Pointer(&correctionVal))
-ccorrectionNull := *(*C.uint8_t)(unsafe.Pointer(&correctionNull))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
-	C.atg_var_correction_out(ptr, out, self, cdimDataPtr, cdimLen, ccorrectionVal, ccorrectionNull, ckeepdim)
+	C.atg_var_correction_out(ptr, out, self, cdimDataPtr, cdimLen, correction , ckeepdim)
 }
 func AtgVarDim(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, unbiased int32, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
@ -11985,21 +12146,17 @@ func AtgVarMean(ptr *Ctensor, self Ctensor, unbiased int32){
 cunbiased := *(*C.int)(unsafe.Pointer(&unbiased)) 
 	C.atg_var_mean(ptr, self, cunbiased)
 }
-func AtgVarMeanCorrection(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, correctionVal int64, correctionNull int, keepdim int32){
+func AtgVarMeanCorrection(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, correction Cscalar, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
 cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
-ccorrectionVal := *(*C.int64_t)(unsafe.Pointer(&correctionVal))
-ccorrectionNull := *(*C.uint8_t)(unsafe.Pointer(&correctionNull))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
-	C.atg_var_mean_correction(ptr, self, cdimDataPtr, cdimLen, ccorrectionVal, ccorrectionNull, ckeepdim)
+	C.atg_var_mean_correction(ptr, self, cdimDataPtr, cdimLen, correction , ckeepdim)
 }
-func AtgVarMeanCorrectionOut(ptr *Ctensor, out0 Ctensor, out1 Ctensor, self Ctensor, dimData []int64, dimLen int, correctionVal int64, correctionNull int, keepdim int32){
+func AtgVarMeanCorrectionOut(ptr *Ctensor, out0 Ctensor, out1 Ctensor, self Ctensor, dimData []int64, dimLen int, correction Cscalar, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
 cdimLen := *(*C.int)(unsafe.Pointer(&dimLen))
-ccorrectionVal := *(*C.int64_t)(unsafe.Pointer(&correctionVal))
-ccorrectionNull := *(*C.uint8_t)(unsafe.Pointer(&correctionNull))
 ckeepdim := *(*C.int)(unsafe.Pointer(&keepdim)) 
-	C.atg_var_mean_correction_out(ptr, out0, out1, self, cdimDataPtr, cdimLen, ccorrectionVal, ccorrectionNull, ckeepdim)
+	C.atg_var_mean_correction_out(ptr, out0, out1, self, cdimDataPtr, cdimLen, correction , ckeepdim)
 }
 func AtgVarMeanDim(ptr *Ctensor, self Ctensor, dimData []int64, dimLen int, unbiased int32, keepdim int32){
 cdimDataPtr := (*C.int64_t)(unsafe.Pointer(&dimData[0]))
--- a/libtch/lib.go
+++ b/libtch/lib.go
@ -5,6 +5,6 @@ package libtch
 // #cgo CFLAGS: -I${SRCDIR} -O3 -Wall -Wno-unused-variable -Wno-deprecated-declarations -Wno-c++11-narrowing -g -Wno-sign-compare -Wno-unused-function
 // #cgo CFLAGS: -D_GLIBCXX_USE_CXX11_ABI=0
 // #cgo CFLAGS: -I/usr/local/cuda/include
-// #cgo CXXFLAGS: -std=c++14 -I${SRCDIR} -g -O3
+// #cgo CXXFLAGS: -std=c++17 -I${SRCDIR} -g -O3
 // #cgo CXXFLAGS: -I${SRCDIR}/libtorch/lib -I${SRCDIR}/libtorch/include -I${SRCDIR}/libtorch/include/torch/csrc/api/include
 import "C"
--- a/libtch/torch_api_generated.cpp.h
+++ b/libtch/torch_api_generated.cpp.h
@ -439,13 +439,6 @@ void atg__cholesky_solve_helper_out(tensor *out__, tensor out, tensor self, tens
  )
 }

-int atg__chunk_grad_outputs_efficient_attention(tensor query, tensor key, tensor value, int is_causal) {
-  PROTECT(
-    return torch::_chunk_grad_outputs_efficient_attention(*query, *key, *value, (bool)is_causal);
-  )
-  return 0;
-}
-
 void atg__coalesce(tensor *out__, tensor self) {
  PROTECT(
    auto outputs__ = torch::_coalesce(*self);
@ -628,6 +621,20 @@ void atg__copy_from_out(tensor *out__, tensor out, tensor self, tensor dst, int
  )
 }

+void atg__cslt_compress(tensor *out__, tensor input) {
+  PROTECT(
+    auto outputs__ = torch::_cslt_compress(*input);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__cslt_sparse_mm(tensor *out__, tensor compressed_A, tensor dense_B, tensor bias, int transpose_result) {
+  PROTECT(
+    auto outputs__ = torch::_cslt_sparse_mm(*compressed_A, *dense_B, (bias ? *bias : torch::Tensor()), (bool)transpose_result);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__ctc_loss(tensor *out__, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int zero_infinity) {
  PROTECT(
    auto outputs__ = torch::_ctc_loss(*log_probs, *targets, torch::IntArrayRef(input_lengths_data, input_lengths_len), torch::IntArrayRef(target_lengths_data, target_lengths_len), blank, (bool)zero_infinity);
@ -755,20 +762,6 @@ void atg__cudnn_rnn_out(tensor *out__, tensor out0, tensor out1, tensor out2, te
  )
 }

-int64_t atg__cufft_get_plan_cache_max_size(int64_t device_index) {
-  PROTECT(
-    return torch::_cufft_get_plan_cache_max_size(device_index);
-  )
-  return 0;
-}
-
-int64_t atg__cufft_get_plan_cache_size(int64_t device_index) {
-  PROTECT(
-    return torch::_cufft_get_plan_cache_size(device_index);
-  )
-  return 0;
-}
-
 int64_t atg__debug_has_internal_overlap(tensor self) {
  PROTECT(
    return torch::_debug_has_internal_overlap(*self);
@ -811,12 +804,13 @@ void atg__dirichlet_grad_out(tensor *out__, tensor out, tensor x, tensor alpha,
  )
 }

-void atg__efficient_attention_backward(tensor *out__, tensor grad_out_, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, int is_causal, int chunk_grad_outputs) {
+void atg__efficient_attention_backward(tensor *out__, tensor grad_out_, tensor query, tensor key, tensor value, tensor bias, tensor out, tensor cu_seqlens_q, tensor cu_seqlens_k, int64_t max_seqlen_k, int64_t max_seqlen_q, tensor logsumexp, double dropout_p, tensor philox_seed, tensor philox_offset, int64_t custom_mask_type, int bias_requires_grad, double scale_v, uint8_t scale_null, int64_t num_splits_key_v, uint8_t num_splits_key_null) {
  PROTECT(
-    auto outputs__ = torch::_efficient_attention_backward(*grad_out_, *query, *key, *value, *out, *logsumexp, (bool)is_causal, (bool)chunk_grad_outputs);
+    auto outputs__ = torch::_efficient_attention_backward(*grad_out_, *query, *key, *value, (bias ? *bias : torch::Tensor()), *out, (cu_seqlens_q ? *cu_seqlens_q : torch::Tensor()), (cu_seqlens_k ? *cu_seqlens_k : torch::Tensor()), max_seqlen_k, max_seqlen_q, *logsumexp, dropout_p, *philox_seed, *philox_offset, custom_mask_type, (bool)bias_requires_grad, scale_null ? c10::nullopt : c10::optional<double>(scale_v), num_splits_key_null ? c10::nullopt : c10::optional<int64_t>(num_splits_key_v));
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
    out__[2] = new torch::Tensor(std::get<2>(outputs__));
+    out__[3] = new torch::Tensor(std::get<3>(outputs__));
  )
 }

@ -1062,9 +1056,16 @@ void atg__fft_r2c_out(tensor *out__, tensor out, tensor self, int64_t *dim_data,
  )
 }

-void atg__flash_attention_backward(tensor *out__, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, int64_t philox_seed, int64_t philox_offset) {
+void atg__fill_mem_eff_dropout_mask_(tensor *out__, tensor self, double dropout_p, int64_t seed, int64_t offset) {
  PROTECT(
-    auto outputs__ = torch::_flash_attention_backward(*grad_out, *query, *key, *value, *out, *logsumexp, *cum_seq_q, *cum_seq_k, max_q, max_k, dropout_p, (bool)is_causal, philox_seed, philox_offset);
+    auto outputs__ = torch::_fill_mem_eff_dropout_mask_(*self, dropout_p, seed, offset);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__flash_attention_backward(tensor *out__, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, tensor philox_seed, tensor philox_offset, double scale_v, uint8_t scale_null) {
+  PROTECT(
+    auto outputs__ = torch::_flash_attention_backward(*grad_out, *query, *key, *value, *out, *logsumexp, *cum_seq_q, *cum_seq_k, max_q, max_k, dropout_p, (bool)is_causal, *philox_seed, *philox_offset, scale_null ? c10::nullopt : c10::optional<double>(scale_v));
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
    out__[2] = new torch::Tensor(std::get<2>(outputs__));
@ -1085,6 +1086,27 @@ void atg__foobar_out(tensor *out__, tensor out, tensor self, int arg1, int arg2,
  )
 }

+void atg__functional_assert_async(tensor *out__, tensor self, char* assert_msg_ptr, int assert_msg_len, tensor dep_token) {
+  PROTECT(
+    auto outputs__ = torch::_functional_assert_async(*self, std::string(assert_msg_ptr, assert_msg_len), *dep_token);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__functional_sym_constrain_range(tensor *out__, scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null, tensor dep_token) {
+  PROTECT(
+    auto outputs__ = torch::_functional_sym_constrain_range(*size, min_null ? c10::nullopt : c10::optional<int64_t>(min_v), max_null ? c10::nullopt : c10::optional<int64_t>(max_v), *dep_token);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__functional_sym_constrain_range_for_size(tensor *out__, scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null, tensor dep_token) {
+  PROTECT(
+    auto outputs__ = torch::_functional_sym_constrain_range_for_size(*size, min_null ? c10::nullopt : c10::optional<int64_t>(min_v), max_null ? c10::nullopt : c10::optional<int64_t>(max_v), *dep_token);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__fused_dropout(tensor *out__, tensor self, double p) {
  PROTECT(
    auto outputs__ = torch::_fused_dropout(*self, p);
@ -1129,9 +1151,9 @@ void atg__fused_moving_avg_obs_fq_helper_out(tensor *out__, tensor out0, tensor
  )
 }

-int64_t atg__fused_sdp_choice(tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal) {
+int64_t atg__fused_sdp_choice(tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, double scale_v, uint8_t scale_null) {
  PROTECT(
-    return torch::_fused_sdp_choice(*query, *key, *value, (attn_mask ? *attn_mask : torch::Tensor()), dropout_p, (bool)is_causal);
+    return torch::_fused_sdp_choice(*query, *key, *value, (attn_mask ? *attn_mask : torch::Tensor()), dropout_p, (bool)is_causal, scale_null ? c10::nullopt : c10::optional<double>(scale_v));
  )
  return 0;
 }
@ -1263,6 +1285,20 @@ void atg__indices_copy_out(tensor *out__, tensor out, tensor self) {
  )
 }

+void atg__int_mm(tensor *out__, tensor self, tensor mat2) {
+  PROTECT(
+    auto outputs__ = torch::_int_mm(*self, *mat2);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__int_mm_out(tensor *out__, tensor out, tensor self, tensor mat2) {
+  PROTECT(
+    auto outputs__ = torch::_int_mm_out(*out, *self, *mat2);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__is_all_true(tensor *out__, tensor self) {
  PROTECT(
    auto outputs__ = torch::_is_all_true(*self);
@ -1451,6 +1487,13 @@ void atg__lu_with_info(tensor *out__, tensor self, int pivot, int check_errors)
  )
 }

+void atg__make_dep_token(tensor *out__, int options_kind, int options_device) {
+  PROTECT(
+    auto outputs__ = torch::_make_dep_token(at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__make_dual(tensor *out__, tensor primal, tensor tangent, int64_t level) {
  PROTECT(
    auto outputs__ = torch::_make_dual(*primal, *tangent, level);
@ -1643,6 +1686,24 @@ void atg__native_batch_norm_legit_no_stats_out(tensor *out__, tensor out, tensor
  )
 }

+void atg__native_batch_norm_legit_no_training(tensor *out__, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, double momentum, double eps) {
+  PROTECT(
+    auto outputs__ = torch::_native_batch_norm_legit_no_training(*input, (weight ? *weight : torch::Tensor()), (bias ? *bias : torch::Tensor()), *running_mean, *running_var, momentum, eps);
+    out__[0] = new torch::Tensor(std::get<0>(outputs__));
+    out__[1] = new torch::Tensor(std::get<1>(outputs__));
+    out__[2] = new torch::Tensor(std::get<2>(outputs__));
+  )
+}
+
+void atg__native_batch_norm_legit_no_training_out(tensor *out__, tensor out0, tensor out1, tensor out2, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, double momentum, double eps) {
+  PROTECT(
+    auto outputs__ = torch::_native_batch_norm_legit_no_training_out(*out0, *out1, *out2, *input, (weight ? *weight : torch::Tensor()), (bias ? *bias : torch::Tensor()), *running_mean, *running_var, momentum, eps);
+    out__[0] = new torch::Tensor(std::get<0>(outputs__));
+    out__[1] = new torch::Tensor(std::get<1>(outputs__));
+    out__[2] = new torch::Tensor(std::get<2>(outputs__));
+  )
+}
+
 void atg__native_batch_norm_legit_out(tensor *out__, tensor out, tensor save_mean, tensor save_invstd, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double momentum, double eps) {
  PROTECT(
    auto outputs__ = torch::_native_batch_norm_legit_out(*out, *save_mean, *save_invstd, *input, (weight ? *weight : torch::Tensor()), (bias ? *bias : torch::Tensor()), *running_mean, *running_var, (bool)training, momentum, eps);
@ -1652,26 +1713,6 @@ void atg__native_batch_norm_legit_out(tensor *out__, tensor out, tensor save_mea
  )
 }

-void atg__native_decoder_only_multi_head_attention(tensor *out__, tensor query, tensor key, tensor value, int64_t embed_dim, int64_t num_head, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, tensor mask, tensor incr_key, tensor incr_value, int need_weights, int average_attn_weights) {
-  PROTECT(
-    auto outputs__ = torch::_native_decoder_only_multi_head_attention(*query, *key, *value, embed_dim, num_head, *qkv_weight, *qkv_bias, *proj_weight, *proj_bias, (mask ? *mask : torch::Tensor()), (incr_key ? *incr_key : torch::Tensor()), (incr_value ? *incr_value : torch::Tensor()), (bool)need_weights, (bool)average_attn_weights);
-    out__[0] = new torch::Tensor(std::get<0>(outputs__));
-    out__[1] = new torch::Tensor(std::get<1>(outputs__));
-    out__[2] = new torch::Tensor(std::get<2>(outputs__));
-    out__[3] = new torch::Tensor(std::get<3>(outputs__));
-  )
-}
-
-void atg__native_decoder_only_multi_head_attention_out(tensor *out__, tensor out0, tensor out1, tensor out2, tensor out3, tensor query, tensor key, tensor value, int64_t embed_dim, int64_t num_head, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, tensor mask, tensor incr_key, tensor incr_value, int need_weights, int average_attn_weights) {
-  PROTECT(
-    auto outputs__ = torch::_native_decoder_only_multi_head_attention_out(*out0, *out1, *out2, *out3, *query, *key, *value, embed_dim, num_head, *qkv_weight, *qkv_bias, *proj_weight, *proj_bias, (mask ? *mask : torch::Tensor()), (incr_key ? *incr_key : torch::Tensor()), (incr_value ? *incr_value : torch::Tensor()), (bool)need_weights, (bool)average_attn_weights);
-    out__[0] = new torch::Tensor(std::get<0>(outputs__));
-    out__[1] = new torch::Tensor(std::get<1>(outputs__));
-    out__[2] = new torch::Tensor(std::get<2>(outputs__));
-    out__[3] = new torch::Tensor(std::get<3>(outputs__));
-  )
-}
-
 void atg__native_multi_head_attention(tensor *out__, tensor query, tensor key, tensor value, int64_t embed_dim, int64_t num_head, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, tensor mask, int need_weights, int average_attn_weights, int64_t mask_type_v, uint8_t mask_type_null) {
  PROTECT(
    auto outputs__ = torch::_native_multi_head_attention(*query, *key, *value, embed_dim, num_head, *qkv_weight, *qkv_bias, *proj_weight, *proj_bias, (mask ? *mask : torch::Tensor()), (bool)need_weights, (bool)average_attn_weights, mask_type_null ? c10::nullopt : c10::optional<int64_t>(mask_type_v));
@ -1751,23 +1792,23 @@ void atg__nested_sum_backward(tensor *out__, tensor grad, tensor self, int64_t *
  )
 }

-void atg__nested_view_from_buffer(tensor *out__, tensor self, tensor nested_size, tensor nested_strides, int64_t *offsets_data, int offsets_len) {
+void atg__nested_view_from_buffer(tensor *out__, tensor self, tensor nested_size, tensor nested_strides, tensor offsets) {
  PROTECT(
-    auto outputs__ = torch::_nested_view_from_buffer(*self, *nested_size, *nested_strides, torch::IntArrayRef(offsets_data, offsets_len));
+    auto outputs__ = torch::_nested_view_from_buffer(*self, *nested_size, *nested_strides, *offsets);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg__nested_view_from_buffer_copy(tensor *out__, tensor self, tensor nested_size, tensor nested_strides, int64_t *offsets_data, int offsets_len) {
+void atg__nested_view_from_buffer_copy(tensor *out__, tensor self, tensor nested_size, tensor nested_strides, tensor offsets) {
  PROTECT(
-    auto outputs__ = torch::_nested_view_from_buffer_copy(*self, *nested_size, *nested_strides, torch::IntArrayRef(offsets_data, offsets_len));
+    auto outputs__ = torch::_nested_view_from_buffer_copy(*self, *nested_size, *nested_strides, *offsets);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg__nested_view_from_buffer_copy_out(tensor *out__, tensor out, tensor self, tensor nested_size, tensor nested_strides, int64_t *offsets_data, int offsets_len) {
+void atg__nested_view_from_buffer_copy_out(tensor *out__, tensor out, tensor self, tensor nested_size, tensor nested_strides, tensor offsets) {
  PROTECT(
-    auto outputs__ = torch::_nested_view_from_buffer_copy_out(*out, *self, *nested_size, *nested_strides, torch::IntArrayRef(offsets_data, offsets_len));
+    auto outputs__ = torch::_nested_view_from_buffer_copy_out(*out, *self, *nested_size, *nested_strides, *offsets);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -1902,6 +1943,12 @@ void atg__prelu_kernel_backward(tensor *out__, tensor grad_output, tensor self,
  )
 }

+void atg__propagate_xla_data(tensor input, tensor output) {
+  PROTECT(
+    torch::_propagate_xla_data(*input, *output);
+  )
+}
+
 void atg__remove_batch_dim(tensor *out__, tensor self, int64_t level, int64_t batch_size, int64_t out_dim) {
  PROTECT(
    auto outputs__ = torch::_remove_batch_dim(*self, level, batch_size, out_dim);
@ -1994,45 +2041,67 @@ void atg__saturate_weight_to_fp16(tensor *out__, tensor weight) {
  )
 }

-void atg__scaled_dot_product_attention(tensor *out__, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int need_attn_weights, int is_causal) {
+void atg__scaled_dot_product_attention_math(tensor *out__, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, tensor dropout_mask, double scale_v, uint8_t scale_null) {
  PROTECT(
-    auto outputs__ = torch::_scaled_dot_product_attention(*query, *key, *value, (attn_mask ? *attn_mask : torch::Tensor()), dropout_p, (bool)need_attn_weights, (bool)is_causal);
+    auto outputs__ = torch::_scaled_dot_product_attention_math(*query, *key, *value, (attn_mask ? *attn_mask : torch::Tensor()), dropout_p, (bool)is_causal, (dropout_mask ? *dropout_mask : torch::Tensor()), scale_null ? c10::nullopt : c10::optional<double>(scale_v));
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
  )
 }

-void atg__scaled_dot_product_attention_math(tensor *out__, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, tensor dropout_mask) {
+void atg__scaled_dot_product_efficient_attention(tensor *out__, tensor query, tensor key, tensor value, tensor attn_bias, int compute_log_sumexp, double dropout_p, int is_causal, double scale_v, uint8_t scale_null) {
  PROTECT(
-    auto outputs__ = torch::_scaled_dot_product_attention_math(*query, *key, *value, (attn_mask ? *attn_mask : torch::Tensor()), dropout_p, (bool)is_causal, (dropout_mask ? *dropout_mask : torch::Tensor()));
+    auto outputs__ = torch::_scaled_dot_product_efficient_attention(*query, *key, *value, (attn_bias ? *attn_bias : torch::Tensor()), (bool)compute_log_sumexp, dropout_p, (bool)is_causal, scale_null ? c10::nullopt : c10::optional<double>(scale_v));
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
+    out__[2] = new torch::Tensor(std::get<2>(outputs__));
+    out__[3] = new torch::Tensor(std::get<3>(outputs__));
  )
 }

-void atg__scaled_dot_product_efficient_attention(tensor *out__, tensor query, tensor key, tensor value, int compute_log_sumexp, int is_causal) {
+void atg__scaled_dot_product_flash_attention_backward(tensor *out__, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, tensor philox_seed, tensor philox_offset, double scale_v, uint8_t scale_null) {
  PROTECT(
-    auto outputs__ = torch::_scaled_dot_product_efficient_attention(*query, *key, *value, (bool)compute_log_sumexp, (bool)is_causal);
-    out__[0] = new torch::Tensor(std::get<0>(outputs__));
-    out__[1] = new torch::Tensor(std::get<1>(outputs__));
-  )
-}
-
-void atg__scaled_dot_product_efficient_attention_backward(tensor *out__, tensor grad_out_, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, int is_causal, int chunk_grad_outputs) {
-  PROTECT(
-    auto outputs__ = torch::_scaled_dot_product_efficient_attention_backward(*grad_out_, *query, *key, *value, *out, *logsumexp, (bool)is_causal, (bool)chunk_grad_outputs);
+    auto outputs__ = torch::_scaled_dot_product_flash_attention_backward(*grad_out, *query, *key, *value, *out, *logsumexp, *cum_seq_q, *cum_seq_k, max_q, max_k, dropout_p, (bool)is_causal, *philox_seed, *philox_offset, scale_null ? c10::nullopt : c10::optional<double>(scale_v));
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
    out__[2] = new torch::Tensor(std::get<2>(outputs__));
  )
 }

-void atg__scaled_dot_product_flash_attention_backward(tensor *out__, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, int64_t philox_seed, int64_t philox_offset) {
+void atg__scaled_mm(tensor *out__, tensor self, tensor mat2, tensor bias, int out_dtype, tensor scale_a, tensor scale_b, tensor scale_result) {
  PROTECT(
-    auto outputs__ = torch::_scaled_dot_product_flash_attention_backward(*grad_out, *query, *key, *value, *out, *logsumexp, *cum_seq_q, *cum_seq_k, max_q, max_k, dropout_p, (bool)is_causal, philox_seed, philox_offset);
+    auto outputs__ = torch::_scaled_mm(*self, *mat2, (bias ? *bias : torch::Tensor()), out_dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(out_dtype)), (scale_a ? *scale_a : torch::Tensor()), (scale_b ? *scale_b : torch::Tensor()), (scale_result ? *scale_result : torch::Tensor()));
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
-    out__[2] = new torch::Tensor(std::get<2>(outputs__));
+  )
+}
+
+void atg__scaled_mm_out(tensor *out__, tensor out, tensor out_amax, tensor self, tensor mat2, tensor bias, int out_dtype, tensor scale_a, tensor scale_b, tensor scale_result) {
+  PROTECT(
+    auto outputs__ = torch::_scaled_mm_out(*out, *out_amax, *self, *mat2, (bias ? *bias : torch::Tensor()), out_dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(out_dtype)), (scale_a ? *scale_a : torch::Tensor()), (scale_b ? *scale_b : torch::Tensor()), (scale_result ? *scale_result : torch::Tensor()));
+    out__[0] = new torch::Tensor(std::get<0>(outputs__));
+    out__[1] = new torch::Tensor(std::get<1>(outputs__));
+  )
+}
+
+void atg__scatter_reduce(tensor *out__, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len, int include_self) {
+  PROTECT(
+    auto outputs__ = torch::scatter_reduce(*self, dim, *index, *src, std::string(reduce_ptr, reduce_len), (bool)include_self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__scatter_reduce_(tensor *out__, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len, int include_self) {
+  PROTECT(
+    auto outputs__ = self->scatter_reduce_(dim, *index, *src, std::string(reduce_ptr, reduce_len), (bool)include_self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__scatter_reduce_two_out(tensor *out__, tensor out, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len, int include_self) {
+  PROTECT(
+    auto outputs__ = torch::scatter_reduce_out(*out, *self, dim, *index, *src, std::string(reduce_ptr, reduce_len), (bool)include_self);
+    out__[0] = new torch::Tensor(outputs__);
  )
 }

@ -2179,9 +2248,9 @@ void atg__sparse_compressed_tensor_unsafe(tensor *out__, tensor compressed_indic
  )
 }

-void atg__sparse_coo_tensor_unsafe(tensor *out__, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device) {
+void atg__sparse_coo_tensor_unsafe(tensor *out__, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device, int is_coalesced) {
  PROTECT(
-    auto outputs__ = torch::_sparse_coo_tensor_unsafe(*indices, *values, torch::IntArrayRef(size_data, size_len), at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)));
+    auto outputs__ = torch::_sparse_coo_tensor_unsafe(*indices, *values, torch::IntArrayRef(size_data, size_len), at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)), (bool)is_coalesced);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -2193,16 +2262,16 @@ void atg__sparse_coo_tensor_with_dims(tensor *out__, int64_t sparse_dim, int64_t
  )
 }

-void atg__sparse_coo_tensor_with_dims_and_tensors(tensor *out__, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int options_kind, int options_device) {
+void atg__sparse_coo_tensor_with_dims_and_tensors(tensor *out__, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int options_kind, int options_device, int is_coalesced) {
  PROTECT(
-    auto outputs__ = torch::_sparse_coo_tensor_with_dims_and_tensors(sparse_dim, dense_dim, torch::IntArrayRef(size_data, size_len), *indices, *values, at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)));
+    auto outputs__ = torch::_sparse_coo_tensor_with_dims_and_tensors(sparse_dim, dense_dim, torch::IntArrayRef(size_data, size_len), *indices, *values, at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)), (bool)is_coalesced);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg__sparse_coo_tensor_with_dims_and_tensors_out(tensor *out__, tensor out, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values) {
+void atg__sparse_coo_tensor_with_dims_and_tensors_out(tensor *out__, tensor out, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int is_coalesced) {
  PROTECT(
-    auto outputs__ = torch::_sparse_coo_tensor_with_dims_and_tensors_out(*out, sparse_dim, dense_dim, torch::IntArrayRef(size_data, size_len), *indices, *values);
+    auto outputs__ = torch::_sparse_coo_tensor_with_dims_and_tensors_out(*out, sparse_dim, dense_dim, torch::IntArrayRef(size_data, size_len), *indices, *values, (bool)is_coalesced);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -2291,6 +2360,20 @@ void atg__sparse_log_softmax_out(tensor *out__, tensor out, tensor self, int64_t
  )
 }

+void atg__sparse_mask_projection(tensor *out__, tensor self, tensor mask, int accumulate_matches) {
+  PROTECT(
+    auto outputs__ = self->_sparse_mask_projection(*mask, (bool)accumulate_matches);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__sparse_mask_projection_out(tensor *out__, tensor out, tensor self, tensor mask, int accumulate_matches) {
+  PROTECT(
+    auto outputs__ = torch::_sparse_mask_projection_out(*out, *self, *mask, (bool)accumulate_matches);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__sparse_mm(tensor *out__, tensor sparse, tensor dense) {
  PROTECT(
    auto outputs__ = torch::_sparse_mm(*sparse, *dense);
@ -2313,6 +2396,13 @@ void atg__sparse_mm_reduce_impl(tensor *out__, tensor self, tensor other, char*
  )
 }

+void atg__sparse_semi_structured_linear(tensor *out__, tensor input, tensor weight, tensor meta, tensor bias, char* activation_ptr, int activation_len) {
+  PROTECT(
+    auto outputs__ = torch::_sparse_semi_structured_linear(*input, *weight, *meta, (bias ? *bias : torch::Tensor()), std::string(activation_ptr, activation_len));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__sparse_softmax(tensor *out__, tensor self, int64_t dim, int half_to_float) {
  PROTECT(
    auto outputs__ = torch::_sparse_softmax(*self, dim, (bool)half_to_float);
@ -2530,6 +2620,20 @@ void atg__test_check_tensor(tensor *out__, tensor self) {
  )
 }

+void atg__test_functorch_fallback(tensor *out__, tensor self, tensor other) {
+  PROTECT(
+    auto outputs__ = torch::_test_functorch_fallback(*self, *other);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__test_functorch_fallback_out(tensor *out__, tensor out, tensor self, tensor other) {
+  PROTECT(
+    auto outputs__ = torch::_test_functorch_fallback_out(*out, *self, *other);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__test_optional_filled_intlist(tensor *out__, tensor values, int64_t *addends_data, int addends_len) {
  PROTECT(
    auto outputs__ = torch::_test_optional_filled_intlist(*values, addends_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(addends_data, addends_len)));
@ -2627,16 +2731,108 @@ tensor *atg__to_cpu(tensor *tensors_data, int tensors_len) {
  return nullptr;
 }

-void atg__to_dense(tensor *out__, tensor self, int dtype) {
+void atg__to_dense(tensor *out__, tensor self, int dtype, int masked_grad) {
  PROTECT(
-    auto outputs__ = self->_to_dense(dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(dtype)));
+    auto outputs__ = self->_to_dense(dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(dtype)), (bool)masked_grad);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg__to_dense_out(tensor *out__, tensor out, tensor self, int dtype) {
+void atg__to_dense_out(tensor *out__, tensor out, tensor self, int dtype, int masked_grad) {
  PROTECT(
-    auto outputs__ = torch::_to_dense_out(*out, *self, dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(dtype)));
+    auto outputs__ = torch::_to_dense_out(*out, *self, dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(dtype)), (bool)masked_grad);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse(tensor *out__, tensor self, int8_t layout, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = self->_to_sparse((layout == -1 ? c10::nullopt : c10::optional<at::Layout>(static_cast<at::Layout>(layout))), blocksize_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(blocksize_data, blocksize_len)), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_bsc(tensor *out__, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = self->_to_sparse_bsc(torch::IntArrayRef(blocksize_data, blocksize_len), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_bsc_out(tensor *out__, tensor out, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = torch::_to_sparse_bsc_out(*out, *self, torch::IntArrayRef(blocksize_data, blocksize_len), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_bsr(tensor *out__, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = self->_to_sparse_bsr(torch::IntArrayRef(blocksize_data, blocksize_len), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_bsr_out(tensor *out__, tensor out, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = torch::_to_sparse_bsr_out(*out, *self, torch::IntArrayRef(blocksize_data, blocksize_len), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_csc(tensor *out__, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = self->_to_sparse_csc(dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_csc_out(tensor *out__, tensor out, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = torch::_to_sparse_csc_out(*out, *self, dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_csr(tensor *out__, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = self->_to_sparse_csr(dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_csr_out(tensor *out__, tensor out, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = torch::_to_sparse_csr_out(*out, *self, dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_out(tensor *out__, tensor out, tensor self, int8_t layout, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
+  PROTECT(
+    auto outputs__ = torch::_to_sparse_out(*out, *self, (layout == -1 ? c10::nullopt : c10::optional<at::Layout>(static_cast<at::Layout>(layout))), blocksize_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(blocksize_data, blocksize_len)), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_semi_structured(tensor *out__, tensor dense) {
+  PROTECT(
+    auto outputs__ = torch::_to_sparse_semi_structured(*dense);
+    out__[0] = new torch::Tensor(std::get<0>(outputs__));
+    out__[1] = new torch::Tensor(std::get<1>(outputs__));
+  )
+}
+
+void atg__to_sparse_sparse_dim(tensor *out__, tensor self, int64_t sparse_dim) {
+  PROTECT(
+    auto outputs__ = self->_to_sparse(sparse_dim);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__to_sparse_sparse_dim_out(tensor *out__, tensor out, tensor self, int64_t sparse_dim) {
+  PROTECT(
+    auto outputs__ = torch::_to_sparse_out(*out, *self, sparse_dim);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -2659,24 +2855,6 @@ void atg__transform_bias_rescale_qkv_out(tensor *out__, tensor out0, tensor out1
  )
 }

-void atg__transformer_decoder_only_layer_fwd(tensor *out__, tensor src, int64_t embed_dim, int64_t num_heads, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, int use_gelu, int norm_first, double eps, tensor norm_weight_1, tensor norm_bias_1, tensor norm_weight_2, tensor norm_bias_2, tensor ffn_weight_1, tensor ffn_bias_1, tensor ffn_weight_2, tensor ffn_bias_2, tensor mask, tensor incr_key, tensor incr_value) {
-  PROTECT(
-    auto outputs__ = torch::_transformer_decoder_only_layer_fwd(*src, embed_dim, num_heads, *qkv_weight, *qkv_bias, *proj_weight, *proj_bias, (bool)use_gelu, (bool)norm_first, eps, *norm_weight_1, *norm_bias_1, *norm_weight_2, *norm_bias_2, *ffn_weight_1, *ffn_bias_1, *ffn_weight_2, *ffn_bias_2, (mask ? *mask : torch::Tensor()), (incr_key ? *incr_key : torch::Tensor()), (incr_value ? *incr_value : torch::Tensor()));
-    out__[0] = new torch::Tensor(std::get<0>(outputs__));
-    out__[1] = new torch::Tensor(std::get<1>(outputs__));
-    out__[2] = new torch::Tensor(std::get<2>(outputs__));
-  )
-}
-
-void atg__transformer_decoder_only_layer_fwd_out(tensor *out__, tensor out0, tensor out1, tensor out2, tensor src, int64_t embed_dim, int64_t num_heads, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, int use_gelu, int norm_first, double eps, tensor norm_weight_1, tensor norm_bias_1, tensor norm_weight_2, tensor norm_bias_2, tensor ffn_weight_1, tensor ffn_bias_1, tensor ffn_weight_2, tensor ffn_bias_2, tensor mask, tensor incr_key, tensor incr_value) {
-  PROTECT(
-    auto outputs__ = torch::_transformer_decoder_only_layer_fwd_out(*out0, *out1, *out2, *src, embed_dim, num_heads, *qkv_weight, *qkv_bias, *proj_weight, *proj_bias, (bool)use_gelu, (bool)norm_first, eps, *norm_weight_1, *norm_bias_1, *norm_weight_2, *norm_bias_2, *ffn_weight_1, *ffn_bias_1, *ffn_weight_2, *ffn_bias_2, (mask ? *mask : torch::Tensor()), (incr_key ? *incr_key : torch::Tensor()), (incr_value ? *incr_value : torch::Tensor()));
-    out__[0] = new torch::Tensor(std::get<0>(outputs__));
-    out__[1] = new torch::Tensor(std::get<1>(outputs__));
-    out__[2] = new torch::Tensor(std::get<2>(outputs__));
-  )
-}
-
 void atg__transformer_encoder_layer_fwd(tensor *out__, tensor src, int64_t embed_dim, int64_t num_heads, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, int use_gelu, int norm_first, double eps, tensor norm_weight_1, tensor norm_bias_1, tensor norm_weight_2, tensor norm_bias_2, tensor ffn_weight_1, tensor ffn_bias_1, tensor ffn_weight_2, tensor ffn_bias_2, tensor mask, int64_t mask_type_v, uint8_t mask_type_null) {
  PROTECT(
    auto outputs__ = torch::_transformer_encoder_layer_fwd(*src, embed_dim, num_heads, *qkv_weight, *qkv_bias, *proj_weight, *proj_bias, (bool)use_gelu, (bool)norm_first, eps, *norm_weight_1, *norm_bias_1, *norm_weight_2, *norm_bias_2, *ffn_weight_1, *ffn_bias_1, *ffn_weight_2, *ffn_bias_2, (mask ? *mask : torch::Tensor()), mask_type_null ? c10::nullopt : c10::optional<int64_t>(mask_type_v));
@ -2775,6 +2953,20 @@ void atg__unpack_dual(tensor *out__, tensor dual, int64_t level) {
  )
 }

+void atg__unsafe_index(tensor *out__, tensor self, tensor *indices_data, int indices_len) {
+  PROTECT(
+    auto outputs__ = torch::_unsafe_index(*self, of_carray_tensor_opt(indices_data, indices_len));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg__unsafe_index_put(tensor *out__, tensor self, tensor *indices_data, int indices_len, tensor values, int accumulate) {
+  PROTECT(
+    auto outputs__ = torch::_unsafe_index_put(*self, of_carray_tensor_opt(indices_data, indices_len), *values, (bool)accumulate);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg__unsafe_view(tensor *out__, tensor self, int64_t *size_data, int size_len) {
  PROTECT(
    auto outputs__ = torch::_unsafe_view(*self, torch::IntArrayRef(size_data, size_len));
@ -4176,16 +4368,16 @@ void atg_batch_norm(tensor *out__, tensor input, tensor weight, tensor bias, ten
  )
 }

-void atg_batch_norm_backward_elemt(tensor *out__, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, tensor mean_dy, tensor mean_dy_xmu, tensor count) {
+void atg_batch_norm_backward_elemt(tensor *out__, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, tensor sum_dy, tensor sum_dy_xmu, tensor count) {
  PROTECT(
-    auto outputs__ = torch::batch_norm_backward_elemt(*grad_out, *input, *mean, *invstd, (weight ? *weight : torch::Tensor()), *mean_dy, *mean_dy_xmu, *count);
+    auto outputs__ = torch::batch_norm_backward_elemt(*grad_out, *input, *mean, *invstd, (weight ? *weight : torch::Tensor()), *sum_dy, *sum_dy_xmu, *count);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg_batch_norm_backward_elemt_out(tensor *out__, tensor out, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, tensor mean_dy, tensor mean_dy_xmu, tensor count) {
+void atg_batch_norm_backward_elemt_out(tensor *out__, tensor out, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, tensor sum_dy, tensor sum_dy_xmu, tensor count) {
  PROTECT(
-    auto outputs__ = torch::batch_norm_backward_elemt_out(*out, *grad_out, *input, *mean, *invstd, (weight ? *weight : torch::Tensor()), *mean_dy, *mean_dy_xmu, *count);
+    auto outputs__ = torch::batch_norm_backward_elemt_out(*out, *grad_out, *input, *mean, *invstd, (weight ? *weight : torch::Tensor()), *sum_dy, *sum_dy_xmu, *count);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -6569,6 +6761,20 @@ void atg_empty_out(tensor *out__, tensor out, int64_t *size_data, int size_len)
  )
 }

+void atg_empty_permuted(tensor *out__, int64_t *size_data, int size_len, int64_t *physical_layout_data, int physical_layout_len, int options_kind, int options_device) {
+  PROTECT(
+    auto outputs__ = torch::empty_permuted(torch::IntArrayRef(size_data, size_len), torch::IntArrayRef(physical_layout_data, physical_layout_len), at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg_empty_permuted_out(tensor *out__, tensor out, int64_t *size_data, int size_len, int64_t *physical_layout_data, int physical_layout_len) {
+  PROTECT(
+    auto outputs__ = torch::empty_permuted_out(*out, torch::IntArrayRef(size_data, size_len), torch::IntArrayRef(physical_layout_data, physical_layout_len));
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg_empty_quantized(tensor *out__, int64_t *size_data, int size_len, tensor qtensor, int options_kind, int options_device) {
  PROTECT(
    auto outputs__ = torch::empty_quantized(torch::IntArrayRef(size_data, size_len), *qtensor, at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)));
@ -11072,6 +11278,13 @@ void atg_min_out(tensor *out__, tensor out, tensor self, tensor other) {
  )
 }

+void atg_min_unary_out(tensor *out__, tensor out, tensor self) {
+  PROTECT(
+    auto outputs__ = torch::min_out(*out, *self);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg_minimum(tensor *out__, tensor self, tensor other) {
  PROTECT(
    auto outputs__ = torch::minimum(*self, *other);
@ -12245,6 +12458,20 @@ void atg_nonzero_out(tensor *out__, tensor out, tensor self) {
  )
 }

+void atg_nonzero_static(tensor *out__, tensor self, int64_t size, int64_t fill_value) {
+  PROTECT(
+    auto outputs__ = torch::nonzero_static(*self, size, fill_value);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg_nonzero_static_out(tensor *out__, tensor out, tensor self, int64_t size, int64_t fill_value) {
+  PROTECT(
+    auto outputs__ = torch::nonzero_static_out(*out, *self, size, fill_value);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg_norm(tensor *out__, tensor self) {
  PROTECT(
    auto outputs__ = torch::norm(*self);
@ -12954,6 +13181,20 @@ void atg_quantized_max_pool2d_out(tensor *out__, tensor out, tensor self, int64_
  )
 }

+void atg_quantized_max_pool3d(tensor *out__, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode) {
+  PROTECT(
+    auto outputs__ = torch::quantized_max_pool3d(*self, torch::IntArrayRef(kernel_size_data, kernel_size_len), torch::IntArrayRef(stride_data, stride_len), torch::IntArrayRef(padding_data, padding_len), torch::IntArrayRef(dilation_data, dilation_len), (bool)ceil_mode);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
+void atg_quantized_max_pool3d_out(tensor *out__, tensor out, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode) {
+  PROTECT(
+    auto outputs__ = torch::quantized_max_pool3d_out(*out, *self, torch::IntArrayRef(kernel_size_data, kernel_size_len), torch::IntArrayRef(stride_data, stride_len), torch::IntArrayRef(padding_data, padding_len), torch::IntArrayRef(dilation_data, dilation_len), (bool)ceil_mode);
+    out__[0] = new torch::Tensor(outputs__);
+  )
+}
+
 void atg_quantized_rnn_relu_cell(tensor *out__, tensor input, tensor hx, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh, tensor packed_ih, tensor packed_hh, tensor col_offsets_ih, tensor col_offsets_hh, scalar scale_ih, scalar scale_hh, scalar zero_point_ih, scalar zero_point_hh) {
  PROTECT(
    auto outputs__ = torch::quantized_rnn_relu_cell(*input, *hx, *w_ih, *w_hh, *b_ih, *b_hh, *packed_ih, *packed_hh, *col_offsets_ih, *col_offsets_hh, *scale_ih, *scale_hh, *zero_point_ih, *zero_point_hh);
@ -13931,9 +14172,9 @@ void atg_scalar_tensor_out(tensor *out__, tensor out, scalar s) {
  )
 }

-void atg_scaled_dot_product_attention(tensor *out__, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal) {
+void atg_scaled_dot_product_attention(tensor *out__, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, double scale_v, uint8_t scale_null) {
  PROTECT(
-    auto outputs__ = torch::scaled_dot_product_attention(*query, *key, *value, (attn_mask ? *attn_mask : torch::Tensor()), dropout_p, (bool)is_causal);
+    auto outputs__ = torch::scaled_dot_product_attention(*query, *key, *value, (attn_mask ? *attn_mask : torch::Tensor()), dropout_p, (bool)is_causal, scale_null ? c10::nullopt : c10::optional<double>(scale_v));
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -14742,16 +14983,16 @@ void atg_sparse_coo_tensor(tensor *out__, int64_t *size_data, int size_len, int
  )
 }

-void atg_sparse_coo_tensor_indices(tensor *out__, tensor indices, tensor values, int options_kind, int options_device) {
+void atg_sparse_coo_tensor_indices(tensor *out__, tensor indices, tensor values, int options_kind, int options_device, int is_coalesced) {
  PROTECT(
-    auto outputs__ = torch::sparse_coo_tensor(*indices, *values, at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)));
+    auto outputs__ = torch::sparse_coo_tensor(*indices, *values, at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)), (bool)is_coalesced);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg_sparse_coo_tensor_indices_size(tensor *out__, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device) {
+void atg_sparse_coo_tensor_indices_size(tensor *out__, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device, int is_coalesced) {
  PROTECT(
-    auto outputs__ = torch::sparse_coo_tensor(*indices, *values, torch::IntArrayRef(size_data, size_len), at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)));
+    auto outputs__ = torch::sparse_coo_tensor(*indices, *values, torch::IntArrayRef(size_data, size_len), at::device(device_of_int(options_device)).dtype(at::ScalarType(options_kind)), (bool)is_coalesced);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -16284,16 +16525,16 @@ void atg_std(tensor *out__, tensor self, int unbiased) {
  )
 }

-void atg_std_correction(tensor *out__, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim) {
+void atg_std_correction(tensor *out__, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim) {
  PROTECT(
-    auto outputs__ = torch::std(*self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), correction_null ? c10::nullopt : c10::optional<int64_t>(correction_v), (bool)keepdim);
+    auto outputs__ = torch::std(*self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), *correction, (bool)keepdim);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg_std_correction_out(tensor *out__, tensor out, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim) {
+void atg_std_correction_out(tensor *out__, tensor out, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim) {
  PROTECT(
-    auto outputs__ = torch::std_out(*out, *self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), correction_null ? c10::nullopt : c10::optional<int64_t>(correction_v), (bool)keepdim);
+    auto outputs__ = torch::std_out(*out, *self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), *correction, (bool)keepdim);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -16313,17 +16554,17 @@ void atg_std_mean(tensor *out__, tensor self, int unbiased) {
  )
 }

-void atg_std_mean_correction(tensor *out__, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim) {
+void atg_std_mean_correction(tensor *out__, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim) {
  PROTECT(
-    auto outputs__ = torch::std_mean(*self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), correction_null ? c10::nullopt : c10::optional<int64_t>(correction_v), (bool)keepdim);
+    auto outputs__ = torch::std_mean(*self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), *correction, (bool)keepdim);
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
  )
 }

-void atg_std_mean_correction_out(tensor *out__, tensor out0, tensor out1, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim) {
+void atg_std_mean_correction_out(tensor *out__, tensor out0, tensor out1, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim) {
  PROTECT(
-    auto outputs__ = torch::std_mean_out(*out0, *out1, *self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), correction_null ? c10::nullopt : c10::optional<int64_t>(correction_v), (bool)keepdim);
+    auto outputs__ = torch::std_mean_out(*out0, *out1, *self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), *correction, (bool)keepdim);
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
  )
@ -16516,6 +16757,18 @@ void atg_swapdims_(tensor *out__, tensor self, int64_t dim0, int64_t dim1) {
  )
 }

+void atg_sym_constrain_range(scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null) {
+  PROTECT(
+    torch::sym_constrain_range(*size, min_null ? c10::nullopt : c10::optional<int64_t>(min_v), max_null ? c10::nullopt : c10::optional<int64_t>(max_v));
+  )
+}
+
+void atg_sym_constrain_range_for_size(scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null) {
+  PROTECT(
+    torch::sym_constrain_range_for_size(*size, min_null ? c10::nullopt : c10::optional<int64_t>(min_v), max_null ? c10::nullopt : c10::optional<int64_t>(max_v));
+  )
+}
+
 void atg_t(tensor *out__, tensor self) {
  PROTECT(
    auto outputs__ = torch::t(*self);
@ -16730,16 +16983,16 @@ void atg_to(tensor *out__, tensor self, int device) {
  )
 }

-void atg_to_dense(tensor *out__, tensor self, int dtype) {
+void atg_to_dense(tensor *out__, tensor self, int dtype, int masked_grad) {
  PROTECT(
-    auto outputs__ = self->to_dense(dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(dtype)));
+    auto outputs__ = self->to_dense(dtype < 0 ? c10::nullopt : c10::optional<at::ScalarType>(at::ScalarType(dtype)), (bool)masked_grad);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg_to_dense_backward(tensor *out__, tensor grad, tensor input) {
+void atg_to_dense_backward(tensor *out__, tensor grad, tensor input, int masked_grad) {
  PROTECT(
-    auto outputs__ = torch::to_dense_backward(*grad, *input);
+    auto outputs__ = torch::to_dense_backward(*grad, *input, (bool)masked_grad);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -16821,13 +17074,6 @@ void atg_to_sparse_bsc(tensor *out__, tensor self, int64_t *blocksize_data, int
  )
 }

-void atg_to_sparse_bsc_out(tensor *out__, tensor out, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
-  PROTECT(
-    auto outputs__ = torch::to_sparse_bsc_out(*out, *self, torch::IntArrayRef(blocksize_data, blocksize_len), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
-    out__[0] = new torch::Tensor(outputs__);
-  )
-}
-
 void atg_to_sparse_bsr(tensor *out__, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
  PROTECT(
    auto outputs__ = self->to_sparse_bsr(torch::IntArrayRef(blocksize_data, blocksize_len), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
@ -16835,13 +17081,6 @@ void atg_to_sparse_bsr(tensor *out__, tensor self, int64_t *blocksize_data, int
  )
 }

-void atg_to_sparse_bsr_out(tensor *out__, tensor out, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
-  PROTECT(
-    auto outputs__ = torch::to_sparse_bsr_out(*out, *self, torch::IntArrayRef(blocksize_data, blocksize_len), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
-    out__[0] = new torch::Tensor(outputs__);
-  )
-}
-
 void atg_to_sparse_csc(tensor *out__, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null) {
  PROTECT(
    auto outputs__ = self->to_sparse_csc(dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
@ -16849,13 +17088,6 @@ void atg_to_sparse_csc(tensor *out__, tensor self, int64_t dense_dim_v, uint8_t
  )
 }

-void atg_to_sparse_csc_out(tensor *out__, tensor out, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null) {
-  PROTECT(
-    auto outputs__ = torch::to_sparse_csc_out(*out, *self, dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
-    out__[0] = new torch::Tensor(outputs__);
-  )
-}
-
 void atg_to_sparse_csr(tensor *out__, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null) {
  PROTECT(
    auto outputs__ = self->to_sparse_csr(dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
@ -16863,20 +17095,6 @@ void atg_to_sparse_csr(tensor *out__, tensor self, int64_t dense_dim_v, uint8_t
  )
 }

-void atg_to_sparse_csr_out(tensor *out__, tensor out, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null) {
-  PROTECT(
-    auto outputs__ = torch::to_sparse_csr_out(*out, *self, dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
-    out__[0] = new torch::Tensor(outputs__);
-  )
-}
-
-void atg_to_sparse_out(tensor *out__, tensor out, tensor self, int8_t layout, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null) {
-  PROTECT(
-    auto outputs__ = torch::to_sparse_out(*out, *self, (layout == -1 ? c10::nullopt : c10::optional<at::Layout>(static_cast<at::Layout>(layout))), blocksize_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(blocksize_data, blocksize_len)), dense_dim_null ? c10::nullopt : c10::optional<int64_t>(dense_dim_v));
-    out__[0] = new torch::Tensor(outputs__);
-  )
-}
-
 void atg_to_sparse_sparse_dim(tensor *out__, tensor self, int64_t sparse_dim) {
  PROTECT(
    auto outputs__ = self->to_sparse(sparse_dim);
@ -16884,13 +17102,6 @@ void atg_to_sparse_sparse_dim(tensor *out__, tensor self, int64_t sparse_dim) {
  )
 }

-void atg_to_sparse_sparse_dim_out(tensor *out__, tensor out, tensor self, int64_t sparse_dim) {
-  PROTECT(
-    auto outputs__ = torch::to_sparse_out(*out, *self, sparse_dim);
-    out__[0] = new torch::Tensor(outputs__);
-  )
-}
-
 void atg_topk(tensor *out__, tensor self, int64_t k, int64_t dim, int largest, int sorted) {
  PROTECT(
    auto outputs__ = torch::topk(*self, k, dim, (bool)largest, (bool)sorted);
@ -17657,16 +17868,16 @@ void atg_var(tensor *out__, tensor self, int unbiased) {
  )
 }

-void atg_var_correction(tensor *out__, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim) {
+void atg_var_correction(tensor *out__, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim) {
  PROTECT(
-    auto outputs__ = torch::var(*self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), correction_null ? c10::nullopt : c10::optional<int64_t>(correction_v), (bool)keepdim);
+    auto outputs__ = torch::var(*self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), *correction, (bool)keepdim);
    out__[0] = new torch::Tensor(outputs__);
  )
 }

-void atg_var_correction_out(tensor *out__, tensor out, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim) {
+void atg_var_correction_out(tensor *out__, tensor out, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim) {
  PROTECT(
-    auto outputs__ = torch::var_out(*out, *self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), correction_null ? c10::nullopt : c10::optional<int64_t>(correction_v), (bool)keepdim);
+    auto outputs__ = torch::var_out(*out, *self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), *correction, (bool)keepdim);
    out__[0] = new torch::Tensor(outputs__);
  )
 }
@ -17686,17 +17897,17 @@ void atg_var_mean(tensor *out__, tensor self, int unbiased) {
  )
 }

-void atg_var_mean_correction(tensor *out__, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim) {
+void atg_var_mean_correction(tensor *out__, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim) {
  PROTECT(
-    auto outputs__ = torch::var_mean(*self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), correction_null ? c10::nullopt : c10::optional<int64_t>(correction_v), (bool)keepdim);
+    auto outputs__ = torch::var_mean(*self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), *correction, (bool)keepdim);
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
  )
 }

-void atg_var_mean_correction_out(tensor *out__, tensor out0, tensor out1, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim) {
+void atg_var_mean_correction_out(tensor *out__, tensor out0, tensor out1, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim) {
  PROTECT(
-    auto outputs__ = torch::var_mean_out(*out0, *out1, *self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), correction_null ? c10::nullopt : c10::optional<int64_t>(correction_v), (bool)keepdim);
+    auto outputs__ = torch::var_mean_out(*out0, *out1, *self, dim_data == nullptr ? c10::nullopt : c10::optional<torch::IntArrayRef>(torch::IntArrayRef(dim_data, dim_len)), *correction, (bool)keepdim);
    out__[0] = new torch::Tensor(std::get<0>(outputs__));
    out__[1] = new torch::Tensor(std::get<1>(outputs__));
  )
--- a/libtch/torch_api_generated.h
+++ b/libtch/torch_api_generated.h
@ -62,7 +62,6 @@ void atg__cdist_backward(tensor *, tensor grad, tensor x1, tensor x2, double p,
 void atg__cdist_backward_out(tensor *, tensor out, tensor grad, tensor x1, tensor x2, double p, tensor cdist);
 void atg__cholesky_solve_helper(tensor *, tensor self, tensor A, int upper);
 void atg__cholesky_solve_helper_out(tensor *, tensor out, tensor self, tensor A, int upper);
-int atg__chunk_grad_outputs_efficient_attention(tensor query, tensor key, tensor value, int is_causal);
 void atg__coalesce(tensor *, tensor self);
 void atg__coalesce_out(tensor *, tensor out, tensor self);
 void atg__coalesced(tensor *, tensor self, int coalesced);
@ -89,6 +88,8 @@ void atg__copy_from(tensor *, tensor self, tensor dst, int non_blocking);
 void atg__copy_from_and_resize(tensor *, tensor self, tensor dst);
 void atg__copy_from_and_resize_out(tensor *, tensor out, tensor self, tensor dst);
 void atg__copy_from_out(tensor *, tensor out, tensor self, tensor dst, int non_blocking);
+void atg__cslt_compress(tensor *, tensor input);
+void atg__cslt_sparse_mm(tensor *, tensor compressed_A, tensor dense_B, tensor bias, int transpose_result);
 void atg__ctc_loss(tensor *, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, int64_t blank, int zero_infinity);
 void atg__ctc_loss_backward(tensor *, tensor grad, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, tensor neg_log_likelihood, tensor log_alpha, int64_t blank, int zero_infinity);
 void atg__ctc_loss_backward_out(tensor *, tensor out, tensor grad, tensor log_probs, tensor targets, int64_t *input_lengths_data, int input_lengths_len, int64_t *target_lengths_data, int target_lengths_len, tensor neg_log_likelihood, tensor log_alpha, int64_t blank, int zero_infinity);
@ -105,15 +106,13 @@ void atg__cudnn_rnn(tensor *, tensor input, tensor *weight_data, int weight_len,
 void atg__cudnn_rnn_flatten_weight(tensor *, tensor *weight_arr_data, int weight_arr_len, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int batch_first, int bidirectional);
 void atg__cudnn_rnn_flatten_weight_out(tensor *, tensor out, tensor *weight_arr_data, int weight_arr_len, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int batch_first, int bidirectional);
 void atg__cudnn_rnn_out(tensor *, tensor out0, tensor out1, tensor out2, tensor out3, tensor out4, tensor input, tensor *weight_data, int weight_len, int64_t weight_stride0, tensor weight_buf, tensor hx, tensor cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int batch_first, double dropout, int train, int bidirectional, int64_t *batch_sizes_data, int batch_sizes_len, tensor dropout_state);
-int64_t atg__cufft_get_plan_cache_max_size(int64_t device_index);
-int64_t atg__cufft_get_plan_cache_size(int64_t device_index);
 int64_t atg__debug_has_internal_overlap(tensor self);
 void atg__dim_arange(tensor *, tensor like, int64_t dim);
 int64_t atg__dimi(tensor self);
 int64_t atg__dimv(tensor self);
 void atg__dirichlet_grad(tensor *, tensor x, tensor alpha, tensor total);
 void atg__dirichlet_grad_out(tensor *, tensor out, tensor x, tensor alpha, tensor total);
-void atg__efficient_attention_backward(tensor *, tensor grad_out_, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, int is_causal, int chunk_grad_outputs);
+void atg__efficient_attention_backward(tensor *, tensor grad_out_, tensor query, tensor key, tensor value, tensor bias, tensor out, tensor cu_seqlens_q, tensor cu_seqlens_k, int64_t max_seqlen_k, int64_t max_seqlen_q, tensor logsumexp, double dropout_p, tensor philox_seed, tensor philox_offset, int64_t custom_mask_type, int bias_requires_grad, double scale_v, uint8_t scale_null, int64_t num_splits_key_v, uint8_t num_splits_key_null);
 void atg__efficientzerotensor(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__efficientzerotensor_out(tensor *, tensor out, int64_t *size_data, int size_len);
 void atg__embedding_bag(tensor *, tensor weight, tensor indices, tensor offsets, int scale_grad_by_freq, int64_t mode, int sparse, tensor per_sample_weights, int include_last_offset, int64_t padding_idx);
@ -146,15 +145,19 @@ void atg__fft_c2r(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t
 void atg__fft_c2r_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int64_t normalization, int64_t last_dim_size);
 void atg__fft_r2c(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t normalization, int onesided);
 void atg__fft_r2c_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int64_t normalization, int onesided);
-void atg__flash_attention_backward(tensor *, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, int64_t philox_seed, int64_t philox_offset);
+void atg__fill_mem_eff_dropout_mask_(tensor *, tensor self, double dropout_p, int64_t seed, int64_t offset);
+void atg__flash_attention_backward(tensor *, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, tensor philox_seed, tensor philox_offset, double scale_v, uint8_t scale_null);
 void atg__foobar(tensor *, tensor self, int arg1, int arg2, int arg3);
 void atg__foobar_out(tensor *, tensor out, tensor self, int arg1, int arg2, int arg3);
+void atg__functional_assert_async(tensor *, tensor self, char* assert_msg_ptr, int assert_msg_len, tensor dep_token);
+void atg__functional_sym_constrain_range(tensor *, scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null, tensor dep_token);
+void atg__functional_sym_constrain_range_for_size(tensor *, scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null, tensor dep_token);
 void atg__fused_dropout(tensor *, tensor self, double p);
 void atg__fused_dropout_out(tensor *, tensor out0, tensor out1, tensor self, double p);
 void atg__fused_moving_avg_obs_fq_helper(tensor *, tensor self, tensor observer_on, tensor fake_quant_on, tensor running_min, tensor running_max, tensor scale, tensor zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int per_row_fake_quant, int symmetric_quant);
 void atg__fused_moving_avg_obs_fq_helper_functional(tensor *, tensor self, tensor observer_on, tensor fake_quant_on, tensor running_min, tensor running_max, tensor scale, tensor zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int per_row_fake_quant, int symmetric_quant);
 void atg__fused_moving_avg_obs_fq_helper_out(tensor *, tensor out0, tensor out1, tensor self, tensor observer_on, tensor fake_quant_on, tensor running_min, tensor running_max, tensor scale, tensor zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int per_row_fake_quant, int symmetric_quant);
-int64_t atg__fused_sdp_choice(tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal);
+int64_t atg__fused_sdp_choice(tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, double scale_v, uint8_t scale_null);
 void atg__fw_primal(tensor *, tensor self, int64_t level);
 void atg__fw_primal_copy(tensor *, tensor self, int64_t level);
 void atg__fw_primal_copy_out(tensor *, tensor out, tensor self, int64_t level);
@ -173,6 +176,8 @@ void atg__index_put_impl_out(tensor *, tensor out, tensor self, tensor *indices_
 void atg__indices(tensor *, tensor self);
 void atg__indices_copy(tensor *, tensor self);
 void atg__indices_copy_out(tensor *, tensor out, tensor self);
+void atg__int_mm(tensor *, tensor self, tensor mat2);
+void atg__int_mm_out(tensor *, tensor out, tensor self, tensor mat2);
 void atg__is_all_true(tensor *, tensor self);
 void atg__is_any_true(tensor *, tensor self);
 int atg__is_zerotensor(tensor self);
@ -195,6 +200,7 @@ void atg__logcumsumexp_out(tensor *, tensor out, tensor self, int64_t dim);
 void atg__lstm_mps(tensor *, tensor input, tensor *hx_data, int hx_len, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional, int batch_first);
 void atg__lstm_mps_out(tensor *, tensor out0, tensor out1, tensor out2, tensor out3, tensor out4, tensor out5, tensor input, tensor *hx_data, int hx_len, tensor *params_data, int params_len, int has_biases, int64_t num_layers, double dropout, int train, int bidirectional, int batch_first);
 void atg__lu_with_info(tensor *, tensor self, int pivot, int check_errors);
+void atg__make_dep_token(tensor *, int options_kind, int options_device);
 void atg__make_dual(tensor *, tensor primal, tensor tangent, int64_t level);
 void atg__make_dual_copy(tensor *, tensor primal, tensor tangent, int64_t level);
 void atg__make_dual_copy_out(tensor *, tensor out, tensor primal, tensor tangent, int64_t level);
@ -221,9 +227,9 @@ void atg__native_batch_norm_legit(tensor *, tensor input, tensor weight, tensor
 void atg__native_batch_norm_legit_functional(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double momentum, double eps);
 void atg__native_batch_norm_legit_no_stats(tensor *, tensor input, tensor weight, tensor bias, int training, double momentum, double eps);
 void atg__native_batch_norm_legit_no_stats_out(tensor *, tensor out, tensor save_mean, tensor save_invstd, tensor input, tensor weight, tensor bias, int training, double momentum, double eps);
+void atg__native_batch_norm_legit_no_training(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, double momentum, double eps);
+void atg__native_batch_norm_legit_no_training_out(tensor *, tensor out0, tensor out1, tensor out2, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, double momentum, double eps);
 void atg__native_batch_norm_legit_out(tensor *, tensor out, tensor save_mean, tensor save_invstd, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double momentum, double eps);
-void atg__native_decoder_only_multi_head_attention(tensor *, tensor query, tensor key, tensor value, int64_t embed_dim, int64_t num_head, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, tensor mask, tensor incr_key, tensor incr_value, int need_weights, int average_attn_weights);
-void atg__native_decoder_only_multi_head_attention_out(tensor *, tensor out0, tensor out1, tensor out2, tensor out3, tensor query, tensor key, tensor value, int64_t embed_dim, int64_t num_head, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, tensor mask, tensor incr_key, tensor incr_value, int need_weights, int average_attn_weights);
 void atg__native_multi_head_attention(tensor *, tensor query, tensor key, tensor value, int64_t embed_dim, int64_t num_head, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, tensor mask, int need_weights, int average_attn_weights, int64_t mask_type_v, uint8_t mask_type_null);
 void atg__native_multi_head_attention_out(tensor *, tensor out0, tensor out1, tensor query, tensor key, tensor value, int64_t embed_dim, int64_t num_head, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, tensor mask, int need_weights, int average_attn_weights, int64_t mask_type_v, uint8_t mask_type_null);
 void atg__neg_view(tensor *, tensor self);
@ -235,9 +241,9 @@ void atg__nested_from_padded_and_nested_example_out(tensor *, tensor out, tensor
 void atg__nested_from_padded_out(tensor *, tensor out, tensor padded, tensor cpu_nested_shape_example, int fuse_transform_0213);
 void atg__nested_select_backward(tensor *, tensor grad_output, tensor self, int64_t dim, int64_t index);
 void atg__nested_sum_backward(tensor *, tensor grad, tensor self, int64_t *dim_data, int dim_len, int keepdim);
-void atg__nested_view_from_buffer(tensor *, tensor self, tensor nested_size, tensor nested_strides, int64_t *offsets_data, int offsets_len);
-void atg__nested_view_from_buffer_copy(tensor *, tensor self, tensor nested_size, tensor nested_strides, int64_t *offsets_data, int offsets_len);
-void atg__nested_view_from_buffer_copy_out(tensor *, tensor out, tensor self, tensor nested_size, tensor nested_strides, int64_t *offsets_data, int offsets_len);
+void atg__nested_view_from_buffer(tensor *, tensor self, tensor nested_size, tensor nested_strides, tensor offsets);
+void atg__nested_view_from_buffer_copy(tensor *, tensor self, tensor nested_size, tensor nested_strides, tensor offsets);
+void atg__nested_view_from_buffer_copy_out(tensor *, tensor out, tensor self, tensor nested_size, tensor nested_strides, tensor offsets);
 void atg__new_zeros_with_same_feature_meta(tensor *, tensor self, tensor other, int64_t self_num_batch_dims);
 void atg__new_zeros_with_same_feature_meta_out(tensor *, tensor out, tensor self, tensor other, int64_t self_num_batch_dims);
 int atg__nnpack_available();
@ -256,6 +262,7 @@ void atg__pin_memory(tensor *, tensor self, int device);
 void atg__pin_memory_out(tensor *, tensor out, tensor self, int device);
 void atg__prelu_kernel(tensor *, tensor self, tensor weight);
 void atg__prelu_kernel_backward(tensor *, tensor grad_output, tensor self, tensor weight);
+void atg__propagate_xla_data(tensor input, tensor output);
 void atg__remove_batch_dim(tensor *, tensor self, int64_t level, int64_t batch_size, int64_t out_dim);
 void atg__reshape_alias(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len);
 void atg__reshape_alias_copy(tensor *, tensor self, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len);
@ -269,11 +276,14 @@ void atg__rowwise_prune(tensor *, tensor weight, tensor mask, int compressed_ind
 void atg__sample_dirichlet(tensor *, tensor self);
 void atg__sample_dirichlet_out(tensor *, tensor out, tensor self);
 void atg__saturate_weight_to_fp16(tensor *, tensor weight);
-void atg__scaled_dot_product_attention(tensor *, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int need_attn_weights, int is_causal);
-void atg__scaled_dot_product_attention_math(tensor *, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, tensor dropout_mask);
-void atg__scaled_dot_product_efficient_attention(tensor *, tensor query, tensor key, tensor value, int compute_log_sumexp, int is_causal);
-void atg__scaled_dot_product_efficient_attention_backward(tensor *, tensor grad_out_, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, int is_causal, int chunk_grad_outputs);
-void atg__scaled_dot_product_flash_attention_backward(tensor *, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, int64_t philox_seed, int64_t philox_offset);
+void atg__scaled_dot_product_attention_math(tensor *, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, tensor dropout_mask, double scale_v, uint8_t scale_null);
+void atg__scaled_dot_product_efficient_attention(tensor *, tensor query, tensor key, tensor value, tensor attn_bias, int compute_log_sumexp, double dropout_p, int is_causal, double scale_v, uint8_t scale_null);
+void atg__scaled_dot_product_flash_attention_backward(tensor *, tensor grad_out, tensor query, tensor key, tensor value, tensor out, tensor logsumexp, tensor cum_seq_q, tensor cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int is_causal, tensor philox_seed, tensor philox_offset, double scale_v, uint8_t scale_null);
+void atg__scaled_mm(tensor *, tensor self, tensor mat2, tensor bias, int out_dtype, tensor scale_a, tensor scale_b, tensor scale_result);
+void atg__scaled_mm_out(tensor *, tensor out, tensor out_amax, tensor self, tensor mat2, tensor bias, int out_dtype, tensor scale_a, tensor scale_b, tensor scale_result);
+void atg__scatter_reduce(tensor *, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len, int include_self);
+void atg__scatter_reduce_(tensor *, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len, int include_self);
+void atg__scatter_reduce_two_out(tensor *, tensor out, tensor self, int64_t dim, tensor index, tensor src, char* reduce_ptr, int reduce_len, int include_self);
 void atg__segment_reduce_backward(tensor *, tensor grad, tensor output, tensor data, char* reduce_ptr, int reduce_len, tensor lengths, tensor offsets, int64_t axis, scalar initial);
 void atg__segment_reduce_backward_out(tensor *, tensor out, tensor grad, tensor output, tensor data, char* reduce_ptr, int reduce_len, tensor lengths, tensor offsets, int64_t axis, scalar initial);
 void atg__shape_as_tensor(tensor *, tensor self);
@ -294,10 +304,10 @@ void atg__sparse_broadcast_to_copy_out(tensor *, tensor out, tensor self, int64_
 void atg__sparse_bsc_tensor_unsafe(tensor *, tensor ccol_indices, tensor row_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_bsr_tensor_unsafe(tensor *, tensor crow_indices, tensor col_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_compressed_tensor_unsafe(tensor *, tensor compressed_indices, tensor plain_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
-void atg__sparse_coo_tensor_unsafe(tensor *, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
+void atg__sparse_coo_tensor_unsafe(tensor *, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device, int is_coalesced);
 void atg__sparse_coo_tensor_with_dims(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, int options_kind, int options_device);
-void atg__sparse_coo_tensor_with_dims_and_tensors(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int options_kind, int options_device);
-void atg__sparse_coo_tensor_with_dims_and_tensors_out(tensor *, tensor out, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values);
+void atg__sparse_coo_tensor_with_dims_and_tensors(tensor *, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int options_kind, int options_device, int is_coalesced);
+void atg__sparse_coo_tensor_with_dims_and_tensors_out(tensor *, tensor out, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len, tensor indices, tensor values, int is_coalesced);
 void atg__sparse_coo_tensor_with_dims_out(tensor *, tensor out, int64_t sparse_dim, int64_t dense_dim, int64_t *size_data, int size_len);
 void atg__sparse_csc_tensor_unsafe(tensor *, tensor ccol_indices, tensor row_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg__sparse_csr_prod(tensor *, tensor self, int64_t *dim_data, int dim_len, int keepdim, int dtype);
@ -310,9 +320,12 @@ void atg__sparse_log_softmax_backward_data(tensor *, tensor grad_output, tensor
 void atg__sparse_log_softmax_backward_data_out(tensor *, tensor out, tensor grad_output, tensor output, int64_t dim, tensor self);
 void atg__sparse_log_softmax_int(tensor *, tensor self, int64_t dim, int dtype);
 void atg__sparse_log_softmax_out(tensor *, tensor out, tensor self, int64_t dim, int half_to_float);
+void atg__sparse_mask_projection(tensor *, tensor self, tensor mask, int accumulate_matches);
+void atg__sparse_mask_projection_out(tensor *, tensor out, tensor self, tensor mask, int accumulate_matches);
 void atg__sparse_mm(tensor *, tensor sparse, tensor dense);
 void atg__sparse_mm_reduce(tensor *, tensor sparse, tensor dense, char* reduce_ptr, int reduce_len);
 void atg__sparse_mm_reduce_impl(tensor *, tensor self, tensor other, char* reduce_ptr, int reduce_len);
+void atg__sparse_semi_structured_linear(tensor *, tensor input, tensor weight, tensor meta, tensor bias, char* activation_ptr, int activation_len);
 void atg__sparse_softmax(tensor *, tensor self, int64_t dim, int half_to_float);
 void atg__sparse_softmax_backward_data(tensor *, tensor grad_output, tensor output, int64_t dim, tensor self);
 void atg__sparse_softmax_backward_data_out(tensor *, tensor out, tensor grad_output, tensor output, int64_t dim, tensor self);
@ -344,6 +357,8 @@ void atg__test_autograd_multiple_dispatch_view(tensor *, tensor self);
 void atg__test_autograd_multiple_dispatch_view_copy(tensor *, tensor self);
 void atg__test_autograd_multiple_dispatch_view_copy_out(tensor *, tensor out, tensor self);
 void atg__test_check_tensor(tensor *, tensor self);
+void atg__test_functorch_fallback(tensor *, tensor self, tensor other);
+void atg__test_functorch_fallback_out(tensor *, tensor out, tensor self, tensor other);
 void atg__test_optional_filled_intlist(tensor *, tensor values, int64_t *addends_data, int addends_len);
 void atg__test_optional_filled_intlist_out(tensor *, tensor out, tensor values, int64_t *addends_data, int addends_len);
 void atg__test_optional_floatlist(tensor *, tensor values, double *addends_data, int addends_len);
@ -357,12 +372,23 @@ void atg__test_warn_in_autograd_out(tensor *, tensor out, tensor self);
 void atg__to_copy(tensor *, tensor self, int options_kind, int options_device, int non_blocking);
 void atg__to_copy_out(tensor *, tensor out, tensor self, int non_blocking);
 tensor *atg__to_cpu(tensor *tensors_data, int tensors_len);
-void atg__to_dense(tensor *, tensor self, int dtype);
-void atg__to_dense_out(tensor *, tensor out, tensor self, int dtype);
+void atg__to_dense(tensor *, tensor self, int dtype, int masked_grad);
+void atg__to_dense_out(tensor *, tensor out, tensor self, int dtype, int masked_grad);
+void atg__to_sparse(tensor *, tensor self, int8_t layout, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_bsc(tensor *, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_bsc_out(tensor *, tensor out, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_bsr(tensor *, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_bsr_out(tensor *, tensor out, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_csc(tensor *, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_csc_out(tensor *, tensor out, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_csr(tensor *, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_csr_out(tensor *, tensor out, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_out(tensor *, tensor out, tensor self, int8_t layout, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
+void atg__to_sparse_semi_structured(tensor *, tensor dense);
+void atg__to_sparse_sparse_dim(tensor *, tensor self, int64_t sparse_dim);
+void atg__to_sparse_sparse_dim_out(tensor *, tensor out, tensor self, int64_t sparse_dim);
 void atg__transform_bias_rescale_qkv(tensor *, tensor qkv, tensor qkv_bias, int64_t num_heads);
 void atg__transform_bias_rescale_qkv_out(tensor *, tensor out0, tensor out1, tensor out2, tensor qkv, tensor qkv_bias, int64_t num_heads);
-void atg__transformer_decoder_only_layer_fwd(tensor *, tensor src, int64_t embed_dim, int64_t num_heads, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, int use_gelu, int norm_first, double eps, tensor norm_weight_1, tensor norm_bias_1, tensor norm_weight_2, tensor norm_bias_2, tensor ffn_weight_1, tensor ffn_bias_1, tensor ffn_weight_2, tensor ffn_bias_2, tensor mask, tensor incr_key, tensor incr_value);
-void atg__transformer_decoder_only_layer_fwd_out(tensor *, tensor out0, tensor out1, tensor out2, tensor src, int64_t embed_dim, int64_t num_heads, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, int use_gelu, int norm_first, double eps, tensor norm_weight_1, tensor norm_bias_1, tensor norm_weight_2, tensor norm_bias_2, tensor ffn_weight_1, tensor ffn_bias_1, tensor ffn_weight_2, tensor ffn_bias_2, tensor mask, tensor incr_key, tensor incr_value);
 void atg__transformer_encoder_layer_fwd(tensor *, tensor src, int64_t embed_dim, int64_t num_heads, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, int use_gelu, int norm_first, double eps, tensor norm_weight_1, tensor norm_bias_1, tensor norm_weight_2, tensor norm_bias_2, tensor ffn_weight_1, tensor ffn_bias_1, tensor ffn_weight_2, tensor ffn_bias_2, tensor mask, int64_t mask_type_v, uint8_t mask_type_null);
 void atg__transformer_encoder_layer_fwd_out(tensor *, tensor out, tensor src, int64_t embed_dim, int64_t num_heads, tensor qkv_weight, tensor qkv_bias, tensor proj_weight, tensor proj_bias, int use_gelu, int norm_first, double eps, tensor norm_weight_1, tensor norm_bias_1, tensor norm_weight_2, tensor norm_bias_2, tensor ffn_weight_1, tensor ffn_bias_1, tensor ffn_weight_2, tensor ffn_bias_2, tensor mask, int64_t mask_type_v, uint8_t mask_type_null);
 void atg__trilinear(tensor *, tensor i1, tensor i2, tensor i3, int64_t *expand1_data, int expand1_len, int64_t *expand2_data, int expand2_len, int64_t *expand3_data, int expand3_len, int64_t *sumdim_data, int sumdim_len, int64_t unroll_dim);
@ -376,6 +402,8 @@ void atg__unique2(tensor *, tensor self, int sorted, int return_inverse, int ret
 void atg__unique2_out(tensor *, tensor out0, tensor out1, tensor out2, tensor self, int sorted, int return_inverse, int return_counts);
 void atg__unique_out(tensor *, tensor out0, tensor out1, tensor self, int sorted, int return_inverse);
 void atg__unpack_dual(tensor *, tensor dual, int64_t level);
+void atg__unsafe_index(tensor *, tensor self, tensor *indices_data, int indices_len);
+void atg__unsafe_index_put(tensor *, tensor self, tensor *indices_data, int indices_len, tensor values, int accumulate);
 void atg__unsafe_view(tensor *, tensor self, int64_t *size_data, int size_len);
 void atg__unsafe_view_out(tensor *, tensor out, tensor self, int64_t *size_data, int size_len);
 void atg__upsample_bicubic2d_aa(tensor *, tensor self, int64_t *output_size_data, int output_size_len, int align_corners, double scales_h_v, uint8_t scales_h_null, double scales_w_v, uint8_t scales_w_null);
@ -571,8 +599,8 @@ void atg_bartlett_window_out(tensor *, tensor out, int64_t window_length);
 void atg_bartlett_window_periodic(tensor *, int64_t window_length, int periodic, int options_kind, int options_device);
 void atg_bartlett_window_periodic_out(tensor *, tensor out, int64_t window_length, int periodic);
 void atg_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double momentum, double eps, int cudnn_enabled);
-void atg_batch_norm_backward_elemt(tensor *, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, tensor mean_dy, tensor mean_dy_xmu, tensor count);
-void atg_batch_norm_backward_elemt_out(tensor *, tensor out, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, tensor mean_dy, tensor mean_dy_xmu, tensor count);
+void atg_batch_norm_backward_elemt(tensor *, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, tensor sum_dy, tensor sum_dy_xmu, tensor count);
+void atg_batch_norm_backward_elemt_out(tensor *, tensor out, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, tensor sum_dy, tensor sum_dy_xmu, tensor count);
 void atg_batch_norm_backward_reduce(tensor *, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, int input_g, int weight_g, int bias_g);
 void atg_batch_norm_backward_reduce_out(tensor *, tensor out0, tensor out1, tensor out2, tensor out3, tensor grad_out, tensor input, tensor mean, tensor invstd, tensor weight, int input_g, int weight_g, int bias_g);
 void atg_batch_norm_elemt(tensor *, tensor input, tensor weight, tensor bias, tensor mean, tensor invstd, double eps);
@ -903,6 +931,8 @@ void atg_empty(tensor *, int64_t *size_data, int size_len, int options_kind, int
 void atg_empty_like(tensor *, tensor self);
 void atg_empty_like_out(tensor *, tensor out, tensor self);
 void atg_empty_out(tensor *, tensor out, int64_t *size_data, int size_len);
+void atg_empty_permuted(tensor *, int64_t *size_data, int size_len, int64_t *physical_layout_data, int physical_layout_len, int options_kind, int options_device);
+void atg_empty_permuted_out(tensor *, tensor out, int64_t *size_data, int size_len, int64_t *physical_layout_data, int physical_layout_len);
 void atg_empty_quantized(tensor *, int64_t *size_data, int size_len, tensor qtensor, int options_kind, int options_device);
 void atg_empty_quantized_out(tensor *, tensor out, int64_t *size_data, int size_len, tensor qtensor);
 void atg_empty_strided(tensor *, int64_t *size_data, int size_len, int64_t *stride_data, int stride_len, int options_kind, int options_device);
@ -1532,6 +1562,7 @@ void atg_min_dim(tensor *, tensor self, int64_t dim, int keepdim);
 void atg_min_dim_min(tensor *, tensor min, tensor min_indices, tensor self, int64_t dim, int keepdim);
 void atg_min_other(tensor *, tensor self, tensor other);
 void atg_min_out(tensor *, tensor out, tensor self, tensor other);
+void atg_min_unary_out(tensor *, tensor out, tensor self);
 void atg_minimum(tensor *, tensor self, tensor other);
 void atg_minimum_out(tensor *, tensor out, tensor self, tensor other);
 void atg_miopen_batch_norm(tensor *, tensor input, tensor weight, tensor bias, tensor running_mean, tensor running_var, int training, double exponential_average_factor, double epsilon);
@ -1691,6 +1722,8 @@ void atg_nll_loss_out(tensor *, tensor out, tensor self, tensor target, tensor w
 void atg_nonzero(tensor *, tensor self);
 tensor *atg_nonzero_numpy(tensor self);
 void atg_nonzero_out(tensor *, tensor out, tensor self);
+void atg_nonzero_static(tensor *, tensor self, int64_t size, int64_t fill_value);
+void atg_nonzero_static_out(tensor *, tensor out, tensor self, int64_t size, int64_t fill_value);
 void atg_norm(tensor *, tensor self);
 void atg_norm_dtype_out(tensor *, tensor out, tensor self, scalar p, int64_t *dim_data, int dim_len, int keepdim, int dtype);
 void atg_norm_except_dim(tensor *, tensor v, int64_t pow, int64_t dim);
@ -1791,6 +1824,8 @@ void atg_quantized_max_pool1d(tensor *, tensor self, int64_t *kernel_size_data,
 void atg_quantized_max_pool1d_out(tensor *, tensor out, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
 void atg_quantized_max_pool2d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
 void atg_quantized_max_pool2d_out(tensor *, tensor out, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
+void atg_quantized_max_pool3d(tensor *, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
+void atg_quantized_max_pool3d_out(tensor *, tensor out, tensor self, int64_t *kernel_size_data, int kernel_size_len, int64_t *stride_data, int stride_len, int64_t *padding_data, int padding_len, int64_t *dilation_data, int dilation_len, int ceil_mode);
 void atg_quantized_rnn_relu_cell(tensor *, tensor input, tensor hx, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh, tensor packed_ih, tensor packed_hh, tensor col_offsets_ih, tensor col_offsets_hh, scalar scale_ih, scalar scale_hh, scalar zero_point_ih, scalar zero_point_hh);
 void atg_quantized_rnn_tanh_cell(tensor *, tensor input, tensor hx, tensor w_ih, tensor w_hh, tensor b_ih, tensor b_hh, tensor packed_ih, tensor packed_hh, tensor col_offsets_ih, tensor col_offsets_hh, scalar scale_ih, scalar scale_hh, scalar zero_point_ih, scalar zero_point_hh);
 void atg_rad2deg(tensor *, tensor self);
@ -1930,7 +1965,7 @@ void atg_rsub_scalar_out(tensor *, tensor out, tensor self, scalar other);
 void atg_rsub_tensor_out(tensor *, tensor out, tensor self, tensor other);
 void atg_scalar_tensor(tensor *, scalar s, int options_kind, int options_device);
 void atg_scalar_tensor_out(tensor *, tensor out, scalar s);
-void atg_scaled_dot_product_attention(tensor *, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal);
+void atg_scaled_dot_product_attention(tensor *, tensor query, tensor key, tensor value, tensor attn_mask, double dropout_p, int is_causal, double scale_v, uint8_t scale_null);
 void atg_scatter(tensor *, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter_(tensor *, tensor self, int64_t dim, tensor index, tensor src);
 void atg_scatter_add(tensor *, tensor self, int64_t dim, tensor index, tensor src);
@ -2045,8 +2080,8 @@ void atg_sparse_bsr_tensor_crow_col_value_size(tensor *, tensor crow_indices, te
 void atg_sparse_compressed_tensor(tensor *, tensor compressed_indices, tensor plain_indices, tensor values, int options_kind, int options_device);
 void atg_sparse_compressed_tensor_comp_plain_value_size(tensor *, tensor compressed_indices, tensor plain_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
 void atg_sparse_coo_tensor(tensor *, int64_t *size_data, int size_len, int options_kind, int options_device);
-void atg_sparse_coo_tensor_indices(tensor *, tensor indices, tensor values, int options_kind, int options_device);
-void atg_sparse_coo_tensor_indices_size(tensor *, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
+void atg_sparse_coo_tensor_indices(tensor *, tensor indices, tensor values, int options_kind, int options_device, int is_coalesced);
+void atg_sparse_coo_tensor_indices_size(tensor *, tensor indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device, int is_coalesced);
 void atg_sparse_coo_tensor_size_out(tensor *, tensor out, int64_t *size_data, int size_len);
 void atg_sparse_csc_tensor(tensor *, tensor ccol_indices, tensor row_indices, tensor values, int options_kind, int options_device);
 void atg_sparse_csc_tensor_ccol_row_value_size(tensor *, tensor ccol_indices, tensor row_indices, tensor values, int64_t *size_data, int size_len, int options_kind, int options_device);
@ -2261,12 +2296,12 @@ void atg_sspaddmm_out(tensor *, tensor out, tensor self, tensor mat1, tensor mat
 void atg_stack(tensor *, tensor *tensors_data, int tensors_len, int64_t dim);
 void atg_stack_out(tensor *, tensor out, tensor *tensors_data, int tensors_len, int64_t dim);
 void atg_std(tensor *, tensor self, int unbiased);
-void atg_std_correction(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim);
-void atg_std_correction_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim);
+void atg_std_correction(tensor *, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim);
+void atg_std_correction_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim);
 void atg_std_dim(tensor *, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_std_mean(tensor *, tensor self, int unbiased);
-void atg_std_mean_correction(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim);
-void atg_std_mean_correction_out(tensor *, tensor out0, tensor out1, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim);
+void atg_std_mean_correction(tensor *, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim);
+void atg_std_mean_correction_out(tensor *, tensor out0, tensor out1, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim);
 void atg_std_mean_dim(tensor *, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_std_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_stft(tensor *, tensor self, int64_t n_fft, int64_t hop_length_v, uint8_t hop_length_null, int64_t win_length_v, uint8_t win_length_null, tensor window, int normalized, int onesided, int return_complex);
@ -2293,6 +2328,8 @@ void atg_swapaxes(tensor *, tensor self, int64_t axis0, int64_t axis1);
 void atg_swapaxes_(tensor *, tensor self, int64_t axis0, int64_t axis1);
 void atg_swapdims(tensor *, tensor self, int64_t dim0, int64_t dim1);
 void atg_swapdims_(tensor *, tensor self, int64_t dim0, int64_t dim1);
+void atg_sym_constrain_range(scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null);
+void atg_sym_constrain_range_for_size(scalar size, int64_t min_v, uint8_t min_null, int64_t max_v, uint8_t max_null);
 void atg_t(tensor *, tensor self);
 void atg_t_(tensor *, tensor self);
 void atg_t_copy(tensor *, tensor self);
@ -2321,8 +2358,8 @@ void atg_threshold_backward_grad_input(tensor *, tensor grad_input, tensor grad_
 void atg_threshold_out(tensor *, tensor out, tensor self, scalar threshold, scalar value);
 void atg_tile(tensor *, tensor self, int64_t *dims_data, int dims_len);
 void atg_to(tensor *, tensor self, int device);
-void atg_to_dense(tensor *, tensor self, int dtype);
-void atg_to_dense_backward(tensor *, tensor grad, tensor input);
+void atg_to_dense(tensor *, tensor self, int dtype, int masked_grad);
+void atg_to_dense_backward(tensor *, tensor grad, tensor input, int masked_grad);
 void atg_to_device(tensor *, tensor self, int device, int dtype, int non_blocking, int copy);
 void atg_to_dtype(tensor *, tensor self, int dtype, int non_blocking, int copy);
 void atg_to_dtype_layout(tensor *, tensor self, int options_kind, int options_device, int non_blocking, int copy);
@ -2334,16 +2371,10 @@ void atg_to_padded_tensor(tensor *, tensor self, double padding, int64_t *output
 void atg_to_padded_tensor_out(tensor *, tensor out, tensor self, double padding, int64_t *output_size_data, int output_size_len);
 void atg_to_sparse(tensor *, tensor self, int8_t layout, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
 void atg_to_sparse_bsc(tensor *, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
-void atg_to_sparse_bsc_out(tensor *, tensor out, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
 void atg_to_sparse_bsr(tensor *, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
-void atg_to_sparse_bsr_out(tensor *, tensor out, tensor self, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
 void atg_to_sparse_csc(tensor *, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null);
-void atg_to_sparse_csc_out(tensor *, tensor out, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null);
 void atg_to_sparse_csr(tensor *, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null);
-void atg_to_sparse_csr_out(tensor *, tensor out, tensor self, int64_t dense_dim_v, uint8_t dense_dim_null);
-void atg_to_sparse_out(tensor *, tensor out, tensor self, int8_t layout, int64_t *blocksize_data, int blocksize_len, int64_t dense_dim_v, uint8_t dense_dim_null);
 void atg_to_sparse_sparse_dim(tensor *, tensor self, int64_t sparse_dim);
-void atg_to_sparse_sparse_dim_out(tensor *, tensor out, tensor self, int64_t sparse_dim);
 void atg_topk(tensor *, tensor self, int64_t k, int64_t dim, int largest, int sorted);
 void atg_topk_values(tensor *, tensor values, tensor indices, tensor self, int64_t k, int64_t dim, int largest, int sorted);
 void atg_totype(tensor *, tensor self, int scalar_type);
@ -2446,12 +2477,12 @@ void atg_values_copy(tensor *, tensor self);
 void atg_values_copy_out(tensor *, tensor out, tensor self);
 void atg_vander(tensor *, tensor x, int64_t n_v, uint8_t n_null, int increasing);
 void atg_var(tensor *, tensor self, int unbiased);
-void atg_var_correction(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim);
-void atg_var_correction_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim);
+void atg_var_correction(tensor *, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim);
+void atg_var_correction_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim);
 void atg_var_dim(tensor *, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_var_mean(tensor *, tensor self, int unbiased);
-void atg_var_mean_correction(tensor *, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim);
-void atg_var_mean_correction_out(tensor *, tensor out0, tensor out1, tensor self, int64_t *dim_data, int dim_len, int64_t correction_v, uint8_t correction_null, int keepdim);
+void atg_var_mean_correction(tensor *, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim);
+void atg_var_mean_correction_out(tensor *, tensor out0, tensor out1, tensor self, int64_t *dim_data, int dim_len, scalar correction, int keepdim);
 void atg_var_mean_dim(tensor *, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_var_out(tensor *, tensor out, tensor self, int64_t *dim_data, int dim_len, int unbiased, int keepdim);
 void atg_vdot(tensor *, tensor self, tensor other);
--- a/nn/optimizer_test.go
+++ b/nn/optimizer_test.go
@ -38,11 +38,13 @@ func TestOptimizer(t *testing.T) {

 	// Optimization loop
 	for i := 0; i < 50; i++ {
-		logits := model.ForwardT(x, true)
+		logits := model.Forward(x)
 		loss := logits.MustMseLoss(y, 1, true)
 		if i%10 == 0 {
 			fmt.Printf("Loss: %.3f\n", loss.MustView([]int64{-1}, false).MustFloat64Value([]int64{0}))
 		}
+
+		loss.MustRequiresGrad_(true)
 		opt.BackwardStep(loss)
 		loss.MustDrop()
 	}
--- a/setup-gotch.sh
+++ b/setup-gotch.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-GOTCH_VERSION="${GOTCH_VER:-v0.8.0}"
+GOTCH_VERSION="${GOTCH_VER:-v0.9.0}"
 CUDA_VERSION="${CUDA_VER:-11.8}"

 if [ -z $GOPATH ]; then
--- a/setup-libtorch.sh
+++ b/setup-libtorch.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-LIBTORCH_VERSION="${LIBTORCH_VER:-2.0.1}"
+LIBTORCH_VERSION="${LIBTORCH_VER:-2.1.0}"
 CUDA_VERSION="${CUDA_VER:-11.8}"

 if [ "${CUDA_VERSION}" == "cpu" ]; then
--- a/ts/must-tensor-generated.go
+++ b/ts/must-tensor-generated.go
@ -505,14 +505,6 @@ func(ts *Tensor) Must_CholeskySolveHelperOut(out *Tensor, a *Tensor, upper bool,
  return retVal
 } 

-func Must_ChunkGradOutputsEfficientAttention(query *Tensor, key *Tensor, value *Tensor, isCausal bool)(retVal bool) { 
-  
-  retVal, err := _ChunkGradOutputsEfficientAttention(query, key, value, isCausal)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
 func(ts *Tensor) Must_Coalesce(del bool)(retVal *Tensor) { 
  
  retVal, err := ts._Coalesce(del)
@ -721,6 +713,22 @@ func(ts *Tensor) Must_CopyFromOut(out *Tensor, dst *Tensor, nonBlocking bool, de
  return retVal
 } 

+func Must_CsltCompress(input *Tensor)(retVal *Tensor) { 
+  
+  retVal, err := _CsltCompress(input)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func Must_CsltSparseMm(compressedA *Tensor, denseB *Tensor, bias *Tensor, transposeResult bool)(retVal *Tensor) { 
+  
+  retVal, err := _CsltSparseMm(compressedA, denseB, bias, transposeResult)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func Must_CtcLoss(logProbs *Tensor, targets *Tensor, inputLengths []int64, targetLengths []int64, blank int64, zeroInfinity bool)(retVal0 *Tensor, retVal1 *Tensor) { 
  
  retVal0, retVal1, err := _CtcLoss(logProbs, targets, inputLengths, targetLengths, blank, zeroInfinity)
@ -849,22 +857,6 @@ func Must_CudnnRnnOut(out0 *Tensor, out1 *Tensor, out2 *Tensor, out3 *Tensor, ou
  return retVal0, retVal1, retVal2, retVal3, retVal4
 } 

-func Must_CufftGetPlanCacheMaxSize(deviceIndex int64)(retVal int64) { 
-  
-  retVal, err := _CufftGetPlanCacheMaxSize(deviceIndex)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
-func Must_CufftGetPlanCacheSize(deviceIndex int64)(retVal int64) { 
-  
-  retVal, err := _CufftGetPlanCacheSize(deviceIndex)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
 func(ts *Tensor) Must_DebugHasInternalOverlap(del bool)(retVal int64) { 
  
  retVal, err := ts._DebugHasInternalOverlap(del)
@ -913,12 +905,12 @@ func Must_DirichletGradOut(out *Tensor, x *Tensor, alpha *Tensor, total *Tensor)
  return retVal
 } 

-func Must_EfficientAttentionBackward(gradOut_ *Tensor, query *Tensor, key *Tensor, value *Tensor, out *Tensor, logsumexp *Tensor, isCausal bool, chunkGradOutputs bool)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
+func Must_EfficientAttentionBackward(gradOut_ *Tensor, query *Tensor, key *Tensor, value *Tensor, bias *Tensor, out *Tensor, cuSeqlensQ *Tensor, cuSeqlensK *Tensor, maxSeqlenK int64, maxSeqlenQ int64, logsumexp *Tensor, dropoutP float64, philoxSeed *Tensor, philoxOffset *Tensor, customMaskType int64, biasRequiresGrad bool, scale []float64, numSplitsKey []int64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor, retVal3 *Tensor) { 
  
-  retVal0, retVal1, retVal2, err := _EfficientAttentionBackward(gradOut_, query, key, value, out, logsumexp, isCausal, chunkGradOutputs)
+  retVal0, retVal1, retVal2, retVal3, err := _EfficientAttentionBackward(gradOut_, query, key, value, bias, out, cuSeqlensQ, cuSeqlensK, maxSeqlenK, maxSeqlenQ, logsumexp, dropoutP, philoxSeed, philoxOffset, customMaskType, biasRequiresGrad, scale, numSplitsKey)
  if err != nil { log.Fatal(err) }
  
-  return retVal0, retVal1, retVal2
+  return retVal0, retVal1, retVal2, retVal3
 } 

 func Must_Efficientzerotensor(size []int64, optionsKind gotch.DType, optionsDevice gotch.Device)(retVal *Tensor) { 
@ -1177,9 +1169,17 @@ func(ts *Tensor) Must_FftR2cOut(out *Tensor, dim []int64, normalization int64, o
  return retVal
 } 

-func Must_FlashAttentionBackward(gradOut *Tensor, query *Tensor, key *Tensor, value *Tensor, out *Tensor, logsumexp *Tensor, cumSeqQ *Tensor, cumSeqK *Tensor, maxQ int64, maxK int64, dropoutP float64, isCausal bool, philoxSeed int64, philoxOffset int64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
+func(ts *Tensor) Must_FillMemEffDropoutMask_(dropoutP float64, seed int64, offset int64)() { 
  
-  retVal0, retVal1, retVal2, err := _FlashAttentionBackward(gradOut, query, key, value, out, logsumexp, cumSeqQ, cumSeqK, maxQ, maxK, dropoutP, isCausal, philoxSeed, philoxOffset)
+  err := ts._FillMemEffDropoutMask_(dropoutP, seed, offset)
+  if err != nil { log.Fatal(err) }
+  
+  return 
+} 
+
+func Must_FlashAttentionBackward(gradOut *Tensor, query *Tensor, key *Tensor, value *Tensor, out *Tensor, logsumexp *Tensor, cumSeqQ *Tensor, cumSeqK *Tensor, maxQ int64, maxK int64, dropoutP float64, isCausal bool, philoxSeed *Tensor, philoxOffset *Tensor, scale []float64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
+  
+  retVal0, retVal1, retVal2, err := _FlashAttentionBackward(gradOut, query, key, value, out, logsumexp, cumSeqQ, cumSeqK, maxQ, maxK, dropoutP, isCausal, philoxSeed, philoxOffset, scale)
  if err != nil { log.Fatal(err) }
  
  return retVal0, retVal1, retVal2
@ -1201,6 +1201,30 @@ func(ts *Tensor) Must_FoobarOut(out *Tensor, arg1 bool, arg2 bool, arg3 bool, de
  return retVal
 } 

+func(ts *Tensor) Must_FunctionalAssertAsync(assertMsg string, depToken *Tensor, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._FunctionalAssertAsync(assertMsg, depToken, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func Must_FunctionalSymConstrainRange(size *Scalar, min []int64, max []int64, depToken *Tensor)(retVal *Tensor) { 
+  
+  retVal, err := _FunctionalSymConstrainRange(size, min, max, depToken)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func Must_FunctionalSymConstrainRangeForSize(size *Scalar, min []int64, max []int64, depToken *Tensor)(retVal *Tensor) { 
+  
+  retVal, err := _FunctionalSymConstrainRangeForSize(size, min, max, depToken)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func(ts *Tensor) Must_FusedDropout(p float64, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
  
  retVal0, retVal1, err := ts._FusedDropout(p, del)
@ -1241,9 +1265,9 @@ func(ts *Tensor) Must_FusedMovingAvgObsFqHelperOut(out0 *Tensor, out1 *Tensor, o
  return retVal0, retVal1
 } 

-func Must_FusedSdpChoice(query *Tensor, key *Tensor, value *Tensor, attnMask *Tensor, dropoutP float64, isCausal bool)(retVal int64) { 
+func Must_FusedSdpChoice(query *Tensor, key *Tensor, value *Tensor, attnMask *Tensor, dropoutP float64, isCausal bool, scale []float64)(retVal int64) { 
  
-  retVal, err := _FusedSdpChoice(query, key, value, attnMask, dropoutP, isCausal)
+  retVal, err := _FusedSdpChoice(query, key, value, attnMask, dropoutP, isCausal, scale)
  if err != nil { log.Fatal(err) }
  
  return retVal
@ -1385,6 +1409,22 @@ func(ts *Tensor) Must_IndicesCopyOut(out *Tensor, del bool)(retVal *Tensor) {
  return retVal
 } 

+func(ts *Tensor) Must_IntMm(mat2 *Tensor, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._IntMm(mat2, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_IntMmOut(out *Tensor, mat2 *Tensor, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._IntMmOut(out, mat2, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func(ts *Tensor) Must_IsAllTrue(del bool)(retVal *Tensor) { 
  
  retVal, err := ts._IsAllTrue(del)
@ -1561,6 +1601,14 @@ func(ts *Tensor) Must_LuWithInfo(pivot bool, checkErrors bool, del bool)(retVal0
  return retVal0, retVal1, retVal2
 } 

+func Must_MakeDepToken(optionsKind gotch.DType, optionsDevice gotch.Device)(retVal *Tensor) { 
+  
+  retVal, err := _MakeDepToken(optionsKind, optionsDevice)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func Must_MakeDual(primal *Tensor, tangent *Tensor, level int64)(retVal *Tensor) { 
  
  retVal, err := _MakeDual(primal, tangent, level)
@ -1769,6 +1817,22 @@ func Must_NativeBatchNormLegitNoStatsOut(out *Tensor, saveMean *Tensor, saveInvs
  return retVal0, retVal1, retVal2
 } 

+func Must_NativeBatchNormLegitNoTraining(input *Tensor, weight *Tensor, bias *Tensor, runningMean *Tensor, runningVar *Tensor, momentum float64, eps float64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
+  
+  retVal0, retVal1, retVal2, err := _NativeBatchNormLegitNoTraining(input, weight, bias, runningMean, runningVar, momentum, eps)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal0, retVal1, retVal2
+} 
+
+func Must_NativeBatchNormLegitNoTrainingOut(out0 *Tensor, out1 *Tensor, out2 *Tensor, input *Tensor, weight *Tensor, bias *Tensor, runningMean *Tensor, runningVar *Tensor, momentum float64, eps float64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
+  
+  retVal0, retVal1, retVal2, err := _NativeBatchNormLegitNoTrainingOut(out0, out1, out2, input, weight, bias, runningMean, runningVar, momentum, eps)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal0, retVal1, retVal2
+} 
+
 func Must_NativeBatchNormLegitOut(out *Tensor, saveMean *Tensor, saveInvstd *Tensor, input *Tensor, weight *Tensor, bias *Tensor, runningMean *Tensor, runningVar *Tensor, training bool, momentum float64, eps float64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
  
  retVal0, retVal1, retVal2, err := _NativeBatchNormLegitOut(out, saveMean, saveInvstd, input, weight, bias, runningMean, runningVar, training, momentum, eps)
@ -1777,22 +1841,6 @@ func Must_NativeBatchNormLegitOut(out *Tensor, saveMean *Tensor, saveInvstd *Ten
  return retVal0, retVal1, retVal2
 } 

-func Must_NativeDecoderOnlyMultiHeadAttention(query *Tensor, key *Tensor, value *Tensor, embedDim int64, numHead int64, qkvWeight *Tensor, qkvBias *Tensor, projWeight *Tensor, projBias *Tensor, mask *Tensor, incrKey *Tensor, incrValue *Tensor, needWeights bool, averageAttnWeights bool)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor, retVal3 *Tensor) { 
-  
-  retVal0, retVal1, retVal2, retVal3, err := _NativeDecoderOnlyMultiHeadAttention(query, key, value, embedDim, numHead, qkvWeight, qkvBias, projWeight, projBias, mask, incrKey, incrValue, needWeights, averageAttnWeights)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal0, retVal1, retVal2, retVal3
-} 
-
-func Must_NativeDecoderOnlyMultiHeadAttentionOut(out0 *Tensor, out1 *Tensor, out2 *Tensor, out3 *Tensor, query *Tensor, key *Tensor, value *Tensor, embedDim int64, numHead int64, qkvWeight *Tensor, qkvBias *Tensor, projWeight *Tensor, projBias *Tensor, mask *Tensor, incrKey *Tensor, incrValue *Tensor, needWeights bool, averageAttnWeights bool)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor, retVal3 *Tensor) { 
-  
-  retVal0, retVal1, retVal2, retVal3, err := _NativeDecoderOnlyMultiHeadAttentionOut(out0, out1, out2, out3, query, key, value, embedDim, numHead, qkvWeight, qkvBias, projWeight, projBias, mask, incrKey, incrValue, needWeights, averageAttnWeights)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal0, retVal1, retVal2, retVal3
-} 
-
 func Must_NativeMultiHeadAttention(query *Tensor, key *Tensor, value *Tensor, embedDim int64, numHead int64, qkvWeight *Tensor, qkvBias *Tensor, projWeight *Tensor, projBias *Tensor, mask *Tensor, needWeights bool, averageAttnWeights bool, maskType []int64)(retVal0 *Tensor, retVal1 *Tensor) { 
  
  retVal0, retVal1, err := _NativeMultiHeadAttention(query, key, value, embedDim, numHead, qkvWeight, qkvBias, projWeight, projBias, mask, needWeights, averageAttnWeights, maskType)
@ -1881,7 +1929,7 @@ func(ts *Tensor) Must_NestedSumBackward(grad *Tensor, dim []int64, keepdim bool,
  return retVal
 } 

-func(ts *Tensor) Must_NestedViewFromBuffer(nestedSize *Tensor, nestedStrides *Tensor, offsets []int64, del bool)(retVal *Tensor) { 
+func(ts *Tensor) Must_NestedViewFromBuffer(nestedSize *Tensor, nestedStrides *Tensor, offsets *Tensor, del bool)(retVal *Tensor) { 
  
  retVal, err := ts._NestedViewFromBuffer(nestedSize, nestedStrides, offsets, del)
  if err != nil { log.Fatal(err) }
@ -1889,7 +1937,7 @@ func(ts *Tensor) Must_NestedViewFromBuffer(nestedSize *Tensor, nestedStrides *Te
  return retVal
 } 

-func(ts *Tensor) Must_NestedViewFromBufferCopy(nestedSize *Tensor, nestedStrides *Tensor, offsets []int64, del bool)(retVal *Tensor) { 
+func(ts *Tensor) Must_NestedViewFromBufferCopy(nestedSize *Tensor, nestedStrides *Tensor, offsets *Tensor, del bool)(retVal *Tensor) { 
  
  retVal, err := ts._NestedViewFromBufferCopy(nestedSize, nestedStrides, offsets, del)
  if err != nil { log.Fatal(err) }
@ -1897,7 +1945,7 @@ func(ts *Tensor) Must_NestedViewFromBufferCopy(nestedSize *Tensor, nestedStrides
  return retVal
 } 

-func(ts *Tensor) Must_NestedViewFromBufferCopyOut(out *Tensor, nestedSize *Tensor, nestedStrides *Tensor, offsets []int64, del bool)(retVal *Tensor) { 
+func(ts *Tensor) Must_NestedViewFromBufferCopyOut(out *Tensor, nestedSize *Tensor, nestedStrides *Tensor, offsets *Tensor, del bool)(retVal *Tensor) { 
  
  retVal, err := ts._NestedViewFromBufferCopyOut(out, nestedSize, nestedStrides, offsets, del)
  if err != nil { log.Fatal(err) }
@ -2153,44 +2201,68 @@ func Must_SaturateWeightToFp16(weight *Tensor)(retVal *Tensor) {
  return retVal
 } 

-func Must_ScaledDotProductAttention(query *Tensor, key *Tensor, value *Tensor, attnMask *Tensor, dropoutP float64, needAttnWeights bool, isCausal bool)(retVal0 *Tensor, retVal1 *Tensor) { 
+func Must_ScaledDotProductAttentionMath(query *Tensor, key *Tensor, value *Tensor, attnMask *Tensor, dropoutP float64, isCausal bool, dropoutMask *Tensor, scale []float64)(retVal0 *Tensor, retVal1 *Tensor) { 
  
-  retVal0, retVal1, err := _ScaledDotProductAttention(query, key, value, attnMask, dropoutP, needAttnWeights, isCausal)
+  retVal0, retVal1, err := _ScaledDotProductAttentionMath(query, key, value, attnMask, dropoutP, isCausal, dropoutMask, scale)
  if err != nil { log.Fatal(err) }
  
  return retVal0, retVal1
 } 

-func Must_ScaledDotProductAttentionMath(query *Tensor, key *Tensor, value *Tensor, attnMask *Tensor, dropoutP float64, isCausal bool, dropoutMask *Tensor)(retVal0 *Tensor, retVal1 *Tensor) { 
+func Must_ScaledDotProductEfficientAttention(query *Tensor, key *Tensor, value *Tensor, attnBias *Tensor, computeLogSumexp bool, dropoutP float64, isCausal bool, scale []float64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor, retVal3 *Tensor) { 
  
-  retVal0, retVal1, err := _ScaledDotProductAttentionMath(query, key, value, attnMask, dropoutP, isCausal, dropoutMask)
+  retVal0, retVal1, retVal2, retVal3, err := _ScaledDotProductEfficientAttention(query, key, value, attnBias, computeLogSumexp, dropoutP, isCausal, scale)
  if err != nil { log.Fatal(err) }
  
-  return retVal0, retVal1
+  return retVal0, retVal1, retVal2, retVal3
 } 

-func Must_ScaledDotProductEfficientAttention(query *Tensor, key *Tensor, value *Tensor, computeLogSumexp bool, isCausal bool)(retVal0 *Tensor, retVal1 *Tensor) { 
+func Must_ScaledDotProductFlashAttentionBackward(gradOut *Tensor, query *Tensor, key *Tensor, value *Tensor, out *Tensor, logsumexp *Tensor, cumSeqQ *Tensor, cumSeqK *Tensor, maxQ int64, maxK int64, dropoutP float64, isCausal bool, philoxSeed *Tensor, philoxOffset *Tensor, scale []float64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
  
-  retVal0, retVal1, err := _ScaledDotProductEfficientAttention(query, key, value, computeLogSumexp, isCausal)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal0, retVal1
-} 
-
-func Must_ScaledDotProductEfficientAttentionBackward(gradOut_ *Tensor, query *Tensor, key *Tensor, value *Tensor, out *Tensor, logsumexp *Tensor, isCausal bool, chunkGradOutputs bool)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
-  
-  retVal0, retVal1, retVal2, err := _ScaledDotProductEfficientAttentionBackward(gradOut_, query, key, value, out, logsumexp, isCausal, chunkGradOutputs)
+  retVal0, retVal1, retVal2, err := _ScaledDotProductFlashAttentionBackward(gradOut, query, key, value, out, logsumexp, cumSeqQ, cumSeqK, maxQ, maxK, dropoutP, isCausal, philoxSeed, philoxOffset, scale)
  if err != nil { log.Fatal(err) }
  
  return retVal0, retVal1, retVal2
 } 

-func Must_ScaledDotProductFlashAttentionBackward(gradOut *Tensor, query *Tensor, key *Tensor, value *Tensor, out *Tensor, logsumexp *Tensor, cumSeqQ *Tensor, cumSeqK *Tensor, maxQ int64, maxK int64, dropoutP float64, isCausal bool, philoxSeed int64, philoxOffset int64)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
+func(ts *Tensor) Must_ScaledMm(mat2 *Tensor, bias *Tensor, outDtype gotch.DType, scaleA *Tensor, scaleB *Tensor, scaleResult *Tensor, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
  
-  retVal0, retVal1, retVal2, err := _ScaledDotProductFlashAttentionBackward(gradOut, query, key, value, out, logsumexp, cumSeqQ, cumSeqK, maxQ, maxK, dropoutP, isCausal, philoxSeed, philoxOffset)
+  retVal0, retVal1, err := ts._ScaledMm(mat2, bias, outDtype, scaleA, scaleB, scaleResult, del)
  if err != nil { log.Fatal(err) }
  
-  return retVal0, retVal1, retVal2
+  return retVal0, retVal1
+} 
+
+func(ts *Tensor) Must_ScaledMmOut(out *Tensor, outAmax *Tensor, mat2 *Tensor, bias *Tensor, outDtype gotch.DType, scaleA *Tensor, scaleB *Tensor, scaleResult *Tensor, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
+  
+  retVal0, retVal1, err := ts._ScaledMmOut(out, outAmax, mat2, bias, outDtype, scaleA, scaleB, scaleResult, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal0, retVal1
+} 
+
+func(ts *Tensor) Must_ScatterReduce(dim int64, index *Tensor, src *Tensor, reduce string, includeSelf bool, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ScatterReduce(dim, index, src, reduce, includeSelf, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ScatterReduce_(dim int64, index *Tensor, src *Tensor, reduce string, includeSelf bool)() { 
+  
+  err := ts._ScatterReduce_(dim, index, src, reduce, includeSelf)
+  if err != nil { log.Fatal(err) }
+  
+  return 
+} 
+
+func(ts *Tensor) Must_ScatterReduceTwoOut(out *Tensor, dim int64, index *Tensor, src *Tensor, reduce string, includeSelf bool, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ScatterReduceTwoOut(out, dim, index, src, reduce, includeSelf, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
 } 

 func Must_SegmentReduceBackward(grad *Tensor, output *Tensor, data *Tensor, reduce string, lengths *Tensor, offsets *Tensor, axis int64, initial *Scalar)(retVal *Tensor) { 
@ -2353,9 +2425,9 @@ func Must_SparseCompressedTensorUnsafe(compressedIndices *Tensor, plainIndices *
  return retVal
 } 

-func Must_SparseCooTensorUnsafe(indices *Tensor, values *Tensor, size []int64, optionsKind gotch.DType, optionsDevice gotch.Device)(retVal *Tensor) { 
+func Must_SparseCooTensorUnsafe(indices *Tensor, values *Tensor, size []int64, optionsKind gotch.DType, optionsDevice gotch.Device, isCoalesced bool)(retVal *Tensor) { 
  
-  retVal, err := _SparseCooTensorUnsafe(indices, values, size, optionsKind, optionsDevice)
+  retVal, err := _SparseCooTensorUnsafe(indices, values, size, optionsKind, optionsDevice, isCoalesced)
  if err != nil { log.Fatal(err) }
  
  return retVal
@ -2369,17 +2441,17 @@ func Must_SparseCooTensorWithDims(sparseDim int64, denseDim int64, size []int64,
  return retVal
 } 

-func Must_SparseCooTensorWithDimsAndTensors(sparseDim int64, denseDim int64, size []int64, indices *Tensor, values *Tensor, optionsKind gotch.DType, optionsDevice gotch.Device)(retVal *Tensor) { 
+func Must_SparseCooTensorWithDimsAndTensors(sparseDim int64, denseDim int64, size []int64, indices *Tensor, values *Tensor, optionsKind gotch.DType, optionsDevice gotch.Device, isCoalesced bool)(retVal *Tensor) { 
  
-  retVal, err := _SparseCooTensorWithDimsAndTensors(sparseDim, denseDim, size, indices, values, optionsKind, optionsDevice)
+  retVal, err := _SparseCooTensorWithDimsAndTensors(sparseDim, denseDim, size, indices, values, optionsKind, optionsDevice, isCoalesced)
  if err != nil { log.Fatal(err) }
  
  return retVal
 } 

-func Must_SparseCooTensorWithDimsAndTensorsOut(out *Tensor, sparseDim int64, denseDim int64, size []int64, indices *Tensor, values *Tensor)(retVal *Tensor) { 
+func Must_SparseCooTensorWithDimsAndTensorsOut(out *Tensor, sparseDim int64, denseDim int64, size []int64, indices *Tensor, values *Tensor, isCoalesced bool)(retVal *Tensor) { 
  
-  retVal, err := _SparseCooTensorWithDimsAndTensorsOut(out, sparseDim, denseDim, size, indices, values)
+  retVal, err := _SparseCooTensorWithDimsAndTensorsOut(out, sparseDim, denseDim, size, indices, values, isCoalesced)
  if err != nil { log.Fatal(err) }
  
  return retVal
@ -2481,6 +2553,22 @@ func(ts *Tensor) Must_SparseLogSoftmaxOut(out *Tensor, dim int64, halfToFloat bo
  return retVal
 } 

+func(ts *Tensor) Must_SparseMaskProjection(mask *Tensor, accumulateMatches bool, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._SparseMaskProjection(mask, accumulateMatches, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_SparseMaskProjectionOut(out *Tensor, mask *Tensor, accumulateMatches bool, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._SparseMaskProjectionOut(out, mask, accumulateMatches, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func Must_SparseMm(sparse *Tensor, dense *Tensor)(retVal *Tensor) { 
  
  retVal, err := _SparseMm(sparse, dense)
@ -2505,6 +2593,14 @@ func(ts *Tensor) Must_SparseMmReduceImpl(other *Tensor, reduce string, del bool)
  return retVal0, retVal1
 } 

+func Must_SparseSemiStructuredLinear(input *Tensor, weight *Tensor, meta *Tensor, bias *Tensor, activation string)(retVal *Tensor) { 
+  
+  retVal, err := _SparseSemiStructuredLinear(input, weight, meta, bias, activation)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func(ts *Tensor) Must_SparseSoftmax(dim int64, halfToFloat bool, del bool)(retVal *Tensor) { 
  
  retVal, err := ts._SparseSoftmax(dim, halfToFloat, del)
@ -2753,6 +2849,22 @@ func(ts *Tensor) Must_TestCheckTensor(del bool)(retVal *Tensor) {
  return retVal
 } 

+func(ts *Tensor) Must_TestFunctorchFallback(other *Tensor, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._TestFunctorchFallback(other, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_TestFunctorchFallbackOut(out *Tensor, other *Tensor, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._TestFunctorchFallbackOut(out, other, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func Must_TestOptionalFilledIntlist(values *Tensor, addends []int64)(retVal *Tensor) { 
  
  retVal, err := _TestOptionalFilledIntlist(values, addends)
@ -2849,17 +2961,121 @@ func(ts *Tensor) Must_ToCopyOut(out *Tensor, nonBlocking bool, del bool)(retVal
  return retVal
 } 

-func(ts *Tensor) Must_ToDense(dtype gotch.DType, del bool)(retVal *Tensor) { 
+func(ts *Tensor) Must_ToDense(dtype gotch.DType, maskedGrad bool, del bool)(retVal *Tensor) { 
  
-  retVal, err := ts._ToDense(dtype, del)
+  retVal, err := ts._ToDense(dtype, maskedGrad, del)
  if err != nil { log.Fatal(err) }
  
  return retVal
 } 

-func(ts *Tensor) Must_ToDenseOut(out *Tensor, dtype gotch.DType, del bool)(retVal *Tensor) { 
+func(ts *Tensor) Must_ToDenseOut(out *Tensor, dtype gotch.DType, maskedGrad bool, del bool)(retVal *Tensor) { 
  
-  retVal, err := ts._ToDenseOut(out, dtype, del)
+  retVal, err := ts._ToDenseOut(out, dtype, maskedGrad, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparse(layout Layout, blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparse(layout, blocksize, denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseBsc(blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseBsc(blocksize, denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseBscOut(out *Tensor, blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseBscOut(out, blocksize, denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseBsr(blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseBsr(blocksize, denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseBsrOut(out *Tensor, blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseBsrOut(out, blocksize, denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseCsc(denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseCsc(denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseCscOut(out *Tensor, denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseCscOut(out, denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseCsr(denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseCsr(denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseCsrOut(out *Tensor, denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseCsrOut(out, denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseOut(out *Tensor, layout Layout, blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseOut(out, layout, blocksize, denseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func Must_ToSparseSemiStructured(dense *Tensor)(retVal0 *Tensor, retVal1 *Tensor) { 
+  
+  retVal0, retVal1, err := _ToSparseSemiStructured(dense)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal0, retVal1
+} 
+
+func(ts *Tensor) Must_ToSparseSparseDim(sparseDim int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseSparseDim(sparseDim, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_ToSparseSparseDimOut(out *Tensor, sparseDim int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._ToSparseSparseDimOut(out, sparseDim, del)
  if err != nil { log.Fatal(err) }
  
  return retVal
@ -2881,22 +3097,6 @@ func Must_TransformBiasRescaleQkvOut(out0 *Tensor, out1 *Tensor, out2 *Tensor, q
  return retVal0, retVal1, retVal2
 } 

-func Must_TransformerDecoderOnlyLayerFwd(src *Tensor, embedDim int64, numHeads int64, qkvWeight *Tensor, qkvBias *Tensor, projWeight *Tensor, projBias *Tensor, useGelu bool, normFirst bool, eps float64, normWeight1 *Tensor, normBias1 *Tensor, normWeight2 *Tensor, normBias2 *Tensor, ffnWeight1 *Tensor, ffnBias1 *Tensor, ffnWeight2 *Tensor, ffnBias2 *Tensor, mask *Tensor, incrKey *Tensor, incrValue *Tensor)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
-  
-  retVal0, retVal1, retVal2, err := _TransformerDecoderOnlyLayerFwd(src, embedDim, numHeads, qkvWeight, qkvBias, projWeight, projBias, useGelu, normFirst, eps, normWeight1, normBias1, normWeight2, normBias2, ffnWeight1, ffnBias1, ffnWeight2, ffnBias2, mask, incrKey, incrValue)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal0, retVal1, retVal2
-} 
-
-func Must_TransformerDecoderOnlyLayerFwdOut(out0 *Tensor, out1 *Tensor, out2 *Tensor, src *Tensor, embedDim int64, numHeads int64, qkvWeight *Tensor, qkvBias *Tensor, projWeight *Tensor, projBias *Tensor, useGelu bool, normFirst bool, eps float64, normWeight1 *Tensor, normBias1 *Tensor, normWeight2 *Tensor, normBias2 *Tensor, ffnWeight1 *Tensor, ffnBias1 *Tensor, ffnWeight2 *Tensor, ffnBias2 *Tensor, mask *Tensor, incrKey *Tensor, incrValue *Tensor)(retVal0 *Tensor, retVal1 *Tensor, retVal2 *Tensor) { 
-  
-  retVal0, retVal1, retVal2, err := _TransformerDecoderOnlyLayerFwdOut(out0, out1, out2, src, embedDim, numHeads, qkvWeight, qkvBias, projWeight, projBias, useGelu, normFirst, eps, normWeight1, normBias1, normWeight2, normBias2, ffnWeight1, ffnBias1, ffnWeight2, ffnBias2, mask, incrKey, incrValue)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal0, retVal1, retVal2
-} 
-
 func Must_TransformerEncoderLayerFwd(src *Tensor, embedDim int64, numHeads int64, qkvWeight *Tensor, qkvBias *Tensor, projWeight *Tensor, projBias *Tensor, useGelu bool, normFirst bool, eps float64, normWeight1 *Tensor, normBias1 *Tensor, normWeight2 *Tensor, normBias2 *Tensor, ffnWeight1 *Tensor, ffnBias1 *Tensor, ffnWeight2 *Tensor, ffnBias2 *Tensor, mask *Tensor, maskType []int64)(retVal *Tensor) { 
  
  retVal, err := _TransformerEncoderLayerFwd(src, embedDim, numHeads, qkvWeight, qkvBias, projWeight, projBias, useGelu, normFirst, eps, normWeight1, normBias1, normWeight2, normBias2, ffnWeight1, ffnBias1, ffnWeight2, ffnBias2, mask, maskType)
@ -3001,6 +3201,22 @@ func Must_UnpackDual(dual *Tensor, level int64)(retVal0 *Tensor, retVal1 *Tensor
  return retVal0, retVal1
 } 

+func(ts *Tensor) Must_UnsafeIndex(indices []*Tensor, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._UnsafeIndex(indices, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) Must_UnsafeIndexPut(indices []*Tensor, values *Tensor, accumulate bool, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts._UnsafeIndexPut(indices, values, accumulate, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func(ts *Tensor) Must_UnsafeView(size []int64, del bool)(retVal *Tensor) { 
  
  retVal, err := ts._UnsafeView(size, del)
@ -4529,17 +4745,17 @@ func MustBatchNorm(input *Tensor, weight *Tensor, bias *Tensor, runningMean *Ten
  return retVal
 } 

-func MustBatchNormBackwardElemt(gradOut *Tensor, input *Tensor, mean *Tensor, invstd *Tensor, weight *Tensor, meanDy *Tensor, meanDyXmu *Tensor, count *Tensor)(retVal *Tensor) { 
+func MustBatchNormBackwardElemt(gradOut *Tensor, input *Tensor, mean *Tensor, invstd *Tensor, weight *Tensor, sumDy *Tensor, sumDyXmu *Tensor, count *Tensor)(retVal *Tensor) { 
  
-  retVal, err := BatchNormBackwardElemt(gradOut, input, mean, invstd, weight, meanDy, meanDyXmu, count)
+  retVal, err := BatchNormBackwardElemt(gradOut, input, mean, invstd, weight, sumDy, sumDyXmu, count)
  if err != nil { log.Fatal(err) }
  
  return retVal
 } 

-func MustBatchNormBackwardElemtOut(out *Tensor, gradOut *Tensor, input *Tensor, mean *Tensor, invstd *Tensor, weight *Tensor, meanDy *Tensor, meanDyXmu *Tensor, count *Tensor)(retVal *Tensor) { 
+func MustBatchNormBackwardElemtOut(out *Tensor, gradOut *Tensor, input *Tensor, mean *Tensor, invstd *Tensor, weight *Tensor, sumDy *Tensor, sumDyXmu *Tensor, count *Tensor)(retVal *Tensor) { 
  
-  retVal, err := BatchNormBackwardElemtOut(out, gradOut, input, mean, invstd, weight, meanDy, meanDyXmu, count)
+  retVal, err := BatchNormBackwardElemtOut(out, gradOut, input, mean, invstd, weight, sumDy, sumDyXmu, count)
  if err != nil { log.Fatal(err) }
  
  return retVal
@ -7145,6 +7361,22 @@ func MustEmptyOut(out *Tensor, size []int64)(retVal *Tensor) {
  return retVal
 } 

+func MustEmptyPermuted(size []int64, physicalLayout []int64, optionsKind gotch.DType, optionsDevice gotch.Device)(retVal *Tensor) { 
+  
+  retVal, err := EmptyPermuted(size, physicalLayout, optionsKind, optionsDevice)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func MustEmptyPermutedOut(out *Tensor, size []int64, physicalLayout []int64)(retVal *Tensor) { 
+  
+  retVal, err := EmptyPermutedOut(out, size, physicalLayout)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func MustEmptyQuantized(size []int64, qtensor *Tensor, optionsKind gotch.DType, optionsDevice gotch.Device)(retVal *Tensor) { 
  
  retVal, err := EmptyQuantized(size, qtensor, optionsKind, optionsDevice)
@ -12121,6 +12353,14 @@ func(ts *Tensor) MustMinOut(out *Tensor, other *Tensor, del bool)(retVal *Tensor
  return retVal
 } 

+func(ts *Tensor) MustMinUnaryOut(out *Tensor, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts.MinUnaryOut(out, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func(ts *Tensor) MustMinimum(other *Tensor, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.Minimum(other, del)
@ -13385,6 +13625,22 @@ func(ts *Tensor) MustNonzeroOut(out *Tensor, del bool)(retVal *Tensor) {
  return retVal
 } 

+func(ts *Tensor) MustNonzeroStatic(size int64, fillValue int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts.NonzeroStatic(size, fillValue, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) MustNonzeroStaticOut(out *Tensor, size int64, fillValue int64, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts.NonzeroStaticOut(out, size, fillValue, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func(ts *Tensor) MustNorm(del bool)(retVal *Tensor) { 
  
  retVal, err := ts.Norm(del)
@ -14177,6 +14433,22 @@ func(ts *Tensor) MustQuantizedMaxPool2dOut(out *Tensor, kernelSize []int64, stri
  return retVal
 } 

+func(ts *Tensor) MustQuantizedMaxPool3d(kernelSize []int64, stride []int64, padding []int64, dilation []int64, ceilMode bool, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts.QuantizedMaxPool3d(kernelSize, stride, padding, dilation, ceilMode, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
+func(ts *Tensor) MustQuantizedMaxPool3dOut(out *Tensor, kernelSize []int64, stride []int64, padding []int64, dilation []int64, ceilMode bool, del bool)(retVal *Tensor) { 
+  
+  retVal, err := ts.QuantizedMaxPool3dOut(out, kernelSize, stride, padding, dilation, ceilMode, del)
+  if err != nil { log.Fatal(err) }
+  
+  return retVal
+} 
+
 func MustQuantizedRnnReluCell(input *Tensor, hx *Tensor, wIh *Tensor, wHh *Tensor, bIh *Tensor, bHh *Tensor, packedIh *Tensor, packedHh *Tensor, colOffsetsIh *Tensor, colOffsetsHh *Tensor, scaleIh *Scalar, scaleHh *Scalar, zeroPointIh *Scalar, zeroPointHh *Scalar)(retVal *Tensor) { 
  
  retVal, err := QuantizedRnnReluCell(input, hx, wIh, wHh, bIh, bHh, packedIh, packedHh, colOffsetsIh, colOffsetsHh, scaleIh, scaleHh, zeroPointIh, zeroPointHh)
@ -15289,9 +15561,9 @@ func MustScalarTensorOut(out *Tensor, s *Scalar)(retVal *Tensor) {
  return retVal
 } 

-func MustScaledDotProductAttention(query *Tensor, key *Tensor, value *Tensor, attnMask *Tensor, dropoutP float64, isCausal bool)(retVal *Tensor) { 
+func MustScaledDotProductAttention(query *Tensor, key *Tensor, value *Tensor, attnMask *Tensor, dropoutP float64, isCausal bool, scale []float64)(retVal *Tensor) { 
  
-  retVal, err := ScaledDotProductAttention(query, key, value, attnMask, dropoutP, isCausal)
+  retVal, err := ScaledDotProductAttention(query, key, value, attnMask, dropoutP, isCausal, scale)
  if err != nil { log.Fatal(err) }
  
  return retVal
@ -16209,17 +16481,17 @@ func MustSparseCooTensor(size []int64, optionsKind gotch.DType, optionsDevice go
  return retVal
 } 

-func MustSparseCooTensorIndices(indices *Tensor, values *Tensor, optionsKind gotch.DType, optionsDevice gotch.Device)(retVal *Tensor) { 
+func MustSparseCooTensorIndices(indices *Tensor, values *Tensor, optionsKind gotch.DType, optionsDevice gotch.Device, isCoalesced bool)(retVal *Tensor) { 
  
-  retVal, err := SparseCooTensorIndices(indices, values, optionsKind, optionsDevice)
+  retVal, err := SparseCooTensorIndices(indices, values, optionsKind, optionsDevice, isCoalesced)
  if err != nil { log.Fatal(err) }
  
  return retVal
 } 

-func MustSparseCooTensorIndicesSize(indices *Tensor, values *Tensor, size []int64, optionsKind gotch.DType, optionsDevice gotch.Device)(retVal *Tensor) { 
+func MustSparseCooTensorIndicesSize(indices *Tensor, values *Tensor, size []int64, optionsKind gotch.DType, optionsDevice gotch.Device, isCoalesced bool)(retVal *Tensor) { 
  
-  retVal, err := SparseCooTensorIndicesSize(indices, values, size, optionsKind, optionsDevice)
+  retVal, err := SparseCooTensorIndicesSize(indices, values, size, optionsKind, optionsDevice, isCoalesced)
  if err != nil { log.Fatal(err) }
  
  return retVal
@ -17897,7 +18169,7 @@ func(ts *Tensor) MustStd(unbiased bool, del bool)(retVal *Tensor) {
  return retVal
 } 

-func(ts *Tensor) MustStdCorrection(dim []int64, correction []int64, keepdim bool, del bool)(retVal *Tensor) { 
+func(ts *Tensor) MustStdCorrection(dim []int64, correction *Scalar, keepdim bool, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.StdCorrection(dim, correction, keepdim, del)
  if err != nil { log.Fatal(err) }
@ -17905,7 +18177,7 @@ func(ts *Tensor) MustStdCorrection(dim []int64, correction []int64, keepdim bool
  return retVal
 } 

-func(ts *Tensor) MustStdCorrectionOut(out *Tensor, dim []int64, correction []int64, keepdim bool, del bool)(retVal *Tensor) { 
+func(ts *Tensor) MustStdCorrectionOut(out *Tensor, dim []int64, correction *Scalar, keepdim bool, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.StdCorrectionOut(out, dim, correction, keepdim, del)
  if err != nil { log.Fatal(err) }
@ -17929,7 +18201,7 @@ func(ts *Tensor) MustStdMean(unbiased bool, del bool)(retVal0 *Tensor, retVal1 *
  return retVal0, retVal1
 } 

-func(ts *Tensor) MustStdMeanCorrection(dim []int64, correction []int64, keepdim bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
+func(ts *Tensor) MustStdMeanCorrection(dim []int64, correction *Scalar, keepdim bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
  
  retVal0, retVal1, err := ts.StdMeanCorrection(dim, correction, keepdim, del)
  if err != nil { log.Fatal(err) }
@ -17937,7 +18209,7 @@ func(ts *Tensor) MustStdMeanCorrection(dim []int64, correction []int64, keepdim
  return retVal0, retVal1
 } 

-func(ts *Tensor) MustStdMeanCorrectionOut(out0 *Tensor, out1 *Tensor, dim []int64, correction []int64, keepdim bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
+func(ts *Tensor) MustStdMeanCorrectionOut(out0 *Tensor, out1 *Tensor, dim []int64, correction *Scalar, keepdim bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
  
  retVal0, retVal1, err := ts.StdMeanCorrectionOut(out0, out1, dim, correction, keepdim, del)
  if err != nil { log.Fatal(err) }
@ -18353,17 +18625,17 @@ func(ts *Tensor) MustTo(device gotch.Device, del bool)(retVal *Tensor) {
  return retVal
 } 

-func(ts *Tensor) MustToDense(dtype gotch.DType, del bool)(retVal *Tensor) { 
+func(ts *Tensor) MustToDense(dtype gotch.DType, maskedGrad bool, del bool)(retVal *Tensor) { 
  
-  retVal, err := ts.ToDense(dtype, del)
+  retVal, err := ts.ToDense(dtype, maskedGrad, del)
  if err != nil { log.Fatal(err) }
  
  return retVal
 } 

-func MustToDenseBackward(grad *Tensor, input *Tensor)(retVal *Tensor) { 
+func MustToDenseBackward(grad *Tensor, input *Tensor, maskedGrad bool)(retVal *Tensor) { 
  
-  retVal, err := ToDenseBackward(grad, input)
+  retVal, err := ToDenseBackward(grad, input, maskedGrad)
  if err != nil { log.Fatal(err) }
  
  return retVal
@ -18457,14 +18729,6 @@ func(ts *Tensor) MustToSparseBsc(blocksize []int64, denseDim []int64, del bool)(
  return retVal
 } 

-func(ts *Tensor) MustToSparseBscOut(out *Tensor, blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
-  
-  retVal, err := ts.ToSparseBscOut(out, blocksize, denseDim, del)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
 func(ts *Tensor) MustToSparseBsr(blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.ToSparseBsr(blocksize, denseDim, del)
@ -18473,14 +18737,6 @@ func(ts *Tensor) MustToSparseBsr(blocksize []int64, denseDim []int64, del bool)(
  return retVal
 } 

-func(ts *Tensor) MustToSparseBsrOut(out *Tensor, blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
-  
-  retVal, err := ts.ToSparseBsrOut(out, blocksize, denseDim, del)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
 func(ts *Tensor) MustToSparseCsc(denseDim []int64, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.ToSparseCsc(denseDim, del)
@ -18489,14 +18745,6 @@ func(ts *Tensor) MustToSparseCsc(denseDim []int64, del bool)(retVal *Tensor) {
  return retVal
 } 

-func(ts *Tensor) MustToSparseCscOut(out *Tensor, denseDim []int64, del bool)(retVal *Tensor) { 
-  
-  retVal, err := ts.ToSparseCscOut(out, denseDim, del)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
 func(ts *Tensor) MustToSparseCsr(denseDim []int64, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.ToSparseCsr(denseDim, del)
@ -18505,22 +18753,6 @@ func(ts *Tensor) MustToSparseCsr(denseDim []int64, del bool)(retVal *Tensor) {
  return retVal
 } 

-func(ts *Tensor) MustToSparseCsrOut(out *Tensor, denseDim []int64, del bool)(retVal *Tensor) { 
-  
-  retVal, err := ts.ToSparseCsrOut(out, denseDim, del)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
-func(ts *Tensor) MustToSparseOut(out *Tensor, layout Layout, blocksize []int64, denseDim []int64, del bool)(retVal *Tensor) { 
-  
-  retVal, err := ts.ToSparseOut(out, layout, blocksize, denseDim, del)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
 func(ts *Tensor) MustToSparseSparseDim(sparseDim int64, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.ToSparseSparseDim(sparseDim, del)
@ -18529,14 +18761,6 @@ func(ts *Tensor) MustToSparseSparseDim(sparseDim int64, del bool)(retVal *Tensor
  return retVal
 } 

-func(ts *Tensor) MustToSparseSparseDimOut(out *Tensor, sparseDim int64, del bool)(retVal *Tensor) { 
-  
-  retVal, err := ts.ToSparseSparseDimOut(out, sparseDim, del)
-  if err != nil { log.Fatal(err) }
-  
-  return retVal
-} 
-
 func(ts *Tensor) MustTopk(k int64, dim int64, largest bool, sorted bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
  
  retVal0, retVal1, err := ts.Topk(k, dim, largest, sorted, del)
@ -19305,7 +19529,7 @@ func(ts *Tensor) MustVar(unbiased bool, del bool)(retVal *Tensor) {
  return retVal
 } 

-func(ts *Tensor) MustVarCorrection(dim []int64, correction []int64, keepdim bool, del bool)(retVal *Tensor) { 
+func(ts *Tensor) MustVarCorrection(dim []int64, correction *Scalar, keepdim bool, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.VarCorrection(dim, correction, keepdim, del)
  if err != nil { log.Fatal(err) }
@ -19313,7 +19537,7 @@ func(ts *Tensor) MustVarCorrection(dim []int64, correction []int64, keepdim bool
  return retVal
 } 

-func(ts *Tensor) MustVarCorrectionOut(out *Tensor, dim []int64, correction []int64, keepdim bool, del bool)(retVal *Tensor) { 
+func(ts *Tensor) MustVarCorrectionOut(out *Tensor, dim []int64, correction *Scalar, keepdim bool, del bool)(retVal *Tensor) { 
  
  retVal, err := ts.VarCorrectionOut(out, dim, correction, keepdim, del)
  if err != nil { log.Fatal(err) }
@ -19337,7 +19561,7 @@ func(ts *Tensor) MustVarMean(unbiased bool, del bool)(retVal0 *Tensor, retVal1 *
  return retVal0, retVal1
 } 

-func(ts *Tensor) MustVarMeanCorrection(dim []int64, correction []int64, keepdim bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
+func(ts *Tensor) MustVarMeanCorrection(dim []int64, correction *Scalar, keepdim bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
  
  retVal0, retVal1, err := ts.VarMeanCorrection(dim, correction, keepdim, del)
  if err != nil { log.Fatal(err) }
@ -19345,7 +19569,7 @@ func(ts *Tensor) MustVarMeanCorrection(dim []int64, correction []int64, keepdim
  return retVal0, retVal1
 } 

-func(ts *Tensor) MustVarMeanCorrectionOut(out0 *Tensor, out1 *Tensor, dim []int64, correction []int64, keepdim bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
+func(ts *Tensor) MustVarMeanCorrectionOut(out0 *Tensor, out1 *Tensor, dim []int64, correction *Scalar, keepdim bool, del bool)(retVal0 *Tensor, retVal1 *Tensor) { 
  
  retVal0, retVal1, err := ts.VarMeanCorrectionOut(out0, out1, dim, correction, keepdim, del)
  if err != nil { log.Fatal(err) }
--- a/ts/tensor-generated.go
+++ b/ts/tensor-generated.go