From d2260b91c9b36cb1400cce0de9006543953e7f54 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Mon, 12 Aug 2024 03:24:49 +0300
Subject: [PATCH 1/4] add_Jittor: Passing model tests, Parameter and Module
 Container test

add_Jittor: Passing model tests, Parameter and Module Container test.

Additional Functionality:
1- TrainOneStep integration.
2- Updated core/train_jt to enable accuracy to be measured.
3- Updated Jittor Optimizer: replaced gradient and apply_gradient function with jittors default functions Zero_grad() and Step(). included a new function Set() to set the trainable_weights paramters for the optimizer.
4- Updated Jittor Metrics for Accuracy, Recall, Precision and AUC.
5- Creating Jittor model tutorial file jittor_module_tutorial.py
6- Module Container and Parameter Container: Updated core_jittor ModuleList and ParameterDict to enable OrderedDict intialization which was not available due to the parent class (Jittor Module) initializing Dict by default which caused integration issues. This issue was handled by updating the function and also excluding the parent Module for these functions.

Areas to optimize integration: Enabling Jittor integration to run large model training as currently it is limited in the complexity of NN layers.
---
 examples/basic_tutorials/cifar10_cnn.py       | 377 +++++++--------
 examples/basic_tutorials/cifar10_cnn_dist.py  |   5 +-
 examples/basic_tutorials/cifar10_cnn_train.py |   7 +-
 .../gradient_clip_mixed_tensorflow.py         |   5 +-
 ..._tutorial.py => jiitor_models_tutorial.py} | 370 +++++++++------
 examples/basic_tutorials/mnist_dataflow.py    |   5 +-
 .../basic_tutorials/mnist_mlp_custom_train.py |   5 +-
 .../mnist_mlp_mix_programming.py              | 300 ++++++------
 .../basic_tutorials/mnist_mlp_simple_train.py |   5 +-
 examples/basic_tutorials/mnist_sequential.py  |   4 +-
 examples/basic_tutorials/module_container.py  |   7 +-
 examples/basic_tutorials/quick_start.py       |  31 +-
 .../basic_tutorials/tensorlayerx_graph.py     |  32 +-
 .../tensorlayerx_model_load.py                |   4 +-
 ...ts.out.tfevents.1722986988.LAPTOP-48J7839G | Bin 0 -> 40 bytes
 tensorlayerx/backend/ops/jittor_backend.py    |   4 +-
 tensorlayerx/backend/ops/jittor_nn.py         |  52 +-
 tensorlayerx/files/utils.py                   |  21 +-
 tensorlayerx/metrics/jittor_metric.py         | 196 ++++----
 tensorlayerx/model/core.py                    |  16 +-
 tensorlayerx/model/utils.py                   |  35 +-
 tensorlayerx/nn/core/core_jittor.py           |  95 ++--
 .../nn/layers/convolution/deformable_conv.py  |  11 +-
 tensorlayerx/optimizers/jittor_optimizers.py  | 443 +++++-------------
 24 files changed, 937 insertions(+), 1093 deletions(-)
 rename examples/basic_tutorials/{jiitor_tutorial.py => jiitor_models_tutorial.py} (67%)
 create mode 100644 runs/mlp/events.out.tfevents.1722986988.LAPTOP-48J7839G

diff --git a/examples/basic_tutorials/cifar10_cnn.py b/examples/basic_tutorials/cifar10_cnn.py
index 569a7bb..089a41b 100644
--- a/examples/basic_tutorials/cifar10_cnn.py
+++ b/examples/basic_tutorials/cifar10_cnn.py
@@ -1,31 +1,38 @@
 #! /usr/bin/python
 # -*- coding: utf-8 -*-
 
+
+
+################################ TensorLayerX and Torch can be mixed programming. #################################
+
 import os
+# os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'mindspore'
+os.environ['TL_BACKEND'] = 'torch'
+
+
 import time
-import numpy as np
-import tensorlayerx as tlx
 from tensorlayerx.dataflow import Dataset, DataLoader
 from tensorlayerx.vision.transforms import (
     Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
 )
-from tensorlayerx.nn import Conv2d, Linear, Flatten, Module
-from tensorlayerx.optimizers import Adam
-from tqdm import tqdm
-
-# Enable debug logging
+from tensorlayerx.model import TrainOneStep
+from tensorlayerx.nn import Module
+import tensorlayerx as tlx
+from tensorlayerx.nn import (Conv2d, Linear, Flatten, MaxPool2d, BatchNorm2d)
+# enable debug logging
 tlx.logging.set_verbosity(tlx.logging.DEBUG)
 
-os.environ['TL_BACKEND'] = 'jittor'
-
-
-
-# Download and prepare the CIFAR10 dataset with progress bar
-print("Downloading CIFAR10 dataset...")
+# ################## Download and prepare the CIFAR10 dataset ##################
+# This is just some way of getting the CIFAR10 dataset from an online location
+# and loading it into numpy arrays with shape [32,32,3]
 X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
 
-# Define the CIFAR10 dataset
-class CIFAR10Dataset(Dataset):
+# ################## CIFAR10 dataset ##################
+# We define a Dataset class for Loading CIFAR10 images and labels.
+class make_dataset(Dataset):
+
     def __init__(self, data, label, transforms):
         self.data = data
         self.label = label
@@ -35,113 +42,161 @@ def __getitem__(self, idx):
         x = self.data[idx].astype('uint8')
         y = self.label[idx].astype('int64')
         x = self.transforms(x)
+
         return x, y
 
     def __len__(self):
+
         return len(self.label)
 
-# Define the CIFAR10 images preprocessing pipeline
-train_transforms = Compose([
-    RandomCrop(size=[24, 24]),
-    RandomFlipHorizontal(),
-    RandomBrightness(brightness_factor=(0.5, 1.5)),
-    RandomContrast(contrast_factor=(0.5, 1.5)),
-    StandardizePerImage()
-])
+# We define the CIFAR10 iamges preprocessing pipeline.
+train_transforms = Compose( # Combining multiple operations sequentially
+    [
+        RandomCrop(size=[24, 24]), #random crop from images to shape [24, 24]
+        RandomFlipHorizontal(), # random invert each image horizontally by probability
+        RandomBrightness(brightness_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust brightness randomly
+        RandomContrast(contrast_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust contrast randomly
+        StandardizePerImage() #Normalize the values of each image to [-1, 1]
+    ]
+)
 
 test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
 
-# Create DataLoaders for training and testing
-print("Processing CIFAR10 dataset...")
-train_dataset = CIFAR10Dataset(data=X_train, label=y_train, transforms=train_transforms)
-test_dataset = CIFAR10Dataset(data=X_test, label=y_test, transforms=test_transforms)
+# We use DataLoader to batch and shuffle data, and make data into iterators.
+train_dataset = make_dataset(data=X_train, label=y_train, transforms=train_transforms)
+test_dataset = make_dataset(data=X_test, label=y_test, transforms=test_transforms)
 
-train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
-test_dataloader = DataLoader(test_dataset, batch_size=128)
+train_dataset = DataLoader(train_dataset, batch_size=128, shuffle=True)
+test_dataset = DataLoader(test_dataset, batch_size=128)
 
-# Define a simple CNN model
-class SimpleCNN(Module):
-    def __init__(self):
-        super(SimpleCNN, self).__init__()
-        self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
-        self.flatten = Flatten()
-        self.fc1 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=16 * 24 * 24)
-        self.fc2 = Linear(out_features=10, act=None, in_features=64)
+# ################## CNN network ##################
+class CNN(Module):
 
+    def __init__(self):
+        super(CNN, self).__init__()
+        # Parameter initialization method
+        W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
+        W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
+        b_init2 = tlx.nn.initializers.constant(value=0.1)
+
+        # 2D Convolutional Neural Network, Set padding method "SAME", convolutional kernel size [5,5], stride [1,1], in channels, out channels
+        self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3)
+        # Add 2D BatchNormalize, using ReLU for output.
+        self.bn = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        # Add 2D Max pooling layer.
+        self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
+
+        self.conv2 = Conv2d(
+            64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, name='conv2', in_channels=64
+        )
+        self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
+        # Flatten 2D data to 1D data
+        self.flatten = Flatten(name='flatten')
+        # Linear layer with 384 units, using ReLU for output.
+        self.linear1 = Linear(384, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
+        self.linear2 = Linear(192, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
+        self.linear3 = Linear(10, act=None, W_init=W_init2, name='output', in_features=192)
+
+    # We define the forward computation process.
     def forward(self, x):
         z = self.conv1(x)
+        z = self.bn(z)
+        z = self.maxpool1(z)
+        z = self.conv2(z)
+        z = self.maxpool2(z)
         z = self.flatten(z)
-        z = self.fc1(z)
-        z = self.fc2(z)
+        z = self.linear1(z)
+        z = self.linear2(z)
+        z = self.linear3(z)
         return z
 
-# Instantiate the model
-model = SimpleCNN()
 
-# Define the optimizer
-optimizer = Adam(model.trainable_weights, lr=0.001)
+# get the network
+net = CNN()
 
-# Define the loss function
-loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+# training settings
+n_epoch = 500
+learning_rate = 0.0001
+print_freq = 5
+n_step_epoch = int(len(y_train) / 128)
+n_step = n_epoch * n_step_epoch
+shuffle_buffer_size = 128
+# Get training parameters
+train_weights = net.trainable_weights
+# Define the optimizer, use the Adam optimizer.
+optimizer = tlx.optimizers.Adam(learning_rate)
+# Define evaluation metrics.
+metrics = tlx.metrics.Accuracy()
 
-# Training loop
-n_epoch = 2
-for epoch in range(n_epoch):
-    start_time = time.time()
-    model.set_train()
-    train_loss, n_iter = 0, 0
-
-    with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{n_epoch}", unit="batch") as pbar:
-        for X_batch, y_batch in train_dataloader:
-            X_batch = tlx.convert_to_tensor(X_batch)
-            y_batch = tlx.convert_to_tensor(y_batch)
-            _logits = model(X_batch)
-            loss = loss_fn(_logits, y_batch)
-            
-            optimizer.zero_grad()
-            optimizer.step(loss)
-            
-            train_loss += loss.item()
-            n_iter += 1
-            pbar.update(1)
+# Define the loss calculation process
+class WithLoss(Module):
 
-    print(f"Epoch {epoch + 1} of {n_epoch} took {time.time() - start_time:.2f}s")
-    print(f"   train loss: {train_loss / n_iter:.4f}")
+    def __init__(self, net, loss_fn):
+        super(WithLoss, self).__init__()
+        self._net = net
+        self._loss_fn = loss_fn
 
+    def forward(self, data, label):
+        out = self._net(data)
+        loss = self._loss_fn(out, label)
+        return loss
 
 
-################################ TensorLayerX and Jittor can be mixed programming. #################################
+net_with_loss = WithLoss(net, loss_fn=tlx.losses.softmax_cross_entropy_with_logits)
+# Initialize one-step training
+net_with_train = TrainOneStep(net_with_loss, optimizer, train_weights)
 
+# Custom training loops
+for epoch in range(n_epoch):
+    start_time = time.time()
+    # Set the network to training state
+    net.set_train()
+    train_loss, train_acc, n_iter = 0, 0, 0
+    # Get training data and labels
+    for X_batch, y_batch in train_dataset:
+        # Calculate the loss value, and automatically complete the gradient update
+        _loss_ce = net_with_train(X_batch, y_batch)
+        train_loss += _loss_ce
+
+        n_iter += 1
+        _logits = net(X_batch)
+        # Calculate accuracy
+        metrics.update(_logits, y_batch)
+        train_acc += metrics.result()
+        metrics.reset()
+        print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
+        print("   train loss: {}".format(train_loss / n_iter))
+        print("   train acc:  {}".format(train_acc / n_iter))
 
 
-# import os
-# # os.environ['TL_BACKEND'] = 'paddle'
-# # os.environ['TL_BACKEND'] = 'tensorflow'
-# # os.environ['TL_BACKEND'] = 'mindspore'
-# os.environ['TL_BACKEND'] = 'torch'
+################################ TensorLayerX and Jittor can be mixed programming. #################################
 
 
+# import os
 # import time
+# import numpy as np
+# import tensorlayerx as tlx
 # from tensorlayerx.dataflow import Dataset, DataLoader
 # from tensorlayerx.vision.transforms import (
 #     Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
 # )
-# from tensorlayerx.model import TrainOneStep
-# from tensorlayerx.nn import Module
-# import tensorlayerx as tlx
-# from tensorlayerx.nn import (Conv2d, Linear, Flatten, MaxPool2d, BatchNorm2d)
-# # enable debug logging
+# from tensorlayerx.nn import Conv2d, Linear, Flatten, Module
+# from tensorlayerx.optimizers import Adam
+# from tqdm import tqdm
+
+# # Enable debug logging
 # tlx.logging.set_verbosity(tlx.logging.DEBUG)
 
-# # ################## Download and prepare the CIFAR10 dataset ##################
-# # This is just some way of getting the CIFAR10 dataset from an online location
-# # and loading it into numpy arrays with shape [32,32,3]
-# X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+# os.environ['TL_BACKEND'] = 'jittor'
+
 
-# # ################## CIFAR10 dataset ##################
-# # We define a Dataset class for Loading CIFAR10 images and labels.
-# class make_dataset(Dataset):
 
+# # Download and prepare the CIFAR10 dataset with progress bar
+# print("Downloading CIFAR10 dataset...")
+# X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+
+# # Define the CIFAR10 dataset
+# class CIFAR10Dataset(Dataset):
 #     def __init__(self, data, label, transforms):
 #         self.data = data
 #         self.label = label
@@ -151,131 +206,79 @@ def forward(self, x):
 #         x = self.data[idx].astype('uint8')
 #         y = self.label[idx].astype('int64')
 #         x = self.transforms(x)
-
 #         return x, y
 
 #     def __len__(self):
-
 #         return len(self.label)
 
-# # We define the CIFAR10 iamges preprocessing pipeline.
-# train_transforms = Compose( # Combining multiple operations sequentially
-#     [
-#         RandomCrop(size=[24, 24]), #random crop from images to shape [24, 24]
-#         RandomFlipHorizontal(), # random invert each image horizontally by probability
-#         RandomBrightness(brightness_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust brightness randomly
-#         RandomContrast(contrast_factor=(0.5, 1.5)), # Within the range of values (0.5, 1.5), adjust contrast randomly
-#         StandardizePerImage() #Normalize the values of each image to [-1, 1]
-#     ]
-# )
+# # Define the CIFAR10 images preprocessing pipeline
+# train_transforms = Compose([
+#     RandomCrop(size=[24, 24]),
+#     RandomFlipHorizontal(),
+#     RandomBrightness(brightness_factor=(0.5, 1.5)),
+#     RandomContrast(contrast_factor=(0.5, 1.5)),
+#     StandardizePerImage()
+# ])
 
 # test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
 
-# # We use DataLoader to batch and shuffle data, and make data into iterators.
-# train_dataset = make_dataset(data=X_train, label=y_train, transforms=train_transforms)
-# test_dataset = make_dataset(data=X_test, label=y_test, transforms=test_transforms)
-
-# train_dataset = DataLoader(train_dataset, batch_size=128, shuffle=True)
-# test_dataset = DataLoader(test_dataset, batch_size=128)
+# # Create DataLoaders for training and testing
+# print("Processing CIFAR10 dataset...")
+# train_dataset = CIFAR10Dataset(data=X_train, label=y_train, transforms=train_transforms)
+# test_dataset = CIFAR10Dataset(data=X_test, label=y_test, transforms=test_transforms)
 
-# # ################## CNN network ##################
-# class CNN(Module):
+# train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
+# test_dataloader = DataLoader(test_dataset, batch_size=128)
 
+# # Define a simple CNN model
+# class SimpleCNN(Module):
 #     def __init__(self):
-#         super(CNN, self).__init__()
-#         # Parameter initialization method
-#         W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
-#         W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
-#         b_init2 = tlx.nn.initializers.constant(value=0.1)
+#         super(SimpleCNN, self).__init__()
+#         self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
+#         self.flatten = Flatten()
+#         self.fc1 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=16 * 24 * 24)
+#         self.fc2 = Linear(out_features=10, act=None, in_features=64)
 
-#         # 2D Convolutional Neural Network, Set padding method "SAME", convolutional kernel size [5,5], stride [1,1], in channels, out channels
-#         self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3)
-#         # Add 2D BatchNormalize, using ReLU for output.
-#         self.bn = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
-#         # Add 2D Max pooling layer.
-#         self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
-
-#         self.conv2 = Conv2d(
-#             64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, name='conv2', in_channels=64
-#         )
-#         self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
-#         # Flatten 2D data to 1D data
-#         self.flatten = Flatten(name='flatten')
-#         # Linear layer with 384 units, using ReLU for output.
-#         self.linear1 = Linear(384, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
-#         self.linear2 = Linear(192, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
-#         self.linear3 = Linear(10, act=None, W_init=W_init2, name='output', in_features=192)
-
-#     # We define the forward computation process.
 #     def forward(self, x):
 #         z = self.conv1(x)
-#         z = self.bn(z)
-#         z = self.maxpool1(z)
-#         z = self.conv2(z)
-#         z = self.maxpool2(z)
 #         z = self.flatten(z)
-#         z = self.linear1(z)
-#         z = self.linear2(z)
-#         z = self.linear3(z)
+#         z = self.fc1(z)
+#         z = self.fc2(z)
 #         return z
 
+# # Instantiate the model
+# model = SimpleCNN()
 
-# # get the network
-# net = CNN()
-
-# # training settings
-# n_epoch = 500
-# learning_rate = 0.0001
-# print_freq = 5
-# n_step_epoch = int(len(y_train) / 128)
-# n_step = n_epoch * n_step_epoch
-# shuffle_buffer_size = 128
-# # Get training parameters
-# train_weights = net.trainable_weights
-# # Define the optimizer, use the Adam optimizer.
-# optimizer = tlx.optimizers.Adam(learning_rate)
-# # Define evaluation metrics.
-# metrics = tlx.metrics.Accuracy()
-
-# # Define the loss calculation process
-# class WithLoss(Module):
-
-#     def __init__(self, net, loss_fn):
-#         super(WithLoss, self).__init__()
-#         self._net = net
-#         self._loss_fn = loss_fn
-
-#     def forward(self, data, label):
-#         out = self._net(data)
-#         loss = self._loss_fn(out, label)
-#         return loss
+# # Define the optimizer
+# optimizer = Adam(model.trainable_weights, lr=0.001)
 
+# # Define the loss function
+# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
 
-# net_with_loss = WithLoss(net, loss_fn=tlx.losses.softmax_cross_entropy_with_logits)
-# # Initialize one-step training
-# net_with_train = TrainOneStep(net_with_loss, optimizer, train_weights)
-
-# # Custom training loops
+# # Training loop
+# n_epoch = 2
 # for epoch in range(n_epoch):
 #     start_time = time.time()
-#     # Set the network to training state
-#     net.set_train()
-#     train_loss, train_acc, n_iter = 0, 0, 0
-#     # Get training data and labels
-#     for X_batch, y_batch in train_dataset:
-#         # Calculate the loss value, and automatically complete the gradient update
-#         _loss_ce = net_with_train(X_batch, y_batch)
-#         train_loss += _loss_ce
+#     model.set_train()
+#     train_loss, n_iter = 0, 0
+
+#     with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{n_epoch}", unit="batch") as pbar:
+#         for X_batch, y_batch in train_dataloader:
+#             X_batch = tlx.convert_to_tensor(X_batch)
+#             y_batch = tlx.convert_to_tensor(y_batch)
+#             _logits = model(X_batch)
+#             loss = loss_fn(_logits, y_batch)
+            
+#             optimizer.zero_grad()
+#             optimizer.step(loss)
+            
+#             train_loss += loss.item()
+#             n_iter += 1
+#             pbar.update(1)
+
+#     print(f"Epoch {epoch + 1} of {n_epoch} took {time.time() - start_time:.2f}s")
+#     print(f"   train loss: {train_loss / n_iter:.4f}")
 
-#         n_iter += 1
-#         _logits = net(X_batch)
-#         # Calculate accuracy
-#         metrics.update(_logits, y_batch)
-#         train_acc += metrics.result()
-#         metrics.reset()
-#         print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
-#         print("   train loss: {}".format(train_loss / n_iter))
-#         print("   train acc:  {}".format(train_acc / n_iter))
 
 
 ################################ TensorLayerX and TensorFlow can be mixed programming. #################################
diff --git a/examples/basic_tutorials/cifar10_cnn_dist.py b/examples/basic_tutorials/cifar10_cnn_dist.py
index c4713e0..c72c704 100644
--- a/examples/basic_tutorials/cifar10_cnn_dist.py
+++ b/examples/basic_tutorials/cifar10_cnn_dist.py
@@ -2,10 +2,11 @@
 # -*- coding: utf-8 -*-
 
 import os
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-# os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'torch'
 
 import paddle
 from paddle.distributed import fleet
diff --git a/examples/basic_tutorials/cifar10_cnn_train.py b/examples/basic_tutorials/cifar10_cnn_train.py
index 2661ce5..1a549cc 100644
--- a/examples/basic_tutorials/cifar10_cnn_train.py
+++ b/examples/basic_tutorials/cifar10_cnn_train.py
@@ -5,12 +5,11 @@
 
 import os
 # os.environ['TL_BACKEND'] = 'paddle'
-
-os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
+# os.environ['TL_BACKEND'] = 'jittor'
 
-# os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'torch'
 
 
 
@@ -76,7 +75,7 @@ def forward(self, x):
 
 # 定义损失函数、优化器等
 loss_fn=tlx.losses.softmax_cross_entropy_with_logits
-optimizer = tlx.optimizers.Adam(net.trainable_weights, lr=learning_rate)
+optimizer = tlx.optimizers.Adam(learning_rate)
 metrics = tlx.metrics.Accuracy()
 
 
diff --git a/examples/basic_tutorials/gradient_clip_mixed_tensorflow.py b/examples/basic_tutorials/gradient_clip_mixed_tensorflow.py
index 4432e81..baf54a8 100644
--- a/examples/basic_tutorials/gradient_clip_mixed_tensorflow.py
+++ b/examples/basic_tutorials/gradient_clip_mixed_tensorflow.py
@@ -2,9 +2,10 @@
 # -*- coding: utf-8 -*-
 # The tensorlayerx and tensorflow operators can be mixed
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'paddle'
-# os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'torch'
+# os.environ['TL_BACKEND'] = 'jittor'
 
 
 import time
diff --git a/examples/basic_tutorials/jiitor_tutorial.py b/examples/basic_tutorials/jiitor_models_tutorial.py
similarity index 67%
rename from examples/basic_tutorials/jiitor_tutorial.py
rename to examples/basic_tutorials/jiitor_models_tutorial.py
index 654835f..afb4495 100644
--- a/examples/basic_tutorials/jiitor_tutorial.py
+++ b/examples/basic_tutorials/jiitor_models_tutorial.py
@@ -1,142 +1,123 @@
 
+# """"
+# Here we have a Tutorial of Jittor backend being used with several different models, which includes:
 
+# """
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# +++++++++++++++++++++++++++++++++++++ Jittor CNN ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-
-# import os
-# import time
-# import tensorlayerx as tlx
-# from tensorlayerx.dataflow import Dataset, DataLoader
-# from tensorlayerx.vision.transforms import (
-#     Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
-# )
-# from tensorlayerx.nn import Conv2d, Linear, Flatten, Module, MaxPool2d, BatchNorm2d
-# from tensorlayerx.optimizers import Adam
-# from tqdm import tqdm
-
-# # Enable debug logging
-# tlx.logging.set_verbosity(tlx.logging.DEBUG)
-
-# os.environ['TL_BACKEND'] = 'jittor'
-
-# # Download and prepare the CIFAR10 dataset
-# print("Downloading CIFAR10 dataset...")
-# X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
-
-# # Define the CIFAR10 dataset
-# class CIFAR10Dataset(Dataset):
-#     def __init__(self, data, label, transforms):
-#         self.data = data
-#         self.label = label
-#         self.transforms = transforms
-
-#     def __getitem__(self, idx):
-#         x = self.data[idx].astype('uint8')
-#         y = self.label[idx].astype('int64')
-#         x = self.transforms(x)
-#         return x, y
-
-#     def __len__(self):
-#         return len(self.label)
-
-# # Define the CIFAR10 images preprocessing pipeline
-# train_transforms = Compose([
-#     RandomCrop(size=[24, 24]),
-#     RandomFlipHorizontal(),
-#     RandomBrightness(brightness_factor=(0.5, 1.5)),
-#     RandomContrast(contrast_factor=(0.5, 1.5)),
-#     StandardizePerImage()
-# ])
-
-# test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
-
-# # Create DataLoaders for training and testing
-# print("Processing CIFAR10 dataset...")
-# train_dataset = CIFAR10Dataset(data=X_train, label=y_train, transforms=train_transforms)
-# test_dataset = CIFAR10Dataset(data=X_test, label=y_test, transforms=test_transforms)
-
-# train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
-# test_dataloader = DataLoader(test_dataset, batch_size=128)
-
-
-# class SimpleCNN(Module):
-#     def __init__(self):
-#         super(SimpleCNN, self).__init__()
-#         self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
-#         self.conv2 = Conv2d(32, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=16)
-#         self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME')
-#         self.conv3 = Conv2d(64, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=32)
-#         self.bn1 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
-#         self.conv4 = Conv2d(128, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=64)
-#         self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME')
-#         self.flatten = Flatten()
-#         self.fc1 = Linear(out_features=128, act=tlx.nn.ReLU, in_features=128 * 6 * 6)
-#         self.fc2 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=128)
-#         self.fc3 = Linear(out_features=10, act=None, in_features=64)
-
-#     def forward(self, x):
-#         z = self.conv1(x)
-#         z = self.conv2(z)
-#         z = self.maxpool1(z)
-#         z = self.conv3(z)
-#         z = self.bn1(z)
-#         z = self.conv4(z)
-#         z = self.maxpool2(z)
-#         z = self.flatten(z)
-#         z = self.fc1(z)
-#         z = self.fc2(z)
-#         z = self.fc3(z)
-#         return z
-
-
-
-
-# # Instantiate the model
-# model = SimpleCNN()
-
-# # Define the optimizer
-# optimizer = Adam(model.trainable_weights, lr=0.001)
-
-# # Define the loss function
-# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
-
-# # Training loop
-# n_epoch = 2
-# for epoch in range(n_epoch):
-#     start_time = time.time()
-#     model.set_train()
-#     train_loss, n_iter = 0, 0
-
-#     with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch + 1}/{n_epoch}", unit="batch") as pbar:
-#         for X_batch, y_batch in train_dataloader:
-            
-#             X_batch = tlx.convert_to_tensor(X_batch)
-#             y_batch = tlx.convert_to_tensor(y_batch)
-#             _logits = model(X_batch)
-#             loss = loss_fn(_logits, y_batch)
-#             optimizer.zero_grad()
-#             optimizer.step(loss)
-            
-#             train_loss += loss.item()  # Using .item() to get the scalar value
-#             n_iter += 1
-#             pbar.update(1)
-
-#     print(f"Epoch {epoch + 1} of {n_epoch} took {time.time() - start_time:.2f}s")
-#     print(f"   train loss: {train_loss / n_iter:.4f}")
-
-
+# +++++++++++++++++++++++++++++++++++++ Jittor CIFAR CNN ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+import os
+import time
+import tensorlayerx as tlx
+from tensorlayerx.dataflow import Dataset, DataLoader
+from tensorlayerx.vision.transforms import (
+    Compose, Resize, RandomFlipHorizontal, RandomContrast, RandomBrightness, StandardizePerImage, RandomCrop
+)
+from tensorlayerx.nn import Conv2d, Linear, Flatten, Module, MaxPool2d, BatchNorm2d
+from tensorlayerx.optimizers import Adam
+from tqdm import tqdm
+
+# Enable debug logging
+tlx.logging.set_verbosity(tlx.logging.DEBUG)
+
+os.environ['TL_BACKEND'] = 'jittor'
+
+# Download and prepare the CIFAR10 dataset
+print("Downloading CIFAR10 dataset...")
+X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
+
+# Define the CIFAR10 dataset
+class CIFAR10Dataset(Dataset):
+    def __init__(self, data, label, transforms):
+        self.data = data
+        self.label = label
+        self.transforms = transforms
+
+    def __getitem__(self, idx):
+        x = self.data[idx].astype('uint8')
+        y = self.label[idx].astype('int64')
+        x = self.transforms(x)
+        return x, y
+
+    def __len__(self):
+        return len(self.label)
+
+# Define the CIFAR10 images preprocessing pipeline
+train_transforms = Compose([
+    RandomCrop(size=[24, 24]),
+    RandomFlipHorizontal(),
+    RandomBrightness(brightness_factor=(0.5, 1.5)),
+    RandomContrast(contrast_factor=(0.5, 1.5)),
+    StandardizePerImage()
+])
+
+test_transforms = Compose([Resize(size=(24, 24)), StandardizePerImage()])
+
+# Create DataLoaders for training and testing
+print("Processing CIFAR10 dataset...")
+train_dataset = CIFAR10Dataset(data=X_train, label=y_train, transforms=train_transforms)
+test_dataset = CIFAR10Dataset(data=X_test, label=y_test, transforms=test_transforms)
+
+train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
+test_dataloader = DataLoader(test_dataset, batch_size=128)
+
+
+class SimpleCNN(Module):
+    def __init__(self):
+        super(SimpleCNN, self).__init__()
+        self.conv1 = Conv2d(16, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=3)
+        self.conv2 = Conv2d(32, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=16)
+        self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME')
+        self.conv3 = Conv2d(64, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=32)
+        self.bn1 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        self.conv4 = Conv2d(128, (3, 3), (1, 1), padding='SAME', act=tlx.nn.ReLU, in_channels=64)
+        self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME')
+        self.flatten = Flatten()
+        self.fc1 = Linear(out_features=128, act=tlx.nn.ReLU, in_features=128 * 6 * 6)
+        self.fc2 = Linear(out_features=64, act=tlx.nn.ReLU, in_features=128)
+        self.fc3 = Linear(out_features=10, act=None, in_features=64)
+
+    def forward(self, x):
+        z = self.conv1(x)
+        z = self.conv2(z)
+        z = self.maxpool1(z)
+        z = self.conv3(z)
+        z = self.bn1(z)
+        z = self.conv4(z)
+        z = self.maxpool2(z)
+        z = self.flatten(z)
+        z = self.fc1(z)
+        z = self.fc2(z)
+        z = self.fc3(z)
+        return z
+
+
+# Instantiate the model
+model = SimpleCNN()
+
+# Define the optimizer
+optimizer = Adam(lr=0.001)
+# optimizer = Adam(lr=0.001, params=model.trainable_weights )
+
+# Define the loss function
+loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+
+# Use the built-in training method
+metric = tlx.metrics.Accuracy()
+tlx_model = tlx.model.Model(network=model, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
+tlx_model.train(n_epoch=2, train_dataset=train_dataloader, print_freq=1, print_train_batch=True)
 
 
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# +++++++++++++++++++++++++++++++++++++ Jittor LSTM ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor IMDB LSTM ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
 # import os
 # import sys
 # import tensorlayerx as tlx
-# from tensorlayerx.nn import Module, Linear, LSTM, Embedding
+# from tensorlayerx.nn import Module, LSTM, Embedding, Linear
 # from tensorlayerx.dataflow import Dataset
-# from keras.datasets import imdb
-# from keras.preprocessing import sequence
 # import numpy as np
+
 # os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 # os.environ['TL_BACKEND'] = 'jittor'
 # sys.setrecursionlimit(10000)  # Increase recursion limit
@@ -144,15 +125,9 @@
 # # Set parameters
 # max_features = 20000
 # maxlen = 200
-
 # prev_h = np.random.random([1, 200, 64]).astype(np.float32)
 # prev_h = tlx.convert_to_tensor(prev_h)
-
-# # Load and preprocess the IMDB dataset
-# (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)
-# X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
-# X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
-
+# X_train, y_train, X_test, y_test = tlx.files.load_imdb_dataset('data', nb_words=20000, test_split=0.2)
 # vocab_size = max_features
 # seq_Len = 200
 
@@ -209,7 +184,7 @@
 # print(net)
 
 # # Define optimizer, metric, and loss function using TLX functions
-# optimizer = tlx.optimizers.Adam(lr=1e-3, params=net.trainable_weights)
+# optimizer = tlx.optimizers.Adam(lr=1e-3)
 # metric = tlx.metrics.Accuracy()
 # loss_fn = tlx.losses.softmax_cross_entropy_with_logits
 
@@ -217,11 +192,8 @@
 # model = tlx.model.Model(network=net, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
 # model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=True)
 
-
-
-
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# +++++++++++++++++++++++++++++++++++++ Jittor MLP ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor MNIST MLP ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
 # # ! /usr/bin/python
 # # -*- coding: utf-8 -*-
@@ -296,7 +268,7 @@
 # # Get training parameters
 # train_weights = MLP.trainable_weights
 # # Define the optimizer, use the Momentum optimizer, and set the learning rate to 0.05, momentum to 0.9
-# optimizer = tlx.optimizers.Momentum(lr=0.05, momentum= 0.9, params = train_weights )
+# optimizer = tlx.optimizers.Momentum(lr=0.05, momentum= 0.9 )
 # # Define evaluation metrics.
 # metric = tlx.metrics.Accuracy()
 # # Define loss function, this operator implements the cross entropy loss function with softmax. This function
@@ -315,11 +287,12 @@
 
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # +++++++++++++++++++++++++++++++++++++ Jittor MNIST Sequential ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-#! /usr/bin/python
+# ! /usr/bin/python
 # -*- coding: utf-8 -*-
 # import os
 # os.environ['TL_BACKEND'] = 'jittor'
 
+# # os.environ['TL_BACKEND'] = 'torch'
 
 # from tensorlayerx.nn import Sequential
 # from tensorlayerx.nn import Linear
@@ -358,7 +331,7 @@
 # shuffle_buffer_size = 128
 
 # train_weights = MLP.trainable_weights
-# optimizer = tlx.optimizers.Momentum(lr=0.05,momentum= 0.9, params=train_weights)
+# optimizer = tlx.optimizers.Momentum(lr=0.05,momentum= 0.9)
 # train_dataset = mnistdataset(data=X_train, label=y_train)
 # train_loader = tlx.dataflow.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
 # metric = tlx.metrics.Accuracy()
@@ -367,14 +340,14 @@
 # )
 # model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=False)
 # model.save_weights('./model.npz', format='npz_dict')
-# model.load_weights('./model.npz', format='npz_dict')
+# model.load_weights('./model.npz', format='npz_dict', skip=True)
 
 
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # +++++++++++++++++++++++++++++++++++++ Jittor MNIST GAN ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
-# #! /usr/bin/python
-# # -*- coding: utf-8 -*-
+#! /usr/bin/python
+# -*- coding: utf-8 -*-
 
 # import os
 # os.environ['TL_BACKEND'] = 'jittor'
@@ -487,22 +460,18 @@
 # # loss_fn = tlx.losses.sigmoid_cross_entropy
 # # optimizer = tlx.optimizers.Momentum(learning_rate=5e-4, momentum=0.5)
 # loss_fn = tlx.losses.mean_squared_error
-
+# # Define the optimizers, use the Adam optimizer.
+# optimizer_g = tlx.optimizers.Adam(lr=3e-4, beta_1=0.5, beta_2=0.999)
+# optimizer_d = tlx.optimizers.Adam(lr=3e-4)
 # # Get training parameters
 # g_weights = G.trainable_weights
 # d_weights = D.trainable_weights
-
 # net_with_loss_G = WithLossG(G, D, loss_fn)
 # net_with_loss_D = WithLossD(G, D, loss_fn)
-
-# # Define the optimizers, use the Adam optimizer.
-# optimizer_g = tlx.optimizers.Adam(lr=3e-4, beta_1=0.5, beta_2=0.999, params= g_weights)
-# optimizer_d = tlx.optimizers.Adam(lr=3e-4, params= d_weights)
-
 # # Initialize one-step training
 # train_one_step_g = TrainOneStep(net_with_loss_G, optimizer_g, g_weights)
 # train_one_step_d = TrainOneStep(net_with_loss_D, optimizer_d, d_weights)
-# n_epoch = 50
+# n_epoch = 2
 
 
 # def plot_fake_image(fake_image, num):
@@ -535,3 +504,96 @@
 #         print("   g loss:  {}".format(g_loss / n_iter))
 #     fake_image = G(tlx.convert_to_tensor(np.random.random(size=(36, 100)), dtype=tlx.float32))
 #     plot_fake_image(fake_image, 36)
+
+
+
+# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# +++++++++++++++++++++++++++++++++++++ Jittor IMDB RNN +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+
+# import os
+# import sys
+# import tensorlayerx as tlx
+# from tensorlayerx.nn import Module, RNN, Embedding, Linear
+# from tensorlayerx.dataflow import Dataset
+# import numpy as np
+					 
+
+# os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
+# os.environ['TL_BACKEND'] = 'jittor'
+# sys.setrecursionlimit(10000)  # Increase recursion limit
+
+# # Set parameters
+# max_features = 20000
+# maxlen = 200
+# prev_h = np.random.random([1, 200, 64]).astype(np.float32)
+# prev_h = tlx.convert_to_tensor(prev_h)
+# X_train, y_train, X_test, y_test = tlx.files.load_imdb_dataset('data', nb_words=20000, test_split=0.2)
+# vocab_size = max_features
+# seq_Len = 200
+
+		
+# class ImdbDataset(Dataset):
+					   
+#     def __init__(self, X, y):
+#         self.X = X
+#         self.y = y
+
+#     def __getitem__(self, index):
+#         data = self.X[index]
+#         data = np.concatenate([data[:seq_Len], [0] * (seq_Len - len(data))]).astype('int64')  # set
+#         label = self.y[index].astype('int64')
+#         return data, label
+
+#     def __len__(self):
+#         return len(self.y)
+
+
+# class ImdbNet(Module):
+
+#     def __init__(self):
+#         super(ImdbNet, self).__init__()
+#         self.embedding = Embedding(num_embeddings=vocab_size, embedding_dim=64)
+#         self.rnn = RNN(input_size=64, hidden_size=64)
+#         self.linear1 = Linear(in_features=64, out_features=64, act=tlx.nn.ReLU)
+#         self.linear2 = Linear(in_features=64, out_features=2)
+
+#     def forward(self, x):
+#         x = self.embedding(x)
+#         x, _ = self.rnn(x)
+#         x = tlx.reduce_mean(x, axis=1)
+#         x = self.linear1(x)
+#         x = self.linear2(x)
+#         return x
+
+#     def __repr__(self):
+#         return "ImdbNet(embedding_dim=64, hidden_size=64, num_classes=2)"
+
+#     def __str__(self):
+#         return self.__repr__()
+
+# # Training settings
+# n_epoch = 1
+# batch_size = 64
+# print_freq = 2
+
+# # Create DataLoader
+# train_dataset = ImdbDataset(X=X_train, y=y_train)
+# train_loader = tlx.dataflow.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+
+# # Initialize the network
+# net = ImdbNet()
+# print(net)
+
+# # Define optimizer, metric, and loss function using TLX functions
+# optimizer = tlx.optimizers.Adam(lr=1e-3)
+# metric = tlx.metrics.Accuracy()
+# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+
+# # Create and train the model
+# model = tlx.model.Model(network=net, loss_fn=loss_fn, optimizer=optimizer, metrics=metric)
+# model.train(n_epoch=n_epoch, train_dataset=train_loader, print_freq=print_freq, print_train_batch=True)
+# Optionally, you could now dump the network weights to a file like this:
+# model.save_weights('./rnn_model.npz', format='npz_dict')
+# model.load_weights('./rnn_model.npz', format='npz_dict', skip= True)
+
diff --git a/examples/basic_tutorials/mnist_dataflow.py b/examples/basic_tutorials/mnist_dataflow.py
index a4856d9..f5fb3de 100644
--- a/examples/basic_tutorials/mnist_dataflow.py
+++ b/examples/basic_tutorials/mnist_dataflow.py
@@ -2,9 +2,12 @@
 # -*- coding: utf-8 -*-
 
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
 # os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'jittor'
+os.environ['TL_BACKEND'] = 'torch'
+
 
 import tensorlayerx as tlx
 from tensorlayerx.nn import Module
diff --git a/examples/basic_tutorials/mnist_mlp_custom_train.py b/examples/basic_tutorials/mnist_mlp_custom_train.py
index 60e0bce..514098e 100644
--- a/examples/basic_tutorials/mnist_mlp_custom_train.py
+++ b/examples/basic_tutorials/mnist_mlp_custom_train.py
@@ -5,9 +5,10 @@
 import os
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
 # os.environ['TL_BACKEND'] = 'oneflow'
-# os.environ['TL_BACKEND'] = 'torch'
+# os.environ['TL_BACKEND'] = 'jittor'
+os.environ['TL_BACKEND'] = 'torch'
 
 import time
 import tensorlayerx as tlx
diff --git a/examples/basic_tutorials/mnist_mlp_mix_programming.py b/examples/basic_tutorials/mnist_mlp_mix_programming.py
index 4f2035d..9602605 100644
--- a/examples/basic_tutorials/mnist_mlp_mix_programming.py
+++ b/examples/basic_tutorials/mnist_mlp_mix_programming.py
@@ -1,41 +1,43 @@
-################################ TensorLayerX and TensorFlow can be mixed programming. #################################
+################################## TensorLayerX and Torch can be mixed programming. ##################################
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+os.environ['TL_BACKEND'] = 'torch'
 
-import numpy as np
-import time
-
-import tensorflow as tf
+import torch
+from tensorlayerx.nn import Module, Linear, Dropout
 import tensorlayerx as tlx
-from tensorlayerx.nn import Module
-from tensorlayerx.nn import Linear, Dropout
+from tensorlayerx.dataflow import Dataset, DataLoader
+
+# Get cpu or gpu device for training.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Using {} device".format(device))
 
-# Load MNIST data by TensorLayerX
+# Load MNIST data and make Dataset by TensorLayerX
 X_train, y_train, X_val, y_val, X_test, y_test = tlx.files.load_mnist_dataset(shape=(-1, 784))
 
-def generator_train():
-    inputs = X_train
-    targets = y_train
-    if len(inputs) != len(targets):
-        raise AssertionError("The length of inputs and targets should be equal")
-    for _input, _target in zip(inputs, targets):
-        yield _input, _target
+class mnistdataset(Dataset):
 
-# Make Dataset by TensorFlow
-train_ds = tf.data.Dataset.from_generator(generator_train, output_types=(tf.float32, tf.int32))
-shuffle_buffer_size = 128
-batch_size = 128
-train_ds = train_ds.shuffle(shuffle_buffer_size)
-train_ds = train_ds.batch(batch_size)
+    def __init__(self, data=X_train, label=y_train):
+        self.data = data
+        self.label = label
 
+    def __getitem__(self, index):
+        data = self.data[index].astype('float32')
+        label = self.label[index].astype('int64')
+        return data, label
 
-# Define the network through tensorlayerx
-class CustomModel(Module):
+    def __len__(self):
+        return len(self.data)
+
+train_dataset = mnistdataset(data=X_train, label=y_train)
+train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
+
+# Define the network through TensorLayerX
+class MLP(Module):
 
     def __init__(self):
-        super(CustomModel, self).__init__()
+        super(MLP, self).__init__()
         self.dropout1 = Dropout(p=0.2)
-        self.linear1 = Linear(out_features=800, in_features=784)
+        self.linear1 = Linear(out_features=800, act=tlx.nn.ReLU, in_features=784)
         self.dropout2 = Dropout(p=0.2)
         self.linear2 = Linear(out_features=800, act=tlx.nn.ReLU, in_features=800)
         self.dropout3 = Dropout(p=0.2)
@@ -51,47 +53,130 @@ def forward(self, x):
         return out
 
 
-MLP = CustomModel()
+model = MLP().to(device)
+
+# Define the loss fucntion through TensorLayerX
+loss_fn = tlx.losses.softmax_cross_entropy_with_logits
+# Define the optimizer through torch
+optimizer = torch.optim.SGD(lr=0.05, momentum=0.9, params=model.trainable_weights)
+
 n_epoch = 50
-batch_size = 500
-print_freq = 1
-train_weights = MLP.trainable_weights
-# Define the optimizer through tensorlayerx
-optimizer = tlx.optimizers.Adam(lr=0.0001)
-
-for epoch in range(n_epoch):  ## iterate the dataset n_epoch times
-    start_time = time.time()
-    ## iterate over the entire training set once (shuffle the data via training)
-    for X_batch, y_batch in train_ds :
-        MLP.set_train()  # enable dropout
-        with tf.GradientTape() as tape: # use tf.GradientTape() to record gradient
-            ## compute outputs
-            _logits = MLP(X_batch)
-            ## compute loss and update model
-            _loss = tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
-        grad = tape.gradient(_loss, train_weights)
-        optimizer.apply_gradients(zip(grad, train_weights))
-
-    ## use training and evaluation sets to evaluate the model every print_freq epoch
-    if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
-        print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
-        train_loss, train_acc, n_iter = 0, 0, 0
-        for X_batch, y_batch in train_ds :
-            _logits = MLP(X_batch)
-            train_loss += tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
-            train_acc += np.mean(np.equal(np.argmax(_logits, 1), y_batch))
-            n_iter += 1
-        print("   train loss: {}".format(train_loss / n_iter))
-        print("   train acc:  {}".format(train_acc / n_iter))
-
-        val_loss, val_acc, n_iter = 0, 0, 0
-        for X_batch, y_batch in train_ds:
-            _logits = MLP(X_batch)  # is_train=False, disable dropout
-            val_loss += tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
-            val_acc += np.mean(np.equal(np.argmax(_logits, 1), y_batch))
-            n_iter += 1
-        print("   val loss: {}".format(val_loss / n_iter))
-        print("   val acc:  {}".format(val_acc / n_iter))
+size = len(train_loader.dataset)
+model.train()
+
+# We use tlx's Model, loss function, Dataset and torch's optimizer to train the network
+for epoch in range(n_epoch):
+    for batch, (X, y) in enumerate(train_loader):
+        X, y = X.to(device), y.to(device)
+
+        # Compute prediction error
+        pred = model(X)
+        loss = loss_fn(pred, y)
+        acc = tlx.metrics.acc(pred, y)
+        # Backpropagation
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        if batch % 100 == 0:
+            loss, current = loss.item(), batch * len(X)
+            print(f"loss: {loss:>7f} acc: {acc:>7f}  [{current:>5d}/{size:>5d}] [{epoch} / {n_epoch}epoch]")
+
+
+################################ TensorLayerX and TensorFlow can be mixed programming. #################################
+# import os
+# os.environ['TL_BACKEND'] = 'tensorflow'
+
+# import numpy as np
+# import time
+
+# import tensorflow as tf
+# import tensorlayerx as tlx
+# from tensorlayerx.nn import Module
+# from tensorlayerx.nn import Linear, Dropout
+
+# # Load MNIST data by TensorLayerX
+# X_train, y_train, X_val, y_val, X_test, y_test = tlx.files.load_mnist_dataset(shape=(-1, 784))
+
+# def generator_train():
+#     inputs = X_train
+#     targets = y_train
+#     if len(inputs) != len(targets):
+#         raise AssertionError("The length of inputs and targets should be equal")
+#     for _input, _target in zip(inputs, targets):
+#         yield _input, _target
+
+# # Make Dataset by TensorFlow
+# train_ds = tf.data.Dataset.from_generator(generator_train, output_types=(tf.float32, tf.int32))
+# shuffle_buffer_size = 128
+# batch_size = 128
+# train_ds = train_ds.shuffle(shuffle_buffer_size)
+# train_ds = train_ds.batch(batch_size)
+
+
+# # Define the network through tensorlayerx
+# class CustomModel(Module):
+
+#     def __init__(self):
+#         super(CustomModel, self).__init__()
+#         self.dropout1 = Dropout(p=0.2)
+#         self.linear1 = Linear(out_features=800, in_features=784)
+#         self.dropout2 = Dropout(p=0.2)
+#         self.linear2 = Linear(out_features=800, act=tlx.nn.ReLU, in_features=800)
+#         self.dropout3 = Dropout(p=0.2)
+#         self.linear3 = Linear(out_features=10, act=tlx.nn.ReLU, in_features=800)
+
+#     def forward(self, x):
+#         z = self.dropout1(x)
+#         z = self.linear1(z)
+#         z = self.dropout2(z)
+#         z = self.linear2(z)
+#         z = self.dropout3(z)
+#         out = self.linear3(z)
+#         return out
+
+
+# MLP = CustomModel()
+# n_epoch = 50
+# batch_size = 500
+# print_freq = 1
+# train_weights = MLP.trainable_weights
+# # Define the optimizer through tensorlayerx
+# optimizer = tlx.optimizers.Adam(lr=0.0001)
+
+# for epoch in range(n_epoch):  ## iterate the dataset n_epoch times
+#     start_time = time.time()
+#     ## iterate over the entire training set once (shuffle the data via training)
+#     for X_batch, y_batch in train_ds :
+#         MLP.set_train()  # enable dropout
+#         with tf.GradientTape() as tape: # use tf.GradientTape() to record gradient
+#             ## compute outputs
+#             _logits = MLP(X_batch)
+#             ## compute loss and update model
+#             _loss = tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
+#         grad = tape.gradient(_loss, train_weights)
+#         optimizer.apply_gradients(zip(grad, train_weights))
+
+#     ## use training and evaluation sets to evaluate the model every print_freq epoch
+#     if epoch + 1 == 1 or (epoch + 1) % print_freq == 0:
+#         print("Epoch {} of {} took {}".format(epoch + 1, n_epoch, time.time() - start_time))
+#         train_loss, train_acc, n_iter = 0, 0, 0
+#         for X_batch, y_batch in train_ds :
+#             _logits = MLP(X_batch)
+#             train_loss += tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
+#             train_acc += np.mean(np.equal(np.argmax(_logits, 1), y_batch))
+#             n_iter += 1
+#         print("   train loss: {}".format(train_loss / n_iter))
+#         print("   train acc:  {}".format(train_acc / n_iter))
+
+#         val_loss, val_acc, n_iter = 0, 0, 0
+#         for X_batch, y_batch in train_ds:
+#             _logits = MLP(X_batch)  # is_train=False, disable dropout
+#             val_loss += tlx.losses.softmax_cross_entropy_with_logits(_logits, y_batch)
+#             val_acc += np.mean(np.equal(np.argmax(_logits, 1), y_batch))
+#             n_iter += 1
+#         print("   val loss: {}".format(val_loss / n_iter))
+#         print("   val acc:  {}".format(val_acc / n_iter))
 
 ################################ TensorLayerX and MindSpore can be mixed programming. #################################
 # import os
@@ -260,86 +345,3 @@ def forward(self, x):
 #         print("   train acc:  {}".format(acc.numpy()))
 
 
-################################## TensorLayerX and Torch can be mixed programming. ##################################
-# import os
-# os.environ['TL_BACKEND'] = 'torch'
-#
-# import torch
-# from tensorlayerx.nn import Module, Linear, Dropout
-# import tensorlayerx as tlx
-# from tensorlayerx.dataflow import Dataset, DataLoader
-#
-# # Get cpu or gpu device for training.
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# print("Using {} device".format(device))
-#
-# # Load MNIST data and make Dataset by TensorLayerX
-# X_train, y_train, X_val, y_val, X_test, y_test = tlx.files.load_mnist_dataset(shape=(-1, 784))
-#
-# class mnistdataset(Dataset):
-#
-#     def __init__(self, data=X_train, label=y_train):
-#         self.data = data
-#         self.label = label
-#
-#     def __getitem__(self, index):
-#         data = self.data[index].astype('float32')
-#         label = self.label[index].astype('int64')
-#         return data, label
-#
-#     def __len__(self):
-#         return len(self.data)
-#
-# train_dataset = mnistdataset(data=X_train, label=y_train)
-# train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
-#
-# # Define the network through TensorLayerX
-# class MLP(Module):
-#
-#     def __init__(self):
-#         super(MLP, self).__init__()
-#         self.dropout1 = Dropout(p=0.2)
-#         self.linear1 = Linear(out_features=800, act=tlx.nn.ReLU, in_features=784)
-#         self.dropout2 = Dropout(p=0.2)
-#         self.linear2 = Linear(out_features=800, act=tlx.nn.ReLU, in_features=800)
-#         self.dropout3 = Dropout(p=0.2)
-#         self.linear3 = Linear(out_features=10, act=tlx.nn.ReLU, in_features=800)
-#
-#     def forward(self, x):
-#         z = self.dropout1(x)
-#         z = self.linear1(z)
-#         z = self.dropout2(z)
-#         z = self.linear2(z)
-#         z = self.dropout3(z)
-#         out = self.linear3(z)
-#         return out
-#
-#
-# model = MLP().to(device)
-#
-# # Define the loss fucntion through TensorLayerX
-# loss_fn = tlx.losses.softmax_cross_entropy_with_logits
-# # Define the optimizer through torch
-# optimizer = torch.optim.SGD(lr=0.05, momentum=0.9, params=model.trainable_weights)
-#
-# n_epoch = 50
-# size = len(train_loader.dataset)
-# model.train()
-#
-# # We use tlx's Model, loss function, Dataset and torch's optimizer to train the network
-# for epoch in range(n_epoch):
-#     for batch, (X, y) in enumerate(train_loader):
-#         X, y = X.to(device), y.to(device)
-#
-#         # Compute prediction error
-#         pred = model(X)
-#         loss = loss_fn(pred, y)
-#         acc = tlx.metrics.acc(pred, y)
-#         # Backpropagation
-#         optimizer.zero_grad()
-#         loss.backward()
-#         optimizer.step()
-#
-#         if batch % 100 == 0:
-#             loss, current = loss.item(), batch * len(X)
-#             print(f"loss: {loss:>7f} acc: {acc:>7f}  [{current:>5d}/{size:>5d}] [{epoch} / {n_epoch}epoch]")
diff --git a/examples/basic_tutorials/mnist_mlp_simple_train.py b/examples/basic_tutorials/mnist_mlp_simple_train.py
index b9787ae..f1ff42a 100644
--- a/examples/basic_tutorials/mnist_mlp_simple_train.py
+++ b/examples/basic_tutorials/mnist_mlp_simple_train.py
@@ -5,9 +5,10 @@
 import os
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'oneflow'
-# os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'torch'
 
 import tensorlayerx as tlx
 from tensorlayerx.nn import Module
diff --git a/examples/basic_tutorials/mnist_sequential.py b/examples/basic_tutorials/mnist_sequential.py
index edfe109..2e28bbb 100644
--- a/examples/basic_tutorials/mnist_sequential.py
+++ b/examples/basic_tutorials/mnist_sequential.py
@@ -3,7 +3,9 @@
 import os
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
-os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'jittor'
+os.environ['TL_BACKEND'] = 'torch'
 
 from tensorlayerx.nn import Sequential
 from tensorlayerx.nn import Linear
diff --git a/examples/basic_tutorials/module_container.py b/examples/basic_tutorials/module_container.py
index ef57193..bd929af 100644
--- a/examples/basic_tutorials/module_container.py
+++ b/examples/basic_tutorials/module_container.py
@@ -2,10 +2,13 @@
 # -*- coding: utf-8 -*-
 
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
+# os.environ['TL_BACKEND'] = 'jittor'
 # os.environ['TL_BACKEND'] = 'paddle'
-# os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'torch'
+
+
 import numpy as np
 from tensorlayerx.nn import Module, ModuleList, Linear, ModuleDict
 import tensorlayerx as tlx
diff --git a/examples/basic_tutorials/quick_start.py b/examples/basic_tutorials/quick_start.py
index 916f7ef..70f2615 100644
--- a/examples/basic_tutorials/quick_start.py
+++ b/examples/basic_tutorials/quick_start.py
@@ -1,8 +1,10 @@
 # TensorlayerX目前支持包括TensorFlow、Pytorch、PaddlePaddle、MindSpore作为计算后端，指定计算后端的方法也非常简单，只需要设置环境变量即可
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
 # os.environ['TL_BACKEND'] = 'paddle'
+# os.environ['TL_BACKEND'] = 'jittor'
+os.environ['TL_BACKEND'] = 'torch'
 
 
 import tensorlayerx as tlx
@@ -30,35 +32,34 @@ class CNN(Module):
 
     def __init__(self):
         super(CNN, self).__init__()
-        # weights init
         W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
         W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
+        b_init = tlx.nn.initializers.constant(value=0.1)
         b_init2 = tlx.nn.initializers.constant(value=0.1)
 
-        self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3)
-        self.bn = BatchNorm2d(num_features=64, act=tlx.ReLU)
-        self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
+        self.conv1 = Conv2d(32, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=b_init, name='conv1', in_channels=3)
+        self.bn1 = BatchNorm2d(num_features=32, act=tlx.nn.ReLU)
+        self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME', name='pool1')
 
-        self.conv2 = Conv2d(
-            64, (5, 5), (1, 1), padding='SAME', act=tlx.ReLU, W_init=W_init, name='conv2', in_channels=64
-        )
-        self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
+        self.conv2 = Conv2d(64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, b_init=b_init, name='conv2', in_channels=32)
+        self.bn2 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME', name='pool2')
 
         self.flatten = Flatten(name='flatten')
-        self.linear1 = Linear(384, act=tlx.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
-        self.linear2 = Linear(192, act=tlx.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
-        self.linear3 = Linear(10, act=None, W_init=W_init2, name='output', in_features=192)
+        self.linear1 = Linear(1024, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
+																													   
+        self.linear2 = Linear(10, act=None, W_init=W_init2, b_init=b_init2, name='output', in_features=1024)
 
     def forward(self, x):
         z = self.conv1(x)
-        z = self.bn(z)
+        z = self.bn1(z)
         z = self.maxpool1(z)
         z = self.conv2(z)
+        z = self.bn2(z)
         z = self.maxpool2(z)
         z = self.flatten(z)
         z = self.linear1(z)
-        z = self.linear2(z)
-        z = self.linear3(z)
+        z = self.linear2(z)						   
         return z
 
 X_train, y_train, X_test, y_test = tlx.files.load_cifar10_dataset(shape=(-1, 32, 32, 3), plotable=False)
diff --git a/examples/basic_tutorials/tensorlayerx_graph.py b/examples/basic_tutorials/tensorlayerx_graph.py
index 259d797..f36bba4 100644
--- a/examples/basic_tutorials/tensorlayerx_graph.py
+++ b/examples/basic_tutorials/tensorlayerx_graph.py
@@ -4,6 +4,7 @@
 import os
 # os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'mindspore'
+# os.environ['TL_BACKEND'] = 'jittor'
 os.environ['TL_BACKEND'] = 'torch'
 
 import tensorlayerx as tlx
@@ -14,39 +15,34 @@ class CNN(Module):
 
     def __init__(self):
         super(CNN, self).__init__()
-        # weights init
         W_init = tlx.nn.initializers.truncated_normal(stddev=5e-2)
         W_init2 = tlx.nn.initializers.truncated_normal(stddev=0.04)
+        b_init = tlx.nn.initializers.constant(value=0.1)
         b_init2 = tlx.nn.initializers.constant(value=0.1)
 
-        self.conv1 = Conv2d(64, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=None, name='conv1', in_channels=3, act=tlx.nn.ReLU)
-        self.bn = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
-        self.maxpool1 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool1')
+        self.conv1 = Conv2d(32, (5, 5), (1, 1), padding='SAME', W_init=W_init, b_init=b_init, name='conv1', in_channels=3)
+        self.bn1 = BatchNorm2d(num_features=32, act=tlx.nn.ReLU)
+        self.maxpool1 = MaxPool2d((2, 2), (2, 2), padding='SAME', name='pool1')
 
-        self.conv2 = Conv2d(
-            64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, b_init=None, name='conv2', in_channels=64
-        )
-        self.maxpool2 = MaxPool2d((3, 3), (2, 2), padding='SAME', name='pool2')
+        self.conv2 = Conv2d(64, (5, 5), (1, 1), padding='SAME', act=tlx.nn.ReLU, W_init=W_init, b_init=b_init, name='conv2', in_channels=32)
+        self.bn2 = BatchNorm2d(num_features=64, act=tlx.nn.ReLU)
+        self.maxpool2 = MaxPool2d((2, 2), (2, 2), padding='SAME', name='pool2')
 
         self.flatten = Flatten(name='flatten')
-        self.linear1 = Linear(384, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
-        self.linear2 = Linear(192, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear2relu', in_features=384)
-        self.linear3 = Linear(10, act=None, W_init=W_init2, name='output1', in_features=192)
-        self.linear4 = Linear(20, act=None, W_init=W_init2, name='output2', in_features=192)
-        self.concat = tlx.nn.Concat(name='concat')
+        self.linear1 = Linear(1024, act=tlx.nn.ReLU, W_init=W_init2, b_init=b_init2, name='linear1relu', in_features=2304)
+																													   
+        self.linear2 = Linear(10, act=None, W_init=W_init2, b_init=b_init2, name='output', in_features=1024)
 
     def forward(self, x):
         z = self.conv1(x)
-        z = self.bn(z)
+        z = self.bn1(z)
         z = self.maxpool1(z)
         z = self.conv2(z)
+        z = self.bn2(z)
         z = self.maxpool2(z)
         z = self.flatten(z)
         z = self.linear1(z)
-        z = self.linear2(z)
-        z1 = self.linear3(z)
-        z2 = self.linear4(z)
-        z = self.concat([z1, z2])
+        z = self.linear2(z)						   
         return z
 
 model = CNN()
diff --git a/examples/basic_tutorials/tensorlayerx_model_load.py b/examples/basic_tutorials/tensorlayerx_model_load.py
index 49a5df5..4f9a16e 100644
--- a/examples/basic_tutorials/tensorlayerx_model_load.py
+++ b/examples/basic_tutorials/tensorlayerx_model_load.py
@@ -2,10 +2,10 @@
 # -*- coding: utf-8 -*-
 
 import os
-os.environ['TL_BACKEND'] = 'tensorflow'
+# os.environ['TL_BACKEND'] = 'tensorflow'
 # os.environ['TL_BACKEND'] = 'paddle'
 # os.environ['TL_BACKEND'] = 'mindspore'
-# os.environ['TL_BACKEND'] = 'torch'
+os.environ['TL_BACKEND'] = 'torch'
 
 import tensorlayerx as tlx
 from tensorlayerx.nn import Module
diff --git a/runs/mlp/events.out.tfevents.1722986988.LAPTOP-48J7839G b/runs/mlp/events.out.tfevents.1722986988.LAPTOP-48J7839G
new file mode 100644
index 0000000000000000000000000000000000000000..a976b13fec485e1f648a54763a2351591164fc7e
GIT binary patch
literal 40
rcmb1OfPlsI-b$QR2ZgKGthwnZ#hX-=n3<>NT9%quVr5j(cv&0()ffz(

literal 0
HcmV?d00001

diff --git a/tensorlayerx/backend/ops/jittor_backend.py b/tensorlayerx/backend/ops/jittor_backend.py
index 774cd05..bbce67f 100644
--- a/tensorlayerx/backend/ops/jittor_backend.py
+++ b/tensorlayerx/backend/ops/jittor_backend.py
@@ -72,7 +72,7 @@ def zeros(shape, dtype=None, device = None):
     if device == 'gpu':
         jt.flags.use_cuda = 1
     
-    return jt.zeros(shape=shape, dtype=dtype)
+    return jt.zeros(shape, dtype)
 
 
 def ones(shape, dtype=None, device = None):
@@ -545,7 +545,7 @@ def reduce_mean(input_tensor, axis=None, keepdims=False):
     if axis is not None:
         if isinstance(axis, (tuple, list)):
             axis = tuple(axis)
-        return jt.mean(input_tensor, dims=axis, keepdims=keepdims)
+        return jt.mean(input_tensor, dim=axis, keepdims=keepdims)
     else:
         return jt.mean(input_tensor)
 
diff --git a/tensorlayerx/backend/ops/jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
index 5ebd1a2..985f3f0 100644
--- a/tensorlayerx/backend/ops/jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -552,7 +552,7 @@ def same_padding(input, weight, strides, dilations):
     # H(out) = = floor( --------------------------------------------------------------   + 1 )
     #                                        stride[0]
 
-    print(type(weight))
+
     if isinstance(weight, jt.Var):
         if len(input.shape) == 3:
             filter_rows = weight.size(2)
@@ -594,9 +594,6 @@ def same_padding(input, weight, strides, dilations):
         out_cols = (input_cols + strides[1] - 1) // strides[1]
 
 
-        # print(f"4D output rows: {out_rows}, output cols: {out_cols}")
-        # print(f"4D dilations: {dilations}")
-
 
         padding_rows = max(0, (out_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - input_rows)
         padding_cols = max(0, (out_cols - 1) * strides[1] + (filter_cols - 1) * dilations[1] + 1 - input_cols)
@@ -609,11 +606,6 @@ def same_padding(input, weight, strides, dilations):
         # if cols_odd:
         #     padding_cols += 1
         
-        # print(f"Filter Rows: {filter_rows}, Filter Cols: {filter_cols}")
-        # print(f"Input Rows: {input_rows}, Input Cols: {input_cols}")
-        # print(f"Output Rows: {out_rows}, Output Cols: {out_cols}")
-        # print(f"Padding Rows: {padding_rows}, Padding Cols: {padding_cols}")
-        # print(f"Rows Odd: {rows_odd}, Cols Odd: {cols_odd}")
 
         return rows_odd, cols_odd, padding_rows, padding_cols
 
@@ -653,15 +645,10 @@ def __init__(self, strides, padding, data_format='NHWC', dilations=None, out_cha
             self.strides = (strides[1], strides[2])
             self.dilations = (dilations[1], dilations[2])
         self.groups = groups
-        # print(f"strides =  {strides}")
+
 
     def __call__(self, input, filters):
-        # print(f"Conv2D_Input shape: {input.shape}")
-        # print(f"Conv2D_Filters shape: {filters.shape}")
-        # print(f"Conv2D_Strides: {self.strides}")
-        # print(f"Conv2D_Padding: {self.padding}")
-        # print(f"Conv2D_Dilations: {self.dilations}")
-        # print(f"Conv2D_Groups: {self.groups}")
+ 
 
         if self.data_format == 'NHWC':
             input = nhwc_to_nchw(input)
@@ -678,7 +665,6 @@ def __call__(self, input, filters):
 
     def conv2d_same_padding(self, input, weight, bias=None):
         rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, weight, self.strides, self.dilations)
-        # print(f"Padding rows: {padding_rows}, Padding cols: {padding_cols}")
         if rows_odd or cols_odd:
             input = nn.pad(input, [0, int(cols_odd), 0, int(rows_odd)])
 
@@ -1316,10 +1302,7 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         out_cols = (input_cols - 1) * strides[1] + filter_cols
         out_depth = (input_depth - 1) * strides[2] + filter_depth
 
-        # print(f"SAME_PADDING_Stride : {strides}")
-        # print(f"out_rows = {input_rows} * {strides[0]} - {strides[0]} + 1")
-        # print(f"out_cols = {input_cols} * {strides[1]} - {strides[1]} + 1")
-        # print(f"out_depth = {input_depth} * {strides[2]} - {strides[2]} + 1")
+
 
 
         padding_rows = max(0, (input_rows - 1) * strides[0] + (filter_rows - 1) * dilations[0] + 1 - out_rows)
@@ -1330,12 +1313,6 @@ def same_padding_deconvolution(input, weight, strides, dilations):
         cols_odd = (padding_cols % 2 != 0)
         depth_odd = (padding_depth % 2 != 0)
 
-        # print(f"SAME_PADDING_Filter: {filter_rows}, {filter_cols}, {filter_depth if 'filter_depth' in locals() else 'N/A'}")
-        # print(f"SAME_PADDING_Input : {input_rows}, {input_cols}, {input_depth}")
-        # print(f"SAME_PADDING_Output : {out_rows}, {out_cols}, {out_depth}")
-
-        # print(f"SAME_PADDING_Padding: {padding_rows}, {padding_cols},  {padding_depth}")
-        # print(f"SAME_PADDING_Rows Odd: {rows_odd}, Cols Odd: {cols_odd}, Depth Odd: {depth_odd}")
 
         return rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth
 
@@ -1632,12 +1609,9 @@ def __init__(
         self.name = name
         self.out_channel = out_channel
         self.data_format, self.padding = preprocess_3d_format(data_format, padding)
-        
-        # print(f'__init__Conv3D_TRANSPOSE_Stride = {self.strides}' )
-        # print(f'__init__SAME_PADDING_Dialation = {self.dilations}' )      
+
 
     def __call__(self, input, filters):
-        # print(f"conv3D_Transpose_Call: input shape={input.shape}, filters shape={filters.shape}")
         if self.data_format == 'NDHWC':
             input = nhwc_to_nchw(input)
 
@@ -1658,9 +1632,6 @@ def __call__(self, input, filters):
 
     def conv3d_transpore_same(self, input, filters):
 
-        # print(f'conv3d_transpore_same_Conv3D_TRANSPOSE_Stride = {self.strides}' )
-        # print(f'conv3d_transpore_same_SAME_PADDING_Dialation = {self.dilations}' )    
-        
         rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth = same_padding_deconvolution(
             input, filters, self.strides, (1, 1, 1))
         
@@ -1861,10 +1832,8 @@ class SeparableConv2D(object):
 
     def __init__(self, strides, padding, data_format, dilations, out_channel, k_size, in_channel, depth_multiplier):
         self.data_format, self.padding = preprocess_2d_format(data_format, padding)
-        # print(f"SeparableConv2D-_strides = {strides}")
         dilations = dilations[1:] + [dilations[0]]
 
-        # print(f"SeparableConv2D-_dilations = {dilations}")
         self.depthwise_conv = Conv2D(strides, self.padding, self.data_format, dilations, groups=in_channel)
         self.strides = (0,1,1,0)
         self.dialations = (1,1)
@@ -1875,7 +1844,6 @@ def __call__(self, input, filter, point_filter=None):
 
         depthwise_conv = self.depthwise_conv(input, filter)
         pointwise_conv = self.pointwise_conv(depthwise_conv, point_filter)
-        # print(f'pointwise_conv  = {pointwise_conv.shape}' )
         return pointwise_conv
 
 
@@ -1987,17 +1955,7 @@ def __call__(self, inputs):
         raise NotImplementedError
     #     if self.data_format == 'NDHWC':
     #         inputs = nhwc_to_nchw(inputs)
-        
-    #     # Debugging print statements
-    #     print(f"Input shape before pooling: {inputs.shape}")
-    #     print(f"Input type before pooling: {type(inputs)}")
 
-    #     output = self.op(inputs)
-
-    #     # Debugging print statements
-    #     print(f"Output shape after pooling: {output.shape}")
-    #     print(f"Output type after pooling: {type(output)}")
-        
     #     if self.data_format == 'NDHWC':
     #         output = nchw_to_nhwc(output)
         # return output
diff --git a/tensorlayerx/files/utils.py b/tensorlayerx/files/utils.py
index 2dbf3e1..c1d00d7 100644
--- a/tensorlayerx/files/utils.py
+++ b/tensorlayerx/files/utils.py
@@ -1981,8 +1981,16 @@ def save_npz_dict(save_list=None, name='model.npz'):
             save_list_var.append(values.cpu().detach().numpy())
     else:
         raise NotImplementedError('Not implemented')
+    
+    
     save_var_dict = {save_list_names[idx]: val for idx, val in enumerate(save_list_var)}
-    np.savez(name, **save_var_dict)
+
+    if isinstance(save_var_dict, dict):
+        save_var_dict = {str(k): v for k, v in save_var_dict.items()}
+        np.savez(name, **save_var_dict)
+    else:
+        raise ValueError("save_var_dict must be a dictionary")
+
     save_list_var = None
     save_var_dict = None
     del save_list_var
@@ -1990,7 +1998,8 @@ def save_npz_dict(save_list=None, name='model.npz'):
     logging.info("[*] Model saved in npz_dict %s" % name)
 
 
-def load_and_assign_npz_dict(name='model.npz', network=None, skip=False):
+def load_and_assign_npz_dict(name='model.npz', network=None, skip=False, name_map=None):
+
     """Restore the parameters saved by ``tlx.files.save_npz_dict()``.
 
     Parameters
@@ -2015,16 +2024,20 @@ def load_and_assign_npz_dict(name='model.npz', network=None, skip=False):
     if tlx.BACKEND == 'torch':
         net_weights_name = [n for n, v in network.named_parameters()]
         torch_weights_dict = {n: v for n, v in network.named_parameters()}
+    elif tlx.BACKEND == 'jittor':
+        net_weights_name = [w.name() for w in network.all_weights]
+																		 
     else:
         net_weights_name = [w.name for w in network.all_weights]
 
     for key in weights.keys():
+																		
         if key not in net_weights_name:
             if skip:
                 logging.warning("Weights named '%s' not found in network. Skip it." % key)
             else:
                 raise RuntimeError(
-                    "Weights named '%s' not found in network. Hint: set argument skip=Ture "
+                    "Weights named '%s' not found in network. Hint: set argument skip=True "
                     "if you want to skip redundant or mismatch weights." % key
                 )
         else:
@@ -2037,6 +2050,8 @@ def load_and_assign_npz_dict(name='model.npz', network=None, skip=False):
                 assign_pd_variable(network.all_weights[net_weights_name.index(key)], weights[key])
             elif tlx.BACKEND == 'torch':
                 assign_th_variable(torch_weights_dict[key], weights[key])
+            elif tlx.BACKEND == 'jittor':
+                network.all_weights[net_weights_name.index(key)].update(weights[key])
             else:
                 raise NotImplementedError('Not implemented')
 
diff --git a/tensorlayerx/metrics/jittor_metric.py b/tensorlayerx/metrics/jittor_metric.py
index d5a163c..81a6e2c 100644
--- a/tensorlayerx/metrics/jittor_metric.py
+++ b/tensorlayerx/metrics/jittor_metric.py
@@ -35,67 +35,66 @@ def reset(self):
 
 
 
-
-
-class Accuracy(Metric):
-    def __init__(self, topk=1):
-        super(Accuracy, self).__init__()
-        self.topk = int(topk)  # Ensure topk is an integer
-        self.reset()
+class Accuracy:
+    def __init__(self):
+        self.correct = 0
+        self.total = 0
 
     def update(self, y_pred, y_true):
-        y_pred = jt.argsort(y_pred, dim=-1, descending=True)[0]
-
-        if (len(y_true.shape) == 1) or (len(y_true.shape) == 2 and y_true.shape[-1] == 1):
-            y_true = jt.reshape(y_true, (-1, 1))
-        elif y_true.shape[-1] != 1:
-            y_true = jt.argmax(y_true, dim=-1, keepdim=True)
-
-        correct = y_pred == y_true
-        correct = correct.to(jt.float32)
-        correct = correct.numpy()
-        num_samples = np.prod(np.array(correct.shape[:-1]))
-        num_corrects = correct.sum()
-        self.total += num_corrects
-        self.count += num_samples
+        # Step 1: Get the predicted class labels using argmax
+        y_pred = jt.argmax(y_pred, dim=-1)
+        
+        # Step 2: Ensure y_true is reshaped to match y_pred
+        y_true = np.reshape(y_true, (-1,))
+        
+        # Step 3: Compare the predicted labels to the true labels
+        correct_predictions = np.equal(y_pred, y_true)
+        
+        # Step 4: Count the number of correct predictions
+        num_correct_predictions = np.sum(correct_predictions).item()
+        
+        # Update the running totals
+        self.correct += num_correct_predictions
+        self.total += y_true.shape[0]
 
     def result(self):
-        return float(self.total) / self.count if self.count > 0 else 0.
+        # Calculate the accuracy
+        return self.correct / self.total if self.total > 0 else 0.0
 
     def reset(self):
-        self.total = 0.0
-        self.count = 0.0
+        # Reset the counters
+        self.correct = 0
+        self.total = 0
 
-class Auc(object):
 
-    def __init__(
-        self,
-        curve='ROC',
-        num_thresholds=4095,
-    ):
-        self.curve = curve
+class Auc:
+    def __init__(self, num_thresholds=4095):
         self.num_thresholds = num_thresholds
         self.reset()
 
     def update(self, y_pred, y_true):
-        if isinstance(y_true, jt.array()):
-            y_true = y_true.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_true must be a numpy array or Tensor.")
-
-        if isinstance(y_pred, jt.array):
-            y_pred = y_pred.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_pred must be a numpy array or Tensor.")
-
+        # Convert Jittor tensors to NumPy arrays if necessary
+        if isinstance(y_true, jt.Var):
+            y_true = y_true.numpy()
+        if isinstance(y_pred, jt.Var):
+            y_pred = y_pred.numpy()
+
+        # Flatten y_true to ensure it's 1-dimensional
+        y_true = np.reshape(y_true, (-1,))
+        
+        # Get the positive class probabilities
+        pos_prob = y_pred[:, 1]
+
+        # Bin the predictions into thresholds
+        bin_idx = np.floor(pos_prob * self.num_thresholds).astype(int)
+        bin_idx = np.clip(bin_idx, 0, self.num_thresholds)
+
+        # Update the histogram bins
         for i, label in enumerate(y_true):
-            value = y_pred[i, 1]  # positive probability
-            bin_idx = int(value * self.num_thresholds)
-            assert bin_idx <= self.num_thresholds
             if label:
-                self._stat_pos[bin_idx] += 1.0
+                self._stat_pos[bin_idx[i]] += 1
             else:
-                self._stat_neg[bin_idx] += 1.0
+                self._stat_neg[bin_idx[i]] += 1
 
     @staticmethod
     def trapezoid_area(x1, x2, y1, y2):
@@ -105,91 +104,80 @@ def result(self):
         tot_pos = 0.0
         tot_neg = 0.0
         auc = 0.0
-        idx = self.num_thresholds
-        while idx > 0:
+
+        for idx in range(self.num_thresholds, 0, -1):
             tot_pos_prev = tot_pos
             tot_neg_prev = tot_neg
             tot_pos += self._stat_pos[idx]
             tot_neg += self._stat_neg[idx]
             auc += self.trapezoid_area(tot_neg, tot_neg_prev, tot_pos, tot_pos_prev)
-            idx -= 1
 
-        return auc / tot_pos / tot_neg if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
+        return auc / (tot_pos * tot_neg) if tot_pos > 0.0 and tot_neg > 0.0 else 0.0
 
     def reset(self):
-        """
-        Reset states and result
-        """
-        _num_pred_buckets = self.num_thresholds + 1
-        self._stat_pos = np.zeros(_num_pred_buckets)
-        self._stat_neg = np.zeros(_num_pred_buckets)
+        self._stat_pos = np.zeros(self.num_thresholds + 1)
+        self._stat_neg = np.zeros(self.num_thresholds + 1)
 
 
-class Precision(object):
-
+class Precision:
     def __init__(self):
         self.reset()
 
     def update(self, y_pred, y_true):
-        if isinstance(y_true, jt.array):
-            y_true = y_true.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_true must be a numpy array or Tensor.")
-
-        if isinstance(y_pred, jt.array):
-            y_pred = y_pred.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_pred must be a numpy array or Tensor.")
-
-        sample_num = y_true.shape[0]
-        y_pred = np.rint(y_pred).astype('int32')
-
-        for i in range(sample_num):
-            pred = y_pred[i]
-            label = y_true[i]
-            if pred == 1:
-                if pred == label:
-                    self.tp += 1
-                else:
-                    self.fp += 1
+        # Convert Jittor tensors to NumPy arrays if necessary
+        if isinstance(y_true, jt.Var):
+            y_true = y_true.numpy()
+        if isinstance(y_pred, jt.Var):
+            y_pred = y_pred.numpy()
+
+        # Ensure y_true is reshaped to match y_pred
+        y_true = np.reshape(y_true, (-1,))
+        
+        # Convert probabilities to class predictions
+        y_pred = np.argmax(y_pred, axis=1)
+
+        # Update true positives (tp) and false positives (fp)
+        self.tp += np.sum((y_pred == 1) & (y_true == 1))
+        self.fp += np.sum((y_pred == 1) & (y_true == 0))
 
     def result(self):
-
         ap = self.tp + self.fp
-        return float(self.tp) / ap if ap != 0 else .0
+        return float(self.tp) / ap if ap != 0 else 0.0
 
     def reset(self):
         self.tp = 0
         self.fp = 0
 
 
-class Recall(object):
-
+class Recall:
     def __init__(self):
         self.reset()
 
     def update(self, y_pred, y_true):
-        if isinstance(y_true, jt.array):
-            y_true = y_true.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_true must be a numpy array or Tensor.")
-
-        if isinstance(y_pred, jt.array):
-            y_pred = y_pred.cpu().numpy()
-        elif not isinstance(y_pred, np.ndarray):
-            raise TypeError("The y_pred must be a numpy array or Tensor.")
-
-        sample_num = y_true.shape[0]
-        y_pred = np.rint(y_pred).astype('int32')
-
-        for i in range(sample_num):
-            pred = y_pred[i]
-            label = y_true[i]
-            if label == 1:
-                if pred == label:
-                    self.tp += 1
-                else:
-                    self.fn += 1
+        # Convert Jittor tensors to NumPy arrays if necessary
+        if isinstance(y_true, jt.Var):
+            y_true = y_true.numpy()
+        if isinstance(y_pred, jt.Var):
+            y_pred = y_pred.numpy()
+
+        # Ensure y_true is reshaped to match y_pred
+        y_true = np.reshape(y_true, (-1,))
+
+        # Convert probabilities to class predictions
+        y_pred = np.argmax(y_pred, axis=1)
+
+        # Update true positives (tp) and false negatives (fn)
+        self.tp += np.sum((y_pred == 1) & (y_true == 1))
+        self.fn += np.sum((y_true == 1) & (y_pred == 0))
+
+    def result(self):
+        recall = self.tp + self.fn
+        return float(self.tp) / recall if recall != 0 else 0.0
+
+    def reset(self):
+        self.tp = 0
+        self.fn = 0
+
 
     def result(self):
 
diff --git a/tensorlayerx/model/core.py b/tensorlayerx/model/core.py
index 218d40e..02ef47f 100644
--- a/tensorlayerx/model/core.py
+++ b/tensorlayerx/model/core.py
@@ -24,7 +24,7 @@
 if tlx.BACKEND == 'torch':
     import torch
 if tlx.BACKEND == 'jittor':
-    import torch
+    import jittor as jt
 __all__ = ['Model', 'WithLoss', 'WithGrad', 'TrainOneStep', 'TrainOneStepWithGradientClipping']
 
 
@@ -662,17 +662,21 @@ def jt_train(
                     network.set_train()
                     output = network(X_batch)
                     loss = loss_fn(output, y_batch)
+                    # optimizer.apply_gradients(loss, train_weights)
                     # grads = optimizer.gradient(loss, train_weights)
                     # optimizer.apply_gradients(zip(grads, train_weights))
+
+                    optimizer.set(train_weights)
                     optimizer.zero_grad()
                     optimizer.step(loss)
-                    train_loss += loss
+                    train_loss += loss.item()
+               
                     if metrics:
-                        metrics.update(output, y_batch)
-                        train_acc += metrics.result()
+                        metrics.update(y_pred=output,y_true= y_batch)
+                        train_acc += metrics.result() 
                         metrics.reset()
                     else:
-                        train_acc += (output.argmax(1) == y_batch).type(torch.float).mean().item()
+                        train_acc += np.mean(np.equal(np.argmax(output, axis=1), y_batch))
                     n_iter += 1
 
                     if print_train_batch:
@@ -701,7 +705,7 @@ def jt_train(
                                 val_acc += metrics.result()
                                 metrics.reset()
                             else:
-                                val_acc += (_logits.argmax(1) == y_batch).type(torch.float).mean().item()
+                                val_acc += (_logits.argmax(1) == y_batch).type(jt.float).mean().item()
                             n_iter += 1
                         print("   val loss: {}".format(val_loss / n_iter))
                         print("   val acc:  {}".format(val_acc / n_iter))
diff --git a/tensorlayerx/model/utils.py b/tensorlayerx/model/utils.py
index d7d9c2a..a1229b7 100644
--- a/tensorlayerx/model/utils.py
+++ b/tensorlayerx/model/utils.py
@@ -144,9 +144,10 @@ def __init__(self, network, loss_fn=None, optimizer=None):
         self.network.set_train()
 
     def __call__(self, inputs, label):
-        loss = self.network_with_loss(inputs, label)
-        grads = self.optimizer.gradient(loss, self.train_weights)
-        return grads
+        raise NotImplementedError("WithGradJT not Implemented")
+        # loss = self.network_with_loss(inputs, label)
+        # grads = self.optimizer.gradient(loss, self.train_weights)
+        # return grads
     
 
 
@@ -227,12 +228,14 @@ def __init__(self, net_with_loss, optimizer, train_weights):
         self.optimizer = optimizer
         self.train_weights = train_weights
 
-    def __call__(self, data, label, *args, **kwargs):
-        # loss = self.net_with_loss(data, label, *args, **kwargs)
-        # grads = self.optimizer.gradient(loss, self.train_weights)
-        # self.optimizer.apply_gradients(zip(grads, self.train_weights))
-        # return loss.numpy()
-        return NotImplementedError('TrainOneStep With jittor is not Implemented')
+    def __call__(self, data, label):
+        loss = self.net_with_loss(data, label)
+        self.optimizer.set(self.train_weights)
+        self.optimizer.zero_grad()
+        # if self.grad_clip is not None:
+        #     self.grad_clip(self.train_weights)
+        self.optimizer.step(loss)
+        return loss.numpy()
     
 
 class TrainOneStepWithGradientClippingTF(object):
@@ -296,7 +299,7 @@ def __call__(self, data, label):
     
 
 class TrainOneStepWithGradientClippingJT(object):
-    def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping):
+    def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping=None):
         self.net_with_loss = net_with_loss
         self.optimizer = optimizer
         self.train_weights = train_weights
@@ -304,7 +307,11 @@ def __init__(self, net_with_loss, optimizer, train_weights, gradient_clipping):
 
     def __call__(self, data, label):
         loss = self.net_with_loss(data, label)
-        grads = self.optimizer.gradient(loss, self.train_weights, grad_clip=self.gradient_clipping)
-        self.optimizer.apply_gradients(zip(grads, self.train_weights))
-        return loss.numpy()
-    
+        self.optimizer.set(self.train_weights)
+        self.optimizer.zero_grad()
+        
+        if self.gradient_clipping is not None:
+            self.gradient_clipping(self.train_weights)
+            
+        self.optimizer.step()
+        return loss.numpy()
\ No newline at end of file
diff --git a/tensorlayerx/nn/core/core_jittor.py b/tensorlayerx/nn/core/core_jittor.py
index f69d712..0a49980 100644
--- a/tensorlayerx/nn/core/core_jittor.py
+++ b/tensorlayerx/nn/core/core_jittor.py
@@ -406,7 +406,7 @@ def forward(self, input_data):
     #         tensor._info = (new_node, idx)
 
 
-class ModuleList(Module):
+class ModuleList():
     """
     Holds Modules in a list.
 
@@ -448,8 +448,26 @@ class ModuleList(Module):
 
     def __init__(self, modules=None):
         super(ModuleList, self).__init__()
+
+        # Force _modules to be an OrderedDict right after parent's __init__
+        self._modules = OrderedDict()
+
         if modules is not None:
             self.extend(modules)
+        
+    def extend(self, layers):
+        """
+        Appends layers from a Python iterable to the end of the list.
+        """
+        if not isinstance(layers, list):
+            raise TypeError('Modules should be a list of sublayers')
+
+        for layer in layers:
+            if _valid_module(layer):
+                self._modules[str(len(self._modules))] = layer
+                # print(f"self._modules after layers added: {self._modules}")        
+
+        return self
 
     def __getitem__(self, index):
         if isinstance(index, slice):
@@ -503,18 +521,7 @@ def insert(self, index, layer):
             length -= 1
         self._modules[str(idx)] = layer
 
-    def extend(self, layers):
-        """
-            Appends layers from a Python iterable to the end of the list.
-
-        """
 
-        if not isinstance(layers, list):
-            raise TypeError('Modules {} should be list of sublayers'.format(layers))
-        for layer in layers:
-            if _valid_module(layer):
-                self._modules[str(len(self))] = layer
-        return self
 
     def append(self, layer):
         """
@@ -529,6 +536,8 @@ def forward(self, *inputs):
         raise NotImplementedError
 
 
+
+
 class ModuleDict(Module):
 
     def __init__(self, modules=None):
@@ -680,11 +689,14 @@ def __call__(self, input):
 
 
 
-class ParameterDict(Module):
-
+class ParameterDict():
     def __init__(self, parameters=None):
         super(ParameterDict, self).__init__()
         self._initialized = True
+
+        # Bypass the __setattr__ method's restriction by directly setting _parameters
+        self.__dict__['_parameters'] = OrderedDict()
+        
         if parameters is not None:
             self.update(parameters)
 
@@ -699,12 +711,21 @@ def __getitem__(self, key):
     def __setitem__(self, key, parameter):
         self.register_parameter(key, parameter)
 
+    def register_parameter(self, key, parameter):
+        # Ensure that parameter is of type jt.Var or jt.nn.Parameter
+        if not isinstance(parameter, (jt.Var, jt.nn.Parameter)):
+            raise TypeError(f"Expected jt.nn.Parameter or jt.Var, but got {type(parameter)} for key '{key}'")
+        
+        # Add the parameter to the _parameters dictionary
+        self._parameters[key] = parameter
+        print(f"Registered parameter: {key} -> type: {type(parameter)}, shape: {parameter.shape}")
+
     def __delitem__(self, key):
         del self._parameters[key]
 
     def __setattr__(self, key, value):
         if getattr(self, "_initialized", False):
-            if not hasattr(self, key) and not isinstance(value, jt.nn.Parameter):
+            if not hasattr(self, key) and not isinstance(value, (jt.nn.Parameter, jt.Var)):
                 warnings.warn("Setting attributes on ParameterDict is not supported.")
         super(ParameterDict, self).__setattr__(key, value)
 
@@ -718,7 +739,6 @@ def __reversed__(self):
         return reversed(list(self._parameters.keys()))
 
     def copy(self):
-
         return ParameterDict(self._parameters.copy())
 
     def __contains__(self, key):
@@ -742,58 +762,35 @@ def popitem(self):
         return self._parameters.popitem()
 
     def get(self, key, default=None):
-
         return self._parameters.get(key, default)
 
     def fromkeys(self, keys, default=None):
-
-        return ParameterDict(self._parameters.fromkeys(keys, default))  # type: ignore[arg-type]
+        return ParameterDict(self._parameters.fromkeys(keys, default))
 
     def keys(self):
-
         return self._parameters.keys()
 
     def items(self):
-
         return self._parameters.items()
 
     def values(self):
-
         return self._parameters.values()
 
     def update(self, parameters):
-        if not isinstance(parameters, container_abcs.Iterable):
-            raise TypeError(
-                "ParametersDict.update should be called with an "
-                "iterable of key/value pairs, but got " + type(parameters).__name__
-            )
-
-        if isinstance(parameters, (OrderedDict, ParameterDict)):
-            for key, parameter in parameters.items():
-                self[key] = parameter
-        elif isinstance(parameters, container_abcs.Mapping):
-            for key, parameter in sorted(parameters.items()):
-                self[key] = parameter
-        else:
-            for j, p in enumerate(parameters):
-                if not isinstance(p, container_abcs.Iterable):
-                    raise TypeError(
-                        "ParameterDict update sequence element "
-                        "#" + str(j) + " should be Iterable; is" + type(p).__name__
-                    )
-                if not len(p) == 2:
-                    raise ValueError(
-                        "ParameterDict update sequence element "
-                        "#" + str(j) + " has length " + str(len(p)) + "; 2 is required"
-                    )
-                # parameters as length-2 list too cumbersome to type, see ModuleDict.update comment
-                self[p[0]] = p[1]  # type: ignore[assignment]
+        if not isinstance(parameters, dict):
+            raise TypeError("ParametersDict.update should be called with a dictionary.")
+        
+        for key, parameter in parameters.items():
+            self[key] = parameter
 
     def __call__(self, input):
         raise RuntimeError('ParameterDict should not be called.')
+    
 
+    
 
 def _valid_index(layer_num, index):
+
     if not isinstance(index, int):
         raise TypeError("Index {} is not int type")
     if not -layer_num <= index < layer_num:
@@ -801,6 +798,8 @@ def _valid_index(layer_num, index):
     return index % layer_num
 
 
+
+
 def _valid_module(layer):
     if issubclass(layer.__class__, Module):
         return True
diff --git a/tensorlayerx/nn/layers/convolution/deformable_conv.py b/tensorlayerx/nn/layers/convolution/deformable_conv.py
index db74a43..299d1c2 100644
--- a/tensorlayerx/nn/layers/convolution/deformable_conv.py
+++ b/tensorlayerx/nn/layers/convolution/deformable_conv.py
@@ -89,8 +89,15 @@ def __init__(
         self.in_channels = in_channels
 
         self.kernel_n = kernel_size[0] * kernel_size[1]
-        if self.offset_layer.get_shape()[-1] != 2 * self.kernel_n:
-            raise AssertionError("offset.get_shape()[-1] is not equal to: %d" % 2 * self.kernel_n)
+
+        # Check if offset_layer has get_shape method, if not use reshape
+        if hasattr(self.offset_layer, 'get_shape'):
+            offset_shape = self.offset_layer.get_shape()[-1]
+        else:
+            offset_shape = self.offset_layer.shape[-1]
+
+        if offset_shape != 2 * self.kernel_n:
+            raise AssertionError("offset shape[-1] is not equal to: %d" % (2 * self.kernel_n))
 
         logging.info(
             "DeformableConv2d %s: out_channels: %d, kernel_size: %s act: %s" % (
diff --git a/tensorlayerx/optimizers/jittor_optimizers.py b/tensorlayerx/optimizers/jittor_optimizers.py
index 4905940..ed2b782 100644
--- a/tensorlayerx/optimizers/jittor_optimizers.py
+++ b/tensorlayerx/optimizers/jittor_optimizers.py
@@ -31,7 +31,6 @@ def app_gradients(self):
 class Adam(object):
     def __init__(
             self,
-            params, 
             lr=0.001, 
             beta_1=0.9, 
             beta_2=0.999, 
@@ -39,134 +38,103 @@ def __init__(
             weight_decay=0.0,
             momentum = 0.0,
             grad_clip=None                    
-
             ):
         
-        self.optimizer = optimizer.Adam(
-            params, 
-            lr=lr, 
-            eps=eps, 
-            betas=(beta_1, beta_2), 
-            weight_decay=weight_decay)
-
         self.lr = lr
         self.beta_1 = beta_1
         self.beta_2 = beta_2
+        self.betas = (beta_1,beta_2)
         self.eps = eps
         self.init_optim = False
         self.weight_decay = weight_decay
         self.grad_clip = grad_clip
 
-
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Can not apply gradients before zero_grad call.")
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer_adam.param_groups:
-            params_with_grad = []
-            grads = []
-            exp_avgs = []
-            exp_avg_sqs = []
-            max_exp_avg_sqs = []
-            state_steps = []
-            beta1, beta2 = group['betas']
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    if p.grad.is_sparse:
-                        raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-                    grads.append(p.grad)
-
-                    state = self.optimizer_adam.state[p]
-                    # Lazy state initialization
-                    if len(state) == 0:
-                        state['step'] = 0
-                        # Exponential moving average of gradient values
-                        state['exp_avg'] = jt.zeros_like(p)
-                        # Exponential moving average of squared gradient values
-                        state['exp_avg_sq'] = jt.zeros_like(p)
-                        if group['amsgrad']:
-                            # Maintains max of all exp. moving avg. of sq. grad. values
-                            state['max_exp_avg_sq'] = jt.zeros_like(p)
-
-                    exp_avgs.append(state['exp_avg'])
-                    exp_avg_sqs.append(state['exp_avg_sq'])
-
-                    if group['amsgrad']:
-                        max_exp_avg_sqs.append(state['max_exp_avg_sq'])
-
-                    # update the steps for each param group update
-                    state['step'] += 1
-                    # record the step after step update
-                    state_steps.append(state['step'])
-
-            jt.optim.Adam(params_with_grad,
-                   grads,
-                   exp_avgs,
-                   exp_avg_sqs,
-                   max_exp_avg_sqs,
-                   state_steps,
-                   amsgrad=group['amsgrad'],
-                   beta1=beta1,
-                   beta2=beta2,
-                   lr=get_lr(self.lr),
-                   weight_decay=group['weight_decay'],
-                   eps=group['eps'])
-        return loss
-
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
+    def set(self, weights):
         if not self.init_optim:
             self.optimizer_adam = optimizer.Adam(
-                params=weights, lr=get_lr(self.lr), betas=(self.beta_1, self.beta_2), eps=self.eps,
+                params=weights, lr=self.lr, betas=self.betas, eps=self.eps,
                 weight_decay=self.weight_decay
             )
             self.init_optim = True
-        self.optimizer_adam.zero_grad()
-        self.optimizer_adam.step(loss)
 
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
+    def zero_grad(self):
+        self.optimizer_adam.zero_grad()
 
-        if return_grad ==True:
-            return _grads(weights)
-        else:
-            return None
+    def step(self, loss=None):
+        self.optimizer_adam.step(loss)
 
+class AdamW(object):
+    def __init__(
+            self,
+            lr=0.001, 
+            beta_1=0.9, 
+            beta_2=0.999, 
+            eps=1e-8, 
+            weight_decay=0.01,
+            grad_clip=None                    
+            ):
+        
+        self.lr = lr
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.betas = (beta_1, beta_2)
+        self.eps = eps
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
 
-    def step(self, loss=None):
-        self.optimizer.step(loss)
+    def set(self, weights):
+        if not self.init_optim:
+            self.optimizer_adamw = optimizer.AdamW(
+                params=weights, lr=self.lr, betas=self.betas, eps=self.eps,
+                weight_decay=self.weight_decay
+            )
+            self.init_optim = True
 
     def zero_grad(self):
-        self.optimizer.zero_grad()
-
-class AdamW(object):
-    def __init__(self, params, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=0.0):
-        self.optimizer = optimizer.AdamW(params, lr=lr, eps=eps, betas=(beta_1, beta_2), weight_decay=weight_decay)
+        self.optimizer_adamw.zero_grad()
 
     def step(self, loss=None):
-        self.optimizer.step(loss)
+        self.optimizer_adamw.step(loss)
 
-    def zero_grad(self):
-        self.optimizer.zero_grad()
 
 
 class Adan(object):
-    def __init__(self, params, lr=0.001, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=0.0):
-        self.optimizer = optimizer.Adan(params, lr=lr, eps=eps, betas=(beta_1, beta_2), weight_decay=weight_decay)
+    def __init__(
+            self,
+            lr=0.001, 
+            beta_1=0.9, 
+            beta_2=0.999, 
+            beta_3=0.99,
+            eps=1e-8, 
+            weight_decay=0.0,
+            grad_clip=None                    
+            ):
+        
+        self.lr = lr
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.beta_3 = beta_3
+        self.betas = (beta_1, beta_2, beta_3)
+        self.eps = eps
+        self.init_optim = False
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
 
-    def step(self, loss=None):
-        self.optimizer.step(loss)
+    def set(self, weights):
+        if not self.init_optim:
+            self.optimizer_adan = optimizer.Adan(
+                params=weights, lr=self.lr, betas=self.betas, eps=self.eps,
+                weight_decay=self.weight_decay
+            )
+            self.init_optim = True
 
     def zero_grad(self):
-        self.optimizer.zero_grad()
+        self.optimizer_adan.zero_grad()
+
+    def step(self, loss=None):
+        self.optimizer_adan.step(loss)
+
 
 
 class Adamax(object):
@@ -204,201 +172,81 @@ def gradient(self, train_weights=None):
 
 
 class RMSprop(object):
-
     def __init__(
-        self,
-        lr=0.001,
-        rho=0.99,
-        momentum=0.0,
-        eps=1e-08,
-        centered=False,
-        weight_decay=0.0,
-        grad_clip=None,
-    ):
+            self,
+            lr=0.001, 
+            eps=1e-8, 
+            alpha=0.99, 
+            # weight_decay=0.0,
+            grad_clip=None                    
+            ):
+        
         self.lr = lr
-        self.rho = rho
-        self.momentum = momentum
         self.eps = eps
-        self.centered = centered
+        self.alpha = alpha
         self.init_optim = False
-        self.weight_decay = weight_decay
+        # self.weight_decay = weight_decay
         self.grad_clip = grad_clip
 
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Can not apply gradients before zero_grad call.")
-
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer_rmsprop.param_groups:
-            params_with_grad = []
-            grads = []
-            square_avgs = []
-            grad_avgs = []
-            momentum_buffer_list = []
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                params_with_grad.append(p)
-
-                if p.grad.is_sparse:
-                    raise RuntimeError('RMSprop does not support sparse gradients')
-                grads.append(p.grad)
-
-                state = self.optimizer_rmsprop.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    state['square_avg'] = jt.zeros_like(p)
-                    if group['momentum'] > 0:
-                        state['momentum_buffer'] = jt.zeros_like(p)
-                    if group['centered']:
-                        state['grad_avg'] = jt.zeros_like(p)
-
-                square_avgs.append(state['square_avg'])
-
-                if group['momentum'] > 0:
-                    momentum_buffer_list.append(state['momentum_buffer'])
-                if group['centered']:
-                    grad_avgs.append(state['grad_avg'])
-
-                state['step'] += 1
-
-            optimizer.RMSprop(params_with_grad,
-                      grads,
-                      square_avgs,
-                      grad_avgs,
-                      momentum_buffer_list,
-                      lr=get_lr(self.lr),
-                      alpha=group['alpha'],
-                      eps=group['eps'],
-                      weight_decay=group['weight_decay'],
-                      momentum=group['momentum'],
-                      centered=group['centered'])
-
-        return loss
-
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
+    def set(self, weights):
         if not self.init_optim:
             self.optimizer_rmsprop = optimizer.RMSprop(
-                params=weights, lr=get_lr(self.lr), alpha=self.rho, eps=self.eps, momentum=self.momentum,
-                centered=self.centered, weight_decay=self.weight_decay
+                params=weights, lr=self.lr, eps=self.eps, alpha=self.alpha,
             )
             self.init_optim = True
+
+    def zero_grad(self):
         self.optimizer_rmsprop.zero_grad()
-        loss.backward()
 
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
+    def step(self, loss=None):
+        self.optimizer_rmsprop.step(loss)
 
-        if return_grad ==True:
-            return _grads(weights)
-        else:
-            return None
 
 
 class SGD(object):
-
     def __init__(
-        self,
-        lr=0.001,
-        momentum=0,
-        weight_decay=0.0,
-        grad_clip=None,
-    ):
+            self,
+            lr=0.01,
+            momentum=0.0,
+            weight_decay=0.0,
+            dampening=0.0,
+            nesterov=False,
+            grad_clip=None
+            ):
+
         self.lr = lr
         self.momentum = momentum
-        self.init_optim = False
         self.weight_decay = weight_decay
+        self.dampening = dampening
+        self.nesterov = nesterov
+        self.init_optim = False
         self.grad_clip = grad_clip
 
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Can not apply gradients before zero_grad call.")
-
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer_sgd.param_groups:
-            params_with_grad = []
-            d_p_list = []
-            momentum_buffer_list = []
-            weight_decay = group['weight_decay']
-            momentum = group['momentum']
-            dampening = group['dampening']
-            nesterov = group['nesterov']
-            lr = get_lr(self.lr)
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    d_p_list.append(p.grad)
-
-                    state = self.optimizer_sgd.state[p]
-                    if 'momentum_buffer' not in state:
-                        momentum_buffer_list.append(None)
-                    else:
-                        momentum_buffer_list.append(state['momentum_buffer'])
-
-            optimizer.SGD(params_with_grad,
-                  d_p_list,
-                  momentum_buffer_list,
-                  weight_decay=weight_decay,
-                  momentum=momentum,
-                  lr=lr,
-                  dampening=dampening,
-                  nesterov=nesterov)
-
-            # update momentum_buffers in state
-            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
-                state = self.optimizer_sgd.state[p]
-                state['momentum_buffer'] = momentum_buffer
-
-        return loss
-
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
+    def set(self, weights):
         if not self.init_optim:
             self.optimizer_sgd = optimizer.SGD(
-                params=weights, lr=get_lr(self.lr), momentum=self.momentum, weight_decay=self.weight_decay
+                params=weights, lr=self.lr, momentum=self.momentum, weight_decay=self.weight_decay,
+                dampening=self.dampening, nesterov=self.nesterov
             )
             self.init_optim = True
-        self.optimizer_sgd.zero_grad()
-        loss.backward()
-
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
 
-        if return_grad ==True:
-            return _grads(weights)
-        else:
-            return None
+    def zero_grad(self):
+        self.optimizer_sgd.zero_grad()
 
+    def step(self, loss=None):
+        self.optimizer_sgd.step(loss)
 
 
 class Momentum(object):
-
     def __init__(
-        self,
-        params,  # Add params to the constructor
-        lr=0.001,
-        momentum=0.9,
-        weight_decay=0.0,
-        nesterov=False,
-        grad_clip=None,
-    ):
+            self,
+            lr=0.001, 
+            momentum=0.9,
+            weight_decay=0.0,
+            nesterov=False,
+            grad_clip=None                    
+            ):
+        
         self.lr = lr
         self.momentum = momentum
         self.weight_decay = weight_decay
@@ -406,76 +254,19 @@ def __init__(
         self.grad_clip = grad_clip
         self.init_optim = False
 
-        self.optimizer = optimizer.SGD(  # Initialize the Jittor SGD optimizer
-            params, lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov
-        )
-
-    @jt.no_grad()
-    def apply_gradients(self, grads_and_vars=None, closure=None):
-        if not self.init_optim:
-            raise AttributeError("Cannot apply gradients before zero_grad call.")
-
-        loss = None
-        if closure is not None:
-            with jt.enable_grad():
-                loss = closure()
-
-        for group in self.optimizer.param_groups:
-            params_with_grad = []
-            d_p_list = []
-            momentum_buffer_list = []
-
-            for p in group['params']:
-                if p.grad is not None:
-                    params_with_grad.append(p)
-                    d_p_list.append(p.grad)
-
-                    state = self.optimizer.state[p]
-                    if 'momentum_buffer' not in state:
-                        momentum_buffer_list.append(None)
-                    else:
-                        momentum_buffer_list.append(state['momentum_buffer'])
-
-            optimizer.SGD(params_with_grad,
-                          d_p_list,
-                          momentum_buffer_list,
-                          weight_decay=group['weight_decay'],
-                          momentum=group['momentum'],
-                          lr=self.lr,
-                          dampening=group['dampening'],
-                          nesterov=group['nesterov'])
-
-            # Update momentum_buffers in state
-            for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list):
-                state = self.optimizer.state[p]
-                state['momentum_buffer'] = momentum_buffer
-
-        return loss
-
-    def gradient(self, loss, weights=None, return_grad=True):
-        if weights is None:
-            raise AttributeError("Parameter train_weights must be entered.")
+    def set(self, weights):
         if not self.init_optim:
-            self.optimizer = optimizer.SGD(
+            self.optimizer_momentum = optimizer.SGD(
                 params=weights, lr=self.lr, momentum=self.momentum, weight_decay=self.weight_decay, nesterov=self.nesterov
             )
             self.init_optim = True
-        self.optimizer.zero_grad()
-        loss.backward()
 
-        if self.grad_clip is not None:
-            self.grad_clip(weights)
-
-        if return_grad:
-            return _grads(weights)
-        else:
-            return None
+    def zero_grad(self):
+        self.optimizer_momentum.zero_grad()
 
     def step(self, loss=None):
-        self.optimizer.step(loss)
+        self.optimizer_momentum.step(loss)
 
-    def zero_grad(self):
-        self.optimizer.zero_grad()
 
 
 
@@ -487,10 +278,10 @@ def LARS(**kwargs):
     raise Exception('LARS optimizer function not implemented')
 
 
-def _grads(weights, optimizer_adam):
+def _grads(weights, optimizer):
     grads = []
     for w in weights:
-        grads.append(w.opt_grad(optimizer_adam))
+        grads.append(w.opt_grad(optimizer))
     return grads
 
 

From 42f84ee9b5b7b6c2dd6fc6899a59234f01ac322d Mon Sep 17 00:00:00 2001
From: Brilliant Hanabi <moehanabichan@gmail.com>
Date: Wed, 27 Nov 2024 19:02:22 +0800
Subject: [PATCH 2/4] Fix some bugs in jittor backend implement

---
 tensorlayerx/backend/ops/jittor_backend.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorlayerx/backend/ops/jittor_backend.py b/tensorlayerx/backend/ops/jittor_backend.py
index bbce67f..e74acb7 100644
--- a/tensorlayerx/backend/ops/jittor_backend.py
+++ b/tensorlayerx/backend/ops/jittor_backend.py
@@ -1179,7 +1179,7 @@ def __call__(self, x, y):
 
 class CountNonzero(object):
 
-    def __init__(self, keepdims=None, dtype=None):
+    def __init__(self, keepdims=None, dtype="float32"):
         self.keepdims = keepdims
         self.dtype = dtype
 
@@ -1354,7 +1354,7 @@ def angle(x):
 
 
 def argmax(x, axis=None, keepdim=False, dtype='int64'):
-    return jt.argmax(x, dim=axis, keepdim=keepdim)
+    return jt.argmax(x, dim=axis, keepdims=keepdim)
 
 
 def argmin(x, axis=None, dtype='int64'):
@@ -1646,8 +1646,8 @@ def where(condition, x, y):
     return jt.where(condition,x, y)
 
 
-def ones_like(x, dtype=None):
-    return jt.ones_like(x, dtype=dtype)
+def ones_like(x):
+    return jt.ones_like(x)
 
 
 def zeros_like(x, dtype=None):
@@ -1734,7 +1734,7 @@ def set_seed(seed):
 
 def is_tensor(x):
 
-    return isinstance(x, jt.Tensor)
+    return isinstance(x, jt.Var)
 
 def tensor_scatter_nd_update(tensor, indices, updates):
     tensor = jt.array(tensor)
@@ -1765,10 +1765,10 @@ def mask_select(x, mask, axis = 0):
     elif axis == 3:
         return x[:,:,:, mask]
 
-def eye(n, m=None, dtype=None):
+def eye(n, m=None, dtype="float32"):
     if m is None:
         m = n
-    return jt.init.eye((n,m), dtype =dtype)
+    return jt.init.eye((n,m), dtype=dtype)
 
 
 def einsum(equation, *operands):

From 613b577b6badc6d6dda410a021a571c13d0feb48 Mon Sep 17 00:00:00 2001
From: Hisham <hishambarakat16@gmail.com>
Date: Wed, 27 Nov 2024 19:22:54 +0800
Subject: [PATCH 3/4] fixed some issues with jittor nn

---
 tensorlayerx/backend/ops/jittor_nn.py | 165 ++++++++++++++++++--------
 1 file changed, 113 insertions(+), 52 deletions(-)

diff --git a/tensorlayerx/backend/ops/jittor_nn.py b/tensorlayerx/backend/ops/jittor_nn.py
index 985f3f0..40e2438 100644
--- a/tensorlayerx/backend/ops/jittor_nn.py
+++ b/tensorlayerx/backend/ops/jittor_nn.py
@@ -88,11 +88,17 @@ def preprocess_1d_format(data_format, padding):
         data_format = "NLC"
     elif data_format in ["channels_first", "NCW", "NCL"]:
         data_format = "NCL"
-    elif data_format == None:
+    elif data_format is None:
         data_format = None
     else:
         raise Exception("Unsupported data format: " + str(data_format))
+
     padding = padding_format(padding)
+    # Convert padding to numerical representation for arithmetic operations
+    if padding == "same":
+        padding = 1
+    elif padding == "valid":
+        padding = 0
     return data_format, padding
 
 
@@ -634,18 +640,25 @@ def same_padding(input, weight, strides, dilations):
         return rows_odd, cols_odd, depth_odd, padding_rows, padding_cols, padding_depth
 
 
+
 class Conv2D(object):
 
     def __init__(self, strides, padding, data_format='NHWC', dilations=None, out_channel=None, k_size=None, groups=1):
         self.data_format, self.padding = preprocess_2d_format(data_format, padding)
+        
+        # Ensure strides is a tuple/list of length 2 with non-zero values
+        if len(strides) != 2 or strides[0] == 0 or strides[1] == 0:
+            raise ValueError("Stride values must be greater than zero and of length 2")
+
+        # Adjust the strides and dilations for the data format
         if self.data_format == 'NHWC':
-            self.strides = (strides[1], strides[2])
+            self.strides = (strides[0], strides[1])
             self.dilations = (dilations[0], dilations[1])
         elif self.data_format == 'NCHW':
-            self.strides = (strides[1], strides[2])
-            self.dilations = (dilations[1], dilations[2])
-        self.groups = groups
+            self.strides = (strides[0], strides[1])
+            self.dilations = (dilations[0], dilations[1])
 
+        self.groups = groups
 
     def __call__(self, input, filters):
  
@@ -847,8 +860,7 @@ def __call__(self, *args, **kwargs):
 
 
 class MaxPool(object):
-
-    def __init__(self, ksize, strides, padding, return_mask = False, data_format=None):
+    def __init__(self, ksize, strides, padding, return_mask=False, data_format=None):
         self.ksize = ksize
         self.strides = strides
         self.return_mask = return_mask
@@ -863,6 +875,7 @@ def __init__(self, ksize, strides, padding, return_mask = False, data_format=Non
     def __call__(self, inputs):
         if self.data_format == 'channels_last':
             inputs = nhwc_to_nchw(inputs)
+
         if len(inputs.shape) == 2 or len(inputs.shape) == 3:
             raise NotImplementedError
         
@@ -872,6 +885,7 @@ def __call__(self, inputs):
             else:
                 out = nn.max_pool2d(inputs, self.ksize, self.strides, padding=self.padding,
                             return_indices=self.return_mask)
+        
         if len(inputs.shape) == 5:
             if self.padding in ['SAME', 'same']:
                 out = self.maxpool3d_same_padding(inputs)
@@ -879,6 +893,7 @@ def __call__(self, inputs):
                 out = nn.max_pool3d(inputs, self.ksize, self.strides, padding=self.padding,
                             return_indices=self.return_mask)
 
+
         if self.data_format == 'channels_last':
             if self.return_mask:
                     outputs = [None, None]
@@ -891,6 +906,7 @@ def __call__(self, inputs):
             return out
 
 
+
     def maxpool2d_same_padding(self, input):
         rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, self.ksize, self.strides, (1, 1))
         if rows_odd or cols_odd:
@@ -965,7 +981,6 @@ def __call__(self, *args, **kwargs):
         raise NotImplementedError("AvgPool1d is not implemented in Jittor backend")
     
 
-
 class AvgPool(object):
 
     def __init__(self, ksize, strides, padding, data_format=None):
@@ -994,7 +1009,7 @@ def __call__(self, inputs):
             if self.padding in ['SAME', 'same']:
                 out = self.avgpool3d_same_padding(inputs)
             else:
-                out = nn.AvgPool2d(inputs, self.ksize, self.strides, padding=self.padding)
+                out = nn.AvgPool3d(inputs, self.ksize, self.strides, padding=self.padding)
 
         if self.data_format == 'channels_last':
             return nchw_to_nhwc(out)
@@ -1002,6 +1017,7 @@ def __call__(self, inputs):
             return out
 
 
+
     def avgpool2d_same_padding(self, input):
         rows_odd, cols_odd, padding_rows, padding_cols = same_padding(input, self.ksize, self.strides, (1, 1))
         if rows_odd or cols_odd:
@@ -1065,7 +1081,7 @@ def avg_pool2d(input, kernel_size, stride=None, padding=0, data_format='NCHW'):
 
 def avg_pool3d(input, kernel_size, stride=None, padding=0, data_format='NCDHW'):
     data_format, padding = preprocess_3d_format(data_format, padding)
-    avg_pool_obj = AvgPool(kernel_size, stride, padding, data_format)
+    avg_pool_obj = AvgPool3d(kernel_size, stride, padding)
     return avg_pool_obj(input)
 
 class MaxPool3d(object):
@@ -1149,7 +1165,6 @@ def __call__(self, inputs):
     #     avg_pool_obj = AvgPool(ksize, strides, padding, data_format)
     #     return avg_pool_obj(input)
 
-
 def pool(input, window_shape, pooling_type, strides=None, padding='VALID', data_format=None, dilations=None, name=None):
     """
     Performs an N-D pooling operation.
@@ -1158,8 +1173,6 @@ def pool(input, window_shape, pooling_type, strides=None, padding='VALID', data_
     ----------
     input : tensor
         Tensor of rank N+2, of shape [batch_size] + input_spatial_shape + [num_channels]
-        if data_format does not start with "NC" (default), or [batch_size, num_channels] + input_spatial_shape
-        if data_format starts with "NC". Pooling happens over the spatial dimensions only.
     window_shape : int
         Sequence of N ints >= 1.
     pooling_type : string
@@ -1168,12 +1181,9 @@ def pool(input, window_shape, pooling_type, strides=None, padding='VALID', data_
         Sequence of N ints >= 1. Defaults to [1]*N. If any value of strides is > 1, then all values of dilation_rate must be 1.
     padding : string
         The padding algorithm, must be "SAME" or "VALID". Defaults to "SAME".
-        See the "returns" section of tf.ops.convolution for details.
     data_format : string
         Specifies whether the channel dimension of the input and output is the last dimension (default, or if data_format does not start with "NC"),
         or the second dimension (if data_format starts with "NC").
-        For N=1, the valid values are "NWC" (default) and "NCW". For N=2, the valid values are "NHWC" (default) and "NCHW".
-        For N=3, the valid values are "NDHWC" (default) and "NCDHW".
     dilations : list of ints
         Dilation rate. List of N ints >= 1. Defaults to [1]*N. If any value of dilation_rate is > 1, then all values of strides must be 1.
     name : string
@@ -1193,7 +1203,6 @@ def pool(input, window_shape, pooling_type, strides=None, padding='VALID', data_
 
     return pool_obj(input)
 
-
 class DepthwiseConv2d(object):
 
     def __init__(self, strides, padding, data_format=None, dilations=None, ksize=None, channel_multiplier=1, in_channels=None):
@@ -2023,58 +2032,97 @@ def __call__(self, input, hx=None):
     
 
 
-class lstmcell(object):
-    def __init__(self, weight_ih, weight_hh, bias_ih, bias_hh):
-        self.weight_ih = weight_ih
-        self.weight_hh = weight_hh
-        self.bias_ih = bias_ih
-        self.bias_hh = bias_hh
 
-    def __call__(self, input, h, c):
-        gates = jt.matmul(input, jt.transpose(self.weight_ih)) + jt.matmul(h, jt.transpose(self.weight_hh))
+class lstmcell(Module):
+    def __init__(self, weight_ih, weight_hh, bias_ih=None, bias_hh=None):
+        super(lstmcell, self).__init__()
+
+        self.weight_ih = weight_ih  # Shape: [input_size, 4 * hidden_size]
+        self.weight_hh = weight_hh  # Shape: [hidden_size, 4 * hidden_size]
+        self.bias_ih = bias_ih if bias_ih is not None else jt.ones(4 * weight_ih.shape[1])  # Bias for input-to-hidden
+        self.bias_hh = bias_hh if bias_hh is not None else jt.ones(4 * weight_hh.shape[1])  # Bias for hidden-to-hidden
+
+        # Extract input_size and hidden_size from the weight shapes
+        self.input_size = weight_ih.shape[0]
+        self.hidden_size = weight_hh.shape[0]
+    
+    def execute(self, input, h, c):
+
+        gates_input = jt.matmul(input, self.weight_ih)  # [batch_size, 4 * hidden_size]
+        gates_hidden = jt.matmul(h, self.weight_hh)     # [batch_size, 4 * hidden_size]
+
+        gates = gates_input + gates_hidden
+
+        # Add bias terms if provided
         if self.bias_ih is not None:
             gates += self.bias_ih + self.bias_hh
 
         i, f, g, o = jt.chunk(gates, 4, dim=1)
-        i = jt.sigmoid(i)
-        f = jt.sigmoid(f)
-        g = jt.tanh(g)
-        o = jt.sigmoid(o)
 
-        c_new = f * c + i * g
-        h_new = o * jt.tanh(c_new)
-        return h_new, h_new, c_new
+        # Apply activations to the gates
+        i = jt.sigmoid(i)  # Input gate
+        f = jt.sigmoid(f)  # Forget gate
+        g = jt.tanh(g)     # Cell gate (candidate cell state)
+        o = jt.sigmoid(o)  # Output gate
+
+        # Compute new cell state
+        c_new = f * c + i * g  # Cell state update
+
+        # Compute new hidden state
+        h_new = o * jt.tanh(c_new)  # Hidden state
+
+        return h_new, h_new, c_new  # Return hidden state and cell state
+
+
+
 
 
 class grucell(Module):
     def __init__(self, weight_ih, weight_hh, bias_ih=None, bias_hh=None):
         super(grucell, self).__init__()
-        self.weight_ih = weight_ih
-        self.weight_hh = weight_hh
-        self.bias_ih = bias_ih
-        self.bias_hh = bias_hh
-        self.hidden_size = weight_hh.shape[1]
 
-    def execute(self, inputs, states):
-        hx = states[0] if isinstance(states, (tuple, list)) else states
-        gates = jt.matmul(inputs, self.weight_ih.t()) + jt.matmul(hx, self.weight_hh.t())
-        if self.bias_ih is not None and self.bias_hh is not None:
-            gates += self.bias_ih + self.bias_hh
+        # Initialize the weights and biases
+        self.weight_ih = weight_ih  # Shape: [input_size, 3 * hidden_size]
+        self.weight_hh = weight_hh  # Shape: [hidden_size, 3 * hidden_size]
+        self.bias_ih = bias_ih if bias_ih is not None else jt.ones(weight_ih.shape[1])  # Bias for input-to-hidden
+        self.bias_hh = bias_hh if bias_hh is not None else jt.ones(weight_hh.shape[1])  # Bias for hidden-to-hidden
+
+        # Extract input_size and hidden_size from weight shapes
+        self.input_size = weight_ih.shape[0]
+        self.hidden_size = weight_hh.shape[0]
+
+    def execute(self, inputs, hx):
+        """
+        Args:
+        - inputs: Input tensor [batch_size, input_size]
+        - hx: Previous hidden state [batch_size, hidden_size]
         
-        # Separate the gates
-        r, z, n = jt.chunk(gates, 3, dim=1)
-
-        r = jt.sigmoid(r)
-        z = jt.sigmoid(z)
-        n = jt.tanh(n + r * (jt.matmul(hx, self.weight_hh[2 * self.hidden_size:].t()) + (self.bias_hh[2 * self.hidden_size:] if self.bias_hh is not None else 0)))
-        hy = (1 - z) * n + z * hx
+        Returns:
+        - hy: New hidden state [batch_size, hidden_size]
+        - hy_new: New hidden state (same as hy) for consistency
+        """
+        
+        # Split the weights for the gates (GRU uses 3 * hidden_size)
+        weight_ih_r, weight_ih_z, weight_ih_h = jt.split(self.weight_ih, 3, dim=1)
+        weight_hh_r, weight_hh_z, weight_hh_h = jt.split(self.weight_hh, 3, dim=1)
         
-        return hy, hy
+        # Bias terms for reset, update, and candidate hidden states
+        bias_ih_r, bias_ih_z, bias_ih_h = jt.split(self.bias_ih, 3)
+        bias_hh_r, bias_hh_z, bias_hh_h = jt.split(self.bias_hh, 3)
 
+        # 1. Compute the reset gate (r)
+        r = jt.sigmoid(jt.matmul(inputs, weight_ih_r) + bias_ih_r + jt.matmul(hx, weight_hh_r) + bias_hh_r)
 
+        # 2. Compute the update gate (z)
+        z = jt.sigmoid(jt.matmul(inputs, weight_ih_z) + bias_ih_z + jt.matmul(hx, weight_hh_z) + bias_hh_z)
 
+        # 3. Compute the candidate hidden state (h')
+        h_hat = jt.tanh(jt.matmul(inputs, weight_ih_h) + bias_ih_h + r * (jt.matmul(hx, weight_hh_h) + bias_hh_h))
 
+        # 4. Compute the new hidden state (h)
+        hy = (1 - z) * hx + z * h_hat
 
+        return hy, hy  # Return the new hidden state as both outputs (for consistency)
 
 class rnnbase(Module):
 
@@ -2746,9 +2794,22 @@ def swish(input):
 
     return NotImplementedError
 
-def linear(input, weight, bias = None):
+def linear(input, weight, bias=None):
+    ''' Custom Linear Layer Implementation '''
+    
+    # Perform matrix multiplication (input * weight^T)
+    x = jt.matmul(input, weight)  # input is of shape [batch, in_features], weight is of shape [in_features, out_features]
+    
+    if bias is not None:
+        # Ensure the bias is correctly reshaped for broadcasting
+        if bias.ndim == 1:
+            # Bias should be broadcasted across the batch dimension
+            bias = bias.reshape(1, -1)  # Shape: [1, out_features]
+        
+        # Add bias to the result
+        x = x + bias  # Broadcasting bias to match the result shape
 
-    return nn.linear(input, weight, bias)
+    return x
 
 def unfold(input, kernel_size, dilation = 1, padding = 0, stride = 1):
 

From d64bfffedc2867367a826a1d57901926792dee20 Mon Sep 17 00:00:00 2001
From: root <hishambarakat16@gmail.com>
Date: Thu, 28 Nov 2024 13:27:48 +0800
Subject: [PATCH 4/4] User: moehanabi updated oneflow_backend.py

---
 tensorlayerx/__init__.py                    |  1 +
 tensorlayerx/backend/ops/oneflow_backend.py | 27 ++++++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorlayerx/__init__.py b/tensorlayerx/__init__.py
index 481c043..b174e13 100644
--- a/tensorlayerx/__init__.py
+++ b/tensorlayerx/__init__.py
@@ -40,6 +40,7 @@
     'paddle': '2.2.0',
     'torch': '1.10.0',
     'jittor': '1.3.8.5',
+    'oneflow':'0.9.0'
 }
 
 if BACKEND_VERSION != backend_v[BACKEND]:
diff --git a/tensorlayerx/backend/ops/oneflow_backend.py b/tensorlayerx/backend/ops/oneflow_backend.py
index b4990b7..47e0b3e 100644
--- a/tensorlayerx/backend/ops/oneflow_backend.py
+++ b/tensorlayerx/backend/ops/oneflow_backend.py
@@ -179,7 +179,7 @@ def random_uniform(shape, minval=0, maxval=1, dtype=None, seed=None):
     if seed is not None:
         flow.manual_seed(seed)
     else:
-        flow.manual_seed(flow.random.gen_seed())
+        flow.manual_seed(flow.initial_seed())
 
     w = flow.randn(shape, dtype=_dtypeDict[dtype])
     out = w.uniform_(minval, maxval)
@@ -211,7 +211,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
     if seed is not None:
         flow.manual_seed(seed)
     else:
-        flow.manual_seed(flow.random.gen_seed())
+        flow.manual_seed(flow.initial_seed())
 
     return flow.normal(shape, mean=mean, std=stddev, dtype=_dtypeDict[dtype])
 
@@ -241,7 +241,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
     if seed is not None:
         flow.manual_seed(seed)
     else:
-        flow.manual_seed(flow.random.gen_seed())
+        flow.manual_seed(flow.initial_seed())
 
     w = flow.empty(shape, dtype=_dtypeDict[dtype])
     out = nn.init.truncated_normal_(w, mean=mean, std=stddev)
@@ -271,7 +271,7 @@ def he_normal(shape, dtype=None, seed=None):
     if seed is not None:
         flow.manual_seed(seed)
     else:
-        flow.manual_seed(flow.random.gen_seed())
+        flow.manual_seed(flow.initial_seed())
 
     w = flow.empty(shape, dtype=_dtypeDict[dtype])
     out = nn.init.kaiming_normal_(w)
@@ -301,7 +301,7 @@ def he_uniform(shape, dtype=None, seed=None):
     if seed is not None:
         flow.manual_seed(seed)
     else:
-        flow.manual_seed(flow.random.gen_seed())
+        flow.manual_seed(flow.initial_seed())
 
     w = flow.empty(shape, dtype=_dtypeDict[dtype])
     out = nn.init.kaiming_uniform_(w)
@@ -331,7 +331,7 @@ def xavier_normal(shape, dtype=None, seed=None):
     if seed is not None:
         flow.manual_seed(seed)
     else:
-        flow.manual_seed(flow.random.gen_seed())
+        flow.manual_seed(flow.initial_seed())
 
     w = flow.empty(shape, dtype=_dtypeDict[dtype])
     out = nn.init.xavier_normal_(w)
@@ -363,7 +363,7 @@ def xavier_uniform(shape, gain=1.0, dtype=None, seed=None):
     if seed is not None:
         flow.manual_seed(seed)
     else:
-        flow.manual_seed(flow.random.gen_seed())
+        flow.manual_seed(flow.initial_seed())
 
     w = flow.empty(shape, dtype=_dtypeDict[dtype])
     out = nn.init.xavier_uniform_(w, gain=gain)
@@ -674,7 +674,7 @@ def reduce_mean(input_tensor, axis=None, keepdims=False):
     if axis is not None:
         return flow.mean(input_tensor, dim=axis, keepdim=keepdims)
     else:
-        return flow.mean(input_tensor, keepdim=keepdims)
+        return flow.mean(input_tensor)
 
 
 class ReduceMax(object):
@@ -718,7 +718,7 @@ def reduce_max(input_tensor, axis=None, keepdims=False):
     if axis is not None:
         return flow.max(input_tensor, dim=axis, keepdim=keepdims)
     else:
-        return flow.max(input_tensor, keepdim=keepdims)
+        return flow.max(input_tensor)
 
 
 def reduce_min(input_tensor, axis=None, keepdims=False):
@@ -1582,11 +1582,11 @@ def count_nonzero(x, axis=None, keepdims=None, dtype="int64"):
     return convert_to_tensor(non_zero)
 
 
-def cumprod(x, axis=None, dtype=None, out=None):
+def cumprod(x, axis=0, dtype=None, out=None):
     return flow.cumprod(x, dim=axis)
 
 
-def cumsum(x, axis=None, dtype=None, out=None):
+def cumsum(x, axis=0, dtype=None, out=None):
     return flow.cumsum(x, dim=axis)
 
 def equal(x, y):
@@ -1892,7 +1892,7 @@ def mask_select(x, mask, axis = 0):
     elif axis == 3:
         return x[:,:,:, mask]
 
-def eye(n, m=None, dtype=None):
+def eye(n, m=None, dtype=flow.float32):
     if m is None:
         m = n
     return flow.eye(n, m, dtype=dtype)
@@ -2014,5 +2014,4 @@ def flip(x, axis):
 
 def mv(x, vec):
 
-    raise NotImplementedError
-
+    raise NotImplementedError
\ No newline at end of file