sgl-project
diff --git a/‎python/sglang/srt/managers/cache_controller.py‎
Lines changed: 11 additions & 2 deletions b/‎python/sglang/srt/managers/cache_controller.py‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎python/sglang/srt/mem_cache/hicache_storage.py‎
Lines changed: 15 additions & 6 deletions b/‎python/sglang/srt/mem_cache/hicache_storage.py‎
Lines changed: 15 additions & 6 deletions
diff --git a/‎python/sglang/srt/mem_cache/memory_pool_host.py‎
Lines changed: 29 additions & 0 deletions b/‎python/sglang/srt/mem_cache/memory_pool_host.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎python/sglang/srt/mem_cache/nixl/README.md‎
Lines changed: 164 additions & 0 deletions b/‎python/sglang/srt/mem_cache/nixl/README.md‎
Lines changed: 164 additions & 0 deletions
@@ -265,6 +265,11 @@ def __init__(
             if storage_backend == "file":
                 self.storage_backend = HiCacheFile()
                 self.get_hash_str = get_hash_str
+            elif storage_backend == "nixl":
+                from sglang.srt.mem_cache.nixl.hicache_nixl import HiCacheNixl
+
+                self.storage_backend = HiCacheNixl()
+                self.get_hash_str = get_hash_str
             elif storage_backend == "mooncake":
                 self.storage_backend = MooncakeStore()
                 self.get_hash_str = get_hash_str_mooncake
@@ -545,7 +550,11 @@ def terminate_prefetch(self, operation):
     def generic_page_transfer(self, operation, batch_size=8):
         for i in range(0, len(operation.hash_value), batch_size):
             page_hashes = operation.hash_value[i : i + batch_size]
-            page_data = self.storage_backend.batch_get(page_hashes)
+            # todo: zero copy
+            dummy_page_dst = [self.mem_pool_host.get_dummy_flat_data_page()] * len(
+                page_hashes
+            )
+            page_data = self.storage_backend.batch_get(page_hashes, dummy_page_dst)
             if page_data is None:
                 logger.warning(
                     f"Prefetch operation {operation.request_id} failed to retrieve page {page_hashes}."
@@ -679,7 +688,7 @@ def generic_page_backup(self, operation, batch_size=8):
         for i in range(0, len(operation.hash_value), batch_size):
             page_hashes = operation.hash_value[i : i + batch_size]
             page_data = [
-                self.mem_pool_host.get_flat_data_pages(
+                self.mem_pool_host.get_flat_data_page(
                     operation.host_indices[j * self.page_size]
                 )
                 for j in range(i, i + len(page_hashes))
 
@@ -123,13 +123,22 @@ def get(
         key = self._get_suffixed_key(key)
         tensor_path = os.path.join(self.file_path, f"{key}.bin")
         try:
-            # todo: fixing the target_location logic to enable in-place loading
-            loaded_tensor = torch.load(tensor_path)
-            if isinstance(loaded_tensor, torch.Tensor):
-                return loaded_tensor
+            if target_location is not None:
+                # Load directly into target_location's memory buffer
+                with open(tensor_path, "rb") as f:
+                    target_location.set_(
+                        torch.frombuffer(f.read(), dtype=target_location.dtype)
+                        .reshape(target_location.shape)
+                        .storage()
+                    )
+                return target_location
             else:
-                logger.error(f"Loaded data for key {key} is not a tensor.")
-                return None
+                loaded_tensor = torch.load(tensor_path)
+                if isinstance(loaded_tensor, torch.Tensor):
+                    return loaded_tensor
+                else:
+                    logger.error(f"Loaded data for key {key} is not a tensor.")
+                    return None
         except FileNotFoundError:
             return None
 
 
@@ -105,6 +105,14 @@ def get_flat_data_page(self, index) -> torch.Tensor:
         """
         raise NotImplementedError()
 
+    @abc.abstractmethod
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        """
+        Get a dummy flat data page from the host memory pool.
+        This is used for prefetching or initializing empty pages.
+        """
+        raise NotImplementedError()
+
     @abc.abstractmethod
     def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
         """
@@ -256,6 +264,14 @@ def init_kv_buffer(self):
     def get_flat_data_page(self, index) -> torch.Tensor:
         return self.kv_buffer[:, :, index : index + self.page_size, :, :].flatten()
 
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        return torch.zeros(
+            (2, self.layer_num, self.page_size, self.head_num, self.head_dim),
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        ).flatten()
+
     def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
         self.kv_buffer[:, :, index : index + self.page_size, :, :] = data_page.reshape(
             2,
@@ -355,6 +371,19 @@ def init_kv_buffer(self):
     def get_flat_data_page(self, index) -> torch.Tensor:
         return self.kv_buffer[:, index : index + self.page_size, :, :].flatten()
 
+    def get_dummy_flat_data_page(self) -> torch.Tensor:
+        return torch.zeros(
+            (
+                self.layer_num,
+                self.page_size,
+                1,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+            ),
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        ).flatten()
+
     def set_from_flat_data_page(self, index: int, data_page: torch.Tensor) -> None:
         self.kv_buffer[:, index : index + self.page_size, :, :] = data_page.reshape(
             self.layer_num,
 
@@ -0,0 +1,164 @@
+# NIXL Integration for HiCache
+
+This directory contains the **NIXL (NVIDIA Inference Xfer Library)** integration for **HiCache**, enabling high-performance storage across multiple backends.
+
+NIXL provides a unified API for accessing various storage plugins, including but not limited to:
+
+- **Deepseek's 3FS APIs** for high-throughput file operations
+- **GPU Direct Storage (GDS)** for direct data movement between storage and GPU memory, bypassing CPU memory copies
+- **Amazon S3-compatible object storage** for key-value access patterns
+
+Additional backend integrations are planned for future releases.
+
+## NIXL Resources
+
+- **Project Repository**: [NIXL on GitHub](https://github.com/ai-dynamo/nixl)
+- **Documentation**: [NIXL Documentation](https://github.com/ai-dynamo/nixl/tree/main/docs)
+
+## Overview
+
+The NIXL integration consists of two main files:
+
+- **`hicache_nixl.py`** - Main HiCache storage connector using NIXL
+- **`nixl_utils.py`** - Utility classes for backend selection, registration, and file management
+
+## Components
+
+### HiCacheNixl
+The main storage connector that provides:
+- Single and batch tensor set/get operations
+- Automatic backend selection (3FS > POSIX > GDS_MT > GDS > OBJ)
+- High-performance file-based (or) object based storage access using NIXL
+
+### NixlUtils
+Consolidated utility classes:
+- **NixlBackendSelection** - Handles backend selection and creation
+- **NixlRegistration** - Manages memory registration for tensors, files and objects
+- **NixlFileManager** - Handles file system operations and NIXL tuple creation
+
+## Running Unit Tests
+
+### Prerequisites
+- NIXL library installed and available (latest main required for supporting object query)
+- PyTorch installed
+- Python 3.8+
+
+### Unit tests from Project root
+Navigate to the project root directory (`/path/to/sglang`) and run:
+
+#### Run all NIXL tests:
+```bash
+PYTHONPATH=. python -m pytest test/srt/test_hicache_nixl_storage.py -o asyncio_mode=strict
+```
+
+#### Run with verbose output:
+```bash
+PYTHONPATH=. python -m pytest test/srt/test_hicache_nixl_storage.py -v -o asyncio_mode=strict
+```
+
+Note: The `-v` flag provides more detailed output, showing each test case name and its result.
+
+#### Run a specific test:
+```bash
+PYTHONPATH=. python -m pytest test/srt/test_hicache_nixl_storage.py -v -k test_single_set_get -o asyncio_mode=strict
+```
+
+### From Tests Directory
+Navigate to the tests directory and run:
+
+```bash
+cd test/srt
+PYTHONPATH=../.. python -m pytest test_hicache_nixl_storage.py -o asyncio_mode=strict
+```
+Note: The `-o asyncio_mode=strict` flag is added to suppress warnings about asyncio configuration. This is not required for test functionality but provides cleaner output.
+
+## Test Coverage
+
+Tests for this integration, a test suite can be found at `test_hicache_nixl_storage.py` which covers:
+
+### HiCache Integration Tests (4 tests)
+- Single tensor set/get operations
+- Batch tensor set/get operations
+- Mixed single and batch operations
+- Data integrity for various tensor types
+
+### File Management Tests (5 tests)
+- Basic file operations
+- NIXL tuple creation
+- Error handling in file operations
+
+### Registration Tests (2 tests)
+- Tensor registration with memory type detection
+- File registration using NIXL tuples
+
+## Expected Output
+
+When tests run successfully, you should see:
+- NIXL agent initialization messages
+- Backend selection messages (e.g., "Backend POSIX was instantiated")
+- Test results with "ok" for passed tests
+- Summary showing "Ran X tests in Y seconds" and "OK"
+
+## Troubleshooting
+
+### Import Errors
+If you encounter `ModuleNotFoundError`, ensure:
+- You're running from the correct directory
+- `PYTHONPATH` is set correctly
+- NIXL library is properly installed
+
+### NIXL Errors
+If NIXL operations fail:
+- Check that NIXL is properly installed
+- Verify that required plugins are available
+- Ensure file permissions are correct for test directories
+
+## File Structure
+
+```
+python/sglang/srt/mem_cache/nixl/
+├── hicache_nixl.py          # Main HiCache storage connector
+├── nixl_utils.py            # All NIXL utility classes
+├── README.md                # This file
+└── tests/
+    └── test_nixl_unified.py # All tests in one file
+```
+
+## Dependencies
+
+- **NIXL**: NVIDIA Inference Xfer Library (version 0.4 or later)
+  - Required plugins: POSIX (minimum), 3FS/GDS (optional for better performance)
+  - See [NIXL Installation Guide](https://github.com/ai-dynamo/nixl/blob/main/README.md#installation)
+- **PyTorch**: For tensor operations (version 1.8 or later)
+- **Python 3.8+**: For type hints and modern features
+
+## Supported Features
+
+### Memory Types
+- **Tensor side**: multi-dimensional tensors of all numeric types (int32, int64, float32, float64) are supported.
+  - Tensors can be on CPU or GPU (as long as a GPU capable backend such as GDS_MT is available).
+  - Currently each tensor is mapped to a file or key, but it can be extended to support multiple keys per file or key.
+
+- **Storage side**: file and object are supported through their relevant backends (e.g., 3FS or OBJ).
+
+### Backend Priority
+
+The NIXL backend selection follows this priority order:
+1. **3FS** - Highest performance (if available)
+    - Best for high-throughput file operations using Deepseek 3FS APIs
+2. **POSIX** - Standard file I/O (fallback)
+    - Universal compatibility
+    - Good for development and testing - Leverages both libaio/liburing
+3. **GDS_MT** - Multi-threaded GDS (if available)
+    - Optimized for concurrent operations
+    - Supports GPU Direct storage with multiple light weight threads
+4. **GDS** - GPU Direct Storage (if available)
+    - Direct GPU-storage data path
+    - Best for filesystems benefiting from batch operations and smaller IOs.
+5. **OBJ** - Amazon S3 based Object Storage
+    - Key-value based storage
+The system automatically selects the best available backend, with POSIX as the default fallback.
+
+## Note
+
+This is v0 of the NIXL connector. Future versions will focus on further performance optimizations such as memory pre-registration (pre-allocating and registering memory buffers to reduce registration overhead during transfers) and block merging (combining related blocks as offsets within the same file to reduce file operations and improve throughput). These optimizations require changes at a higher layer, as the current HiCache API doesn't expose information like block relationships or hash patterns that would enable these optimizations.