Document dynamo (#146736)

Many files in dynamo are currently lacking file/module-level documentation, which makes it hard to know what they do at a glance and without digging into the code. This fixes that. Note: documentation was AI-generated and could be incorrect, please review carefully. Pull Request resolved: #146736 Approved by: https://github.com/jansel, https://github.com/StrongerXi, https://github.com/anijain2305, https://github.com/zou3519
pytorch · Feb 20, 2025 · 0541817 · 0541817
1 parent 119b096
commit 0541817
Show file tree

Hide file tree

Showing 71 changed files with 1,337 additions and 86 deletions.
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
@@ -1,3 +1,13 @@
+"""
+TorchDynamo is a Python-level JIT compiler designed to make unmodified PyTorch programs faster.
+TorchDynamo hooks into the frame evaluation API in CPython (PEP 523) to dynamically modify Python
+bytecode right before it is executed. It rewrites Python bytecode in order to extract sequences of
+PyTorch operations into an FX Graph which is then just-in-time compiled with a customizable backend.
+It creates this FX Graph through bytecode analysis and is designed to mix Python execution with
+compiled backends to get the best of both worlds: usability and performance. This allows it to
+seamlessly optimize PyTorch programs, including those using modern Python features.
+"""
+
 import torch
 
 from . import convert_frame, eval_frame, resume_execution

diff --git a/torch/_dynamo/_trace_wrapped_higher_order_op.py b/torch/_dynamo/_trace_wrapped_higher_order_op.py
@@ -1,3 +1,33 @@
+"""trace_wrapped(*args, fn) is equivalent to fn(*args), but with a twist:
+if you make_fx trace through this call, we will not actually trace into fn; instead,
+we will directly insert it as a call_function to fn in the graph.
+(Unlike make_fx, Dynamo WILL inline into fn.)
+You can think of this as a one off allow_in_graph equivalent for proxy tensor tracing.
+
+Because proxy tensor tracing does not actually run the function, there are
+requirements on the behavior of fn. We are still figuring it out, but here is the current state:
+
+1) fn SHOULD only take a single argument, which must be a tensor
+2) fn MUST return a new tensor with the same metadata as the original tensor
+   (e.g., zeros_like(input) is a permissible implementation of fn).
+   This is verified via an extra assert that is inserted into the traced graph.
+3) fn MAY have side effects, but it MAY NOT perform metadata mutation on other tensors
+   participating in proxy tensor tracing (it MAY mutate other tensors, it MAY mutate Python state)
+These requirements stem from the requirement that we need to continue performing proxy tensor tracing,
+which assumes accurate fake tensor metadata, without actually running fn.
+In the future, we may allow for a "meta" function associated with fn to allow for more interesting input-output patterns.
+
+Note that tensors / Python state are allowed to be mutated.
+This is relaxed constraint is not always sound, but it is sound for backward tracing with fake
+tensors as it takes place in AOTAutograd, as the backward pass is guaranteed not to depend on concrete
+tensor values (via fake tensor) or Python state (because the autograd engine doesn't depend on Python).
+
+The intended use case for this function is to allow AOTAutograd to defer complex
+backward hooks to compiled autograd. AOTAutograd performs a make_fx trace which preserves
+the function call as is in the graph, and only when we Dynamo through the backward graph in
+compiled autograd do we inline into the function.
+"""
+
 from typing import Any, Optional
 
 import torch
@@ -19,36 +49,6 @@
 __all__ = ["trace_wrapped"]
 
 
-# trace_wrapped(*args, fn) is equivalent to fn(*args), but with a twist:
-# if you make_fx trace through this call, we will not actually trace into fn; instead,
-# we will directly insert it as a call_function to fn in the graph.
-# (Unlike make_fx, Dynamo WILL inline into fn.)
-# You can think of this as a one off allow_in_graph equivalent for proxy tensor tracing.
-#
-# Because proxy tensor tracing does not actually run the function, there are
-# requirements on the behavior of fn. We are still figuring it out, but here is the current state:
-#
-# 1) fn SHOULD only take a single argument, which must be a tensor
-# 2) fn MUST return a new tensor with the same metadata as the original tensor
-#    (e.g., zeros_like(input) is a permissible implementation of fn).
-#    This is verified via an extra assert that is inserted into the traced graph.
-# 3) fn MAY have side effects, but it MAY NOT perform metadata mutation on other tensors
-#    participating in proxy tensor tracing (it MAY mutate other tensors, it MAY mutate Python state)
-# These requirements stem from the requirement that we need to continue performing proxy tensor tracing,
-# which assumes accurate fake tensor metadata, without actually running fn.
-# In the future, we may allow for a "meta" function associated with fn to allow for more interesting input-output patterns.
-#
-# Note that tensors / Python state are allowed to be mutated.
-# This is relaxed constraint is not always sound, but it is sound for backward tracing with fake
-# tensors as it takes place in AOTAutograd, as the backward pass is guaranteed not to depend on concrete
-# tensor values (via fake tensor) or Python state (because the autograd engine doesn't depend on Python).
-#
-# The intended use case for this function is to allow AOTAutograd to defer complex
-# backward hooks to compiled autograd. AOTAutograd performs a make_fx trace which preserves
-# the function call as is in the graph, and only when we Dynamo through the backward graph in
-# compiled autograd do we inline into the function.
-    
-
 if not torch._running_with_deploy():
     # torch.library.custom_op does not work with torch.deploy/multipy
 

diff --git a/torch/_dynamo/backends/common.py b/torch/_dynamo/backends/common.py
@@ -1,5 +1,23 @@
 # mypy: ignore-errors
 
+"""
+This module provides common utilities and base classes for TorchDynamo backends.
+
+Key components:
+- AotAutograd: Base class for implementing AOT (Ahead-of-Time) autograd backends
+- Backend utilities for handling:
+  - Fake tensor conversion
+  - Device/dtype detection from inputs
+  - Memory efficient fusion
+  - Graph flattening
+  - Common compiler configurations
+
+The utilities here are used by various backend implementations to handle
+common operations and provide consistent behavior across different backends.
+AOT autograd functionality is particularly important as it enables ahead-of-time
+optimization of both forward and backward passes.
+"""
+
 import contextlib
 import functools
 import logging

diff --git a/torch/_dynamo/backends/cudagraphs.py b/torch/_dynamo/backends/cudagraphs.py
@@ -1,5 +1,28 @@
 # mypy: ignore-errors
 
+"""
+This module implements CUDA graphs support for TorchDynamo backends.
+
+CUDA graphs allow for capturing and replaying GPU operations, which can significantly
+reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:
+
+- CUDA graph creation and management for both forward and backward passes
+- Input mutation detection and handling
+- Device compatibility checking
+- Stack trace management for debugging
+- Integration with TorchInductor's cudagraph trees
+
+The backend supports two main modes:
+1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
+2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking
+
+Key components:
+- CudagraphsBackend: Main backend class for CUDA graph integration
+- Mutation detection utilities to ensure graph safety
+- Device mapping and compatibility checks
+- Stack trace collection for debugging
+"""
+
 import functools
 from collections import defaultdict
 from typing import Optional

diff --git a/torch/_dynamo/backends/debugging.py b/torch/_dynamo/backends/debugging.py
@@ -1,5 +1,30 @@
 # mypy: ignore-errors
 
+"""
+This module provides debugging backends for TorchDynamo to help diagnose and troubleshoot
+compilation and execution issues. It includes:
+
+Key Debugging Backends:
+- eager: Simple pass-through backend that runs models in eager mode
+- eager_noexcept: Similar to eager but with additional exception handling
+- eager_debug: Adds schema validation checks for custom operators
+- aot_eager: Uses AOT Autograd with nop compiler for debugging
+- aot_eager_decomp_partition: Uses TorchInductor decompositions for debugging
+- torchscript: Compiles using TorchScript for debugging JIT-related issues
+
+Testing and Development Tools:
+- Backends for inducing specific errors (compile/runtime/accuracy)
+- ExplainOutput class for detailed graph compilation analysis
+- Utilities for cross-referencing and mode management
+- Tools for graph detail inspection and break reason analysis
+
+These backends are primarily used for:
+1. Debugging graph breaks and compilation failures
+2. Testing error handling and recovery mechanisms
+3. Analyzing performance bottlenecks
+4. Validating operator schemas and decompositions
+"""
+
 import dataclasses
 import functools
 import logging
@@ -19,11 +44,6 @@
 log = logging.getLogger(__name__)
 
 
-"""
-This file contains TorchDynamo backends intended for debugging uses.
-"""
-
-
 @register_backend
 def eager(gm, fake_tensor_inputs, **kwargs):
     if kwargs:

diff --git a/torch/_dynamo/backends/distributed.py b/torch/_dynamo/backends/distributed.py
@@ -1,5 +1,23 @@
 # mypy: ignore-errors
 
+"""
+This module implements distributed training optimizations for TorchDynamo backends.
+
+It provides functionality to optimize models wrapped in DistributedDataParallel (DDP)
+by intelligently splitting compiled graphs to align with DDP's gradient synchronization
+boundaries. Key features include:
+
+- Graph partitioning based on parameter bucket sizes
+- Optimization of allreduce operations for distributed training
+- Support for parameter ignoring and buffer handling
+- Submodule compilation and management
+- Debugging utilities for distributed training
+
+The main component is the DDPOptimizer class, which handles graph splitting and
+recompilation to enable efficient distributed training while maintaining the benefits
+of compilation.
+"""
+
 import logging
 import traceback
 from dataclasses import dataclass, field

diff --git a/torch/_dynamo/backends/inductor.py b/torch/_dynamo/backends/inductor.py
@@ -1,5 +1,16 @@
 # mypy: ignore-errors
 
+"""
+This module provides the TorchInductor backend integration for TorchDynamo.
+
+TorchInductor is a compiler backend that generates optimized code for both CPU and GPU.
+This module lazily imports and registers the TorchInductor compiler to avoid loading it
+into memory when it is not being used. This helps reduce memory overhead when using
+other backends.
+
+The inductor backend can be used with torch.compile():
+    model = torch.compile(model, backend="inductor")
+"""
 
 from torch._dynamo import register_backend
 

diff --git a/torch/_dynamo/backends/registry.py b/torch/_dynamo/backends/registry.py
@@ -1,5 +1,65 @@
 # mypy: ignore-errors
 
+"""
+This module implements TorchDynamo's backend registry system for managing compiler backends.
+
+The registry provides a centralized way to register, discover and manage different compiler
+backends that can be used with torch.compile(). It handles:
+
+- Backend registration and discovery through decorators and entry points
+- Lazy loading of backend implementations
+- Lookup and validation of backend names
+- Categorization of backends using tags (debug, experimental, etc.)
+
+Key components:
+- CompilerFn: Type for backend compiler functions that transform FX graphs
+- _BACKENDS: Registry mapping backend names to entry points
+- _COMPILER_FNS: Registry mapping backend names to loaded compiler functions
+
+Example usage:
+    @register_backend
+    def my_compiler(fx_graph, example_inputs):
+        # Transform FX graph into optimized implementation
+        return compiled_fn
+
+    # Use registered backend
+    torch.compile(model, backend="my_compiler")
+
+The registry also supports discovering backends through setuptools entry points
+in the "torch_dynamo_backends" group. Example:
+```
+setup.py
+---
+from setuptools import setup
+
+setup(
+    name='my_torch_backend',
+    version='0.1',
+    packages=['my_torch_backend'],
+    entry_points={
+        'torch_dynamo_backends': [
+            # name = path to entry point of backend implementation
+            'my_compiler = my_torch_backend.compiler:my_compiler_function',
+        ],
+    },
+)
+```
+```
+my_torch_backend/compiler.py
+---
+def my_compiler_function(fx_graph, example_inputs):
+    # Transform FX graph into optimized implementation
+    return compiled_fn
+```
+Using `my_compiler` backend:
+```
+import torch
+
+model = ...  # Your PyTorch model
+optimized_model = torch.compile(model, backend="my_compiler")
+```
+"""
+
 import functools
 import logging
 import sys

diff --git a/torch/_dynamo/backends/tvm.py b/torch/_dynamo/backends/tvm.py
@@ -1,5 +1,27 @@
 # mypy: ignore-errors
 
+"""
+This module provides TVM backend integration for TorchDynamo.
+
+Apache TVM is a deep learning compiler framework that can optimize and execute
+models on various hardware backends. This module enables:
+
+- Compilation of PyTorch models to TVM's computation graphs
+- Multiple scheduling options:
+  - Default scheduler
+  - Auto-scheduler for automatic optimization
+  - Meta-schedule for evolutionary search-based tuning
+- Hardware-specific optimizations:
+  - CUDA GPU support
+  - CPU support with LLVM targeting and architecture-specific tuning
+  - Automatic detection of CPU capabilities (AVX2, AVX512)
+- Tensor conversion utilities between PyTorch and TVM formats
+- Configurable optimization levels and tuning trials
+
+The backend can be used with torch.compile():
+    model = torch.compile(model, backend="tvm")
+"""
+
 import functools
 import importlib
 import logging

diff --git a/torch/_dynamo/bytecode_analysis.py b/torch/_dynamo/bytecode_analysis.py
@@ -1,4 +1,19 @@
 # mypy: allow-untyped-defs
+
+"""
+This module provides utilities for analyzing and optimizing Python bytecode.
+Key functionality includes:
+- Dead code elimination
+- Jump instruction optimization
+- Stack size analysis and verification
+- Live variable analysis
+- Line number propagation and cleanup
+- Exception table handling for Python 3.11+
+
+The utilities in this module are used to analyze and transform bytecode
+for better performance while maintaining correct semantics.
+"""
+
 import bisect
 import dataclasses
 import dis

diff --git a/torch/_dynamo/bytecode_transformation.py b/torch/_dynamo/bytecode_transformation.py
@@ -1,4 +1,20 @@
 # mypy: allow-untyped-defs
+
+"""
+This module provides utilities for analyzing, transforming and manipulating Python bytecode.
+
E09E
It includes functionality for:
+- Converting between different bytecode formats and versions
+- Virtualizing jumps and managing jump targets
+- Handling exception tables and their entries
+- Managing instruction offsets and extended arguments
+- Providing a clean API for bytecode modification and transformation
+- Supporting Python version-specific bytecode features
+- Generating bytecode from template functions
+
+The module is designed to work across different Python versions (3.7+) and handles
+version-specific bytecode differences transparently.
+"""
+
 import copy
 import dataclasses
 import dis

diff --git a/torch/_dynamo/callback.py b/torch/_dynamo/callback.py
@@ -1,3 +1,30 @@
+"""
+This module provides callback management functionality for TorchDynamo's compilation process.
+
+It implements a thread-safe system for registering, managing and executing callbacks that run
+at the start and end of TorchDynamo compilations. Key features include:
+
+- Registration and deregistration of compilation callbacks
+- Thread-safe callback handling with proper locking mechanisms
+- Prevention of duplicate callback execution when configured
+- Decorator utilities for easy callback registration
+- Context manager for controlled callback lifecycle
+
+The module centers around the CompilationCallbackHandler class which maintains separate
+lists for start and end callbacks, manages their execution order, and ensures thread-safety.
+Utility decorators @on_compile_start and @on_compile_end provide a convenient way to
+register compilation hooks.
+
+Example usage:
+    @on_compile_start
+    def my_start_callback():
+        print("Starting compilation")
+
+    @on_compile_end
+    def my_end_callback():
+        print("Compilation complete")
+"""
+
 import threading
 from collections.abc import Generator
 from contextlib import contextmanager