From a437e0b1ca2744c83c2b301caf8df0584a28f910 Mon Sep 17 00:00:00 2001 From: Ray Wang Date: Wed, 11 Jun 2025 18:56:20 -0700 Subject: [PATCH 1/4] Add more GPU architectures support --- CMakeLists.txt | 4 +- README.md | 42 +- deep_gemm/__init__.py | 42 +- deep_gemm/config.py | 28 + deep_gemm/dispatch.py | 189 ++++++ .../include/deep_gemm/common/scheduler.cuh | 119 ++++ .../include/deep_gemm/common/sm100_utils.cuh | 156 +++++ .../include/deep_gemm/common/sm90_utils.cuh | 17 + deep_gemm/include/deep_gemm/common/utils.cuh | 109 +++ deep_gemm/include/deep_gemm/fp8_gemm.cuh | 444 ------------- .../include/deep_gemm/fp8_wgrad_gemm.cuh | 363 ---------- .../deep_gemm/impls/sm100_bf16_gemm.cuh | 3 + .../deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh | 624 ++++++++++++++++++ .../deep_gemm/impls/sm90_bf16_gemm.cuh | 3 + .../deep_gemm/impls/sm90_fp8_gemm_1d1d.cuh | 3 + .../deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh | 3 + deep_gemm/include/deep_gemm/mma_utils.cuh | 212 ------ deep_gemm/include/deep_gemm/scheduler.cuh | 163 ----- deep_gemm/include/deep_gemm/tma_utils.cuh | 19 - deep_gemm/include/deep_gemm/utils.cuh | 34 - deep_gemm/jit/__init__.py | 2 +- deep_gemm/jit/compiler.py | 93 ++- deep_gemm/jit/runtime.py | 9 + deep_gemm/jit/scripts/__init__.py | 1 + .../sm90_interleave_ffma.py} | 0 deep_gemm/jit_kernels/__init__.py | 17 +- deep_gemm/jit_kernels/gemm.py | 242 ------- deep_gemm/jit_kernels/heuristics/__init__.py | 5 + deep_gemm/jit_kernels/heuristics/common.py | 49 ++ .../heuristics/sm100_heuristics.py | 171 +++++ .../jit_kernels/heuristics/sm90_heuristics.py | 0 deep_gemm/jit_kernels/impls/__init__.py | 7 + .../jit_kernels/impls/sm100_bf16_gemm.py | 0 .../jit_kernels/impls/sm100_fp8_gemm_1d1d.py | 339 ++++++++++ deep_gemm/jit_kernels/impls/sm90_bf16_gemm.py | 0 .../jit_kernels/impls/sm90_fp8_gemm_1d1d.py | 0 .../jit_kernels/impls/sm90_fp8_gemm_1d2d.py | 0 deep_gemm/jit_kernels/m_grouped_gemm.py | 205 ------ deep_gemm/jit_kernels/runtime.py | 359 +++------- deep_gemm/jit_kernels/utils.py | 109 --- deep_gemm/jit_kernels/wgrad_gemm.py | 158 ----- deep_gemm/testing/__init__.py | 1 + deep_gemm/{utils.py => testing/bench.py} | 34 +- deep_gemm/testing/numeric.py | 19 + deep_gemm/utils/__init__.py | 1 + deep_gemm/utils/layout.py | 170 +++++ deep_gemm/utils/math.py | 46 ++ indexing/main.cu | 7 +- install.sh | 12 + setup.py | 23 +- tests/generators.py | 87 +++ tests/test_core.py | 328 ++------- third-party/cutlass | 2 +- 53 files changed, 2485 insertions(+), 2588 deletions(-) create mode 100644 deep_gemm/config.py create mode 100644 deep_gemm/dispatch.py create mode 100644 deep_gemm/include/deep_gemm/common/scheduler.cuh create mode 100644 deep_gemm/include/deep_gemm/common/sm100_utils.cuh create mode 100644 deep_gemm/include/deep_gemm/common/sm90_utils.cuh create mode 100644 deep_gemm/include/deep_gemm/common/utils.cuh delete mode 100644 deep_gemm/include/deep_gemm/fp8_gemm.cuh delete mode 100644 deep_gemm/include/deep_gemm/fp8_wgrad_gemm.cuh create mode 100644 deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh create mode 100644 deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh create mode 100644 deep_gemm/include/deep_gemm/impls/sm90_bf16_gemm.cuh create mode 100644 deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d1d.cuh create mode 100644 deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh delete mode 100644 deep_gemm/include/deep_gemm/mma_utils.cuh delete mode 100644 deep_gemm/include/deep_gemm/scheduler.cuh delete mode 100644 deep_gemm/include/deep_gemm/tma_utils.cuh delete mode 100644 deep_gemm/include/deep_gemm/utils.cuh create mode 100644 deep_gemm/jit/scripts/__init__.py rename deep_gemm/jit/{interleave_ffma.py => scripts/sm90_interleave_ffma.py} (100%) delete mode 100644 deep_gemm/jit_kernels/gemm.py create mode 100644 deep_gemm/jit_kernels/heuristics/__init__.py create mode 100644 deep_gemm/jit_kernels/heuristics/common.py create mode 100644 deep_gemm/jit_kernels/heuristics/sm100_heuristics.py create mode 100644 deep_gemm/jit_kernels/heuristics/sm90_heuristics.py create mode 100644 deep_gemm/jit_kernels/impls/__init__.py create mode 100644 deep_gemm/jit_kernels/impls/sm100_bf16_gemm.py create mode 100644 deep_gemm/jit_kernels/impls/sm100_fp8_gemm_1d1d.py create mode 100644 deep_gemm/jit_kernels/impls/sm90_bf16_gemm.py create mode 100644 deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d1d.py create mode 100644 deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d2d.py delete mode 100644 deep_gemm/jit_kernels/m_grouped_gemm.py delete mode 100644 deep_gemm/jit_kernels/utils.py delete mode 100644 deep_gemm/jit_kernels/wgrad_gemm.py create mode 100644 deep_gemm/testing/__init__.py rename deep_gemm/{utils.py => testing/bench.py} (79%) create mode 100644 deep_gemm/testing/numeric.py create mode 100644 deep_gemm/utils/__init__.py create mode 100644 deep_gemm/utils/layout.py create mode 100644 deep_gemm/utils/math.py create mode 100755 install.sh create mode 100644 tests/generators.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 658aa7bd..240f6b17 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,8 +3,8 @@ cmake_minimum_required(VERSION 3.10) project(deep_gemm LANGUAGES CXX CUDA) -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CUDA_STANDARD 20) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) set(CMAKE_VERBOSE_MAKEFILE ON) find_package(CUDAToolkit REQUIRED) diff --git a/README.md b/README.md index 8df722aa..e1df304a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ DeepGEMM is a library designed for clean and efficient FP8 General Matrix Multiplications (GEMMs) with fine-grained scaling, as proposed in [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3). It supports both normal and Mix-of-Experts (MoE) grouped GEMMs. Written in CUDA, the library has no compilation need during installation, by compiling all kernels at runtime using a lightweight Just-In-Time (JIT) module. -Currently, DeepGEMM exclusively supports NVIDIA Hopper tensor cores. To address the imprecise FP8 tensor core accumulation, it employs CUDA-core two-level accumulation (promotion). While it leverages some concepts from [CUTLASS](https://github.com/nvidia/cutlass) and [CuTe](https://github.com/NVIDIA/cutlass/tree/main/include/cute), it avoids heavy reliance on their templates or algebras. Instead, the library is designed for simplicity, with only one core kernel function. This makes it a clean and accessible resource for learning Hopper FP8 matrix multiplication and optimization techniques. +DeepGEMM leverages some concepts from [CUTLASS](https://github.com/nvidia/cutlass) and [CuTe](https://github.com/NVIDIA/cutlass/tree/main/include/cute), it avoids heavy reliance on their templates or algebras. Instead, the library is designed for simplicity, with only one core kernel function. This makes it a clean and accessible resource for learning SM90 and SM100 FP8 matrix multiplication and optimization techniques. Despite its lightweight design, DeepGEMM's performance matches or exceeds expert-tuned libraries across various matrix shapes. @@ -40,12 +40,14 @@ Despite its lightweight design, DeepGEMM's performance matches or exceeds expert ### Requirements -- Hopper architecture GPUs, `sm_90a` must be supported -- Python 3.8 or above -- CUDA 12.3 or above - - **But we highly recommend 12.8 or above for the best performance** -- PyTorch 2.1 or above -- CUTLASS 3.6 or above (could be cloned by Git submodule) +- NVIDIA SM90 or SM100 architecture GPU +- Python 3.8 or higher +- CUDA Toolkit: + - CUDA 12.3 or higher for SM90 + - **We highly recommend 12.8 or higher for the best performance** + - CUDA 12.8 or higher for SM100 +- PyTorch 2.1 or higher +- CUTLASS 3.6 or higher (could be cloned by Git submodule) ### Development @@ -53,8 +55,8 @@ Despite its lightweight design, DeepGEMM's performance matches or exceeds expert # Submodule must be cloned git clone --recursive git@github.com:deepseek-ai/DeepGEMM.git -# Make symbolic links for third-party (CUTLASS and CuTe) include directories -python setup.py develop +# Install DeepGEMM +python setup.py install # Test JIT compilation python tests/test_jit.py @@ -75,11 +77,19 @@ Then, import `deep_gemm` in your Python project, and enjoy! #### Notices -This library exclusively contains GEMM kernels. It requires the LHS scaling factor to be TMA-aligned and transposed, and it only supports the NT format (non-transposed LHS and transposed RHS). For transposition or other FP8 casting operations, please implement or fuse them into prior kernels independently. While the library provides some simple PyTorch utility functions, these may result in slower performance, but our primary focus is on optimizing the GEMM kernels themselves. +This library provides optimized GEMM kernels for NVIDIA GPUs. The input shape layout is NT (non-transposed LHS, transposed RHS). While the SM90 implementation supports only the NT memory layout (row-major, col-major), the SM100 implementation supports all memory layouts (NT, TN, NN, TT). + +For both architectures, the LHS scaling factor is required to have a TMA-aligned and transposed layout. And the data format for the scaling factor of SM90 and SM100 is different: + +- SM90 requires scaling factors in FP32 format. + +- SM100 requires scaling factors in [UE8M0](https://docs.nvidia.com/cuda/parallel-thread-execution/#alternate-floating-point-data-formats) format. + +Please note that operations like input transposition or FP8 casting must be handled separately by the user, please implement or fuse them into prior kernels independently. While the library provides some simple PyTorch utility functions, these may result in slower performance, but our primary focus is on optimizing the GEMM kernels themselves. #### Normal dense GEMMs (non-grouped) -To perform a basic non-grouped FP8 GEMM, call the `deep_gemm.gemm_fp8_fp8_bf16_nt` function. For more details, please refer to the function documentation. +To perform a basic non-grouped FP8 GEMM, call the `fp8_gemm_nt` function. For more details, please refer to the function documentation. #### Grouped GEMMs (contiguous layout) @@ -87,13 +97,13 @@ Unlike traditional grouped GEMMs in CUTLASS, DeepGEMM groups only the M-axis, wh For training forward passes or inference prefilling, where each expert may process a varying number of tokens, we concatenate these tokens into a single tensor, referred to as the "contiguous" layout. Note that each expert segment must be aligned to the GEMM M block size (`get_m_alignment_for_contiguous_layout()`). -For more information, please refer to the `m_grouped_gemm_fp8_fp8_bf16_nt_contiguous` function documentation. +For more information, please refer to the `m_grouped_fp8_gemm_nt_contiguous` function documentation. #### Grouped GEMMs (masked layout) During the inference decoding phase, when CUDA graph is enabled and the CPU is unaware of the number of tokens each expert receives, we support masked grouped GEMMs. By providing a mask tensor, the kernel computes only the valid portions. -Use `m_grouped_gemm_fp8_fp8_bf16_nt_masked` for this purpose and consult the relevant documentation. An example usage is to use the output of low-latency kernels from [DeepEP](https://github.com/deepseek-ai/DeepEP) as input. +Use `fp8_m_grouped_gemm_nt_masked` for this purpose and consult the relevant documentation. An example usage is to use the output of low-latency kernels from [DeepEP](https://github.com/deepseek-ai/DeepEP) as input. #### Utilities @@ -121,7 +131,7 @@ The library also provides some environment variables, which may be useful: - `DG_JIT_PRINT_REG_REUSE`: `0` or `1`, print FFMA-interleaving details, `0` by default - `DG_JIT_PRINT_COMPILER_COMMAND`: `0` or `1`, print NVCC compilation command, `0` by default - Post optimization - - `DG_JIT_DISABLE_FFMA_INTERLEAVE`: `0` or `1`, disable FFMA-interleaving optimization, `0` by default + - `DG_JIT_DISABLE_FFMA_INTERLEAVE`: `0` or `1`, disable FFMA-interleaving optimization, `0` by default (only valid for SM90) - Heuristic selection - `DG_PRINT_CONFIGS`: `0` or `1`, print selected configs for each shape, `0` by default - Testing @@ -139,7 +149,7 @@ Following the CUTLASS design, the kernels in DeepGEMM are warp-specialized, enab ![design](figures/design.png) -#### Hopper TMA features +#### TMA features The [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#tensor-memory-accelerator) (TMA) is a new hardware feature introduced by the Hopper architecture, designed for faster and asynchronous data movement. Specifically, we utilize TMA for: @@ -205,4 +215,4 @@ This code repository is released under [the MIT License](LICENSE). publisher = {GitHub}, howpublished = {\url{https://github.com/deepseek-ai/DeepGEMM}}, } -``` +``` \ No newline at end of file diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index 8e6b2996..a2bbfe21 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -1,15 +1,29 @@ -import torch - -from . import jit -from .jit_kernels import ( - gemm_fp8_fp8_bf16_nt, - m_grouped_gemm_fp8_fp8_bf16_nt_contiguous, - m_grouped_gemm_fp8_fp8_bf16_nt_masked, - wgrad_gemm_fp8_fp8_fp32_nt, - k_grouped_wgrad_gemm_fp8_fp8_fp32_nt, - ceil_div, - set_num_sms, get_num_sms, - get_col_major_tma_aligned_tensor, - get_m_alignment_for_contiguous_layout +import os + +# Set some default environment provided at setup +try: + # noinspection PyUnresolvedReferences + from .envs import persistent_envs + for key, value in persistent_envs.items(): + if key not in os.environ: + os.environ[key] = value +except ImportError: + pass + +# All modules +from . import ( + dispatch, + jit, + jit_kernels, + testing, + utils +) + +# All kernels +from .dispatch import * + +# Some useful utils +from .utils.layout import ( + get_device_arch, + get_m_alignment_for_contiguous_layout, ) -from .utils import bench, bench_kineto, calc_diff diff --git a/deep_gemm/config.py b/deep_gemm/config.py new file mode 100644 index 00000000..fe3d959d --- /dev/null +++ b/deep_gemm/config.py @@ -0,0 +1,28 @@ +import torch + +_num_sms = None + +def set_num_sms(num_sms: int) -> None: + """ + Set the maximum SM count for all GEMM kernels to use. + + Arguments: + num_sms: the desired maximum SM count for all GEMM kernels to use. + """ + global _num_sms + assert 0 < num_sms <= torch.cuda.get_device_properties(device='cuda').multi_processor_count + _num_sms = num_sms + + +def get_num_sms() -> int: + """ + Get the current maximum limit of SM count for all GEMM kernels to use. + If the count is never specified, the function will return the number of device SMs. + + Returns: + Current maximum limit of SM count for all GEMM kernels to use. + """ + global _num_sms + if _num_sms is None: + _num_sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count + return _num_sms diff --git a/deep_gemm/dispatch.py b/deep_gemm/dispatch.py new file mode 100644 index 00000000..c22506a4 --- /dev/null +++ b/deep_gemm/dispatch.py @@ -0,0 +1,189 @@ +import functools +import torch +from typing import Tuple, Optional + +# TODO: add Ampere Triton/tile-lang kernels +from .jit.compiler import get_device_arch +from .jit_kernels.impls import ( + sm90_bf16_gemm, + sm100_bf16_gemm, + sm90_fp8_gemm_1d1d, + sm90_fp8_gemm_1d2d, + sm100_fp8_gemm_1d1d, +) +from .utils.layout import ( + MajorTypeAB, MajorTypeCD, + get_major_type_ab, get_major_type_cd, + transform_sf_into_required_layout +) + + +@functools.lru_cache(maxsize=None) +def must_be_k_major() -> bool: + return { + '90a': True, + '100a': False, + }[get_device_arch()] + + +@functools.lru_cache(maxsize=None) +def get_default_recipe(sfa_dtype: torch.dtype, sfb_dtype: torch.dtype) -> Tuple[int, int, int]: + assert sfa_dtype in (torch.float, torch.int) + return { + ('90a', torch.float): (1, 128, 128), + ('100a', torch.float): (1, 128, 128), + ('100a', torch.int): (1, 1, 128), + }[(get_device_arch(), sfb_dtype)] + + +def fp8_gemm_nt(a: Tuple[torch.Tensor, torch.Tensor], + b: Tuple[torch.Tensor, torch.Tensor], + d: torch.Tensor, + c: Optional[torch.Tensor] = None, + recipe: Optional[Tuple[int, int, int]] = None, + compiled_dims: str = 'nk') -> None: + """ + Perform `d = c + (a @ b)`. + TODO: add more docs. + """ + # Compiled dims can be upper cases + compiled_dims = compiled_dims.lower() + + # NOTES: shape must be `[M, K] @ [N, K].T` + major_a = get_major_type_ab(a[0]) + major_b = get_major_type_ab(b[0]) + if must_be_k_major(): + assert major_a == major_b == MajorTypeAB.KMajor + + a, sfa = a + b, sfb = b + m, k = a.shape + n, k_ = b.shape + m_, n_ = d.shape + + # Type and shape checks + assert m == m_ and n == n_ and k == k_ + assert n > 0 and k > 0 + assert a.dtype == torch.float8_e4m3fn + assert b.dtype == torch.float8_e4m3fn + assert d.dtype in (torch.bfloat16, torch.float) + + # D must be N-major + assert get_major_type_cd(d) == MajorTypeCD.NMajor + + # Check C as well + if c is not None: + assert c.dtype == d.dtype == torch.float + assert get_major_type_cd(c) == MajorTypeCD.NMajor + + # Do nothing if the problem is empty + if m == 0: + return + + # Transform SFA and SFB into compute-required layout + recipe = get_default_recipe(sfa.dtype, sfb.dtype) if recipe is None else recipe + sfa = transform_sf_into_required_layout(sfa, mn=m, k=k, recipe=recipe, is_sfa=True) + sfb = transform_sf_into_required_layout(sfb, mn=n, k=k, recipe=recipe, is_sfa=False) + + impl = { + '100a': functools.partial(sm100_fp8_gemm_1d1d.fp8_gemm_nt, + major_a=major_a, major_b=major_b, major_cd=MajorTypeCD.NMajor, + compiled_dims=compiled_dims) + }[get_device_arch()] + impl(a, sfa, b, sfb, c, d) + + +def m_grouped_fp8_gemm_nt_contiguous(a: Tuple[torch.Tensor, torch.Tensor], + b: Tuple[torch.Tensor, torch.Tensor], + d: torch.Tensor, + m_indices: torch.Tensor, + recipe: Optional[Tuple[int, int, int]] = None, + compiled_dims: str = 'nk') -> None: + # Compiled dims can be upper cases + compiled_dims = compiled_dims.lower() + + # NOTES: shape must be `[M, K] @ [G, N, K].mT` + major_a = get_major_type_ab(a[0]) + major_b = get_major_type_ab(b[0]) + assert major_a == MajorTypeAB.KMajor + if must_be_k_major(): + assert major_b == MajorTypeAB.KMajor + assert m_indices.is_contiguous() + + a, sfa = a + b, sfb = b + m, k = a.shape + num_groups, n, k_ = b.shape + m_, n_ = d.shape + m__ = m_indices.numel() + + # Type and shape checks + assert m == m_ == m__ and n == n_ and k == k_ + assert n > 0 and k > 0 and num_groups > 0 + assert a.dtype == torch.float8_e4m3fn + assert b.dtype == torch.float8_e4m3fn + assert d.dtype == torch.bfloat16 + assert m_indices.dtype == torch.int32 + + # D must be N-major + assert get_major_type_cd(d) == MajorTypeCD.NMajor + + # Do nothing if the problem is empty + if m == 0: + return + + # Transform SFA and SFB into compute-required layout + recipe = get_default_recipe(sfa.dtype, sfb.dtype) if recipe is None else recipe + sfa = transform_sf_into_required_layout(sfa, mn=m, k=k, recipe=recipe, is_sfa=True) + sfb = transform_sf_into_required_layout(sfb, mn=n, k=k, recipe=recipe, num_groups=num_groups, is_sfa=False) + + impl = { + '100a': functools.partial(sm100_fp8_gemm_1d1d.m_grouped_fp8_gemm_nt_contiguous, major_a=major_a, major_b=major_b, compiled_dims=compiled_dims) + }[get_device_arch()] + impl(a, sfa, b, sfb, d, m_indices) + + +def fp8_m_grouped_gemm_nt_masked(a: Tuple[torch.Tensor, torch.Tensor], + b: Tuple[torch.Tensor, torch.Tensor], + d: torch.Tensor, + masked_m: torch.Tensor, + expected_m: int, + recipe: Optional[Tuple[int, int, int]] = None, + compiled_dims: str = 'nk') -> None: + # Compiled dims can be upper cases + compiled_dims = compiled_dims.lower() + + # NOTES: shape must be `[G, M, K] @ [G, N, K].mT` + major_a = get_major_type_ab(a[0]) + major_b = get_major_type_ab(b[0]) + assert major_a == major_b == MajorTypeAB.KMajor + assert masked_m.is_contiguous() + + a, sfa = a + b, sfb = b + num_groups, m, k = a.shape + num_groups_, n, k_ = b.shape + num_groups__, m_, n_ = d.shape + num_groups___ = masked_m.numel() + + # Type and shape checks + assert num_groups == num_groups_ == num_groups__ == num_groups___ + assert m == m_ and n == n_ and k == k_ + assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0 + assert a.dtype == torch.float8_e4m3fn + assert b.dtype == torch.float8_e4m3fn + assert d.dtype == torch.bfloat16 + assert masked_m.dtype == torch.int32 + + # D must be N-major + assert get_major_type_cd(d) == MajorTypeCD.NMajor + + # Transform SFA and SFB into compute-required layout + recipe = get_default_recipe(sfa.dtype, sfb.dtype) if recipe is None else recipe + sfa = transform_sf_into_required_layout(sfa, mn=m, k=k, recipe=recipe, num_groups=num_groups, is_sfa=True) + sfb = transform_sf_into_required_layout(sfb, mn=n, k=k, recipe=recipe, num_groups=num_groups, is_sfa=False) + + impl = { + '100a': functools.partial(sm100_fp8_gemm_1d1d.fp8_m_grouped_gemm_nt_masked, major_a=major_a, major_b=major_b, compiled_dims=compiled_dims) + }[get_device_arch()] + impl(a, sfa, b, sfb, d, masked_m, expected_m) diff --git a/deep_gemm/include/deep_gemm/common/scheduler.cuh b/deep_gemm/include/deep_gemm/common/scheduler.cuh new file mode 100644 index 00000000..d5c7f1c0 --- /dev/null +++ b/deep_gemm/include/deep_gemm/common/scheduler.cuh @@ -0,0 +1,119 @@ +#pragma once + +#include + +namespace deep_gemm { + +enum class GemmType { + Normal, + GroupedContiguous, + GroupedMasked +}; + +#pragma clang diagnostic push +#pragma ide diagnostic ignored "cppcoreguidelines-pro-type-member-init" +template +struct Scheduler { + int current_iter = -1; + + // Block configs + uint32_t num_blocks; + uint32_t num_m_blocks; + uint32_t num_n_blocks; + + // For grouped GEMM + int* grouped_layout; + // Only used for masked layout + uint32_t curr_group_idx, curr_cumsum; + + __device__ __forceinline__ explicit Scheduler(const uint32_t& shape_m, const uint32_t& shape_n, + int* grouped_layout = nullptr) { + num_m_blocks = ceil_div(shape_m, BLOCK_M); + num_n_blocks = ceil_div(shape_n, BLOCK_N); + if constexpr (kGemmType == GemmType::Normal) { + num_blocks = num_m_blocks * num_n_blocks; + } else if (kGemmType == GemmType::GroupedContiguous) { + num_blocks = num_m_blocks * num_n_blocks; + this->grouped_layout = grouped_layout; + } else if (kGemmType == GemmType::GroupedMasked) { + curr_group_idx = curr_cumsum = 0; + this->grouped_layout = grouped_layout; + } + } + + __device__ __forceinline__ void get_swizzled_block_idx(const uint32_t& block_idx, uint32_t& m_block_idx, uint32_t& n_block_idx) { + DG_STATIC_ASSERT(kNum1DBlocksPerGroup % kNumMulticast == 0, "Invalid group size"); + + // Swizzle for better L2 usages + // TODO: unify these 2 branches + if constexpr (kIsMulticastOnA) { + auto num_blocks_per_group = num_m_blocks * kNum1DBlocksPerGroup; + auto group_idx = block_idx / num_blocks_per_group; + auto first_n_block_idx = group_idx * kNum1DBlocksPerGroup; + auto num_n_blocks_in_group = min(kNum1DBlocksPerGroup, num_n_blocks - first_n_block_idx); + auto in_group_idx = block_idx % num_blocks_per_group; + m_block_idx = in_group_idx / num_n_blocks_in_group; + n_block_idx = first_n_block_idx + in_group_idx % num_n_blocks_in_group; + } else { + auto num_blocks_per_group = num_n_blocks * kNum1DBlocksPerGroup; + auto group_idx = block_idx / num_blocks_per_group; + auto first_m_block_idx = group_idx * kNum1DBlocksPerGroup; + auto num_m_blocks_in_group = min(kNum1DBlocksPerGroup, num_m_blocks - first_m_block_idx); + auto in_group_idx = block_idx % num_blocks_per_group; + m_block_idx = first_m_block_idx + in_group_idx % num_m_blocks_in_group; + n_block_idx = in_group_idx / num_m_blocks_in_group; + } + } + + template + __device__ __forceinline__ uint32_t get_global_idx(const uint32_t shape_dim, const uint32_t block_size, + const uint32_t& block_idx, const uint32_t& m_block_idx = 0) { + if constexpr (kGemmType == GemmType::Normal) { + return block_idx * block_size; + } else if (kGemmType == GemmType::GroupedContiguous) { + auto offset = kWithGroupOffset ? __ldg(grouped_layout + m_block_idx * BLOCK_M) : 0; + return offset * shape_dim + block_idx * block_size; + } else if (kGemmType == GemmType::GroupedMasked) { + auto offset = kWithGroupOffset ? curr_group_idx : 0; + return offset * shape_dim + block_idx * block_size; + } + } + + __device__ __forceinline__ bool get_next_block(uint32_t& m_block_idx, uint32_t& n_block_idx) { + const auto next_block_idx = (++ current_iter) * gridDim.x + blockIdx.x; + + if constexpr (kGemmType == GemmType::GroupedMasked) { + while (true) { + // End of the task + if (curr_group_idx == kNumGroups) + return false; + + // Within current group + num_m_blocks = ceil_div(static_cast(__ldg(grouped_layout + curr_group_idx)), BLOCK_M); + auto current_m_block_cumsum = curr_cumsum + num_m_blocks; + if (next_block_idx < current_m_block_cumsum * num_n_blocks) + break; + + // Move to check the next group + curr_group_idx ++, curr_cumsum = current_m_block_cumsum; + } + + get_swizzled_block_idx(next_block_idx - curr_cumsum * num_n_blocks, m_block_idx, n_block_idx); + } else { + if (next_block_idx >= num_blocks) + return false; + + get_swizzled_block_idx(next_block_idx, m_block_idx, n_block_idx); + } + return true; + } +}; + +#pragma clang diagnostic pop + +} // namespace deep_gemm diff --git a/deep_gemm/include/deep_gemm/common/sm100_utils.cuh b/deep_gemm/include/deep_gemm/common/sm100_utils.cuh new file mode 100644 index 00000000..671b0779 --- /dev/null +++ b/deep_gemm/include/deep_gemm/common/sm100_utils.cuh @@ -0,0 +1,156 @@ +#pragma once + +#include +#include +#include + +#include + +namespace deep_gemm::sm100 { + +template +constexpr uint32_t get_inner_block_atom_size() { + return kSwizzleMode == 0 ? BLOCK_INNER : kSwizzleMode / sizeof(dtype_t); +} + +template +__device__ __forceinline__ void +tma_copy(void const* desc_ptr, cutlass::arch::ClusterTransactionBarrier* barrier_ptr, + dtype_t* smem_ptr, const uint32_t& inner_idx, const int32_t& outer_idx) { + DG_STATIC_ASSERT(1 <= kNumMulticast and kNumMulticast <= 2, "Invalid multicast config"); + DG_STATIC_ASSERT(static_cast(cute::TMA::CacheHintSm90::EVICT_NORMAL) == + static_cast(cute::TMA::CacheHintSm100::EVICT_NORMAL), "Invalid cache hint"); + + // 2-CTA function will send signals to the leader CTA only + const auto copy_func = kNumMulticast == 1 ? cute::SM90_TMA_LOAD_2D::copy : cute::SM100_TMA_2SM_LOAD_2D::copy; + + // Issue multiple TMAs + constexpr uint32_t BLOCK_INNER_ATOM = get_inner_block_atom_size(); + #pragma unroll + for (uint32_t i = 0; i < BLOCK_INNER / BLOCK_INNER_ATOM; ++ i) { + copy_func(desc_ptr, reinterpret_cast(barrier_ptr), + static_cast(cute::TMA::CacheHintSm100::EVICT_NORMAL), + smem_ptr + i * BLOCK_OUTER * BLOCK_INNER_ATOM, inner_idx + i * BLOCK_INNER_ATOM, outer_idx); + } +} + +__device__ __forceinline__ +cute::UMMA::SmemDescriptor make_smem_desc(cute::UMMA::LayoutType layout, void* smem_ptr, + uint32_t stride_byte_offset, uint32_t leading_byte_offset) { + cute::UMMA::SmemDescriptor desc; + + // Set the version for SM100 + desc.version_ = 1; + + // Legacy mode + desc.lbo_mode_ = 0; + + // Layout + desc.layout_type_ = static_cast(layout); + + // Start address + const auto uint_ptr = cute::cast_smem_ptr_to_uint(smem_ptr); + desc.start_address_ = static_cast(uint_ptr >> 4); + + // Base offset + desc.base_offset_ = 0; + + // SBO and LBO + desc.stride_byte_offset_ = stride_byte_offset >> 4; + desc.leading_byte_offset_ = leading_byte_offset >> 4; + + return desc; +} + +__device__ __forceinline__ +cute::UMMA::SmemDescriptor make_sf_desc(void* smem_ptr) { + // NOTES: the UTCCP layout is K-major by default + // Atom size: 8 x 128 bits + // {SBO, LBO} means the byte stride between atoms on {MN, K} + // Since the UTCCP we used is 128b-wide (only 1 atom on K), so LBO can be zero + return make_smem_desc(cute::UMMA::LayoutType::SWIZZLE_NONE, smem_ptr, 8 * 16, 0); +} + +__device__ __forceinline__ +void replace_smem_desc_addr(cute::UMMA::SmemDescriptor& desc, const void* smem_ptr) { + const auto uint_ptr = cute::cast_smem_ptr_to_uint(smem_ptr); + desc.start_address_ = static_cast(uint_ptr >> 4); +} + +// ReSharper disable once CppNotAllPathsReturnValue +template +constexpr static cute::UMMA::LayoutType to_umma_layout_type() { + DG_STATIC_ASSERT(kSwizzleMode == 0 or kSwizzleMode == 16 or + kSwizzleMode == 32 or kSwizzleMode == 64 or + kSwizzleMode == 128, "Invalid swizzling mode"); + if constexpr (kSwizzleMode == 0) return cute::UMMA::LayoutType::SWIZZLE_NONE; + if constexpr (kSwizzleMode == 16) return cute::UMMA::LayoutType::SWIZZLE_NONE; + if constexpr (kSwizzleMode == 32) return cute::UMMA::LayoutType::SWIZZLE_32B; + if constexpr (kSwizzleMode == 64) return cute::UMMA::LayoutType::SWIZZLE_64B; + if constexpr (kSwizzleMode == 128) return cute::UMMA::LayoutType::SWIZZLE_128B; +} + +template +__device__ __forceinline__ +cute::UMMA::SmemDescriptor make_umma_desc(dtype_t* base_smem_ptr, uint32_t mn_idx, uint32_t k_idx) { + if constexpr (kMajorMode == cute::UMMA::Major::K) { + // NOTES: for K-major layout, the swizzle must be 128B (also, atom index must be 0), as `BLOCK_K` is always 128 + DG_STATIC_ASSERT(kSwizzleMode == BLOCK_K * sizeof(dtype_t), "Unexpected value"); + + // Atom size: 8 x `kSwizzleMode` (in bytes, on K) + // {SBO, LBO} means the byte stride between atoms on {MN, K} + // NOTES: on K, there is only 1 atom as asserted previously, so LBO can be 0 + const uint32_t stride_byte_offset = 8 * BLOCK_K * sizeof(dtype_t); + const uint32_t leading_byte_offset = 0; + return make_smem_desc(to_umma_layout_type(), + base_smem_ptr + mn_idx * BLOCK_K + k_idx, + stride_byte_offset, leading_byte_offset); + } else { + constexpr uint32_t BLOCK_MN_ATOM = get_inner_block_atom_size(); + + // Must have no in-atom MN-idx + // NOTES: no worries for the runtime assert, the `mn_idx` are constants at compilation time + DG_DEVICE_ASSERT(mn_idx % BLOCK_MN_ATOM == 0); + DG_STATIC_ASSERT(kSwizzleMode > 0, "Invalid swizzling"); + + // Atom size: `kSwizzleMode` (in bytes, on MN) x 8 + // NOTES: `kSwizzleMode == 16` mean non-swizzling but interleaving + // {SBO, LBO} means the byte stride between atoms on {K, MN} for swizzling + // {SBO, LBO} means the byte stride between atoms on {MN, K} for non-swizzling + uint32_t stride_byte_offset = 8 * BLOCK_MN_ATOM * sizeof(dtype_t); + uint32_t leading_byte_offset = BLOCK_K * BLOCK_MN_ATOM * sizeof(dtype_t); + if constexpr (kSwizzleMode == 16) + swap(stride_byte_offset, leading_byte_offset); + return make_smem_desc(to_umma_layout_type(), + base_smem_ptr + mn_idx * BLOCK_K + BLOCK_MN_ATOM * k_idx, + stride_byte_offset, leading_byte_offset); + } +} + +__device__ __forceinline__ +uint64_t make_runtime_instr_desc_with_sf_id(cute::UMMA::InstrDescriptorBlockScaled desc, const uint32_t& sf_id) { + desc.a_sf_id_ = sf_id, desc.b_sf_id_ = sf_id; + return static_cast(static_cast(desc)) << 32; +} + +template +__device__ constexpr uint32_t get_num_aligned_tmem_cols() { + DG_STATIC_ASSERT(kNumCols <= 512, "Too many tensor memory columns"); + if (kNumCols <= 32) return 32; + if (kNumCols <= 64) return 64; + if (kNumCols <= 128) return 128; + if (kNumCols <= 256) return 256; + return 512; +} + +__device__ __forceinline__ void tcgen05_before_thread_sync() { + asm volatile("tcgen05.fence::before_thread_sync;"); +} + +__device__ __forceinline__ void tcgen05_after_thread_sync() { + asm volatile("tcgen05.fence::after_thread_sync;"); +} + +} // namespace `deep_gemm::sm100` diff --git a/deep_gemm/include/deep_gemm/common/sm90_utils.cuh b/deep_gemm/include/deep_gemm/common/sm90_utils.cuh new file mode 100644 index 00000000..05ed0ba5 --- /dev/null +++ b/deep_gemm/include/deep_gemm/common/sm90_utils.cuh @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace deep_gemm::sm90 { + +template +struct SM90_U32x2_STSM_N { + __device__ __forceinline__ static void + copy(dtype_t src_0, dtype_t src_1, void* smem_dst) { + const uint32_t src[2] = {*reinterpret_cast(&src_0), *reinterpret_cast(&src_1)}; + asm volatile("stmatrix.sync.aligned.x2.m8n8.shared.b16 [%0], {%1, %2};\n" + :: "l"(smem_dst), "r"(src[0]), "r"(src[1])); + } +}; + +} // namespace `deep_gemm::sm90` diff --git a/deep_gemm/include/deep_gemm/common/utils.cuh b/deep_gemm/include/deep_gemm/common/utils.cuh new file mode 100644 index 00000000..127d80b6 --- /dev/null +++ b/deep_gemm/include/deep_gemm/common/utils.cuh @@ -0,0 +1,109 @@ +#pragma once + +#include +#include + +#ifdef __CLION_IDE__ + +__host__ __device__ __forceinline__ void host_device_printf(const char* format, ...) { + asm volatile("trap;"); +} + +#define printf host_device_printf +#endif + +#ifndef DG_DEVICE_ASSERT +#define DG_DEVICE_ASSERT(cond) \ +do { \ + if (not (cond)) { \ + printf("Assertion failed: %s:%d, condition: %s\n", __FILE__, __LINE__, #cond); \ + asm("trap;"); \ + } \ +} while (0) +#endif + +#ifndef DG_TRAP_ONLY_DEVICE_ASSERT +#define DG_TRAP_ONLY_DEVICE_ASSERT(cond) \ +do { \ + if (not (cond)) \ + asm("trap;"); \ +} while (0) +#endif + +#ifndef DG_STATIC_ASSERT +#define DG_STATIC_ASSERT(cond, ...) static_assert(cond, __VA_ARGS__) +#endif + +namespace deep_gemm { + +template +__device__ __host__ constexpr T ceil_div(T a, T b) { + return (a + b - 1) / b; +} + +template +__device__ __host__ constexpr T align(T a, T b) { + return ceil_div(a, b) * b; +} + +template +__device__ __host__ constexpr T constexpr_gcd(T a, T b) { + return b == 0 ? a : constexpr_gcd(b, a % b); +} + +template +__forceinline__ __device__ void swap(T& a, T& b) { + T temp = a; + a = b; + b = temp; +} + +__forceinline__ __device__ uint32_t get_sm_idx() { + uint32_t sm_idx; + asm ("mov.u32 %0, %%smid;" : "=r"(sm_idx)); + return sm_idx; +} + +__forceinline__ __device__ uint32_t get_lane_idx() { + uint32_t lane_id; + asm ("mov.u32 %0, %laneid;" : "=r"(lane_id)); + return lane_id; +} + +__device__ __forceinline__ uint32_t ld_shared(const uint32_t* ptr) { + uint32_t ret; + asm volatile("ld.shared.u32 %0, [%1];" : "=r"(ret) : "l"(ptr)); + return ret; +} + +__device__ __forceinline__ float4 ld_shared(const float4* ptr) { + float4 ret; + asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : "l"(ptr)); + return ret; +} + +__device__ __forceinline__ float ld_shared(const float* ptr) { + float ret; + asm volatile("ld.shared.f32 %0, [%1];" : "=f"(ret) : "l"(ptr)); + return ret; +} + +__device__ __forceinline__ void st_shared(const float* ptr, float val) { + asm volatile("st.shared.f32 [%0], %1;" :: "l"(ptr), "f"(val)); +} + +__device__ __forceinline__ void st_shared(const uint32_t* ptr, uint32_t val) { + asm volatile("st.shared.u32 [%0], %1;" :: "l"(ptr), "r"(val)); +} + +__device__ __forceinline__ void st_shared(const void* ptr, uint32_t x, uint32_t y, uint32_t z, uint32_t w) { + asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};" :: "l"(ptr), "r"(x), "r"(y), "r"(z), "r"(w)); +} + +template +__device__ __forceinline__ int cast_into_bf16_and_pack(old_t& x, old_t& y) { + auto bf16x2 = __float22bfloat162_rn({*reinterpret_cast(&x), *reinterpret_cast(&y)}); + return *reinterpret_cast(&bf16x2); +} + +} // namespace `deep_gemm` diff --git a/deep_gemm/include/deep_gemm/fp8_gemm.cuh b/deep_gemm/include/deep_gemm/fp8_gemm.cuh deleted file mode 100644 index 5c11cd3d..00000000 --- a/deep_gemm/include/deep_gemm/fp8_gemm.cuh +++ /dev/null @@ -1,444 +0,0 @@ -#pragma once - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunknown-attributes" - -#include -#include - -#include -#include -#include - -#include "mma_utils.cuh" -#include "scheduler.cuh" -#include "tma_utils.cuh" -#include "utils.cuh" - -namespace deep_gemm { - -template -__device__ __host__ void outer_launch_k_iterations(const auto& inner_launch_k_iterations, const auto& func, uint32_t num_former_iters) { - if (num_former_iters == kNumFormerIters) { - inner_launch_k_iterations(func, cute::Int{}); - return; - } - - if constexpr (kNumFormerIters + kGap <= kEnd) - outer_launch_k_iterations(inner_launch_k_iterations, func, num_former_iters); -} - -template -__global__ void __launch_bounds__(get_num_threads_per_sm(BLOCK_M), 1) -fp8_gemm_kernel(float* scales_b, int* grouped_layout, - uint32_t shape_m, - const __grid_constant__ CUtensorMap tensor_map_a, - const __grid_constant__ CUtensorMap tensor_map_b, - const __grid_constant__ CUtensorMap tensor_map_scales_a, - const __grid_constant__ CUtensorMap tensor_map_d) { -#if (defined(__CUDA_ARCH__) and (__CUDA_ARCH__ >= 900)) or defined(__CLION_IDE__) - // Scaling checks - DG_STATIC_ASSERT(BLOCK_K == 128, "Only support per-128-channel FP8 scaling"); - DG_STATIC_ASSERT(ceil_div(BLOCK_N, BLOCK_K) == 1 or (constexpr_gcd(BLOCK_N, BLOCK_K) == BLOCK_N - BLOCK_K), "Too much B scales in a single block"); - - // Types - using WGMMA = typename FP8MMASelector::type; - using Barrier = cutlass::arch::ClusterTransactionBarrier; - DG_STATIC_ASSERT(BLOCK_M % WGMMA::M == 0, "Invalid block size"); - - // Shared memory - static constexpr bool kMustUseUniformedScaleB = (BLOCK_K % BLOCK_N == 0); - static constexpr uint32_t SMEM_D_SIZE = BLOCK_M * (BLOCK_N + BLOCK_N_PADDING) * sizeof(__nv_bfloat16); - static constexpr uint32_t SMEM_A_SIZE_PER_STAGE = BLOCK_M * BLOCK_K * sizeof(__nv_fp8_e4m3); - static constexpr uint32_t SMEM_B_SIZE_PER_STAGE = BLOCK_N * BLOCK_K * sizeof(__nv_fp8_e4m3); - static constexpr uint32_t SMEM_SCALES_A_SIZE_PER_STAGE = BLOCK_M * sizeof(float); - static constexpr uint32_t SHAPE_K_SCALES = ceil_div(SHAPE_K, BLOCK_K); - static constexpr uint32_t SMEM_SCALES_B_SIZE = ceil_div(SHAPE_K_SCALES * (kMustUseUniformedScaleB ? 1 : 2) * sizeof(float), sizeof(Barrier)) * sizeof(Barrier); - - // Configs - constexpr uint32_t kFullKOfAllStages = kNumStages * BLOCK_K; - constexpr uint32_t kNumThreads = get_num_threads_per_sm(BLOCK_M); - constexpr uint32_t kNumMathThreads = kNumThreads - kNumTMAThreads; - constexpr uint32_t kNumIterations = ceil_div(SHAPE_K, kFullKOfAllStages); - const uint32_t warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); - const uint32_t lane_idx = get_lane_id(); - - // Prefetch TMA descriptors at the very beginning - if (threadIdx.x == kNumMathThreads) { - // NOTES: `reinterpret_cast` must be here, or NVRTC will fail - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_a)); - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_b)); - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_scales_a)); - - // `tensor_map_d` is only used in swizzling mode - // For the `kSwizzleDMode == 0 and BLOCK_N_PADDING == 0` case, it will be treated as padding mode - if constexpr (kSwizzleDMode > 0) - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_d)); - } - __syncwarp(); - - // Align to 1024 bytes for swizzle-128B - extern __shared__ __align__(1024) uint8_t smem_buffer[]; - DG_STATIC_ASSERT(SMEM_D_SIZE % 1024 == 0, "Shared memory of A/B must be aligned to 1024 bytes"); - - // Data on shared memory - auto smem_d = reinterpret_cast<__nv_bfloat16*>(smem_buffer); - __nv_fp8_e4m3* smem_a[kNumStages]; - __nv_fp8_e4m3* smem_b[kNumStages]; - float* smem_scales_a[kNumStages]; - float* smem_scales_b; - - // TMA Barrier for both divisible and non-divisible cases - Barrier* full_barriers[kNumStages]; - Barrier* empty_barriers[kNumStages]; - - // Fill shared memory pointers - #pragma unroll - for (uint32_t i = 0; i < kNumStages; ++ i) { - smem_a[i] = reinterpret_cast<__nv_fp8_e4m3*>(smem_buffer + SMEM_D_SIZE + i * SMEM_A_SIZE_PER_STAGE); - smem_b[i] = reinterpret_cast<__nv_fp8_e4m3*>(smem_buffer + SMEM_D_SIZE + kNumStages * SMEM_A_SIZE_PER_STAGE + i * SMEM_B_SIZE_PER_STAGE); - smem_scales_a[i] = reinterpret_cast(smem_buffer + SMEM_D_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE) + i * SMEM_SCALES_A_SIZE_PER_STAGE); - } - smem_scales_b = reinterpret_cast(smem_buffer + SMEM_D_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SCALES_A_SIZE_PER_STAGE)); - - // Fill barriers - auto barrier_start_ptr = reinterpret_cast(reinterpret_cast(smem_scales_b) + SMEM_SCALES_B_SIZE); - #pragma unroll - for (uint32_t i = 0; i < kNumStages; ++ i) { - full_barriers[i] = barrier_start_ptr + i; - empty_barriers[i] = barrier_start_ptr + kNumStages + i; - } - - // Initialize barriers - DG_STATIC_ASSERT(kNumTMAMulticast <= 32, "Too many TMA multicast"); - if (threadIdx.x == kNumMathThreads) { - // NOTES: we always use `lane_idx` to arrive for the `lane_idx`-th CTA in the cluster, - // even with TMA multicast disabled, we want to make the behavior aligned - #pragma unroll - for (uint32_t i = 0; i < kNumStages; ++ i) { - full_barriers[i]->init(1); - empty_barriers[i]->init(kNumTMAMulticast * kNumMathThreads / 32); - } - - // Make initialized barrier visible in async proxy - cutlass::arch::fence_view_async_shared(); - (kNumTMAMulticast > 1) ? cutlass::arch::fence_barrier_init() : void(); - } - - // Synchronize all threads to make barrier visible in normal memory model - (kNumTMAMulticast > 1) ? cute::cluster_sync() : __syncthreads(); - - // For pipeline unrolling - struct DivisibleK {}; - struct NotDivisibleK {}; - struct SkipComputation {}; - struct NotSkipComputation {}; - auto launch_k_iterations = [](const auto& func, bool skip_computation, uint32_t num_former_iters) { - constexpr bool kShouldOptimize = BLOCK_K / constexpr_gcd(BLOCK_K, BLOCK_N) <= 4 and not kMustUseUniformedScaleB; - constexpr uint32_t kGap = constexpr_gcd(BLOCK_K, BLOCK_N) / 8; - constexpr uint32_t kEnd = kShouldOptimize ? BLOCK_K / 8 : 0; - - // NOTES: for too-many branches (> 5), we disable this optimization - // Otherwise, the compiler must know the dynamic variable `num_former_iters`'s real value - outer_launch_k_iterations<0, kGap, kEnd>([=](const auto& func, auto num_former_iters_type) { - if (skip_computation) { - for (uint32_t k_iter = 0; k_iter < kNumIterations; ++ k_iter) - func(k_iter, DivisibleK{}, SkipComputation{}, num_former_iters_type); - } else if (SHAPE_K % kFullKOfAllStages == 0) { - for (uint32_t k_iter = 0; k_iter < kNumIterations; ++ k_iter) - func(k_iter, DivisibleK{}, NotSkipComputation{}, num_former_iters_type); - } else { - for (uint32_t k_iter = 0; k_iter < kNumIterations - 1; ++ k_iter) - func(k_iter, DivisibleK{}, NotSkipComputation{}, num_former_iters_type); - func(kNumIterations - 1, NotDivisibleK{}, NotSkipComputation{}, num_former_iters_type); - } - }, func, kShouldOptimize ? num_former_iters : 0); - }; - - // Register reconfigurations - constexpr uint32_t kNumTMARegisters = 40; - constexpr uint32_t kNumMathRegisters = 232; - - // Block scheduler - uint32_t m_block_idx, n_block_idx; - auto scheduler = Scheduler(shape_m, grouped_layout); - - if (threadIdx.x >= kNumMathThreads) { - // TMA warp-group for loading data - cutlass::arch::warpgroup_reg_dealloc(); - - // NOTES: only one thread (or warp) will be used - if (threadIdx.x == kNumMathThreads) { - // Persistently schedule over blocks - while (scheduler.get_next_block(m_block_idx, n_block_idx)) { - launch_k_iterations([&](uint32_t k_iter, auto divisible_type, auto _, auto __) { - constexpr bool kHasDivisibleStages = std::is_same_v; - constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : (SHAPE_K % kFullKOfAllStages) / BLOCK_K; - - // Assign TMA multicast number into A and B - // NOTES: there may be additional odd rows/columns or cases where multicast is not possible. - const bool is_tma_multicast_valid = scheduler.is_tma_multicast_valid(m_block_idx); - const uint32_t num_tma_multicast_a = (kIsTMAMulticastOnA and is_tma_multicast_valid) ? kNumTMAMulticast : 1; - const uint32_t num_tma_multicast_b = (not kIsTMAMulticastOnA and is_tma_multicast_valid) ? kNumTMAMulticast : 1; - DG_STATIC_ASSERT(kNumTMAMulticast <= 2, "Scheduler does not support > 2 TMA multicast"); - - // NOTES: unrolling and `kNumInnerStages` are vital for performance, NVCC will try to eliminate all - // shared memory pointers, e.g. `full_barriers` registers, if all the access indices are constant - #pragma unroll - for (uint32_t s = 0; s < kNumInnerStages; ++ s) { - // Wait consumer release - empty_barriers[s]->wait((scheduler.current_iter * kNumIterations + k_iter + 1) & 1); - - // Issue TMA A - auto& full_barrier = *full_barriers[s]; - uint32_t k_idx = k_iter * kFullKOfAllStages + s * BLOCK_K; - tma_copy(&tensor_map_a, reinterpret_cast(&full_barrier), - smem_a[s], k_idx, scheduler.get_global_idx(shape_m, BLOCK_M, m_block_idx), - num_tma_multicast_a); - tma_copy(&tensor_map_scales_a, reinterpret_cast(&full_barrier), - smem_scales_a[s], m_block_idx * BLOCK_M, - scheduler.get_global_idx(SHAPE_K_SCALES, 1, k_idx / BLOCK_K), - num_tma_multicast_a); - - // Issue TMA B - tma_copy(&tensor_map_b, reinterpret_cast(&full_barrier), - smem_b[s], k_idx, scheduler.get_global_idx(SHAPE_N, BLOCK_N, n_block_idx, m_block_idx), - num_tma_multicast_b); - full_barrier.arrive_and_expect_tx(SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SCALES_A_SIZE_PER_STAGE); - } - - // Wait unaligned cases - #pragma unroll - for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { - empty_barriers[s]->wait((scheduler.current_iter * kNumIterations + k_iter + 1) & 1); - full_barriers[s]->arrive(); - } - }, false, 0); - } - - // To safely deconstruct distributed shared barriers, we need another round of empty waits - if constexpr (kNumTMAMulticast > 1) { - #pragma unroll - for (uint32_t s = 0; s < kNumStages; ++ s) - empty_barriers[s]->wait((scheduler.current_iter * kNumIterations + 1) & 1); - } - } - } else { - // Math warp-groups for WGMMA - cutlass::arch::warpgroup_reg_alloc(); - - // NOTES: use `__shfl_sync` to encourage NVCC to use unified registers - const auto math_wg_idx = __shfl_sync(0xffffffff, threadIdx.x / kNumMathThreadsPerGroup, 0); - const auto r_0 = warp_idx * 16 + lane_idx / 4, r_1 = r_0 + 8; - - // Persistently schedule over blocks - while (scheduler.get_next_block(m_block_idx, n_block_idx)) { - // Decide the number of scales B to load - DG_STATIC_ASSERT(SHAPE_N % 8 == 0, "Invalid shape N"); - uint32_t num_former_iters = BLOCK_N / 8, num_full_iters = num_former_iters; - if constexpr (not kMustUseUniformedScaleB) { - num_former_iters = min(BLOCK_N, BLOCK_K - n_block_idx * BLOCK_N % BLOCK_K) / 8; - num_full_iters = min(SHAPE_N - n_block_idx * BLOCK_N, BLOCK_N) / 8; - } - uint32_t num_scales_b = SHAPE_K_SCALES * (num_former_iters >= num_full_iters ? 1 : 2); - - // Load B scales with math warp-groups - // NOTES: except the first warp, we want to overlap loading B scales with TMA stores between tasks - if (threadIdx.x >= 32) { - auto num_previous_lines = scheduler.get_global_idx(ceil_div(SHAPE_N, BLOCK_K), 0, 0, m_block_idx); - auto local_scales_b = scales_b + (num_previous_lines + ((n_block_idx * BLOCK_N) / BLOCK_K)) * SHAPE_K_SCALES; - #pragma unroll - for (uint32_t i = threadIdx.x - 32; i < num_scales_b; i += kNumMathThreads - 32) - st_shared(smem_scales_b + i, __ldg(local_scales_b + i)); - } - cutlass::arch::NamedBarrier(kNumMathThreads).sync(); - - // Accumulation for WGMMA or CUDA promotion - constexpr uint32_t WAVE_BLOCK_M = WGMMA::M * get_num_math_warpgroups(BLOCK_M); - DG_STATIC_ASSERT(BLOCK_M % WAVE_BLOCK_M == 0, "Invalid block sizes"); - float accum[WGMMA::kNumAccum], final_accum[WGMMA::kNumAccum * (BLOCK_M / WAVE_BLOCK_M)] = {0}; - - // Empty barrier arrival - auto empty_barrier_arrive = [&](uint32_t s) { - if constexpr (kNumTMAMulticast == 1) { - lane_idx == 0 ? empty_barriers[s]->arrive() : void(); - } else { - auto target_cta = scheduler.is_peer_cta_alive ? lane_idx : cute::block_rank_in_cluster(); - lane_idx < kNumTMAMulticast ? empty_barriers[s]->arrive(target_cta) : void(); - } - }; - - // Launch MMAs - launch_k_iterations([&](uint32_t k_iter, auto divisible_type, auto skip_type, auto _) { - constexpr bool kSkipComputation = std::is_same_v; - constexpr bool kHasDivisibleStages = std::is_same_v; - constexpr uint32_t kNumInnerStages = kSkipComputation ? 0 : - (kHasDivisibleStages ? kNumStages : (SHAPE_K % kFullKOfAllStages) / BLOCK_K); - - #pragma unroll - for (uint32_t s = 0; s < kNumInnerStages; ++ s) { - // Read B scales - float scale_b_0 = ld_shared(smem_scales_b + k_iter * kNumStages + s), scale_b_1; - // NOTES: even some blocks do not need to read the second row, but we still load one to align with other blocks - if constexpr (not kMustUseUniformedScaleB) - scale_b_1 = ld_shared(smem_scales_b + k_iter * kNumStages + s + SHAPE_K_SCALES); - - // Wait TMA arrivals - full_barriers[s]->wait((scheduler.current_iter * kNumIterations + k_iter) & 1); - - // TODO: remove some useless computation for unaligned Ms - #pragma unroll - for (uint32_t local_idx = 0; local_idx < BLOCK_M / WAVE_BLOCK_M; ++ local_idx) { - auto m_offset = local_idx * WAVE_BLOCK_M; - - // Read A scales - // NOTES: all shared memory read must be prior to `warpgroup_arrive` to avoid next scheduled block polluting the results - auto scale_a_0 = ld_shared(smem_scales_a[s] + r_0 + m_offset); - auto scale_a_1 = ld_shared(smem_scales_a[s] + r_1 + m_offset); - - // Commit WGMMA instructions - #pragma unroll - for (uint32_t i = 0; i < WGMMA::kNumAccum; ++ i) - warpgroup_fence_operand(accum[i]); - warpgroup_arrive(); - #pragma unroll - for (uint32_t k = 0; k < BLOCK_K / WGMMA::K; ++ k) { - auto desc_a = make_smem_desc(smem_a[s] + (math_wg_idx * WGMMA::M + m_offset) * BLOCK_K + k * WGMMA::K, 1); - auto desc_b = make_smem_desc(smem_b[s] + k * WGMMA::K, 1); - WGMMA::wgmma(desc_a, desc_b, accum, k); - } - warpgroup_commit_batch(); - #pragma unroll - for (uint32_t i = 0; i < WGMMA::kNumAccum; ++ i) - warpgroup_fence_operand(accum[i]); - warpgroup_wait<0>(); - - // Notify barrier arrival at the last warpgroup wave - if (local_idx == BLOCK_M / WAVE_BLOCK_M - 1) - empty_barrier_arrive(s); - - // Promote with scales - // NOTES: making it as predicates is very important for performance, comparing to two loops - float scale_0_0 = scale_a_0 * scale_b_0, scale_1_0 = scale_a_1 * scale_b_0; - float scale_0_1, scale_1_1; - if constexpr (not kMustUseUniformedScaleB) - scale_0_1 = scale_a_0 * scale_b_1, scale_1_1 = scale_a_1 * scale_b_1; - - auto shifted_accum = final_accum + WGMMA::kNumAccum * local_idx; - #pragma unroll - for (uint32_t i = 0; i < WGMMA::kNumAccum / 4; ++ i) { - // NOTES: for unrolled `num_former_iters` cases, we expect the compiler to automatically make it a constant - bool predicate = kMustUseUniformedScaleB or i < num_former_iters; - shifted_accum[i * 4 + 0] += (predicate ? scale_0_0 : scale_0_1) * accum[i * 4 + 0]; - shifted_accum[i * 4 + 1] += (predicate ? scale_0_0 : scale_0_1) * accum[i * 4 + 1]; - shifted_accum[i * 4 + 2] += (predicate ? scale_1_0 : scale_1_1) * accum[i * 4 + 2]; - shifted_accum[i * 4 + 3] += (predicate ? scale_1_0 : scale_1_1) * accum[i * 4 + 3]; - } - } - } - - // Wait unaligned cases - #pragma unroll - for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { - full_barriers[s]->wait((scheduler.current_iter * kNumIterations + k_iter) & 1); - empty_barrier_arrive(s); - } - }, not scheduler.is_computation_valid(m_block_idx, math_wg_idx * WGMMA::M), num_former_iters); - - // TMA checks - constexpr uint32_t kNumElemBytes = sizeof(nv_bfloat16); - constexpr uint32_t TMA_D_BLOCK_N = kSwizzleDMode == 0 ? BLOCK_N : (kSwizzleDMode / kNumElemBytes); - constexpr uint32_t WGMMA_M_PER_WARP = WGMMA::M / 4; - DG_STATIC_ASSERT(BLOCK_M % 8 == 0, "Invalid swizzling atom"); - DG_STATIC_ASSERT(BLOCK_N % TMA_D_BLOCK_N == 0 and BLOCK_N / TMA_D_BLOCK_N <= 32, - "Unaligned TMA store or too many TMA store instructions"); - DG_STATIC_ASSERT(TMA_D_BLOCK_N % 8 == 0, "Invalid TMA block N"); - DG_STATIC_ASSERT(static_cast(kSwizzleDMode > 0) + static_cast(BLOCK_N_PADDING > 0) <= 1, - "Swizzling and padding are not compatible"); - - // Wait last TMA store to be finished - if (threadIdx.x < BLOCK_N / TMA_D_BLOCK_N) - cute::tma_store_wait<0>(); - cutlass::arch::NamedBarrier(kNumMathThreads).sync(); - - // Write back to shared memory using STSM and issue TMA stores - DG_STATIC_ASSERT(WGMMA::kNumAccum % 4 == 0, "Invalid STSM x2 vectorization"); - #pragma unroll - for (uint32_t local_idx = 0; local_idx < BLOCK_M / WAVE_BLOCK_M; ++ local_idx) { - auto m_offset = local_idx * WAVE_BLOCK_M; - auto shifted_accum = final_accum + WGMMA::kNumAccum * local_idx; - #pragma unroll - for (auto i = 0; i < WGMMA::kNumAccum / 4; ++ i) { - // Swizzle or padding into the correct address - uint8_t* smem_ptr = nullptr; - if constexpr (kSwizzleDMode > 0) { - // Calculate the swizzling atom offset and in-atom offset - constexpr uint32_t kNumBankGroupBytes = 16; - auto atom_offset = i / (TMA_D_BLOCK_N / 8), in_atom_offset = i % (TMA_D_BLOCK_N / 8); - - // Calculate the index of the bank group to be written in the atom - auto bank_group_index = in_atom_offset + lane_idx * (kSwizzleDMode / kNumBankGroupBytes); - - // Reshape the atom in another view and swizzle - // - original: `(BLOCK_M, kSwizzleDMode / kNumBankGroupBytes)` - // - new: `(BLOCK_M * kSwizzleDMode / kNumBankGroupBytes / 8, 8)` - constexpr bool kHasShortcut = (kSwizzleDMode / kNumBankGroupBytes) == 8; - auto row = kHasShortcut ? (in_atom_offset / 8 + lane_idx) : (bank_group_index / 8); - auto col = kHasShortcut ? (in_atom_offset) : (bank_group_index % 8); - col ^= row % (kSwizzleDMode / 16); - - // Add back into the base pointer - // NOTES: think twice before modifying this, as changes may affect the number of instructions - smem_ptr = reinterpret_cast(smem_d) + // Base pointer - warp_idx * (WGMMA_M_PER_WARP * kSwizzleDMode) + // Warp offset - m_offset * kSwizzleDMode + // Wave offset - atom_offset * BLOCK_M * kSwizzleDMode + // Swizzle atom offset (constants) - row * (kNumBankGroupBytes * 8) + col * kNumBankGroupBytes; // In-atom offset - } else { - // No swizzling, just padding - // NOTES: padding must be zero for BF16 output - DG_STATIC_ASSERT(BLOCK_N_PADDING == 0, "Padding must be zero for BF16 output"); - smem_ptr = reinterpret_cast(smem_d + (m_offset + warp_idx * WGMMA_M_PER_WARP + lane_idx) * (BLOCK_N + BLOCK_N_PADDING) + i * 8); - } - - // NOTES: only 16 lanes' addresses are used - SM90_U32x2_STSM_N::copy( - __float22bfloat162_rn({shifted_accum[i * 4 + 0], shifted_accum[i * 4 + 1]}), - __float22bfloat162_rn({shifted_accum[i * 4 + 2], shifted_accum[i * 4 + 3]}), - smem_ptr - ); - } - } - cute::tma_store_fence(); - cutlass::arch::NamedBarrier(kNumMathThreads).sync(); - - // Use TMA store to write back to global memory - // TODO: compatible with FP32 output - DG_STATIC_ASSERT(kNumMathThreads >= BLOCK_N / TMA_D_BLOCK_N, "Too many TMA blocks"); - if (threadIdx.x < BLOCK_N / TMA_D_BLOCK_N) { - auto in_block_n_offset = threadIdx.x * TMA_D_BLOCK_N; - auto smem_ptr = smem_d + in_block_n_offset * BLOCK_M; - cute::SM90_TMA_STORE_2D::copy(&tensor_map_d, smem_ptr, - n_block_idx * BLOCK_N + in_block_n_offset, - scheduler.get_global_idx(shape_m, BLOCK_M, m_block_idx)); - cute::tma_store_arrive(); - } - __syncwarp(); - } - } -#else - if (blockIdx.x == 0 and threadIdx.x == 0) - DG_DEVICE_ASSERT(false and "This kernel only support sm_90a"); -#endif -} - -}; // namespace deep_gemm - -#pragma clang diagnostic pop \ No newline at end of file diff --git a/deep_gemm/include/deep_gemm/fp8_wgrad_gemm.cuh b/deep_gemm/include/deep_gemm/fp8_wgrad_gemm.cuh deleted file mode 100644 index 7b7e3d31..00000000 --- a/deep_gemm/include/deep_gemm/fp8_wgrad_gemm.cuh +++ /dev/null @@ -1,363 +0,0 @@ -#pragma once - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunknown-attributes" - -#include -#include - -#include -#include -#include - -#include "mma_utils.cuh" -#include "scheduler.cuh" -#include "tma_utils.cuh" -#include "utils.cuh" - -namespace deep_gemm { - -template -__global__ void __launch_bounds__(get_num_threads_per_sm(BLOCK_M), 1) -fp8_wgrad_gemm_kernel(uint32_t shape_k, - const __grid_constant__ CUtensorMap tensor_map_a, - const __grid_constant__ CUtensorMap tensor_map_b, - const __grid_constant__ CUtensorMap tensor_map_scales_a, - const __grid_constant__ CUtensorMap tensor_map_scales_b, - const __grid_constant__ CUtensorMap tensor_map_d) { -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) || defined(__CLION_IDE__) - // Scaling checks - DG_STATIC_ASSERT(BLOCK_K == 128, "Only support per-128-channel FP8 scaling"); - - // Types - using WGMMA = typename FP8MMASelector::type; - using Barrier = cutlass::arch::ClusterTransactionBarrier; - DG_STATIC_ASSERT(BLOCK_M % WGMMA::M == 0, "Invalid block size"); - - // Shared memory - static constexpr uint32_t SMEM_D_SIZE = BLOCK_M * BLOCK_N * sizeof(float); - static constexpr uint32_t SMEM_A_SIZE_PER_STAGE = BLOCK_M * BLOCK_K * sizeof(__nv_fp8_e4m3); - static constexpr uint32_t SMEM_B_SIZE_PER_STAGE = BLOCK_N * BLOCK_K * sizeof(__nv_fp8_e4m3); - static constexpr uint32_t SMEM_SCALES_A_SIZE_PER_STAGE = BLOCK_M * sizeof(float); - static constexpr uint32_t SMEM_SCALES_B_SIZE_PER_STAGE = BLOCK_N * sizeof(float); - static constexpr uint32_t ALIGNED_SMEM_SCALES_B_SIZE_PER_STAGE = ceil_div(SMEM_SCALES_B_SIZE_PER_STAGE, 128U) * 128U; - - // Configs - constexpr uint32_t kFullKOfAllStages = kNumStages * BLOCK_K; - constexpr uint32_t kNumThreads = get_num_threads_per_sm(BLOCK_M); - constexpr uint32_t kNumMathThreads = kNumThreads - kNumTMAThreads; - - const uint32_t shape_k_scales = ceil_div(shape_k, BLOCK_K); - const uint32_t num_iterations = ceil_div(shape_k, kFullKOfAllStages); - const uint32_t warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); - const uint32_t lane_idx = get_lane_id(); - - // Prefetch TMA descriptors at the very beginning - if (threadIdx.x == kNumMathThreads) { - // NOTES: `reinterpret_cast` must be here, or NVRTC will fail - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_a)); - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_b)); - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_scales_a)); - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_scales_b)); - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_d)); - } - __syncwarp(); - - // Align to 1024 bytes for swizzle-128B - extern __shared__ __align__(1024) uint8_t smem_buffer[]; - DG_STATIC_ASSERT(SMEM_D_SIZE % 1024 == 0, "Shared memory of A/B must be aligned to 1024 bytes"); - - // Data on shared memory - auto smem_d = reinterpret_cast(smem_buffer); - __nv_fp8_e4m3* smem_a[kNumStages]; - __nv_fp8_e4m3* smem_b[kNumStages]; - float* smem_scales_a[kNumStages]; - float* smem_scales_b[kNumStages]; - - // TMA Barrier for both divisible and non-divisible cases - Barrier* full_barriers[kNumStages + 1]; - Barrier* empty_barriers[kNumStages + 1]; - - // Fill shared memory pointers - #pragma unroll - for (int i = 0; i < kNumStages; ++ i) { - smem_a[i] = reinterpret_cast<__nv_fp8_e4m3*>(smem_buffer + SMEM_D_SIZE + i * SMEM_A_SIZE_PER_STAGE); - smem_b[i] = reinterpret_cast<__nv_fp8_e4m3*>(smem_buffer + SMEM_D_SIZE + kNumStages * SMEM_A_SIZE_PER_STAGE + i * SMEM_B_SIZE_PER_STAGE); - smem_scales_a[i] = reinterpret_cast(smem_buffer + SMEM_D_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE) - + i * SMEM_SCALES_A_SIZE_PER_STAGE); - smem_scales_b[i] = reinterpret_cast(smem_buffer + SMEM_D_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SCALES_A_SIZE_PER_STAGE) - + i * ALIGNED_SMEM_SCALES_B_SIZE_PER_STAGE); - } - - // Fill barriers - DG_STATIC_ASSERT(sizeof(Barrier) % sizeof(float) == 0, "Misaligned barriers"); - auto barrier_start_ptr = reinterpret_cast(smem_buffer + SMEM_D_SIZE + kNumStages - * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SCALES_A_SIZE_PER_STAGE + ALIGNED_SMEM_SCALES_B_SIZE_PER_STAGE)); - #pragma unroll - for (int i = 0; i < kNumStages + 1; ++ i) { - full_barriers[i] = barrier_start_ptr + i; - empty_barriers[i] = barrier_start_ptr + kNumStages + 1 + i; - } - - // Initialize barriers - DG_STATIC_ASSERT(kNumTMAMulticast <= 32, "To many TMA multicast"); - if (threadIdx.x == kNumMathThreads) { - // NOTES: we always use `lane_idx` to arrive for the `lane_idx`-th CTA in the cluster, - // even with TMA multicast disabled, we want to make the behavior aligned - #pragma unroll - for (int i = 0; i < kNumStages; ++ i) { - full_barriers[i]->init(1); - empty_barriers[i]->init(kNumTMAMulticast * kNumMathThreads / 32); - } - full_barriers[kNumStages]->init(1); - empty_barriers[kNumStages]->init(1); - - // Make initialized barrier visible in async proxy - cutlass::arch::fence_view_async_shared(); - (kNumTMAMulticast > 1) ? cutlass::arch::fence_barrier_init() : void(); - } - - // Synchronize all threads to make barrier visible in normal memory model - (kNumTMAMulticast > 1) ? cute::cluster_sync() : __syncthreads(); - - // For pipeline unrolling - struct DivisibleK {}; - struct NotDivisibleK {}; - auto launch_k_iterations = [&](const auto& func) { - if constexpr (kNumLastStages == 0) { - for (int k_iter = 0; k_iter < num_iterations; ++ k_iter) - func(k_iter, DivisibleK{}); - } else { - for (int k_iter = 0; k_iter < num_iterations - 1; ++ k_iter) - func(k_iter, DivisibleK{}); - func(num_iterations - 1, NotDivisibleK{}); - } - }; - - // Register reconfigurations - constexpr int kNumTMARegisters = 40; - constexpr int kNumMathRegisters = 232; - - // Block scheduler - uint32_t m_block_idx, n_block_idx; - auto scheduler = Scheduler(SHAPE_M); - - if (threadIdx.x >= kNumMathThreads) { - // TMA warp-group for loading data - cutlass::arch::warpgroup_reg_dealloc(); - - // NOTES: only one thread (or warp) will be used - if (threadIdx.x == kNumMathThreads) { - // Persistently schedule over blocks - while (scheduler.get_next_block(m_block_idx, n_block_idx)) { - launch_k_iterations([&](int k_iter, auto type) { - constexpr bool kHasDivisibleStages = std::is_same_v; - constexpr int kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; - DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); - - // Assign TMA multicast number into A and B - // NOTES: there may be additional odd rows/columns or cases where multicast is not possible. - const bool is_tma_multicast_valid = scheduler.is_tma_multicast_valid(m_block_idx); - const uint32_t num_tma_multicast_a = (kIsTMAMulticastOnA and is_tma_multicast_valid) ? kNumTMAMulticast : 1; - const uint32_t num_tma_multicast_b = (not kIsTMAMulticastOnA and is_tma_multicast_valid) ? kNumTMAMulticast : 1; - DG_STATIC_ASSERT(kNumTMAMulticast <= 2, "Scheduler does not support > 2 TMA multicast"); - - #pragma unroll - for (uint32_t s = 0; s < kNumInnerStages; ++ s) { - // Wait consumer release - empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); - - // Issue TMA A - auto& full_barrier = *full_barriers[s]; - int k_idx = k_iter * kFullKOfAllStages + s * BLOCK_K; - tma_copy(&tensor_map_a, reinterpret_cast(&full_barrier), - smem_a[s], k_idx, m_block_idx * BLOCK_M, num_tma_multicast_a); - tma_copy(&tensor_map_scales_a, reinterpret_cast(&full_barrier), - smem_scales_a[s], m_block_idx * BLOCK_M, - k_idx / BLOCK_K, num_tma_multicast_a); - - // Issue TMA B - tma_copy(&tensor_map_b, reinterpret_cast(&full_barrier), - smem_b[s], k_idx, n_block_idx * BLOCK_N, num_tma_multicast_b); - tma_copy(&tensor_map_scales_b, reinterpret_cast(&full_barrier), - smem_scales_b[s], n_block_idx * BLOCK_N, k_idx / BLOCK_K, num_tma_multicast_b); - - full_barrier.arrive_and_expect_tx(SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SCALES_A_SIZE_PER_STAGE + SMEM_SCALES_B_SIZE_PER_STAGE); - } - - // Wait unaligned cases - #pragma unroll - for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { - empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); - full_barriers[s]->arrive(); - } - }); - - // Issue TMA D - empty_barriers[kNumStages]->wait((scheduler.current_iter + 1) & 1); - auto& full_barrier = *full_barriers[kNumStages]; - tma_copy(&tensor_map_d, reinterpret_cast(&full_barrier), - smem_d, n_block_idx * BLOCK_N, m_block_idx * BLOCK_M, 1); - full_barrier.arrive_and_expect_tx(SMEM_D_SIZE); - } - - // To safely deconstruct distributed shared barriers, we need another round of empty waits - if constexpr (kNumTMAMulticast > 1) { - #pragma unroll - for (uint32_t s = 0; s < kNumStages; ++ s) - empty_barriers[s]->wait((scheduler.current_iter * num_iterations + 1) & 1); - } - } - } else { - // Math warp-groups for WGMMA - cutlass::arch::warpgroup_reg_alloc(); - - // NOTES: use `__shfl_sync` to encourage NVCC to use unified registers - const auto math_wg_idx = __shfl_sync(0xffffffff, threadIdx.x / kNumMathThreadsPerGroup, 0); - const auto row_idx = lane_idx / 4, col_idx = lane_idx % 4; - const auto r_0 = warp_idx * 16 + row_idx, r_1 = r_0 + 8; - - // Empty barrier arrival - auto empty_barrier_arrive = [&](int s) { - if constexpr (kNumTMAMulticast == 1) { - lane_idx == 0 ? empty_barriers[s]->arrive() : void(); - } else { - auto target_cta = scheduler.is_peer_cta_alive ? lane_idx : cute::block_rank_in_cluster(); - lane_idx < kNumTMAMulticast ? empty_barriers[s]->arrive(target_cta) : void(); - } - }; - - // Persistently schedule over blocks - while (scheduler.get_next_block(m_block_idx, n_block_idx)) { - // Decide the number of scales B to load - DG_STATIC_ASSERT(SHAPE_N % 8 == 0, "Invalid shape N"); - cutlass::arch::NamedBarrier(kNumMathThreads).sync(); - - // Accumulation for WGMMA or CUDA promotion - constexpr int WAVE_BLOCK_M = WGMMA::M * get_num_math_warpgroups(BLOCK_M); - float accum[WGMMA::kNumAccum], final_accum[WGMMA::kNumAccum * (BLOCK_M / WAVE_BLOCK_M)] = {0}; - float2 scales_b[WGMMA::kNumAccum / 4]; - - // Launch MMAs - launch_k_iterations([&](int k_iter, auto type) { - constexpr bool kHasDivisibleStages = std::is_same_v; - constexpr int kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; - DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); - - #pragma unroll - for (int s = 0; s < kNumInnerStages; ++ s) { - // Wait TMA arrivals - full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); - - #pragma unroll - for (uint32_t local_idx = 0; local_idx < BLOCK_M / WAVE_BLOCK_M; ++ local_idx) { - auto m_offset = local_idx * WAVE_BLOCK_M; - - // Read A scales - auto scale_a_0 = ld_shared(smem_scales_a[s] + r_0 + m_offset); - auto scale_a_1 = ld_shared(smem_scales_a[s] + r_1 + m_offset); - - // Commit WGMMA instructions - #pragma unroll - for (int i = 0; i < WGMMA::kNumAccum; ++ i) - warpgroup_fence_operand(accum[i]); - warpgroup_arrive(); - #pragma unroll - for (int k = 0; k < BLOCK_K / WGMMA::K; ++ k) { - auto desc_a = make_smem_desc(smem_a[s] + (math_wg_idx * WGMMA::M + m_offset) * BLOCK_K + k * WGMMA::K, 1); - auto desc_b = make_smem_desc(smem_b[s] + k * WGMMA::K, 1); - WGMMA::wgmma(desc_a, desc_b, accum, k); - } - warpgroup_commit_batch(); - - // Read B scales at the first warpgroup wave - if (local_idx == 0) { - #pragma unroll - for (int i = 0; i < WGMMA::kNumAccum / 4; ++i) - scales_b[i] = ld_shared(reinterpret_cast(smem_scales_b[s] + i * 8 + col_idx * 2)); - __syncwarp(); - } - - #pragma unroll - for (int i = 0; i < WGMMA::kNumAccum; ++ i) - warpgroup_fence_operand(accum[i]); - warpgroup_wait<0>(); - - // Notify barrier arrival at the last warpgroup wave - if (local_idx == BLOCK_M / WAVE_BLOCK_M - 1) - empty_barrier_arrive(s); - - // Promote with scales - auto shifted_accum = final_accum + WGMMA::kNumAccum * local_idx; - #pragma unroll - for (int i = 0; i < WGMMA::kNumAccum / 4; ++ i) { - const float &scale_b_0 = scales_b[i].x; - const float &scale_b_1 = scales_b[i].y; - shifted_accum[i * 4 + 0] += scale_a_0 * scale_b_0 * accum[i * 4 + 0]; - shifted_accum[i * 4 + 1] += scale_a_0 * scale_b_1 * accum[i * 4 + 1]; - shifted_accum[i * 4 + 2] += scale_a_1 * scale_b_0 * accum[i * 4 + 2]; - shifted_accum[i * 4 + 3] += scale_a_1 * scale_b_1 * accum[i * 4 + 3]; - } - } - } - - // Wait last TMA store to be finished - if (k_iter == 0 and scheduler.current_iter > 0) { - if (threadIdx.x == 0) { - cute::tma_store_wait<0>(); - empty_barriers[kNumStages]->arrive(); - } - __syncwarp(); - } - - // Wait unaligned cases - #pragma unroll - for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { - full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); - empty_barrier_arrive(s); - } - }); - - // Wait TMA D arrivals - full_barriers[kNumStages]->wait(scheduler.current_iter & 1); - - // Accumulate to D shared memory - #pragma unroll - for (uint32_t local_idx = 0; local_idx < BLOCK_M / WAVE_BLOCK_M; ++ local_idx) { - auto m_offset = local_idx * WAVE_BLOCK_M; - auto shifted_accum = final_accum + WGMMA::kNumAccum * local_idx; - auto smem_d_0 = reinterpret_cast(smem_d + (m_offset + r_0) * BLOCK_N + col_idx * 2); - auto smem_d_1 = reinterpret_cast(smem_d + (m_offset + r_1) * BLOCK_N + col_idx * 2); - #pragma unroll - for (auto i = 0; i < WGMMA::kNumAccum / 4; ++ i) { - float2 d_0 = ld_shared(smem_d_0 + i * 4); - st_shared(smem_d_0 + i * 4, {d_0.x + shifted_accum[i * 4 + 0], d_0.y + shifted_accum[i * 4 + 1]}); - float2 d_1 = ld_shared(smem_d_1 + i * 4); - st_shared(smem_d_1 + i * 4, {d_1.x + shifted_accum[i * 4 + 2], d_1.y + shifted_accum[i * 4 + 3]}); - } - } - - cute::tma_store_fence(); - cutlass::arch::NamedBarrier(kNumMathThreads).sync(); - - // Use TMA store to write back to global memory - if (threadIdx.x == 0) { - cute::SM90_TMA_STORE_2D::copy(&tensor_map_d, smem_d, n_block_idx * BLOCK_N, m_block_idx * BLOCK_M); - cute::tma_store_arrive(); - } - __syncwarp(); - } - } -#else - if (blockIdx.x == 0 and threadIdx.x == 0) - DG_DEVICE_ASSERT(false && "This kernel only support sm_90a"); -#endif -} - -}; // namespace deep_gemm - -#pragma clang diagnostic pop diff --git a/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh b/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh new file mode 100644 index 00000000..28b5399a --- /dev/null +++ b/deep_gemm/include/deep_gemm/impls/sm100_bf16_gemm.cuh @@ -0,0 +1,3 @@ +#pragma once + +// TODO: add implement \ No newline at end of file diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh new file mode 100644 index 00000000..12bbc20c --- /dev/null +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh @@ -0,0 +1,624 @@ +#pragma once +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunknown-attributes" + +#include + +#include +#include +#include + +namespace deep_gemm { + +using namespace deep_gemm::sm100; + +template +__global__ void __launch_bounds__(kNumNonEpilogueThreads + kNumEpilogueThreads, 1) +sm100_fp8_gemm_1d1d_impl(int* grouped_layout, + uint32_t shape_m, uint32_t shape_n, uint32_t shape_k, + const __grid_constant__ CUtensorMap tensor_map_a, + const __grid_constant__ CUtensorMap tensor_map_b, + const __grid_constant__ CUtensorMap tensor_map_sfa, + const __grid_constant__ CUtensorMap tensor_map_sfb, + const __grid_constant__ CUtensorMap tensor_map_c, + const __grid_constant__ CUtensorMap tensor_map_d) { +#if (defined(__CUDA_ARCH__) and (__CUDA_ARCH__ >= 1000)) or defined(__CLION_IDE__) + using Barrier = cutlass::arch::ClusterTransactionBarrier; + + // GEMM with accumulation must have FP32 output + if constexpr (kWithAccumulation) + DG_STATIC_ASSERT(std::is_same_v, "Invalid C/D data dtype"); + + // Configs + constexpr uint32_t LAYOUT_AD_M = 128; + constexpr uint32_t kNumMWaves = BLOCK_M / LAYOUT_AD_M; + constexpr uint32_t kNumTMAStoreStages = 2; + constexpr uint32_t kNumSFStagesPerLoad = sizeof(uint32_t) / sizeof(cutlass::float_ue8m0_t); + constexpr uint32_t kNumUTCCPAlignedElems = 128; + DG_STATIC_ASSERT(BLOCK_K == 128, "Invalid block K"); + DG_STATIC_ASSERT(BLOCK_M % LAYOUT_AD_M == 0 and 2 % kNumMWaves == 0, "Invalid block M"); + + // Overwrite shape constants if the compiler gives + shape_m = SHAPE_M != 0 ? SHAPE_M : shape_m; + shape_n = SHAPE_N != 0 ? SHAPE_N : shape_n; + shape_k = SHAPE_K != 0 ? SHAPE_K : shape_k; + const uint32_t shape_sf_k = ceil_div(shape_k, BLOCK_K * kNumSFStagesPerLoad); + + // Utils + bool is_leader_cta = cute::block_rank_in_cluster() == 0; + const auto warp_idx = cutlass::canonical_warp_idx_sync(); + const auto lane_idx = get_lane_idx(); + + // Align to 1024 bytes for swizzle-128B + extern __shared__ __align__(1024) uint8_t smem_buffer[]; + + // 2-CTA MMA + constexpr uint32_t LOAD_BLOCK_M = BLOCK_M / (kIsMulticastOnA ? kNumMulticast: 1); + constexpr uint32_t LOAD_BLOCK_N = BLOCK_N / (kIsMulticastOnA ? 1 : kNumMulticast); + constexpr uint32_t STORE_BLOCK_M = std::min(BLOCK_M, LAYOUT_AD_M); + constexpr uint32_t STORE_BLOCK_N = kSwizzleCDMode / sizeof(cd_dtype_t); + DG_STATIC_ASSERT(not kIsMulticastOnA or kNumMulticast == 1, "Invalid multicast"); + DG_STATIC_ASSERT(LOAD_BLOCK_M == BLOCK_M and BLOCK_M % LAYOUT_AD_M == 0, "Only support tensor memory layout A/D"); + DG_STATIC_ASSERT(kNumMulticast == 1 or kNumMulticast == 2, "Only support 1/2 multicast"); + + // Share memory sizes + constexpr uint32_t SMEM_CD_SIZE_PER_STAGE = STORE_BLOCK_M * kSwizzleCDMode; + constexpr uint32_t SMEM_CD_SIZE = SMEM_CD_SIZE_PER_STAGE * kNumTMAStoreStages; + constexpr uint32_t SMEM_A_SIZE_PER_STAGE = LOAD_BLOCK_M * BLOCK_K * sizeof(__nv_fp8_e4m3); + constexpr uint32_t SMEM_B_SIZE_PER_STAGE = LOAD_BLOCK_N * BLOCK_K * sizeof(__nv_fp8_e4m3); + constexpr uint32_t SF_BLOCK_M = align(BLOCK_M, kNumUTCCPAlignedElems); + constexpr uint32_t SF_BLOCK_N = align(BLOCK_N, kNumUTCCPAlignedElems); + constexpr uint32_t SMEM_SFA_SIZE_PER_STAGE = SF_BLOCK_M * sizeof(uint32_t); + constexpr uint32_t SMEM_SFB_SIZE_PER_STAGE = SF_BLOCK_N * sizeof(uint32_t); + DG_STATIC_ASSERT(SMEM_CD_SIZE % 1024 == 0, "Shared memory of A/B must be aligned to 1024 bytes"); + DG_STATIC_ASSERT(kNumTMAStoreStages >= 1, "Invalid number of TMA stages"); + + // Automatically deduce the number of epilogue stages (1 or 2), according to the tensor memory size + // TODO: test cases of `kNumMWaves == 2 and kNumEpilogueStages == 2` + constexpr uint32_t kNumSFATmemCols = SF_BLOCK_M / 32; + constexpr uint32_t kNumSFBTmemCols = SF_BLOCK_N / 32; + constexpr uint32_t kNumEpilogueStages = (2 * kNumMWaves * BLOCK_N + kNumSFATmemCols + kNumSFBTmemCols) > 512 ? 1 : 2; + + // Real tensor memory size and offsets + constexpr uint32_t kNumAccumTmemCols = kNumEpilogueStages * kNumMWaves * BLOCK_N; + constexpr uint32_t kNumTmemCols = get_num_aligned_tmem_cols(); + constexpr uint32_t kTmemStartColOfSFA = kNumAccumTmemCols; + constexpr uint32_t kTmemStartColOfSFB = kNumAccumTmemCols + kNumSFATmemCols; + + // Prefetch TMA descriptors at the very beginning + if (threadIdx.x == 0) { + cute::prefetch_tma_descriptor(&tensor_map_a); + cute::prefetch_tma_descriptor(&tensor_map_b); + cute::prefetch_tma_descriptor(&tensor_map_sfa); + cute::prefetch_tma_descriptor(&tensor_map_sfb); + cute::prefetch_tma_descriptor(&tensor_map_d); + if constexpr (kWithAccumulation) + cute::prefetch_tma_descriptor(&tensor_map_c); + } + + // Data on shared memory (layout as ordered below) + cd_dtype_t* smem_cd[kNumTMAStoreStages]; + cutlass::float_e4m3_t* smem_a[kNumStages]; + cutlass::float_e4m3_t* smem_b[kNumStages]; + uint32_t* smem_sfa[kNumStages]; + uint32_t* smem_sfb[kNumStages]; + + // TMA Barrier for both divisible and non-divisible cases + Barrier* full_barriers[kNumStages]; + Barrier* empty_barriers[kNumStages]; + Barrier* with_sf_full_barriers[kNumStages]; + Barrier* tmem_full_barriers[kNumEpilogueStages]; + Barrier* tmem_empty_barriers[kNumEpilogueStages]; + Barrier* accumulation_full_barrier; + + // Fill D/A/B pointers + #pragma unroll + for (uint32_t i = 0; i < kNumTMAStoreStages; ++ i) + smem_cd[i] = reinterpret_cast(smem_buffer + i * SMEM_CD_SIZE_PER_STAGE); + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + smem_a[i] = reinterpret_cast(smem_buffer + SMEM_CD_SIZE + i * SMEM_A_SIZE_PER_STAGE); + smem_b[i] = reinterpret_cast(smem_buffer + SMEM_CD_SIZE + kNumStages * SMEM_A_SIZE_PER_STAGE + i * SMEM_B_SIZE_PER_STAGE); + } + + // Fill SFA/SFB + auto sf_start_ptr = smem_buffer + SMEM_CD_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE); + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + smem_sfa[i] = reinterpret_cast(sf_start_ptr + i * SMEM_SFA_SIZE_PER_STAGE); + smem_sfb[i] = reinterpret_cast(sf_start_ptr + kNumStages * SMEM_SFA_SIZE_PER_STAGE + i * SMEM_SFB_SIZE_PER_STAGE); + } + + // Fill barriers + auto barrier_start_ptr = reinterpret_cast(smem_buffer + + SMEM_CD_SIZE + + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE) + + kNumStages * (SMEM_SFA_SIZE_PER_STAGE + SMEM_SFB_SIZE_PER_STAGE)); + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + full_barriers[i] = barrier_start_ptr + i; + empty_barriers[i] = barrier_start_ptr + kNumStages + i; + with_sf_full_barriers[i] = barrier_start_ptr + kNumStages * 2 + i; + } + #pragma unroll + for (uint32_t i = 0; i < kNumEpilogueStages; ++ i) { + tmem_full_barriers[i] = barrier_start_ptr + kNumStages * 3 + i; + tmem_empty_barriers[i] = barrier_start_ptr + kNumStages * 3 + kNumEpilogueStages + i; + } + accumulation_full_barrier = barrier_start_ptr + kNumStages * 3 + kNumEpilogueStages * 2; + + // Fill the tensor memory pointer + auto tmem_ptr_in_smem = reinterpret_cast(barrier_start_ptr + kNumStages * 3 + kNumEpilogueStages * 2 + 1); + DG_STATIC_ASSERT(32 <= kNumTmemCols and kNumTmemCols <= 512, "Invalid tensor memory columns"); + + // Initialize barriers + if (threadIdx.x == 0) { + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + // Arrive at all CTAs + full_barriers[i]->init(1); + empty_barriers[i]->init(1); + // Arrive only at the leader CTA + with_sf_full_barriers[i]->init(kNumMulticast * 32); + } + #pragma unroll + for (uint32_t i = 0; i < kNumEpilogueStages; ++ i) { + // Arrive at all CTAs + tmem_full_barriers[i]->init(1); + // Arrive only at the leader CTA + tmem_empty_barriers[i]->init(kNumMulticast * kNumEpilogueThreads); + } + if constexpr (kWithAccumulation) + accumulation_full_barrier->init(1); + + // Make initialized barrier visible in async proxy + cutlass::arch::fence_view_async_shared(); + cutlass::arch::fence_barrier_init(); + } else if (threadIdx.x >= 32 and threadIdx.x < 64) { + // Allocate tensor memory + cute::TMEM::Allocator1Sm().allocate(kNumTmemCols, tmem_ptr_in_smem); + } + kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads(); + + // For pipeline unrolling + struct DivisibleK {}; + struct NotDivisibleK {}; + const uint32_t num_iterations = ceil_div(shape_k, kNumStages * BLOCK_K); + auto launch_k_iterations = [=](const auto& func) { + if constexpr (kNumLastStages == 0) { + for (uint32_t k_iter = 0; k_iter < num_iterations; ++ k_iter) + func(k_iter, DivisibleK{}, k_iter == num_iterations - 1); + } else { + for (uint32_t k_iter = 0; k_iter < num_iterations - 1; ++ k_iter) + func(k_iter, DivisibleK{}, false); + func(num_iterations - 1, NotDivisibleK{}, true); + } + }; + + auto dispatch_accum_stage_idx = [&](uint32_t accum_stage_idx, const auto& func) { + DG_STATIC_ASSERT(1 <= kNumEpilogueStages and kNumEpilogueStages <= 2, + "Too many epilogue stages, please modify the Python heuristic as well"); + accum_stage_idx == 0 ? func(0) : func(1); + }; + + // Block scheduler + uint32_t m_block_idx, n_block_idx; + auto scheduler = Scheduler(shape_m, shape_n, grouped_layout); + + // Dispatch warps into different roles + if (warp_idx == 0) { + // TMA load warp + // Persistently schedule over blocks + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter) { + constexpr bool kHasDivisibleStages = std::is_same_v; + constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; + DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + + #pragma unroll + for (uint32_t s = 0; s < kNumInnerStages; ++ s) { + // Wait consumer release + empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); + + // Compute offsets + // NOTES: the group is always concatenated with the outer dimension + uint32_t m_idx = scheduler.get_global_idx<(kGemmType != GemmType::GroupedContiguous)>( + shape_m, BLOCK_M, m_block_idx); + uint32_t n_idx = scheduler.get_global_idx<(kMajorB == cute::UMMA::Major::K)>( + shape_n, BLOCK_N, n_block_idx, m_block_idx); + + // NOTES: `k_idx` is actually the k index default for K-major, while `k_b_idx` may be MN-major + // And for all grouped GEMMs, A must be K-majored + DG_STATIC_ASSERT(kGemmType == GemmType::Normal or kMajorA == cute::UMMA::Major::K, "Invalid major"); + uint32_t k_block_idx = k_iter * kNumStages + s; + uint32_t k_idx = k_block_idx * BLOCK_K; + uint32_t k_b_idx = scheduler.get_global_idx<(kMajorB == cute::UMMA::Major::MN)>( + shape_k, BLOCK_K, k_block_idx, m_block_idx); + + // Add 2 CTA offsets + if constexpr (kNumMulticast > 1) { + m_idx += kIsMulticastOnA ? (cute::block_rank_in_cluster() * LOAD_BLOCK_M) : 0; + n_idx += kIsMulticastOnA ? 0 : (cute::block_rank_in_cluster() * LOAD_BLOCK_N); + } + + // Issue TMAs + if (cute::elect_one_sync()) { + if constexpr (kMajorA == cute::UMMA::Major::K) + tma_copy(&tensor_map_a, full_barriers[s], smem_a[s], k_idx, m_idx); + if constexpr (kMajorA == cute::UMMA::Major::MN) + tma_copy(&tensor_map_a, full_barriers[s], smem_a[s], m_idx, k_idx); + if constexpr (kMajorB == cute::UMMA::Major::K) + tma_copy(&tensor_map_b, full_barriers[s], smem_b[s], k_b_idx, n_idx); + if constexpr (kMajorB == cute::UMMA::Major::MN) + tma_copy(&tensor_map_b, full_barriers[s], smem_b[s], n_idx, k_b_idx); + } + auto num_arrival_bytes = SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE; + + // Issue SFA and SFB TMAs at certain stages + // No swizzling, so one TMA for one SF is enough + const uint32_t sf_stage_in_group_idx = (k_iter * kNumStages + s) % kNumSFStagesPerLoad; + if (sf_stage_in_group_idx == 0 and cute::elect_one_sync()) { + tma_copy(&tensor_map_sfa, full_barriers[s], smem_sfa[s], m_block_idx * BLOCK_M, + scheduler.get_global_idx<(kGemmType != GemmType::GroupedContiguous)>(shape_sf_k, 1, ceil_div(k_idx, BLOCK_K * kNumSFStagesPerLoad))); + tma_copy(&tensor_map_sfb, full_barriers[s], smem_sfb[s], n_block_idx * BLOCK_N, + scheduler.get_global_idx(shape_sf_k, 1, ceil_div(k_idx, BLOCK_K * kNumSFStagesPerLoad), m_block_idx)); + num_arrival_bytes += (BLOCK_M + BLOCK_N) * sizeof(uint32_t); + } + + // Arrive at full barriers + if (cute::elect_one_sync()) + full_barriers[s]->arrive_and_expect_tx(num_arrival_bytes); + } + + // Wait unaligned cases + #pragma unroll + for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { + empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); + if (cute::elect_one_sync()) + full_barriers[s]->arrive(); + } + }); + } + } else if (warp_idx == 1 and is_leader_cta) { + // MMA issue warp + // NOTES: only the leader CTA will do this + // Make instruction descriptor + // TODO: refactor `UMMA_M` calculation + constexpr uint32_t UMMA_M = LAYOUT_AD_M * (kIsMulticastOnA ? 1 : kNumMulticast); + constexpr uint32_t UMMA_N = BLOCK_N * (kIsMulticastOnA ? kNumMulticast : 1); + constexpr uint32_t UMMA_K = 32 / sizeof(cutlass::float_e4m3_t); + auto instr_desc = cute::UMMA::make_instr_desc_block_scaled(); + auto sf_desc = make_sf_desc(nullptr); + + // Checks for MMA instructions + // NOTES: CUTLASS does not have such checks except the MMA traits, but we are not using these traits + DG_STATIC_ASSERT((UMMA_M == 64 and UMMA_N % 8 == 0 and 8 <= UMMA_N and UMMA_N <= 256) or + (UMMA_M == 128 and UMMA_N % 16 == 0 and 16 <= UMMA_N and UMMA_N <= 256) or + (UMMA_M == 256 and UMMA_N % 16 == 0 and 16 <= UMMA_N and UMMA_N <= 256), + "Invalid MMA instruction shape"); + + // Persistently schedule over blocks + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + dispatch_accum_stage_idx(scheduler.current_iter % kNumEpilogueStages, [&](uint32_t accum_stage_idx) { + // Wait tensor memory empty barrier arrival + auto accum_phase_idx = (scheduler.current_iter / kNumEpilogueStages) & 1; + tmem_empty_barriers[accum_stage_idx]->wait(accum_phase_idx ^ 1); + tcgen05_after_thread_sync(); + + // Empty barrier arrival + auto empty_barrier_arrive = [&](uint32_t s, bool do_tmem_full_arrive) { + auto umma_arrive = [](const uint64_t* barrier) { + if constexpr (kNumMulticast == 1) { + cutlass::arch::umma_arrive(barrier); + } else { + constexpr uint16_t kCTAMask = (1 << kNumMulticast) - 1; + cutlass::arch::umma_arrive_multicast_2x1SM(barrier, kCTAMask); + } + }; + umma_arrive(reinterpret_cast(empty_barriers[s])); + + // NOTES: the tensor memory accumulator pipeline has nothing to do with multicasting + if (do_tmem_full_arrive) + umma_arrive(reinterpret_cast(tmem_full_barriers[accum_stage_idx])); + }; + + // Launch MMAs + launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter) { + constexpr bool kHasDivisibleStages = std::is_same_v; + constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; + DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + + #pragma unroll + for (uint32_t s = 0; s < kNumInnerStages; ++ s) { + // Wait TMA and SF-transpose arrival + with_sf_full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + tcgen05_after_thread_sync(); + + // Do SF copy at certain stages + // NOTES: CUTLASS UTCCP's interface does not have `elect_one_sync`, we must do it by ourselves + const uint32_t sf_stage_in_group_idx = (k_iter * kNumStages + s) % kNumSFStagesPerLoad; + if (sf_stage_in_group_idx == 0 and cute::elect_one_sync()) { + using cute_utccp_t = std::conditional_t; + + // SFA and SFB copy + // TODO: preprocess shared memory descriptor + #pragma unroll + for (uint32_t i = 0; i < SF_BLOCK_M / kNumUTCCPAlignedElems; ++ i) { + auto smem_ptr = smem_sfa[s] + i * kNumUTCCPAlignedElems; + replace_smem_desc_addr(sf_desc, smem_ptr); + cute_utccp_t::copy(sf_desc, kTmemStartColOfSFA + i * 4); + } + #pragma unroll + for (uint32_t i = 0; i < SF_BLOCK_N / kNumUTCCPAlignedElems; ++ i) { + auto smem_ptr = smem_sfb[s] + i * kNumUTCCPAlignedElems; + replace_smem_desc_addr(sf_desc, smem_ptr); + cute_utccp_t::copy(sf_desc, kTmemStartColOfSFB + i * 4); + } + } + __syncwarp(); + + // Issue UMMA in the leader CTA + using cute_mma_t = std::conditional_t, + cute::SM100_MMA_MXF8F6F4_2x1SM_SS>; + #pragma unroll + for (uint32_t k = 0; k < BLOCK_K / UMMA_K; ++ k) { + auto b_desc = make_umma_desc(smem_b[s], 0, k * UMMA_K); + // TODO: optimize runtime instruction creation + #pragma unroll + for (uint32_t w = 0; w < kNumMWaves; ++ w) { + auto a_desc = make_umma_desc(smem_a[s], w * LAYOUT_AD_M, k * UMMA_K); + auto runtime_instr_desc = make_runtime_instr_desc_with_sf_id(instr_desc, sf_stage_in_group_idx); + cute_mma_t::fma(a_desc, b_desc, + accum_stage_idx * kNumMWaves * BLOCK_N + w * BLOCK_N, + k_iter > 0 or s > 0 or k > 0, + runtime_instr_desc, + kTmemStartColOfSFA + w * (kNumUTCCPAlignedElems / 32), + kTmemStartColOfSFB); + } + } + + // Commit to the mbarrier object + tcgen05_before_thread_sync(); + empty_barrier_arrive(s, is_last_iter and s == kNumInnerStages - 1); + } + + // Wait unaligned cases + #pragma unroll + for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { + with_sf_full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + empty_barrier_arrive(s, false); + } + }); + }); + } + } else if (warp_idx == 2) { + // UTCCP transposer + auto utccp_required_smem_warp_transpose = [&](const uint32_t* smem_ptr) { + DG_STATIC_ASSERT(kNumUTCCPAlignedElems == 128, "Invalid aligned elements"); + uint32_t values[4]; + #pragma unroll + for (uint32_t i = 0; i < 4; ++ i) + values[i] = ld_shared(smem_ptr + (i ^ (lane_idx >> 3)) * 32 + lane_idx); + __syncwarp(); + #pragma unroll + for (uint32_t i = 0; i < 4; ++ i) + st_shared(smem_ptr + lane_idx * 4 + (i ^ (lane_idx >> 3)), values[i]); + }; + + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter) { + constexpr bool kHasDivisibleStages = std::is_same_v; + constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; + DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + + #pragma unroll + for (uint32_t s = 0; s < kNumInnerStages; ++ s) { + // Wait TMA arrival + full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + + // Transpose for UTCCP at certain stages + const uint32_t sf_stage_in_group_idx = (k_iter * kNumStages + s) % kNumSFStagesPerLoad; + if (sf_stage_in_group_idx == 0) { + #pragma unroll + for (uint32_t i = 0; i < SF_BLOCK_M / kNumUTCCPAlignedElems; ++ i) + utccp_required_smem_warp_transpose(smem_sfa[s] + i * kNumUTCCPAlignedElems); + #pragma unroll + for (uint32_t i = 0; i < SF_BLOCK_N / kNumUTCCPAlignedElems; ++ i) + utccp_required_smem_warp_transpose(smem_sfb[s] + i * kNumUTCCPAlignedElems); + // TODO: figure out whether the proxy fence is valid for 2-CTA cases + cutlass::arch::fence_view_async_shared(); + } + + // Arrive + with_sf_full_barriers[s]->arrive(0u); + } + + // Wait unaligned cases + #pragma unroll + for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { + full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + with_sf_full_barriers[s]->arrive(0u); + } + }); + } + } else if (warp_idx >= kNumNonEpilogueThreads / 32) { + // Epilogue warp groups + const auto epilogue_thread_idx = threadIdx.x - kNumNonEpilogueThreads; + const auto epilogue_warp_idx = warp_idx - (kNumNonEpilogueThreads / 32); + + // NOTES: tensor memory addresses are simplified, as the hardware will ignore the warp index bits, + // i.e., no need for `tmem_ptr |= (epilogue_warp_idx * 32) << 16`. + // NOTES: we also forbid two CTAs to share the same SM and its tensor memory + DG_TRAP_ONLY_DEVICE_ASSERT(ld_shared(tmem_ptr_in_smem) == 0); + + // TMA checks + constexpr uint32_t kNumBankGroupBytes = 16; + constexpr uint32_t kNumElemsPerBankGroup = kNumBankGroupBytes / sizeof(cd_dtype_t); + DG_STATIC_ASSERT(kSwizzleCDMode > 0, "TMA D must be swizzled"); + DG_STATIC_ASSERT(STORE_BLOCK_N % kNumElemsPerBankGroup == 0, "Invalid swizzling"); + + // Persistently schedule over blocks + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + dispatch_accum_stage_idx(scheduler.current_iter % kNumEpilogueStages, [&](uint32_t accum_stage_idx) { + auto accum_phase_idx = (scheduler.current_iter / kNumEpilogueStages) & 1; + + // Flush TMA stores + // NOTES: for the first store, we have to flush all previous TMA, + // as we don't share pipeline stages between two blocks + if (epilogue_thread_idx == 0) + cute::tma_store_wait<0>(); + cutlass::arch::NamedBarrier(kNumEpilogueThreads).sync(); + + // Wait UMMA arrival + tmem_full_barriers[accum_stage_idx]->wait(accum_phase_idx); + tcgen05_after_thread_sync(); + + // Load from tensor memory into registers, and write shared memory with STSM + DG_STATIC_ASSERT(kNumEpilogueThreads == 128, "Epilogue threads not enough"); + DG_STATIC_ASSERT(BLOCK_N % STORE_BLOCK_N == 0, "Invalid block sizes"); + + // Iterate over M waves + #pragma unroll + for (uint32_t w = 0; w < kNumMWaves; ++ w) { + // Issue every swizzled atom and pipeline STSM and TMA store + constexpr uint32_t kNumStores = BLOCK_N / STORE_BLOCK_N; + #pragma unroll + for (uint32_t s = 0; s < kNumStores; ++ s) { + // Wait shared memory to be released + const uint32_t iter_idx = w * kNumStores + s; + if (iter_idx >= kNumTMAStoreStages) { + if (epilogue_thread_idx == 0) + cute::tma_store_wait(); + cutlass::arch::NamedBarrier(kNumEpilogueThreads).sync(); + } + + // The pipeline stage + const auto tma_stage_idx = iter_idx % kNumTMAStoreStages; + const auto m_idx = scheduler.get_global_idx<(kGemmType != GemmType::GroupedContiguous)>(shape_m, BLOCK_M, m_block_idx) + w * LAYOUT_AD_M; + const auto n_idx = n_block_idx * BLOCK_N + s * STORE_BLOCK_N; + + // Issue accumulation TMA + if (kWithAccumulation and epilogue_thread_idx == 0) { + tma_copy( + &tensor_map_c, accumulation_full_barrier, smem_cd[tma_stage_idx], n_idx, m_idx); + accumulation_full_barrier->arrive_and_expect_tx(STORE_BLOCK_M * kSwizzleCDMode); + } + + // Store into shared memory + #pragma unroll + for (uint32_t i = 0; i < STORE_BLOCK_N / kNumElemsPerBankGroup; ++ i) { + // Calculate the index of the bank group to be written in the atom + auto bank_group_index = i + lane_idx * (kSwizzleCDMode / kNumBankGroupBytes); + + // Reshape the atom in another view and swizzle + // - original: `(LAYOUT_AD_M, kSwizzleCDMode / kNumBankGroupBytes)` + // - new: `(LAYOUT_AD_M * kSwizzleCDMode / kNumBankGroupBytes / 8, 8)` + // NOTES: "8" is the number of bank groups, "16" is the swizzling pattern + constexpr bool kHasShortcut = (kSwizzleCDMode / kNumBankGroupBytes) == 8; + auto row = kHasShortcut ? (i / 8 + lane_idx) : (bank_group_index / 8); + auto col = kHasShortcut ? (i) : (bank_group_index % 8); + col ^= row % (kSwizzleCDMode / 16); + + // Source and destination memory address + uint32_t tmem_addr = accum_stage_idx * kNumMWaves * BLOCK_N + // Accumulator offset + w * BLOCK_N + // Wave offset + s * STORE_BLOCK_N + i * kNumElemsPerBankGroup; // In-block offset + auto smem_ptr = reinterpret_cast(smem_cd[tma_stage_idx]) + // Base pointer + epilogue_warp_idx * 32 * kSwizzleCDMode + // Warp offset + row * (kNumBankGroupBytes * 8) + col * kNumBankGroupBytes; // In-atom offset + + // Load from tensor memory, store into shared memory + uint32_t values[kNumElemsPerBankGroup]; + if constexpr (std::is_same_v) { + // For FP32 output, read and store + DG_STATIC_ASSERT(kNumElemsPerBankGroup == 4, "Invalid type"); + cute::SM100_TMEM_LOAD_32dp32b4x::copy(tmem_addr, + values[0], values[1], values[2], values[3]); + cutlass::arch::fence_view_async_tmem_load(); + if constexpr (kWithAccumulation) { + // Wait TMA arrival before the first accumulation + if (i == 0) + accumulation_full_barrier->wait((scheduler.current_iter * (kNumMWaves * kNumStores) + iter_idx) & 1); + + // Load the same position and add + auto c_values = ld_shared(reinterpret_cast(smem_ptr)); + *reinterpret_cast(&values[0]) += c_values.x; + *reinterpret_cast(&values[1]) += c_values.y; + *reinterpret_cast(&values[2]) += c_values.z; + *reinterpret_cast(&values[3]) += c_values.w; + } + st_shared(smem_ptr, values[0], values[1], values[2], values[3]); + } else { + // For BF16 output, read, cast and store + DG_STATIC_ASSERT(kNumElemsPerBankGroup == 8 and std::is_same_v, "Invalid type"); + cute::SM100_TMEM_LOAD_32dp32b8x::copy(tmem_addr, + values[0], values[1], values[2], values[3], + values[4], values[5], values[6], values[7]); + cutlass::arch::fence_view_async_tmem_load(); + st_shared(smem_ptr, + cast_into_bf16_and_pack(values[0], values[1]), + cast_into_bf16_and_pack(values[2], values[3]), + cast_into_bf16_and_pack(values[4], values[5]), + cast_into_bf16_and_pack(values[6], values[7])); + } + } + + // Notify tensor memory empty (only at the leader CTA) arrival ASAP + // NOTES: only the last stage needs to do this + if (w == kNumMWaves - 1 and s == BLOCK_N / STORE_BLOCK_N - 1) { + tcgen05_before_thread_sync(); + tmem_empty_barriers[accum_stage_idx]->arrive(0u); + } + __syncwarp(); + + // Synchronize all threads and issue TMA + cute::tma_store_fence(); + cutlass::arch::NamedBarrier(kNumEpilogueThreads).sync(); + if (epilogue_thread_idx == 0) { + cute::SM90_TMA_STORE_2D::copy(&tensor_map_d, smem_cd[tma_stage_idx], n_idx, m_idx); + cute::tma_store_arrive(); + } + } + } + }); + } + + // Flush all stages in the pipeline to make TMA stores visible to the next kernel + // TODO: do we actually need this? + if (epilogue_thread_idx == 0) + cute::tma_store_wait<0>(); + + // Deallocate tensor memory by warp 1 + // NOTES: warp 0 is waiting TMA store + // TODO: do we need 2 SM allocation? + if (epilogue_warp_idx == 1) + cute::TMEM::Allocator1Sm().free(0, kNumTmemCols); + } + + // To safely deconstruct all barriers, we need a cluster sync + // TODO: optimize it by another round of barrier waits + if constexpr (kNumMulticast > 1) + cute::cluster_sync(); +#else + if (blockIdx.x == 0 and threadIdx.x == 0) + DG_DEVICE_ASSERT(false and "This kernel only support sm_100a/sm_101a"); +#endif +} + +}; // namespace deep_gemm + +#pragma clang diagnostic pop diff --git a/deep_gemm/include/deep_gemm/impls/sm90_bf16_gemm.cuh b/deep_gemm/include/deep_gemm/impls/sm90_bf16_gemm.cuh new file mode 100644 index 00000000..0ccec3eb --- /dev/null +++ b/deep_gemm/include/deep_gemm/impls/sm90_bf16_gemm.cuh @@ -0,0 +1,3 @@ +#pragma once + +// TODO: add implement diff --git a/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d1d.cuh b/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d1d.cuh new file mode 100644 index 00000000..28b5399a --- /dev/null +++ b/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d1d.cuh @@ -0,0 +1,3 @@ +#pragma once + +// TODO: add implement \ No newline at end of file diff --git a/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh b/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh new file mode 100644 index 00000000..28b5399a --- /dev/null +++ b/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh @@ -0,0 +1,3 @@ +#pragma once + +// TODO: add implement \ No newline at end of file diff --git a/deep_gemm/include/deep_gemm/mma_utils.cuh b/deep_gemm/include/deep_gemm/mma_utils.cuh deleted file mode 100644 index 85b2ccc0..00000000 --- a/deep_gemm/include/deep_gemm/mma_utils.cuh +++ /dev/null @@ -1,212 +0,0 @@ -#pragma once - -#ifndef __CUDACC_RTC__ -#include -#endif - -#include -#include - -#include "utils.cuh" - -namespace deep_gemm { - -template -struct SM90_U32x2_STSM_N { - __device__ __forceinline__ static void - copy(dtype_t src_0, dtype_t src_1, void* smem_dst) { - const uint32_t src[2] = {*reinterpret_cast(&src_0), *reinterpret_cast(&src_1)}; - asm volatile("stmatrix.sync.aligned.x2.m8n8.shared.b16 [%0], {%1, %2};\n" - :: "l"(smem_dst), "r"(src[0]), "r"(src[1])); - } -}; - -template -struct SM90_U32x4_STSM_N { - __device__ __forceinline__ static void - copy(dtype_t src_0, dtype_t src_1, dtype_t src_2, dtype_t src_3, void* smem_dst) { - const uint32_t src[4] = {*reinterpret_cast(&src_0), *reinterpret_cast(&src_1), - *reinterpret_cast(&src_2), *reinterpret_cast(&src_3)}; - asm volatile("stmatrix.sync.aligned.x4.m8n8.shared.b16 [%0], {%1, %2, %3, %4};\n" - :: "l"(smem_dst), "r"(src[0]), "r"(src[1]), "r"(src[2]), "r"(src[3])); - } -}; - -__forceinline__ __device__ void warpgroup_arrive() { - asm volatile("wgmma.fence.sync.aligned;\n" ::: "memory"); -} - -__forceinline__ __device__ void warpgroup_commit_batch() { - asm volatile("wgmma.commit_group.sync.aligned;\n" ::: "memory"); -} - -__forceinline__ __device__ void warpgroup_fence_operand(float& reg) { - asm volatile("" : "+f"(reg) :: "memory"); -} - -__forceinline__ __device__ uint32_t get_lane_id() { - uint32_t lane_id; - asm("mov.u32 %0, %laneid;" : "=r"(lane_id)); - return lane_id; -} - -__device__ __forceinline__ uint32_t ld_shared(const uint32_t* __restrict__ ptr) { - uint32_t ret; - asm volatile("ld.shared.u32 %0, [%1];" : "=r"(ret) : "l"(ptr)); - return ret; -} - -__device__ __forceinline__ int4 ld_shared(const int4* __restrict__ ptr) { - int4 ret; - asm volatile("ld.shared.v4.s32 {%0, %1, %2, %3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : "l"(ptr)); - return ret; -} - -__device__ __forceinline__ float ld_shared(const float* __restrict__ ptr) { - float ret; - asm volatile("ld.shared.f32 %0, [%1];" : "=f"(ret) : "l"(ptr)); - return ret; -} - -__device__ __forceinline__ float2 ld_shared(const float2* __restrict__ ptr) { - float2 ret; - asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : "l"(ptr)); - return ret; -} - -__device__ __forceinline__ void st_shared(const float* ptr, float val) { - asm volatile("st.shared.f32 [%0], %1;" :: "l"(ptr), "f"(val)); -} - -__device__ __forceinline__ void st_shared(const uint32_t* ptr, uint32_t val) { - asm volatile("st.shared.u32 [%0], %1;" :: "l"(ptr), "r"(val)); -} - -__device__ __forceinline__ void st_shared(const float2* ptr, float2 val) { - asm volatile("st.shared.v2.f32 [%0], {%1, %2};" :: "l"(ptr), "f"(val.x), "f"(val.y)); -} - -template -__device__ void warpgroup_wait() { - DG_STATIC_ASSERT(N >= 0 and N <= 7, "WGMMA wait: N must be in range [0, 7]"); - asm volatile("wgmma.wait_group.sync.aligned %0;\n" :: "n"(N) : "memory"); -} - -union GmmaDescriptor { - __host__ __device__ constexpr GmmaDescriptor() noexcept: desc_(0) {} - - __host__ __device__ constexpr GmmaDescriptor(uint64_t desc) noexcept: desc_(desc) {} - - __host__ __device__ constexpr GmmaDescriptor(GmmaDescriptor const &t) noexcept: desc_(t.desc_) {} - - __host__ __device__ constexpr GmmaDescriptor(GmmaDescriptor &&t) noexcept: desc_(t.desc_) {} - - __host__ __device__ constexpr GmmaDescriptor &operator=(GmmaDescriptor const &t) noexcept { - desc_ = t.desc_; - return *this; - } - - __host__ __device__ constexpr GmmaDescriptor &operator=(GmmaDescriptor &&t) noexcept { - desc_ = t.desc_; - return *this; - } - - uint64_t desc_; - uint32_t reg32_[2]; - uint16_t reg16_[4]; - - struct { - uint16_t start_address_: 14, : 2; - uint16_t leading_byte_offset_: 14, : 2; - uint16_t stride_byte_offset_: 14, : 2; - uint8_t : 1, base_offset_: 3, : 4; - uint8_t : 6, layout_type_: 2; - } bitfield; - - // Decay to an `uint64_t` - __host__ __device__ constexpr operator uint64_t() const noexcept { return desc_; } -}; - -template -__device__ GmmaDescriptor make_smem_desc(PointerType smem_ptr, int layout_type, - int leading_byte_offset = 0, - int stride_byte_offset = 1024) { - GmmaDescriptor desc; - auto uint_ptr = static_cast(__cvta_generic_to_shared(smem_ptr)); - desc.bitfield.start_address_ = uint_ptr >> 4; - desc.bitfield.layout_type_ = layout_type; - desc.bitfield.leading_byte_offset_ = leading_byte_offset >> 4; - desc.bitfield.stride_byte_offset_ = stride_byte_offset >> 4; - desc.bitfield.base_offset_ = 0; - return desc; -} - -template -struct FP8MMA { - - template - __forceinline__ __device__ static void call_fma_impl(uint64_t const& desc_a, uint64_t const& desc_b, float* d, bool scale_d, std::index_sequence) { - using namespace cute::SM90::GMMA; - MMA::fma(desc_a, desc_b, d[Idx]..., (scale_d ? ScaleOut::One : ScaleOut::Zero)); - } - - __forceinline__ __device__ static void wgmma(uint64_t const& desc_a, uint64_t const& desc_b, float* d, bool scale_d) { - call_fma_impl(desc_a, desc_b, d, scale_d, std::make_index_sequence{}); - } - - static constexpr int M = 64; - static constexpr int N = N_; - static constexpr int K = 32; - static constexpr int kNumAccum = M * N / 128; -}; - -template -struct FP8MMASelector { - - static constexpr auto select_mma() { - using namespace cute::SM90::GMMA; - if constexpr (N == 16) return MMA_64x16x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 24) return MMA_64x24x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 32) return MMA_64x32x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 40) return MMA_64x40x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 48) return MMA_64x48x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 56) return MMA_64x56x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 64) return MMA_64x64x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 72) return MMA_64x72x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 80) return MMA_64x80x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 88) return MMA_64x88x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 96) return MMA_64x96x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 104) return MMA_64x104x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 112) return MMA_64x112x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 120) return MMA_64x120x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 128) return MMA_64x128x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 136) return MMA_64x136x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 144) return MMA_64x144x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 152) return MMA_64x152x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 160) return MMA_64x160x32_F32E4M3E4M3_SS_TN(); - if constexpr (N == 192) return MMA_64x192x32_F32E4M3E4M3_SS_TN(); - } - - static constexpr auto select_type() { - return FP8MMA(); - } - - using type = decltype(select_type()); -}; - -enum class Layout { - RowMajor, - ColMajor -}; - -__device__ __host__ constexpr int get_num_math_warpgroups(int block_m) { - return block_m == 64 ? 1 : 2; -} - -template -__device__ __host__ constexpr int get_num_threads_per_sm(int block_m) { - DG_STATIC_ASSERT(kNumMathThreadsPerGroup == 128, "Only support 128 threads per math group"); - return get_num_math_warpgroups(block_m) * kNumMathThreadsPerGroup + kNumTMAThreads; -} - -} // namespace deep_gemm diff --git a/deep_gemm/include/deep_gemm/scheduler.cuh b/deep_gemm/include/deep_gemm/scheduler.cuh deleted file mode 100644 index 81bfeba0..00000000 --- a/deep_gemm/include/deep_gemm/scheduler.cuh +++ /dev/null @@ -1,163 +0,0 @@ -#pragma once - -#include "utils.cuh" - -namespace deep_gemm { - -enum class GemmType { - Normal, - GroupedContiguous, - GroupedMasked -}; - -#pragma clang diagnostic push -#pragma ide diagnostic ignored "cppcoreguidelines-pro-type-member-init" -template -struct Scheduler { - int current_iter = -1; - uint32_t num_aligned_m_blocks; - - // For normal GEMM - // Maybe not used in the masked grouped GEMM - uint32_t num_blocks; - uint32_t num_blocks_in_group; - bool is_peer_cta_alive = true; - - // For grouped GEMM - int* grouped_layout; - - // Only used for masked layout - uint32_t curr_group_idx, curr_cumsum; - - __device__ __forceinline__ explicit Scheduler(const uint32_t& shape_m, - int* grouped_layout = nullptr) { - num_aligned_m_blocks = ceil_div(shape_m, BLOCK_M); - if constexpr (kGemmType == GemmType::Normal) { - num_blocks = num_aligned_m_blocks * kNumNBlocks; - } else if (kGemmType == GemmType::GroupedContiguous) { - num_blocks = num_aligned_m_blocks * kNumNBlocks; - this->grouped_layout = grouped_layout; - } else if (kGemmType == GemmType::GroupedMasked) { - curr_group_idx = curr_cumsum = 0; - this->grouped_layout = grouped_layout; - } - } - - // ReSharper disable once CppNotAllPathsReturnValue - __device__ __forceinline__ bool is_computation_valid(const uint32_t& m_block_idx, const uint32_t& m_offset) const { - if constexpr (kGemmType == GemmType::Normal) { - return true; - } else if constexpr (kGemmType == GemmType::GroupedContiguous) { - return __ldg(grouped_layout + m_offset + m_block_idx * BLOCK_M) >= 0; - } else if constexpr (kGemmType == GemmType::GroupedMasked) { - return m_offset + m_block_idx * BLOCK_M < __ldg(grouped_layout + curr_group_idx); - } - } - - __device__ __forceinline__ bool is_tma_multicast_valid(const uint32_t& m_block_idx) const { - if (num_blocks_in_group == 1) - return false; - if constexpr (kGemmType == GemmType::Normal or kGemmType == GemmType::GroupedMasked) { - return true; - } else { - DG_STATIC_ASSERT(kGemmType == GemmType::GroupedContiguous, "Invalid Gemm type"); - if constexpr (kIsTMAMulticastOnA) { - return true; - } else { - auto group_idx = __ldg(grouped_layout + m_block_idx * BLOCK_M); - auto peer_group_idx = __ldg(grouped_layout + (m_block_idx ^ 1) * BLOCK_M); - return group_idx == peer_group_idx; - } - } - } - - __device__ __forceinline__ void get_swizzled_block_idx(const uint32_t& num_m_blocks, const uint32_t& block_idx, - uint32_t& m_block_idx, uint32_t& n_block_idx) { - DG_STATIC_ASSERT(kNum1DBlocksPerGroup % kNumTMAMulticast == 0, "Invalid group size"); - - // Swizzle for better L2 usages - auto primary_num_blocks = kIsTMAMulticastOnA ? kNumNBlocks : num_m_blocks; - auto secondary_num_blocks = kIsTMAMulticastOnA ? num_m_blocks : kNumNBlocks; - auto num_blocks_per_group = secondary_num_blocks * kNum1DBlocksPerGroup; - auto group_idx = block_idx / num_blocks_per_group; - auto first_block_idx = group_idx * kNum1DBlocksPerGroup; - auto in_group_idx = block_idx % num_blocks_per_group; - num_blocks_in_group = min(kNum1DBlocksPerGroup, primary_num_blocks - first_block_idx); - - // Fix unaligned TMA multicast - if (kNumTMAMulticast > 1 and num_blocks_in_group % 2 != 0) { - if (in_group_idx < (num_blocks_in_group ^ 1) * secondary_num_blocks) { - num_blocks_in_group = num_blocks_in_group ^ 1; - } else { - in_group_idx = in_group_idx - (num_blocks_in_group ^ 1) * secondary_num_blocks; - first_block_idx += num_blocks_in_group ^ 1; - num_blocks_in_group = 1; - } - } - - // Convert to final M/N block indices - if constexpr (kIsTMAMulticastOnA) { - m_block_idx = in_group_idx / num_blocks_in_group; - n_block_idx = first_block_idx + in_group_idx % num_blocks_in_group; - } else { - m_block_idx = first_block_idx + in_group_idx % num_blocks_in_group; - n_block_idx = in_group_idx / num_blocks_in_group; - } - } - - template - __device__ __forceinline__ uint32_t get_global_idx(const uint32_t& shape_dim, const uint32_t& block_size, - const uint32_t& block_idx, const uint32_t& m_block_idx=0) { - if constexpr (kGemmType == GemmType::Normal) { - return block_idx * block_size; - } else if constexpr (kGemmType == GemmType::GroupedContiguous) { - auto offset = kIgnoreGroupedForGroupedContiguous ? 0 : __ldg(grouped_layout + m_block_idx * BLOCK_M); - return offset * shape_dim + block_idx * block_size; - } else if constexpr (kGemmType == GemmType::GroupedMasked) { - return curr_group_idx * shape_dim + block_idx * block_size; - } - } - - __device__ __forceinline__ bool get_next_block(uint32_t& m_block_idx, uint32_t& n_block_idx) { - const auto next_block_idx = (++ current_iter) * gridDim.x + blockIdx.x; - - if constexpr (kGemmType == GemmType::GroupedMasked) { - uint32_t num_m_blocks; - while (true) { - // End of the task - if (curr_group_idx == kNumGroups) - return false; - - // Within the current group - num_m_blocks = ceil_div(static_cast(__ldg(grouped_layout + curr_group_idx)), BLOCK_M); - auto current_m_block_cumsum = curr_cumsum + num_m_blocks; - if (next_block_idx < current_m_block_cumsum * kNumNBlocks) - break; - - // Move to check the next group - curr_group_idx ++, curr_cumsum = current_m_block_cumsum; - } - - get_swizzled_block_idx(num_m_blocks, next_block_idx - curr_cumsum * kNumNBlocks, m_block_idx, n_block_idx); - } else { - if (next_block_idx >= num_blocks) - return false; - - // NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned - is_peer_cta_alive = kNumNBlocks % kNumTMAMulticast == 0 or // Always aligned on N (constant bypass) - num_aligned_m_blocks % kNumTMAMulticast == 0 or // Always aligned on M (constant bypass) - (next_block_idx ^ 1) < num_blocks; // Peer CTA in bound - get_swizzled_block_idx(num_aligned_m_blocks, next_block_idx, m_block_idx, n_block_idx); - } - return true; - } -}; - -#pragma clang diagnostic pop - -} // namespace deep_gemm diff --git a/deep_gemm/include/deep_gemm/tma_utils.cuh b/deep_gemm/include/deep_gemm/tma_utils.cuh deleted file mode 100644 index 795dca6a..00000000 --- a/deep_gemm/include/deep_gemm/tma_utils.cuh +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once - -#include "utils.cuh" - -namespace deep_gemm { - -// TODO: move this function to other files -__device__ __forceinline__ void -tma_copy(void const* desc_ptr, uint64_t* barrier_ptr, void* smem_ptr, - int32_t const& crd_0, int32_t const& crd_1, uint32_t num_tma_multicast) { - constexpr auto cache_hint = static_cast(cute::TMA::CacheHintSm90::EVICT_NORMAL); - if (num_tma_multicast == 1) { - cute::SM90_TMA_LOAD_2D::copy(desc_ptr, barrier_ptr, cache_hint, smem_ptr, crd_0, crd_1); - } else if (cute::block_rank_in_cluster() == 0) { - cute::SM90_TMA_LOAD_MULTICAST_2D::copy(desc_ptr, barrier_ptr, (1 << num_tma_multicast) - 1, cache_hint, smem_ptr, crd_0, crd_1); - } -} - -} // namespace deep_gemm diff --git a/deep_gemm/include/deep_gemm/utils.cuh b/deep_gemm/include/deep_gemm/utils.cuh deleted file mode 100644 index 598a4146..00000000 --- a/deep_gemm/include/deep_gemm/utils.cuh +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - -#ifdef __CLION_IDE__ - -__host__ __device__ __forceinline__ void host_device_printf(const char* format, ...) { - asm volatile("trap;"); -} - -#define printf host_device_printf -#endif - -#ifndef DG_DEVICE_ASSERT -#define DG_DEVICE_ASSERT(cond) \ -do { \ - if (not (cond)) { \ - printf("Assertion failed: %s:%d, condition: %s\n", __FILE__, __LINE__, #cond); \ - asm("trap;"); \ - } \ -} while (0) -#endif - -#ifndef DG_STATIC_ASSERT -#define DG_STATIC_ASSERT(cond, reason) static_assert(cond, reason) -#endif - -template -__device__ __host__ constexpr T ceil_div(T a, T b) { - return (a + b - 1) / b; -} - -template -__device__ __host__ constexpr T constexpr_gcd(T a, T b) { - return b == 0 ? a : constexpr_gcd(b, a % b); -} diff --git a/deep_gemm/jit/__init__.py b/deep_gemm/jit/__init__.py index 06a51940..3fcc714d 100644 --- a/deep_gemm/jit/__init__.py +++ b/deep_gemm/jit/__init__.py @@ -1,2 +1,2 @@ from .compiler import get_nvcc_compiler, build, NVCCCompiler, NVRTCCompiler -from .runtime import Runtime +from .runtime import Runtime, pytypes_to_ctypes diff --git a/deep_gemm/jit/compiler.py b/deep_gemm/jit/compiler.py index d3f1f762..3306077d 100644 --- a/deep_gemm/jit/compiler.py +++ b/deep_gemm/jit/compiler.py @@ -1,9 +1,11 @@ import functools +import getpass import hashlib import os import re import subprocess import time +import torch import uuid from typing import Any, Dict, List, Tuple, Type @@ -11,12 +13,19 @@ import cuda.bindings.nvrtc as nvrtc from torch.utils.cpp_extension import CUDA_HOME -from . import interleave_ffma +from .scripts import sm90_interleave_ffma from .runtime import Runtime, RuntimeCache runtime_cache = RuntimeCache() +@functools.lru_cache(maxsize=None) +def get_device_arch(): + major, minor = torch.cuda.get_device_capability() + suffix = 'a' if major >= 9 else '' + return f'{major * 10 + minor}{suffix}' + + def hash_to_hex(s: str) -> str: md5 = hashlib.md5() md5.update(s.encode('utf-8')) @@ -35,13 +44,18 @@ def get_deep_gemm_version() -> str: # Update include directories include_dir = os.path.join(get_jit_include_dir(), 'deep_gemm') assert os.path.exists(include_dir), f'Cannot find GEMM include directory {include_dir}' - for filename in filter(lambda x: x.endswith('.cuh'), sorted(os.listdir(include_dir))): - with open(os.path.join(include_dir, filename), 'rb') as f: + # Recursively walk through all subdirectories + for root, dirs, files in os.walk(include_dir): + for filename in filter(lambda x: x.endswith('.cuh'), sorted(files)): + filepath = os.path.join(root, filename) + with open(filepath, 'rb') as f: + md5.update(f.read()) + + # Update post-compilation scripts + script_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'scripts') + for filename in filter(lambda x: x.endswith('.py'), sorted(os.listdir(script_dir))): + with open(os.path.join(script_dir, filename), 'rb') as f: md5.update(f.read()) - - # Update `interleave_ffma.py` - with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'interleave_ffma.py'), 'rb') as f: - md5.update(f.read()) return md5.hexdigest()[0:12] @@ -74,28 +88,38 @@ def get_default_user_dir(): path = os.getenv('DG_JIT_CACHE_DIR') os.makedirs(path, exist_ok=True) return path - return os.path.join(os.path.expanduser('~'), '.deep_gemm') + # By default, the user home directory is `~` + path = os.path.expanduser('~') -@functools.lru_cache(maxsize=None) -def get_tmp_dir(): - return os.path.join(get_default_user_dir(), 'tmp') + # For a cluster environment, we may use a shared directory + # e.g., `/cluster/shared/user_0`, `/cluster/shared/user_1` + if 'DG_JIT_CACHE_HOME_DIR' in os.environ: + path = os.path.join(os.environ['DG_JIT_CACHE_HOME_DIR'], getpass.getuser()) + return os.path.join(path, '.deep_gemm') @functools.lru_cache(maxsize=None) -def get_cache_dir(): +def get_default_cache_dir(): return os.path.join(get_default_user_dir(), 'cache') -def make_tmp_dir(): - tmp_dir = get_tmp_dir() +def make_default_tmp_dir(): + tmp_dir = os.path.join(get_default_user_dir(), 'tmp') os.makedirs(tmp_dir, exist_ok=True) return tmp_dir +def get_shared_cache_dirs(name: str): + if 'DG_JIT_CACHE_HOME_DIR' in os.environ and 'DG_JIT_CACHE_SHARED_USERS' in os.environ: + return [os.path.join(os.environ['DG_JIT_CACHE_HOME_DIR'], user, 'cache', name) + for user in os.environ['DG_JIT_CACHE_SHARED_USERS'].split(':')] + return [] + + def put(path, data): # Write and do POSIX atomic replace - tmp_file_path = os.path.join(make_tmp_dir(), f'file.tmp.{str(uuid.uuid4())}.{hash_to_hex(path)}') + tmp_file_path = os.path.join(make_default_tmp_dir(), f'file.tmp.{str(uuid.uuid4())}.{hash_to_hex(path)}') with open(tmp_file_path, 'wb' if isinstance(data, bytes) else 'w') as f: f.write(data) os.replace(tmp_file_path, path) @@ -121,7 +145,7 @@ def flags() -> List[str]: '--ptxas-options=--register-usage-level=10' + (',--verbose' if 'DG_JIT_PTXAS_VERBOSE' in os.environ else ''), # Suppress some unnecessary warnings, such as unused variables for certain `constexpr` branch cases - '--diag-suppress=39,161,174,177,186,940'] + '--diag-suppress=39,161,174,177,940'] @staticmethod def include_dirs() -> List[str]: @@ -133,23 +157,26 @@ def build(cls, name: str, code: str, runtime_cls: Type[Runtime], kwargs: Dict[st flags = cls.flags() # Build signature - enable_sass_opt = cls.__version__() <= (12, 8) and not int(os.getenv('DG_JIT_DISABLE_FFMA_INTERLEAVE', 0)) + # TODO: refactor post-process scripts if we have more in the future (or remove `< 12.9` support) + enable_sass_opt = cls.__version__() <= (12, 8) and get_device_arch() == '90a' and not int(os.getenv('DG_JIT_DISABLE_FFMA_INTERLEAVE', 0)) signature = f'{name}$${get_deep_gemm_version()}$${cls.signature()}$${flags}$${enable_sass_opt}$${code}' name = f'kernel.{name}.{hash_to_hex(signature)}' - path = os.path.join(get_cache_dir(), name) + path = os.path.join(get_default_cache_dir(), name) # Check runtime cache or file system hit + # NOTES: also try to use other users' cache global runtime_cache - cached_runtime = runtime_cache.get(path, runtime_cls, name, kwargs) - if cached_runtime is not None: - if int(os.getenv('DG_JIT_DEBUG', 0)): - print(f'Using cached JIT runtime {name} during build') - return cached_runtime + for possible_path in [path, *get_shared_cache_dirs(name)]: + cached_runtime = runtime_cache.get(possible_path, runtime_cls, name, kwargs) + if cached_runtime is not None: + if int(os.getenv('DG_JIT_DEBUG', 0)): + print(f'Using cached JIT runtime {name} during build') + return cached_runtime # Compile into a temporary CU file os.makedirs(path, exist_ok=True) cubin_path = os.path.join(path, 'kernel.cubin') - tmp_cubin_path = os.path.join(make_tmp_dir(), f'nvcc.tmp.{str(uuid.uuid4())}.{hash_to_hex(cubin_path)}.cubin') + tmp_cubin_path = os.path.join(make_default_tmp_dir(), f'nvcc.tmp.{str(uuid.uuid4())}.{hash_to_hex(cubin_path)}.cubin') start_time = time.time() cls.compile(name, code, tmp_cubin_path) @@ -158,10 +185,10 @@ def build(cls, name: str, code: str, runtime_cls: Type[Runtime], kwargs: Dict[st if int(os.getenv('DG_JIT_DEBUG', 0)): print(f'Compilation of JIT runtime {name} took {elapsed_time:.2f} seconds.') - # Interleave FFMA reuse + # Interleave FFMA reuse (SM90 only) if enable_sass_opt: - interleave_ffma.process(tmp_cubin_path) - + sm90_interleave_ffma.process(tmp_cubin_path) + # Atomic replace files os.replace(tmp_cubin_path, cubin_path) @@ -186,14 +213,14 @@ def signature(cls) -> str: def flags(cls) -> List[str]: cxx_flags = ['-fPIC', '-O3', '-fconcepts', '-Wno-deprecated-declarations', '-Wno-abi'] return [*super().flags(), *[f'-I{d}' for d in cls.include_dirs()], - '-gencode=arch=compute_90a,code=sm_90a', + f'--gpu-architecture=sm_{get_device_arch()}', '-cubin', '-O3', '--expt-relaxed-constexpr', '--expt-extended-lambda', f'--compiler-options={",".join(cxx_flags)}'] @classmethod def compile(cls, name: str, code: str, target_path: str) -> None: # Write the code - path = os.path.join(get_cache_dir(), name) + path = os.path.join(get_default_cache_dir(), name) src_path = os.path.join(path, 'kernel.cu') put(src_path, code) command = [get_nvcc_compiler()[0], @@ -207,6 +234,10 @@ def compile(cls, name: str, code: str, target_path: str) -> None: print(f'NVCC compilation failed: stdout: {result.stdout}, stderr: {result.stderr}') assert False, f'Failed to compile {src_path}' + # Print PTXAS log + if int(os.getenv('DG_JIT_DEBUG', 0)) or int(os.getenv('DG_JIT_PTXAS_VERBOSE', 0)): + print(result.stderr) + class NVRTCCompiler(Compiler): @staticmethod @@ -230,7 +261,7 @@ def include_dirs() -> List[str]: @classmethod def flags(cls) -> List[str]: flags = [*super().flags(), *[f'-I{d}' for d in cls.include_dirs()], - '--gpu-architecture=sm_90a', '-default-device'] + f'--gpu-architecture=sm_{get_device_arch()}', '-default-device'] # NOTES: PCH is vital for compilation speed if cls.__version__() >= (12, 8): flags += ['--pch'] @@ -240,6 +271,8 @@ def flags(cls) -> List[str]: @classmethod def compile(cls, name: str, code: str, target_path: str) -> None: + assert int(os.getenv('DG_JIT_PTXAS_VERBOSE', 0)) == 0, '`ptxas --verbose` is not compatible with NVRTC' + # Create program code_bytes = bytes(code, 'utf-8') result, program = nvrtc.nvrtcCreateProgram( diff --git a/deep_gemm/jit/runtime.py b/deep_gemm/jit/runtime.py index 7a63bf1c..e8f980ae 100644 --- a/deep_gemm/jit/runtime.py +++ b/deep_gemm/jit/runtime.py @@ -103,3 +103,12 @@ def get(self, path: str, runtime_cls: Type[Runtime], self.cache[path] = runtime return runtime return None + + +# Map some common Python types into C types +pytypes_to_ctypes = { + True: 'true', + False: 'false', + torch.bfloat16: 'cutlass::bfloat16_t', + torch.float: 'float' +} diff --git a/deep_gemm/jit/scripts/__init__.py b/deep_gemm/jit/scripts/__init__.py new file mode 100644 index 00000000..1661a07a --- /dev/null +++ b/deep_gemm/jit/scripts/__init__.py @@ -0,0 +1 @@ +from . import sm90_interleave_ffma diff --git a/deep_gemm/jit/interleave_ffma.py b/deep_gemm/jit/scripts/sm90_interleave_ffma.py similarity index 100% rename from deep_gemm/jit/interleave_ffma.py rename to deep_gemm/jit/scripts/sm90_interleave_ffma.py diff --git a/deep_gemm/jit_kernels/__init__.py b/deep_gemm/jit_kernels/__init__.py index f1fa7bb2..e6c320de 100644 --- a/deep_gemm/jit_kernels/__init__.py +++ b/deep_gemm/jit_kernels/__init__.py @@ -1,14 +1,5 @@ -from .gemm import gemm_fp8_fp8_bf16_nt -from .m_grouped_gemm import ( - m_grouped_gemm_fp8_fp8_bf16_nt_contiguous, - m_grouped_gemm_fp8_fp8_bf16_nt_masked -) -from .wgrad_gemm import ( - wgrad_gemm_fp8_fp8_fp32_nt, - k_grouped_wgrad_gemm_fp8_fp8_fp32_nt -) -from .utils import ( - ceil_div, set_num_sms, get_num_sms, - get_col_major_tma_aligned_tensor, - get_m_alignment_for_contiguous_layout +from . import ( + heuristics, + impls, + runtime ) diff --git a/deep_gemm/jit_kernels/gemm.py b/deep_gemm/jit_kernels/gemm.py deleted file mode 100644 index 574f821f..00000000 --- a/deep_gemm/jit_kernels/gemm.py +++ /dev/null @@ -1,242 +0,0 @@ -import math -import torch -from functools import lru_cache -from typing import Tuple - -from ..jit import build -from .runtime import ( - FP8GemmRuntime, GemmType, - make_2d_tma_a_desc, make_2d_tma_b_desc, - make_2d_tma_d_desc, make_2d_tma_scales_desc) -from .utils import get_num_sms, ceil_div, get_col_major_tma_aligned_tensor, get_m_alignment_for_contiguous_layout - - -def is_tma_multicast_legal(shape_dim: int, block_dim: int, num_tma_multicast: int, num_sms: int, - require_divisible: bool = False) -> bool: - divisible = ceil_div(shape_dim, block_dim) % num_tma_multicast == 0 or not require_divisible - return divisible and num_sms % num_tma_multicast == 0 - - -def get_swizzle_mode(block_n: int) -> int: - elem_size = 2 - for mode_bytes in (128, 64, 32): - if (block_n * elem_size) % mode_bytes == 0: - return mode_bytes - return 0 - - -def get_block_n_padding_for_smem_d(block_n: int) -> int: - # NOTES: padding is for solving bank conflicts, but wastes shared memory space - elem_size, requirement = 2, (4, 8) - bank_stride = (block_n * elem_size) // 4 - padding = (requirement[0] - bank_stride) % requirement[1] - return (((padding + requirement[1]) if padding < 0 else padding) * 4) // elem_size - - -def get_smem_config(num_stages: int, k: int, block_m: int, block_n: int, block_k: int = 128, - is_fp32_out: bool = False, is_wgrad: bool = False) -> Tuple[int, int, int]: - assert block_k == 128 - - # Try swizzle first, as it does not waste shared memory - swizzle_mode = get_swizzle_mode(block_n) - block_n_padding = get_block_n_padding_for_smem_d( - block_n) if swizzle_mode == 0 else 0 - - # NOTES: `scales_b` in a total manner or per-stage manner - smem_d = block_m * (block_n + block_n_padding) * (4 if is_fp32_out else 2) - smem_a_per_stage = block_m * block_k - smem_scales_a_per_stage = block_m * 4 - smem_b_per_stage = block_n * block_k - smem_scales_b_per_stage = ceil_div(block_n * 4, block_k) * block_k if is_wgrad else 0 - smem_scales_b = ceil_div(k, block_k) * 4 if not is_wgrad else 0 - smem_barrier = num_stages * 8 * 2 - - smem_size = 0 - smem_size += smem_d - smem_size += num_stages * smem_a_per_stage - smem_size += num_stages * smem_scales_a_per_stage - smem_size += num_stages * smem_b_per_stage - smem_size += num_stages * smem_scales_b_per_stage - smem_size += ceil_div(smem_scales_b * (1 if block_k % block_n == 0 else 2), 8) * 8 - smem_size += smem_barrier - - # Swizzle and padding are not compatible - assert int(swizzle_mode > 0) + int(block_n_padding > 0) <= 1 - - return smem_size, swizzle_mode, block_n_padding - - -@lru_cache(maxsize=None) -def get_best_configs(m: int, n: int, k: int, num_groups: int, num_sms: int, - is_grouped_contiguous: bool = False, is_grouped_masked: bool = False, - is_fp32_out: bool = False, is_wgrad: bool = False) -> \ - Tuple[int, int, int, int, Tuple[int, bool], Tuple[int, int, int]]: - if not is_grouped_contiguous: - block_ms = (64, 128, ) + ((256, ) if not is_fp32_out else ()) - else: - block_ms = (get_m_alignment_for_contiguous_layout(), ) - block_ns = tuple(range(16, 129, 8)) + ((136, 152, ) if is_wgrad else (144, 160, )) - - # Avoid bank conflicts for FP32 output - if is_fp32_out: - block_ns = [x for x in block_ns if x % 16 == 8] - - fix_wave_saturate = lambda x: num_sms if x == 0 else x - get_num_waves = lambda bm, bn: (ceil_div(ceil_div(m, bm) * ceil_div(n, bn) * num_groups, num_sms) if bm else None) - get_last_wave_util = lambda bm, bn: fix_wave_saturate((ceil_div(m, bm) * ceil_div(n, bn) * num_groups) % num_sms) - - # Decide block sizes by waves - best_block_m, best_block_n = None, None - for block_m in block_ms: - # NOTES: the block sizes cannot be too large, so at least one dim less than 128 - for block_n in filter(lambda bn: block_m <= 128 or bn <= 128, block_ns): - success = False - num_waves, best_num_waves = get_num_waves(block_m, block_n), get_num_waves(best_block_m, best_block_n) - if best_block_m is None or best_block_n is None: - success = True - elif num_waves < best_num_waves: - success = True - elif num_waves == best_num_waves: - # Check last wave utilization - util = get_last_wave_util(block_m, block_n) - best_util = get_last_wave_util(best_block_m, best_block_n) - success = util > best_util - if util == best_util: - # Case 1: same `block_m`, smaller `block_n` (wasted) - success |= block_m == best_block_m and block_n < best_block_n - # Case 2: same `block_n`, smaller `block_m` (wasted) - success |= block_n == best_block_n and block_m < best_block_m - # Case 3: different for both `block_m` and `block_n`, `block_n` larger is better - success |= block_m != best_block_m and block_n > best_block_n - best_block_m, best_block_n = (block_m, block_n) if success else (best_block_m, best_block_n) - assert best_block_m is not None and best_block_n is not None - - # Always pick the longest one - # NOTES: for double B scales, the best number of stages may be reduced - best_num_stages, best_smem_config, sm90_capacity = None, None, 232448 - stage_candidates = tuple(filter(lambda s: s <= max(k // 128, 1), (8, 7, 6, 5, 4, 3, 2, 1))) - if 128 % best_block_n != 0 and 128 // math.gcd(128, best_block_n) <= 4: - # Unrolling both stages and `num_former_iters` will cause large code size - stage_candidates = tuple(filter(lambda s: s <= max(k // 128, 1), (4, 3, 2, 1))) - for num_stages in stage_candidates: - best_smem_config = get_smem_config(num_stages, k, best_block_m, best_block_n, is_fp32_out=is_fp32_out, is_wgrad=is_wgrad) - if best_smem_config[0] <= sm90_capacity: - best_num_stages = num_stages - break - assert best_smem_config is not None - assert best_num_stages is not None - - # Decide the number of TMA multicasts and whether broadcast on A - best_tma_multicast_config = (1, True) - - # Try to multicast on the larger block side first - # NOTES: currently, grouped masked GEMM only supports multicast on A and requires the number of blocks in the N-direction to be even - is_multicast_legal = { - 'A': is_tma_multicast_legal(n, best_block_n, 2, num_sms, is_grouped_masked), - 'B': is_tma_multicast_legal(m, best_block_m, 2, num_sms) and not is_grouped_masked, - } - for i in ('A', 'B') if best_block_m > best_block_n else ('B', 'A'): - if m >= 512 and is_multicast_legal[i]: - best_tma_multicast_config = (2, i == 'A') - break - - # Recompute the minimal number of SMs required - # NOTES: less L2 cache usage and less GPU frequency drop - num_waves = get_num_waves(best_block_m, best_block_n) - num_min_sms = ceil_div(ceil_div(m, best_block_m) * ceil_div(n, best_block_n) * num_groups, num_waves) - num_min_sms = ceil_div(num_min_sms, best_tma_multicast_config[0]) * best_tma_multicast_config[0] - assert num_min_sms <= num_sms - - return num_min_sms, best_block_m, best_block_n, best_num_stages, best_tma_multicast_config, best_smem_config - - -def gemm_fp8_fp8_bf16_nt(lhs: Tuple[torch.Tensor, torch.Tensor], - rhs: Tuple[torch.Tensor, torch.Tensor], - out: torch.Tensor) -> None: - """ - Perform a normal GEMM with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling. - - Requirements: - LHS, RHS, and output tensors must be contiguous in dimension 1, i.e., stride(1) = 1. - The stride(0) of LHS and RHS must be a multiple of 16, and the stride(0) of output must be a multiple of 8. - RHS and RHS scaling factors are required to be transposed. - The LHS scaling tensor requires a TMA-aligned transposed format, if your input does not match the requirement, - this function will do a transposing with a set of slow PyTorch operations. - - Arguments: - lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m, k]`, - the second element is an FP32 1x128 scaling tensor for LHS of shape `[m, ⌈k / 128⌉]`. - rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[n, k]`, - the second element is an FP32 128x128 scaling tensor for RHS of shape `[⌈n / 128⌉, ⌈k / 128⌉]`. - out: the BF16 output tensor of shape `[m, n]`, representing the result. - """ - lhs, lhs_scales = lhs - rhs, rhs_scales = rhs - m, k = lhs.shape - n, k_ = rhs.shape - m_, n_ = out.shape - - # Type and shape checks - assert m == m_ and n == n_ and k == k_ - assert n > 0 and k > 0 - assert lhs_scales.shape == (m, ceil_div(k, 128)) - assert rhs_scales.shape == (ceil_div(n, 128), ceil_div(k, 128)) - assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32 - assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32 - assert out.dtype == torch.bfloat16 - assert lhs.stride(1) == 1 and out.stride(1) == 1 and rhs.stride(1) == 1 - - # LHS scales must be transposed for TMA loads, but not for RHS scales - # NOTES: `get_col_major_tma_aligned_tensor` may launch a kernel if not processed by previous kernels - lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales) - assert rhs_scales.is_contiguous() - - # Do nothing if `m` is zero - if m == 0: - return - - # K must be aligned to 128 - aligned_k = ceil_div(k, 128) * 128 - - # Auto-tuning with compilation - num_sms = get_num_sms() - num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs(m, n, k, 1, num_sms) - block_k = 128 - num_tma_threads = 128 - num_math_threads_per_group = 128 - - tensor_map_a = make_2d_tma_a_desc(GemmType.Normal, lhs, m, k, lhs.stride(0), block_m, block_k, 1) - tensor_map_b = make_2d_tma_b_desc(GemmType.Normal, rhs, n, k, rhs.stride(0), block_n, block_k, 1) - tensor_map_d = make_2d_tma_d_desc(GemmType.Normal, out, m, n, out.stride(0), block_m, block_n, 1, smem_config[1]) - tensor_map_scales_a = make_2d_tma_scales_desc(GemmType.Normal, lhs_scales, m, k, block_m, block_k, 1) - - kwargs = { - # Templated arguments - 'GEMM_TYPE': GemmType.Normal, - 'NUM_TMA_THREADS': num_tma_threads, - 'NUM_MATH_THREADS_PER_GROUP': num_math_threads_per_group, - 'M': m, 'N': n, 'K': aligned_k, - 'NUM_GROUPS': 1, - 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, - 'SWIZZLE_D_MODE': smem_config[1], - 'BLOCK_N_PADDING': smem_config[2], - 'NUM_STAGES': num_stages, - 'NUM_TMA_MULTICAST': tma_multicast_config[0], - 'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1], - # Runtime arguments - 'SCALES_B': rhs_scales, - 'GROUPED_LAYOUT': torch.empty(0, dtype=torch.int32, device=out.device), - 'NUM_SMS': num_sms, - 'SMEM_SIZE': smem_config[0], - 'TENSOR_MAP_A': tensor_map_a, - 'TENSOR_MAP_B': tensor_map_b, - 'TENSOR_MAP_SCALES_A': tensor_map_scales_a, - 'TENSOR_MAP_D': tensor_map_d, - 'STREAM': torch.cuda.current_stream().cuda_stream, - 'DEVICE_INDEX': out.device.index - } - - # Generate, build and run the kernel - code = FP8GemmRuntime.generate(kwargs) - runtime = build('gemm_fp8_fp8_bf16_nt', code, FP8GemmRuntime, kwargs) - runtime(**kwargs) diff --git a/deep_gemm/jit_kernels/heuristics/__init__.py b/deep_gemm/jit_kernels/heuristics/__init__.py new file mode 100644 index 00000000..24ec067e --- /dev/null +++ b/deep_gemm/jit_kernels/heuristics/__init__.py @@ -0,0 +1,5 @@ +from . import ( + common, + sm90_heuristics, + sm100_heuristics +) diff --git a/deep_gemm/jit_kernels/heuristics/common.py b/deep_gemm/jit_kernels/heuristics/common.py new file mode 100644 index 00000000..bf683afb --- /dev/null +++ b/deep_gemm/jit_kernels/heuristics/common.py @@ -0,0 +1,49 @@ +from ...jit.compiler import get_device_arch +from ...utils.math import ceil_div + + +class MulticastConfig: + def __init__(self, num_multicast: int, is_multicast_on_a: bool): + self.num_multicast = num_multicast + self.is_multicast_on_a = is_multicast_on_a + + def get_ab_load_block_m(self, block_m: int): + # NOTES: this for >= SM100 only + assert get_device_arch() != '90a' + return block_m // (self.num_multicast if self.is_multicast_on_a else 1) + + def get_ab_load_block_n(self, block_n: int): + # NOTES: this for >= SM100 only + assert get_device_arch() != '90a' + return block_n // (1 if self.is_multicast_on_a else self.num_multicast) + + +class SharedMemoryConfig: + def __init__(self, smem_size: int, swizzle_a_mode: int, swizzle_b_mode: int, swizzle_cd_mode: int): + self.smem_size = smem_size + self.swizzle_a_mode = swizzle_a_mode + self.swizzle_b_mode = swizzle_b_mode + # NOTES: sometimes the default swizzling pattern maybe not compatible (e.g., FP32 output) + self.swizzle_cd_mode = swizzle_cd_mode + # TODO: swizzle SF as well + self.swizzle_sf_mode = 0 + + assert self.swizzle_a_mode != 0 + assert self.swizzle_b_mode != 0 + assert self.swizzle_cd_mode > 16 + assert self.swizzle_sf_mode == 0 + + +def is_multicast_legal(shape_dim: int, block_dim: int, num_multicast: int, num_sms: int, + require_divisible: bool = False) -> bool: + divisible = ceil_div(shape_dim, block_dim) % num_multicast == 0 or not require_divisible + return divisible and num_sms % num_multicast == 0 + + +def get_swizzle_mode(block_size: int, elem_size: int) -> int: + # `> 0` means interleaving + # 16B actually means non-swizzling (but interleaving) + for mode_bytes in (128, 64, 32, 16): + if (block_size * elem_size) % mode_bytes == 0: + return mode_bytes + assert False, 'Invalid mode' diff --git a/deep_gemm/jit_kernels/heuristics/sm100_heuristics.py b/deep_gemm/jit_kernels/heuristics/sm100_heuristics.py new file mode 100644 index 00000000..071c61bd --- /dev/null +++ b/deep_gemm/jit_kernels/heuristics/sm100_heuristics.py @@ -0,0 +1,171 @@ +import functools +import torch +from typing import Tuple + +from .common import ( + MulticastConfig, SharedMemoryConfig, + is_multicast_legal, get_swizzle_mode +) +from ...utils.math import align, ceil_div +from ...utils.layout import ( + GemmType, MajorTypeAB, MajorTypeCD, + get_element_size, get_m_alignment_for_contiguous_layout +) + + +def get_sf_aligned_block_sizes(block_m: int, block_n: int, ab_dtype: torch.dtype): + num_utccp_aligned_elems = 128 + assert block_m % num_utccp_aligned_elems == 0 + return { + torch.bfloat16: (0, 0), + torch.float8_e4m3fn: (align(block_m, num_utccp_aligned_elems), align(block_n, num_utccp_aligned_elems)), + }[ab_dtype] + + +def is_tmem_size_legal(block_m: int, block_n: int, ab_dtype: torch.float): + # M waves or epilogue stages (* 2), SFA and SFB + sf_block_m, sf_block_n = get_sf_aligned_block_sizes(block_m, block_n, ab_dtype) + return ((2 * block_n) + (sf_block_m // 32) + (sf_block_n // 32)) <= 512 + + +def get_smem_config(block_m: int, block_n: int, block_k: int, + major_a: MajorTypeAB, major_b: MajorTypeAB, major_d: MajorTypeCD, + ab_dtype: torch.dtype, cd_dtype: torch.dtype, + num_stages: int, multicast_config: MulticastConfig) -> SharedMemoryConfig: + assert major_d == MajorTypeCD.NMajor + + ab_elem_size = get_element_size(ab_dtype) + cd_elem_size = get_element_size(cd_dtype) + + load_block_m = multicast_config.get_ab_load_block_m(block_m) + load_block_n = multicast_config.get_ab_load_block_n(block_n) + swizzle_a_mode = get_swizzle_mode(block_k if major_a == MajorTypeAB.KMajor else load_block_m, ab_elem_size) + swizzle_b_mode = get_swizzle_mode(block_k if major_b == MajorTypeAB.KMajor else load_block_n, ab_elem_size) + swizzle_cd_mode = get_swizzle_mode(block_n if major_d == MajorTypeCD.NMajor else block_m, cd_elem_size) + + # 2 stages of STSM and TMA store + # TODO: consider other layouts + layout_ad_m = 128 + smem_d = min(block_m, layout_ad_m) * swizzle_cd_mode * 2 + + # A/B shared memory + smem_a_per_stage = load_block_m * block_k * ab_elem_size + smem_b_per_stage = load_block_n * block_k * ab_elem_size + + # SF shared memory must be aligned to UTCCP + # Each stage must prefetch next 4 stages' SF (including the current) + sf_block_m, sf_block_n = get_sf_aligned_block_sizes(block_m, block_n, ab_dtype) + smem_scales_a_per_stage = sf_block_m * 4 + smem_scales_b_per_stage = sf_block_n * 4 + + # TODO: remove SF barriers for BF16 GEMMs + # TMA full/empty barriers, with-SF full barriers, tensor memory full/empty barriers, accumulation full barrier + # NOTES: some shapes may only have 1 epilogue stage, but we still allocate space for 2 stages + # NOTES: cases without accumulation will not use the accumulation full barrier + smem_barrier = num_stages * 8 * 3 + 2 * 8 * 2 + 8 + smem_tmem_ptr = 4 + + # Sum them up + smem_size = 0 + smem_size += smem_d + smem_size += num_stages * smem_a_per_stage + smem_size += num_stages * smem_b_per_stage + smem_size += num_stages * smem_scales_a_per_stage + smem_size += num_stages * smem_scales_b_per_stage + smem_size += smem_barrier + smem_size += smem_tmem_ptr + + return SharedMemoryConfig(smem_size, swizzle_a_mode, swizzle_b_mode, swizzle_cd_mode) + + +@functools.lru_cache(maxsize=None) +def get_best_configs(gemm_type: GemmType, + m: int, n: int, k: int, num_groups: int, + major_a: MajorTypeAB, major_b: MajorTypeAB, major_d: MajorTypeCD, + ab_dtype: torch.dtype, cd_dtype: torch.dtype, + num_sms: int) -> \ + Tuple[int, int, int, int, int, MulticastConfig, SharedMemoryConfig]: + assert ab_dtype == torch.float8_e4m3fn + assert cd_dtype in (torch.bfloat16, torch.float) + + # `BLOCK_M` and `BLOCK_N` are selected according to MMA instructions + if gemm_type == GemmType.GroupedContiguous: + block_ms = (get_m_alignment_for_contiguous_layout(), ) + else: + block_ms = (128, ) if major_b == MajorTypeAB.KMajor else (128, 256) + # NOTES: some `% 32 == 16` cases are not compatible with 2-CTA TMA swizzling + block_ns = tuple(range(16, 257, 16)) if major_b == MajorTypeAB.KMajor else tuple(range(32, 257, 32)) + + # `BLOCK_K` is selected in a fixed manner + block_k = 128 // get_element_size(ab_dtype) + + fix_wave_saturate = lambda x: num_sms if x == 0 else x + get_num_waves = lambda bm, bn: (ceil_div(ceil_div(m, bm) * ceil_div(n, bn) * num_groups, num_sms) if bm else None) + get_last_wave_util = lambda bm, bn: fix_wave_saturate((ceil_div(m, bm) * ceil_div(n, bn) * num_groups) % num_sms) + + # Decide block sizes by waves + # TODO: move block size search into `common.py` + best_block_m, best_block_n = None, None + for block_m in block_ms: + for block_n in block_ns: + success = False + num_waves, best_num_waves = get_num_waves(block_m, block_n), get_num_waves(best_block_m, best_block_n) + if best_block_m is None or best_block_n is None: + success = True + elif num_waves < best_num_waves: + success = True + elif num_waves == best_num_waves: + # Check last wave utilization + util = get_last_wave_util(block_m, block_n) + best_util = get_last_wave_util(best_block_m, best_block_n) + success = util > best_util + if util == best_util: + # Case 1: same `block_m`, smaller `block_n` (wasted) + success |= block_m == best_block_m and block_n < best_block_n + # Case 2: same `block_n`, smaller `block_m` (wasted) + success |= block_n == best_block_n and block_m < best_block_m + # Case 3: different for both `block_m` and `block_n`, larger `block_n` is better + success |= block_m != best_block_m and block_n > best_block_n + success &= is_tmem_size_legal(block_m, block_n, ab_dtype) + best_block_m, best_block_n = (block_m, block_n) if success else (best_block_m, best_block_n) + assert best_block_m is not None and best_block_n is not None + + # Decide the number of TMA multicasts and whether broadcast on A + best_multicast_config = MulticastConfig(1, True) + + # Try to multicast on the larger block side first + is_legal = { + # TODO: support other `tcgen05` layouts + 'A': False, + 'B': is_multicast_legal(m, best_block_m, 2, num_sms, True) and gemm_type == GemmType.Normal, + } + for i in ('A', 'B') if best_block_m > best_block_n else ('B', 'A'): + if m >= 512 and is_legal[i]: + best_multicast_config = MulticastConfig(2, i == 'A') + break + + # Always pick the longest one + # NOTES: for double B scales, the best number of stages may be reduced + # TODO: move stage search into `common.py` + best_num_stages, best_smem_config, sm100_capacity = None, None, 232448 + stage_candidates = tuple(filter(lambda s: s <= max(k // 128, 1), (8, 7, 6, 5, 4, 3, 2, 1))) + for num_stages in stage_candidates: + best_smem_config = get_smem_config(best_block_m, best_block_n, block_k, + major_a, major_b, major_d, + ab_dtype, cd_dtype, + num_stages, best_multicast_config) + if best_smem_config.smem_size <= sm100_capacity: + best_num_stages = num_stages + break + assert best_smem_config is not None + assert best_num_stages is not None + + # Recompute the minimal number of SMs required + # NOTES: less L2 cache usage and less GPU frequency drop + # TODO: move min SM fix into `common.py` + num_waves = get_num_waves(best_block_m, best_block_n) + num_min_sms = ceil_div(ceil_div(m, best_block_m) * ceil_div(n, best_block_n) * num_groups, num_waves) + num_min_sms = ceil_div(num_min_sms, best_multicast_config.num_multicast) * best_multicast_config.num_multicast + assert num_min_sms <= num_sms + + return num_min_sms, best_block_m, best_block_n, block_k, best_num_stages, best_multicast_config, best_smem_config diff --git a/deep_gemm/jit_kernels/heuristics/sm90_heuristics.py b/deep_gemm/jit_kernels/heuristics/sm90_heuristics.py new file mode 100644 index 00000000..e69de29b diff --git a/deep_gemm/jit_kernels/impls/__init__.py b/deep_gemm/jit_kernels/impls/__init__.py new file mode 100644 index 00000000..203e3565 --- /dev/null +++ b/deep_gemm/jit_kernels/impls/__init__.py @@ -0,0 +1,7 @@ +from . import ( + sm90_bf16_gemm, + sm100_bf16_gemm, + sm90_fp8_gemm_1d1d, + sm90_fp8_gemm_1d2d, + sm100_fp8_gemm_1d1d, +) diff --git a/deep_gemm/jit_kernels/impls/sm100_bf16_gemm.py b/deep_gemm/jit_kernels/impls/sm100_bf16_gemm.py new file mode 100644 index 00000000..e69de29b diff --git a/deep_gemm/jit_kernels/impls/sm100_fp8_gemm_1d1d.py b/deep_gemm/jit_kernels/impls/sm100_fp8_gemm_1d1d.py new file mode 100644 index 00000000..ca59fac6 --- /dev/null +++ b/deep_gemm/jit_kernels/impls/sm100_fp8_gemm_1d1d.py @@ -0,0 +1,339 @@ +import ctypes +import os +import torch +import cuda.bindings.driver as cbd +from typing import Any, Dict, Optional + +from ..runtime import ( + make_tma_a_desc, make_tma_b_desc, + make_tma_cd_desc, make_tma_sf_desc +) +from ..heuristics.sm100_heuristics import get_best_configs +from ...config import get_num_sms +from ...jit import Runtime, build, pytypes_to_ctypes +from ...utils.math import align, ceil_div +from ...utils.layout import GemmType, MajorTypeAB, MajorTypeCD + + +class SM100FP8GemmRuntime(Runtime): + def __init__(self, path: str) -> None: + super().__init__(path) + + @staticmethod + def generate(kwargs: Dict[str, Any]) -> str: + assert kwargs['CD_DTYPE_T'] in (torch.bfloat16, torch.float) + code = f''' +#ifdef __CUDACC_RTC__ +#include +#else +#include +#include +#endif + +#include + +using namespace deep_gemm; + +static void __instantiate_kernel() {{ + auto ptr = reinterpret_cast(&sm100_fp8_gemm_1d1d_impl< + {kwargs['MAJOR_A']}, + {kwargs['MAJOR_B']}, + {kwargs['M'] if 'm' in kwargs['COMPILED_DIMS'] else 0}, + {kwargs['N'] if 'n' in kwargs['COMPILED_DIMS'] else 0}, + {kwargs['K'] if 'k' in kwargs['COMPILED_DIMS'] else 0}, + {kwargs['BLOCK_M']}, + {kwargs['BLOCK_N']}, + {kwargs['BLOCK_K']}, + {kwargs['NUM_GROUPS']}, + {kwargs['SWIZZLE_A_MODE']}, + {kwargs['SWIZZLE_B_MODE']}, + {kwargs['SWIZZLE_CD_MODE']}, + {kwargs['NUM_STAGES']}, + {kwargs['NUM_LAST_STAGES']}, + {kwargs['NUM_NON_EPILOGUE_THREADS']}, + {kwargs['NUM_EPILOGUE_THREADS']}, + {kwargs['NUM_MULTICAST']}, + {pytypes_to_ctypes[kwargs['IS_MULTICAST_ON_A']]}, + {kwargs['GEMM_TYPE']}, + {pytypes_to_ctypes[kwargs['WITH_ACCUMULATION']]}, + {pytypes_to_ctypes[kwargs['CD_DTYPE_T']]} + >); +}}; +''' + if int(os.getenv('DG_JIT_DEBUG', 0)): + print(f'Generated FP8 GEMM code:\n{code}') + return code + + # noinspection PyMethodOverriding + @staticmethod + def launch(kernel: cbd.CUkernel, kwargs: Dict[str, Any]) -> cbd.CUresult: + result = cbd.cuKernelSetAttribute(cbd.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + kwargs['SMEM_SIZE'], kernel, cbd.CUdevice(kwargs['DEVICE_INDEX']))[0] + assert result == cbd.CUresult.CUDA_SUCCESS, f'Failed to set max dynamic shared memory size: {result}' + + attr_val = cbd.CUlaunchAttributeValue() + attr_val.clusterDim.x = kwargs['NUM_MULTICAST'] + attr_val.clusterDim.y = 1 + attr_val.clusterDim.z = 1 + attr = cbd.CUlaunchAttribute() + attr.id = cbd.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION + attr.value = attr_val + + config = cbd.CUlaunchConfig() + config.numAttrs = 1 + config.attrs = [attr] + config.gridDimX = kwargs['NUM_SMS'] + config.gridDimY = 1 + config.gridDimZ = 1 + config.blockDimX = kwargs['NUM_NON_EPILOGUE_THREADS'] + kwargs['NUM_EPILOGUE_THREADS'] + config.blockDimY = 1 + config.blockDimZ = 1 + config.sharedMemBytes = kwargs['SMEM_SIZE'] + config.hStream = kwargs['STREAM'] + + arg_values = ( + kwargs['GROUPED_LAYOUT'].data_ptr(), + kwargs['M'], + kwargs['N'], + kwargs['K'], + kwargs['TENSOR_MAP_A'], + kwargs['TENSOR_MAP_B'], + kwargs['TENSOR_MAP_SFA'], + kwargs['TENSOR_MAP_SFB'], + kwargs['TENSOR_MAP_C'], + kwargs['TENSOR_MAP_D'], + ) + arg_types = ( + ctypes.c_void_p, + ctypes.c_uint32, + ctypes.c_uint32, + ctypes.c_uint32, + None, None, None, None, None, None + ) + return cbd.cuLaunchKernelEx(config, kernel, (arg_values, arg_types), 0) + + +def fp8_gemm_nt(a: torch.Tensor, sfa: torch.Tensor, + b: torch.Tensor, sfb: torch.Tensor, + c: Optional[torch.Tensor], d: torch.Tensor, + major_a: MajorTypeAB, major_b: MajorTypeAB, + major_cd: MajorTypeCD, + compiled_dims: str) -> None: + m, k = a.shape + n, _ = b.shape + assert major_cd == MajorTypeCD.NMajor + + # K must be aligned to 128 + aligned_k = align(k, 128) + + num_sms = get_num_sms() + num_sms, block_m, block_n, block_k, num_stages, multicast_config, smem_config = get_best_configs( + GemmType.Normal, m, n, k, 1, major_a, major_b, major_cd, torch.float8_e4m3fn, d.dtype, num_sms) + + num_groups = 1 + tensor_map_a = make_tma_a_desc(major_a, a, m, k, + multicast_config.get_ab_load_block_m(block_m), block_k, + a.stride(major_a.non_contiguous_dim()), num_groups, + smem_config.swizzle_a_mode) + tensor_map_b = make_tma_b_desc(major_b, b, n, k, + multicast_config.get_ab_load_block_n(block_n), block_k, + b.stride(major_b.non_contiguous_dim()), num_groups, + smem_config.swizzle_b_mode) + tensor_map_d = make_tma_cd_desc(major_cd, d, m, n, + block_m, block_n, + d.stride(major_cd.non_contiguous_dim()), num_groups, + smem_config.swizzle_cd_mode) + tensor_map_c = make_tma_cd_desc(major_cd, c, m, n, + block_m, block_n, + c.stride(major_cd.non_contiguous_dim()), num_groups, + smem_config.swizzle_cd_mode) if c is not None else tensor_map_d + tensor_map_sfa = make_tma_sf_desc(MajorTypeAB.MNMajor, sfa, m, k, block_m, block_k, num_groups, smem_config.swizzle_sf_mode) + tensor_map_sfb = make_tma_sf_desc(MajorTypeAB.MNMajor, sfb, n, k, block_n, block_k, num_groups, smem_config.swizzle_sf_mode) + + kwargs = { + # Templated or runtime arguments according to the `COMPILED_DIMS` + 'COMPILED_DIMS': compiled_dims, + 'M': m, 'N': n, 'K': aligned_k, + # Templated arguments + 'GEMM_TYPE': GemmType.Normal, + 'NUM_NON_EPILOGUE_THREADS': 128, + 'NUM_EPILOGUE_THREADS': 128, + 'MAJOR_A': major_a, + 'MAJOR_B': major_b, + 'NUM_GROUPS': 1, + 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, + 'NUM_STAGES': num_stages, 'NUM_LAST_STAGES': ceil_div(k, block_k) % num_stages, + 'SWIZZLE_A_MODE': smem_config.swizzle_a_mode, + 'SWIZZLE_B_MODE': smem_config.swizzle_b_mode, + 'SWIZZLE_CD_MODE': smem_config.swizzle_cd_mode, + 'NUM_MULTICAST': multicast_config.num_multicast, + 'IS_MULTICAST_ON_A': multicast_config.is_multicast_on_a, + 'WITH_ACCUMULATION': c is not None, + 'CD_DTYPE_T': d.dtype, + # Runtime arguments + 'GROUPED_LAYOUT': torch.empty(0, dtype=torch.int32, device=d.device), + 'NUM_SMS': num_sms, + 'SMEM_SIZE': smem_config.smem_size, + 'TENSOR_MAP_A': tensor_map_a, + 'TENSOR_MAP_B': tensor_map_b, + 'TENSOR_MAP_SFA': tensor_map_sfa, + 'TENSOR_MAP_SFB': tensor_map_sfb, + 'TENSOR_MAP_C': tensor_map_c, + 'TENSOR_MAP_D': tensor_map_d, + 'STREAM': torch.cuda.current_stream().cuda_stream, + 'DEVICE_INDEX': d.device.index + } + + # Generate, build and run the kernel + code = SM100FP8GemmRuntime.generate(kwargs) + runtime = build('fp8_gemm', code, SM100FP8GemmRuntime, kwargs) + runtime(**kwargs) + + +def m_grouped_fp8_gemm_nt_contiguous(a: torch.Tensor, sfa: torch.Tensor, + b: torch.Tensor, sfb: torch.Tensor, + d: torch.Tensor, + m_indices: torch.Tensor, + major_a: MajorTypeAB, major_b: MajorTypeAB, + compiled_dims: str) -> None: + m, k = a.shape + num_groups, n, _ = b.shape + major_d = MajorTypeCD.NMajor + + # K must be aligned to 128 + aligned_k = align(k, 128) + + # Auto-tuning with compilation + num_sms = get_num_sms() + num_sms, block_m, block_n, block_k, num_stages, multicast_config, smem_config = get_best_configs( + GemmType.GroupedContiguous, m, n, k, num_groups, major_a, major_b, major_d, torch.float8_e4m3fn, d.dtype, num_sms) + + # NOTES: you cannot distinguish groups for A, SFA, and D + tensor_map_a = make_tma_a_desc(major_a, a, m, k, + multicast_config.get_ab_load_block_m(block_m), block_k, + a.stride(major_a.non_contiguous_dim()), num_groups=1, + swizzle_mode=smem_config.swizzle_a_mode) + tensor_map_b = make_tma_b_desc(major_b, b, n, k, + multicast_config.get_ab_load_block_n(block_n), block_k, + b.stride(major_b.non_contiguous_dim()), num_groups=num_groups, + swizzle_mode=smem_config.swizzle_b_mode) + tensor_map_d = make_tma_cd_desc(major_d, d, m, n, + block_m, block_n, + d.stride(major_d.non_contiguous_dim()), num_groups=1, + swizzle_mode=smem_config.swizzle_cd_mode) + tensor_map_sfa = make_tma_sf_desc(MajorTypeAB.MNMajor, sfa, m, k, block_m, block_k, num_groups=1, swizzle_mode=smem_config.swizzle_sf_mode) + tensor_map_sfb = make_tma_sf_desc(MajorTypeAB.MNMajor, sfb, n, k, block_n, block_k, num_groups=num_groups, swizzle_mode=smem_config.swizzle_sf_mode) + + kwargs = { + # Templated or runtime arguments according to the `COMPILED_DIMS` + 'COMPILED_DIMS': compiled_dims, + 'M': m, 'N': n, 'K': aligned_k, + # Templated arguments + 'GEMM_TYPE': GemmType.GroupedContiguous, + 'NUM_NON_EPILOGUE_THREADS': 128, + 'NUM_EPILOGUE_THREADS': 128, + 'MAJOR_A': major_a, + 'MAJOR_B': major_b, + 'NUM_GROUPS': num_groups, + 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, + 'NUM_STAGES': num_stages, 'NUM_LAST_STAGES': ceil_div(k, block_k) % num_stages, + 'SWIZZLE_A_MODE': smem_config.swizzle_a_mode, + 'SWIZZLE_B_MODE': smem_config.swizzle_b_mode, + 'SWIZZLE_CD_MODE': smem_config.swizzle_cd_mode, + 'NUM_MULTICAST': multicast_config.num_multicast, + 'IS_MULTICAST_ON_A': multicast_config.is_multicast_on_a, + 'WITH_ACCUMULATION': False, + 'CD_DTYPE_T': d.dtype, + # Runtime arguments + 'GROUPED_LAYOUT': m_indices, + 'NUM_SMS': num_sms, + 'SMEM_SIZE': smem_config.smem_size, + 'TENSOR_MAP_A': tensor_map_a, + 'TENSOR_MAP_B': tensor_map_b, + 'TENSOR_MAP_SFA': tensor_map_sfa, + 'TENSOR_MAP_SFB': tensor_map_sfb, + 'TENSOR_MAP_C': tensor_map_d, + 'TENSOR_MAP_D': tensor_map_d, + 'STREAM': torch.cuda.current_stream().cuda_stream, + 'DEVICE_INDEX': d.device.index + } + + # Generate, build and run the kernel + code = SM100FP8GemmRuntime.generate(kwargs) + runtime = build('fp8_m_grouped_gemm', code, SM100FP8GemmRuntime, kwargs) + runtime(**kwargs) + + +def fp8_m_grouped_gemm_nt_masked(a: torch.Tensor, sfa: torch.Tensor, + b: torch.Tensor, sfb: torch.Tensor, + d: torch.Tensor, + masked_m: torch.Tensor, + expected_m: int, + major_a: MajorTypeAB, major_b: MajorTypeAB, + compiled_dims: str) -> None: + num_groups, m, k = a.shape + _, n, _ = b.shape + major_d = MajorTypeCD.NMajor + + # K must be aligned to 128 + aligned_k = align(k, 128) + + num_sms = get_num_sms() + num_sms, block_m, block_n, block_k, num_stages, multicast_config, smem_config = get_best_configs( + GemmType.GroupedMasked, expected_m, n, k, num_groups, major_a, major_b, major_d, torch.float8_e4m3fn, d.dtype, num_sms) + if num_groups > 1: + assert m % block_m == 0 + + tensor_map_a = make_tma_a_desc(major_a, a, m, k, + multicast_config.get_ab_load_block_m(block_m), block_k, + a.stride(major_a.non_contiguous_dim()), num_groups, + smem_config.swizzle_a_mode) + tensor_map_b = make_tma_b_desc(major_b, b, n, k, + multicast_config.get_ab_load_block_n(block_n), block_k, + b.stride(major_b.non_contiguous_dim()), num_groups, + smem_config.swizzle_b_mode) + tensor_map_d = make_tma_cd_desc(major_d, d, m, n, + block_m, block_n, + d.stride(major_d.non_contiguous_dim()), num_groups, + smem_config.swizzle_cd_mode) + tensor_map_sfa = make_tma_sf_desc(MajorTypeAB.MNMajor, sfa, m, k, block_m, block_k, num_groups, smem_config.swizzle_sf_mode) + tensor_map_sfb = make_tma_sf_desc(MajorTypeAB.MNMajor, sfb, n, k, block_n, block_k, num_groups, smem_config.swizzle_sf_mode) + + kwargs = { + # Templated or runtime arguments according to the `COMPILED_DIMS` + 'COMPILED_DIMS': compiled_dims, + 'M': m, 'N': n, 'K': aligned_k, + # Templated arguments + 'GEMM_TYPE': GemmType.GroupedMasked, + 'NUM_NON_EPILOGUE_THREADS': 128, + 'NUM_EPILOGUE_THREADS': 128, + 'MAJOR_A': major_a, + 'MAJOR_B': major_b, + 'NUM_GROUPS': num_groups, + 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, + 'NUM_STAGES': num_stages, 'NUM_LAST_STAGES': ceil_div(k, block_k) % num_stages, + 'SWIZZLE_A_MODE': smem_config.swizzle_a_mode, + 'SWIZZLE_B_MODE': smem_config.swizzle_b_mode, + 'SWIZZLE_CD_MODE': smem_config.swizzle_cd_mode, + 'NUM_MULTICAST': multicast_config.num_multicast, + 'IS_MULTICAST_ON_A': multicast_config.is_multicast_on_a, + 'WITH_ACCUMULATION': False, + 'CD_DTYPE_T': d.dtype, + # Runtime arguments + 'GROUPED_LAYOUT': masked_m, + 'NUM_SMS': num_sms, + 'SMEM_SIZE': smem_config.smem_size, + 'TENSOR_MAP_A': tensor_map_a, + 'TENSOR_MAP_B': tensor_map_b, + 'TENSOR_MAP_SFA': tensor_map_sfa, + 'TENSOR_MAP_SFB': tensor_map_sfb, + 'TENSOR_MAP_C': tensor_map_d, + 'TENSOR_MAP_D': tensor_map_d, + 'STREAM': torch.cuda.current_stream().cuda_stream, + 'DEVICE_INDEX': d.device.index + } + + # Generate, build and run the kernel + code = SM100FP8GemmRuntime.generate(kwargs) + runtime = build('fp8_m_grouped_gemm', code, SM100FP8GemmRuntime, kwargs) + runtime(**kwargs) diff --git a/deep_gemm/jit_kernels/impls/sm90_bf16_gemm.py b/deep_gemm/jit_kernels/impls/sm90_bf16_gemm.py new file mode 100644 index 00000000..e69de29b diff --git a/deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d1d.py b/deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d1d.py new file mode 100644 index 00000000..e69de29b diff --git a/deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d2d.py b/deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d2d.py new file mode 100644 index 00000000..e69de29b diff --git a/deep_gemm/jit_kernels/m_grouped_gemm.py b/deep_gemm/jit_kernels/m_grouped_gemm.py deleted file mode 100644 index ca2fc79a..00000000 --- a/deep_gemm/jit_kernels/m_grouped_gemm.py +++ /dev/null @@ -1,205 +0,0 @@ -import torch -from typing import Tuple - -from ..jit import build -from .gemm import get_best_configs -from .runtime import ( - FP8GemmRuntime, GemmType, - make_2d_tma_a_desc, make_2d_tma_b_desc, - make_2d_tma_d_desc, make_2d_tma_scales_desc) -from .utils import ceil_div, get_col_major_tma_aligned_tensor, get_num_sms - - -def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(lhs: Tuple[torch.Tensor, torch.Tensor], - rhs: Tuple[torch.Tensor, torch.Tensor], - out: torch.Tensor, m_indices: torch.Tensor) -> None: - """ - Perform a grouped GEMM (contiguous format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling. - - Requirements: - LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format. - RHS and RHS scaling factors are required to be transposed. - The LHS scaling tensor requires a TMA-aligned transposed format, if your input does not match the requirement, - this function will do a transposing with a set of slow PyTorch operations. - On the M axis, inputs are grouped into several batches, of which batch sizes aligned to - `get_m_alignment_for_contiguous_layout()` (128). - - Arguments: - lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m_sum, k]`, - the second element is an FP32 1x128 scaling tensor for LHS of shape `[m_sum, ⌈k / 128⌉]`. - rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`, - the second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`. - out: the BF16 output tensor of shape `[m_sum, n]`, representing the result. - m_indices: a tensor of shape `[m_sum]` with type `torch.int`. - `m_indices[i]` records the group which the i-th row of the LHS belongs to, - which means that the i-th row of the LHS matrix will be multiplied with `rhs[m_indices[i]]`. - Values of `m_indices` in every-m-alignment-block must also be the same. - """ - lhs, lhs_scales = lhs - rhs, rhs_scales = rhs - m, k = lhs.shape - num_groups, n, k_ = rhs.shape - m_, n_ = out.shape - m__ = m_indices.numel() - - # Type and shape checks - assert m == m_ == m__ and k == k_ and n == n_ - assert lhs_scales.shape == (m, ceil_div(k, 128)) - assert rhs_scales.shape == (num_groups, ceil_div(n, 128), ceil_div(k, 128)) - assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32 - assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32 - assert out.dtype == torch.bfloat16 - assert m_indices.dtype == torch.int32 - assert lhs.is_contiguous() and rhs.is_contiguous() - assert out.is_contiguous() and m_indices.is_contiguous() - - # LHS scales must be transposed for TMA load, but not for RHS scales - lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales) - assert rhs_scales.is_contiguous() - - # Do nothing if `m` is zero - if m == 0: - return - - # Auto-tuning with compilation - num_sms = get_num_sms() - num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs( - m, n, k, 1, num_sms, is_grouped_contiguous=True) - block_k = 128 - num_tma_threads = 128 - num_math_threads_per_group = 128 - - tensor_map_a = make_2d_tma_a_desc(GemmType.GroupedContiguous, lhs, m, k, k, block_m, block_k, num_groups) - tensor_map_b = make_2d_tma_b_desc(GemmType.GroupedContiguous, rhs, n, k, k, block_n, block_k, num_groups) - tensor_map_d = make_2d_tma_d_desc(GemmType.GroupedContiguous, out, m, n, n, block_m, block_n, num_groups, smem_config[1]) - tensor_map_scales_a = make_2d_tma_scales_desc(GemmType.GroupedContiguous, lhs_scales, m, k, block_m, block_k, num_groups) - - kwargs = { - # Templated arguments - 'NUM_TMA_THREADS': num_tma_threads, - 'NUM_MATH_THREADS_PER_GROUP': num_math_threads_per_group, - 'M': m, 'N': n, 'K': k, - 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, - 'SWIZZLE_D_MODE': smem_config[1], - 'BLOCK_N_PADDING': smem_config[2], - 'NUM_GROUPS': num_groups, - 'NUM_STAGES': num_stages, - 'NUM_TMA_MULTICAST': tma_multicast_config[0], - 'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1], - 'GEMM_TYPE': GemmType.GroupedContiguous, - # Runtime arguments - 'SCALES_B': rhs_scales, - 'GROUPED_LAYOUT': m_indices, - 'NUM_SMS': num_sms, - 'SMEM_SIZE': smem_config[0], - 'TENSOR_MAP_A': tensor_map_a, - 'TENSOR_MAP_B': tensor_map_b, - 'TENSOR_MAP_SCALES_A': tensor_map_scales_a, - 'TENSOR_MAP_D': tensor_map_d, - 'STREAM': torch.cuda.current_stream().cuda_stream, - 'DEVICE_INDEX': out.device.index - } - - # Generate, build and run the kernel - code = FP8GemmRuntime.generate(kwargs) - runtime = build('m_grouped_gemm_fp8_fp8_bf16_nt', code, FP8GemmRuntime, kwargs) - runtime(**kwargs) - - -def m_grouped_gemm_fp8_fp8_bf16_nt_masked(lhs: Tuple[torch.Tensor, torch.Tensor], - rhs: Tuple[torch.Tensor, torch.Tensor], - out: torch.Tensor, masked_m: torch.Tensor, expected_m: int) -> None: - """ - Perform a grouped GEMM (masked format) with FP8 inputs and BF16 output, with 1x128 LHS scaling and 128x128 RHS scaling. - - Requirements: - LHS, RHS, RHS scaling factors, and output tensors must be in contiguous format. - RHS and RHS scaling factors are required to be transposed. - The LHS scaling tensor requires a TMA-aligned transposed format, if your input does not match the requirement, - this function will do a transposing with a set of slow PyTorch operations. - Moreover, this alignment requirement is different with the contiguous-format kernel, as we require that each batch - should be separately transposed. - - Arguments: - lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, m_max, k]`, - the second element is an FP32 1x128 scaling tensor for LHS of shape `[num_groups, m_max, ⌈k / 128⌉]`. - rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[num_groups, n, k]`. - The second element is an FP32 128x128 scaling tensor for RHS of shape `[num_groups, ⌈n / 128⌉, ⌈k / 128⌉]`. - out: the BF16 output tensor of shape `[num_groups, m_max, n]`, representing the result. - masked_m: a tensor of shape `[num_groups]`, `masked_m[i]` records actual rows of the `lhs[i]` matrix to compute - in the i-th group. - expected_m: a value hint (which is a value on CPU) for the M expectation of each batch, - correctly setting this value may lead to better performance. - """ - lhs, lhs_scales = lhs - rhs, rhs_scales = rhs - num_groups, m, k = lhs.shape - num_groups_, n, k_ = rhs.shape - num_groups__, m_, n_ = out.shape - num_groups___ = masked_m.numel() - - # Type and shape checks - assert num_groups == num_groups_ == num_groups__ == num_groups___ - assert m == m_ and n == n_ and k == k_ - assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0 - assert lhs_scales.shape == (num_groups, m, ceil_div(k, 128)) - assert rhs_scales.shape == (num_groups, ceil_div(n, 128), ceil_div(k, 128)) - assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32 - assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32 - assert out.dtype == torch.bfloat16 - assert masked_m.dtype == torch.int32 - assert lhs.is_contiguous() and rhs.is_contiguous() - assert out.is_contiguous() and masked_m.is_contiguous() - - # LHS scales must be transposed for TMA load, but not for RHS scales - lhs_scales = get_col_major_tma_aligned_tensor(lhs_scales) - assert rhs_scales.is_contiguous() - - # Auto-tuning with compilation - num_sms = get_num_sms() - num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs( - expected_m, n, k, num_groups, num_sms, is_grouped_masked=True) - - # Extra checks for TMA store - if num_groups > 1 and m > block_m: - assert m % block_m == 0, f'For masked grouped GEMM, shape M should be multiple of the block M (current block M: {block_m})' - - block_k = 128 - num_tma_threads = 128 - num_math_threads_per_group = 128 - - tensor_map_a = make_2d_tma_a_desc(GemmType.GroupedMasked, lhs, m, k, k, block_m, block_k, num_groups) - tensor_map_b = make_2d_tma_b_desc(GemmType.GroupedMasked, rhs, n, k, k, block_n, block_k, num_groups) - tensor_map_d = make_2d_tma_d_desc(GemmType.GroupedMasked, out, m, n, n, block_m, block_n, num_groups, smem_config[1]) - tensor_map_scales_a = make_2d_tma_scales_desc(GemmType.GroupedMasked, lhs_scales, m, k, block_m, block_k, num_groups) - - kwargs = { - # Templated arguments - 'NUM_TMA_THREADS': num_tma_threads, - 'NUM_MATH_THREADS_PER_GROUP': num_math_threads_per_group, - 'M': m, 'N': n, 'K': k, - 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, - 'SWIZZLE_D_MODE': smem_config[1], - 'BLOCK_N_PADDING': smem_config[2], - 'NUM_GROUPS': num_groups, - 'NUM_STAGES': num_stages, - 'NUM_TMA_MULTICAST': tma_multicast_config[0], - 'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1], - 'GEMM_TYPE': GemmType.GroupedMasked, - # Runtime arguments - 'SCALES_B': rhs_scales, - 'GROUPED_LAYOUT': masked_m, - 'NUM_SMS': num_sms, - 'SMEM_SIZE': smem_config[0], - 'TENSOR_MAP_A': tensor_map_a, - 'TENSOR_MAP_B': tensor_map_b, - 'TENSOR_MAP_SCALES_A': tensor_map_scales_a, - 'TENSOR_MAP_D': tensor_map_d, - 'STREAM': torch.cuda.current_stream().cuda_stream, - 'DEVICE_INDEX': out.device.index - } - - # Generate, build and run the kernel - code = FP8GemmRuntime.generate(kwargs) - runtime = build('m_grouped_gemm_fp8_fp8_bf16_nt', code, FP8GemmRuntime, kwargs) - runtime(**kwargs) diff --git a/deep_gemm/jit_kernels/runtime.py b/deep_gemm/jit_kernels/runtime.py index e65e85aa..2254ef26 100644 --- a/deep_gemm/jit_kernels/runtime.py +++ b/deep_gemm/jit_kernels/runtime.py @@ -1,25 +1,9 @@ -import ctypes -import os -import enum import torch import cuda.bindings.driver as cbd from typing import Any, Dict, Tuple -from .utils import get_tma_aligned_size -from ..jit.runtime import Runtime - - -class GemmType(enum.Enum): - Normal = 0 - GroupedContiguous = 1 - GroupedMasked = 2 - - def __str__(self) -> str: - return { - 0: 'Normal', - 1: 'GroupedContiguous', - 2: 'GroupedMasked', - }[self.value] +from ..utils.math import ceil_div +from ..utils.layout import get_tma_aligned_size, GemmType, MajorTypeAB, MajorTypeCD tmap_type_map: Dict[Any, str] = { @@ -42,34 +26,29 @@ def __str__(self) -> str: swizzle_type_map = { 0: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE, + 16: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE, 32: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_32B, 64: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_64B, 128: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B, } +def make_tma_xd_desc(t: torch.Tensor, + gmem_dims: Tuple[cbd.cuuint64_t, ...], gmem_strides: Tuple[cbd.cuuint64_t, ...], + smem_dims: Tuple[cbd.cuuint32_t, ...], + swizzle_type: cbd.CUtensorMapSwizzle) -> cbd.CUtensorMap: + num_dims = len(gmem_dims) + assert len(gmem_strides) == num_dims - 1 + assert len(smem_dims) == num_dims -def get_num_math_warpgroups(block_m: int) -> int: - return 1 if block_m == 64 else 2 - - -def get_num_threads_per_sm(num_tma_threads: int, num_math_threads_per_group: int, block_m: int) -> int: - assert num_math_threads_per_group == 128, 'Only support 128 threads per math group' - return get_num_math_warpgroups(block_m) * num_math_threads_per_group + num_tma_threads - - -def make_2d_tma_copy_desc(t: torch.Tensor, - gmem_dims: Tuple[cbd.cuuint64_t, cbd.cuuint64_t], gmem_outer_stride: cbd.cuuint64_t, - smem_dims: Tuple[cbd.cuuint32_t, cbd.cuuint32_t], - swizzle_type: cbd.CUtensorMapSwizzle) -> cbd.CUtensorMap: tensor_dtype = tmap_type_map[t.dtype] res, tensor_map = cbd.cuTensorMapEncodeTiled( tensor_dtype, - 2, + num_dims, t.data_ptr(), gmem_dims, - (gmem_outer_stride,), + gmem_strides, smem_dims, - (cbd.cuuint32_t(1), cbd.cuuint32_t(1)), + (cbd.cuuint32_t(1), ) * num_dims, cbd.CUtensorMapInterleave.CU_TENSOR_MAP_INTERLEAVE_NONE, swizzle_type, cbd.CUtensorMapL2promotion.CU_TENSOR_MAP_L2_PROMOTION_L2_256B, @@ -81,238 +60,90 @@ def make_2d_tma_copy_desc(t: torch.Tensor, return tensor_map -def make_2d_tma_desc(t: torch.Tensor, - gmem_inner_dim: int, gmem_outer_dim: int, gmem_outer_stride: int, +def make_tma_2d_desc(t: torch.Tensor, + gmem_inner_dim: int, gmem_outer_dim: int, smem_inner_dim: int, smem_outer_dim: int, - swizzle_type: cbd.CUtensorMapSwizzle = cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B) -> cbd.CUtensorMap: - gmem_dim = (cbd.cuuint64_t(gmem_inner_dim), cbd.cuuint64_t(gmem_outer_dim)) - smem_dim = (cbd.cuuint32_t(smem_inner_dim), cbd.cuuint32_t(smem_outer_dim)) - return make_2d_tma_copy_desc(t, gmem_dim, cbd.cuuint64_t(gmem_outer_stride * t.element_size()), smem_dim, swizzle_type) - - -def make_2d_tma_a_desc(gemm_type: GemmType, t: torch.Tensor, - shape_m: int, shape_k: int, m_stride: int, - block_m: int, block_k: int, - num_groups: int) -> cbd.CUtensorMap: - return make_2d_tma_desc(t, - shape_k, shape_m * (num_groups if gemm_type == GemmType.GroupedMasked else 1), m_stride, - block_k, block_m) - + gmem_outer_stride: int, + swizzle_mode: int) -> cbd.CUtensorMap: + # For swizzling pattern, multiple TMAs are required + if swizzle_mode != 0: + assert swizzle_mode % t.element_size() == 0 + smem_inner_dim = swizzle_mode // t.element_size() + + gmem_dims = (cbd.cuuint64_t(gmem_inner_dim), cbd.cuuint64_t(gmem_outer_dim)) + gmem_strides = (cbd.cuuint64_t(gmem_outer_stride * t.element_size()), ) + smem_dims = (cbd.cuuint32_t(smem_inner_dim), cbd.cuuint32_t(smem_outer_dim)) + return make_tma_xd_desc(t, gmem_dims, gmem_strides, smem_dims, swizzle_type_map[swizzle_mode]) + + +def make_tma_a_desc(major_type: MajorTypeAB, + t: torch.Tensor, + shape_m: int, shape_k: int, + block_m: int, block_k: int, + outer_stride: int, + num_groups: int, + swizzle_mode: int) -> cbd.CUtensorMap: + if num_groups > 1: + assert major_type == MajorTypeAB.KMajor + return make_tma_2d_desc(t, + *(shape_k, shape_m * num_groups)[::major_type.shape_direction()], + *(block_k, block_m)[::major_type.shape_direction()], + outer_stride, + swizzle_mode) + + +def make_tma_b_desc(major_type: MajorTypeAB, + t: torch.Tensor, + shape_n: int, shape_k: int, + block_n: int, block_k: int, + outer_stride: int, + num_groups: int, + swizzle_mode: int) -> cbd.CUtensorMap: + # `num_groups` is always applied into the outer dimensions + io_shapes = (shape_k, shape_n)[::major_type.shape_direction()] + io_shapes = (io_shapes[0], io_shapes[1] * num_groups) + + return make_tma_2d_desc(t, + *io_shapes, + *(block_k, block_n)[::major_type.shape_direction()], + outer_stride, + swizzle_mode) + + +def make_tma_cd_desc(major_type: MajorTypeCD, + t: torch.Tensor, + shape_m: int, shape_n: int, + block_m: int, block_n: int, + outer_stride: int, + num_groups: int, + swizzle_mode: int) -> cbd.CUtensorMap: + assert major_type == MajorTypeCD.NMajor + + # Swizzling requires the inner box dim to be less or equal than `kSwizzleCDMode` + # bytes, so `BLOCK_N * sizeof(T) / kSwizzleCDMode` TMA stores are required + layout_ad_m = 128 + return make_tma_2d_desc(t, + shape_n, shape_m * num_groups, + block_n, min(block_m, layout_ad_m), + outer_stride, + swizzle_mode) + + +def make_tma_sf_desc(major_type: MajorTypeAB, + t: torch.Tensor, + shape_mn: int, shape_k: int, + block_mn: int, block_k: int, + num_groups: int, + swizzle_mode: int) -> cbd.CUtensorMap: + assert major_type == MajorTypeAB.MNMajor + + # TODO: maybe swizzle SF as well + assert swizzle_mode == 0 -def make_2d_tma_b_desc(gemm_type: GemmType, t: torch.Tensor, - shape_n: int, shape_k: int, n_stride: int, - block_n: int, block_k: int, - num_groups: int) -> cbd.CUtensorMap: - return make_2d_tma_desc(t, - shape_k, shape_n * (num_groups if gemm_type != GemmType.Normal else 1), n_stride, - block_k, block_n) - - -def make_2d_tma_d_desc(gemm_type: GemmType, t: torch.Tensor, - shape_m: int, shape_n: int, m_stride: int, - block_m: int, block_n: int, - num_groups: int, - swizzle_mode: int) -> cbd.CUtensorMap: - # Swizzling requires the inner box dim to be less or equal than `kSwizzleDMode` - # bytes, so `BLOCK_N * sizeof(T) / kSwizzleDMode` TMA stores are required - return make_2d_tma_desc(t, - shape_n, shape_m * (num_groups if gemm_type == GemmType.GroupedMasked else 1), m_stride, - block_n if swizzle_mode == 0 else swizzle_mode // t.element_size(), block_m, - swizzle_type_map[swizzle_mode]) - - -def make_2d_tma_scales_desc(gemm_type: GemmType, t: torch.Tensor, - shape_mn: int, shape_k: int, - block_mn: int, block_k: int, - num_groups: int) -> cbd.CUtensorMap: # Make TMA aligned to 16 bytes shape_mn = get_tma_aligned_size(shape_mn, t.element_size()) - return make_2d_tma_desc(t, - shape_mn, (shape_k + block_k - 1) // block_k * (num_groups if gemm_type == GemmType.GroupedMasked else 1), shape_mn, + return make_tma_2d_desc(t, + shape_mn, ceil_div(shape_k, block_k * 4) * num_groups, block_mn, 1, - cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE) - - -class FP8GemmRuntime(Runtime): - def __init__(self, path: str) -> None: - super().__init__(path) - - @staticmethod - def generate(kwargs: Dict[str, Any]) -> str: - code = f''' -#ifdef __CUDACC_RTC__ -#include -#else -#include -#include -#endif - -#include -#include - -#include - -using namespace deep_gemm; - -static void __instantiate_kernel() {{ - auto ptr = reinterpret_cast(&fp8_gemm_kernel< - {kwargs['N']}, - {kwargs['K']}, - {kwargs['BLOCK_M']}, - {kwargs['BLOCK_N']}, - {kwargs['BLOCK_K']}, - {kwargs['BLOCK_N_PADDING']}, - {kwargs['SWIZZLE_D_MODE']}, - {kwargs['NUM_GROUPS']}, - {kwargs['NUM_STAGES']}, - {kwargs['NUM_TMA_THREADS']}, - {kwargs['NUM_MATH_THREADS_PER_GROUP']}, - {kwargs['NUM_TMA_MULTICAST']}, - {'true' if kwargs['IS_TMA_MULTICAST_ON_A'] else 'false'}, - GemmType::{kwargs['GEMM_TYPE']} - >); -}}; -''' - if int(os.getenv('DG_JIT_DEBUG', 0)): - print(f'Generated FP8 GEMM code:\n{code}') - return code - - # noinspection PyMethodOverriding - @staticmethod - def launch(kernel: cbd.CUkernel, kwargs: Dict[str, Any]) -> cbd.CUresult: - num_tma_threads = 128 - num_math_threads_per_group = 128 - - result = cbd.cuKernelSetAttribute(cbd.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - kwargs['SMEM_SIZE'], kernel, cbd.CUdevice(kwargs['DEVICE_INDEX']))[0] - assert result == cbd.CUresult.CUDA_SUCCESS, f'Failed to set max dynamic shared memory size: {result}' - - attr_val = cbd.CUlaunchAttributeValue() - attr_val.clusterDim.x = kwargs['NUM_TMA_MULTICAST'] - attr_val.clusterDim.y = 1 - attr_val.clusterDim.z = 1 - attr = cbd.CUlaunchAttribute() - attr.id = cbd.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION - attr.value = attr_val - - config = cbd.CUlaunchConfig() - config.numAttrs = 1 - config.attrs = [attr] - config.gridDimX = kwargs['NUM_SMS'] - config.gridDimY = 1 - config.gridDimZ = 1 - config.blockDimX = get_num_threads_per_sm(num_tma_threads, num_math_threads_per_group, kwargs['BLOCK_M']) - config.blockDimY = 1 - config.blockDimZ = 1 - config.sharedMemBytes = kwargs['SMEM_SIZE'] - config.hStream = kwargs['STREAM'] - - arg_values = ( - kwargs['SCALES_B'].data_ptr(), - kwargs['GROUPED_LAYOUT'].data_ptr(), - kwargs['M'], - kwargs['TENSOR_MAP_A'], - kwargs['TENSOR_MAP_B'], - kwargs['TENSOR_MAP_SCALES_A'], - kwargs['TENSOR_MAP_D'], - ) - arg_types = ( - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_uint32, - None, - None, - None, - None, - ) - return cbd.cuLaunchKernelEx(config, kernel, (arg_values, arg_types), 0) - - -class FP8WGradGemmRuntime(Runtime): - def __init__(self, path: str) -> None: - super().__init__(path) - - @staticmethod - def generate(kwargs: Dict[str, Any]) -> str: - code = f''' -#ifdef __CUDACC_RTC__ -#include -#else -#include -#include -#endif - -#include -#include - -#include - -using namespace deep_gemm; - -static void __instantiate_kernel() {{ - auto ptr = reinterpret_cast(&fp8_wgrad_gemm_kernel< - {kwargs['M']}, - {kwargs['N']}, - {kwargs['BLOCK_M']}, - {kwargs['BLOCK_N']}, - {kwargs['BLOCK_K']}, - {kwargs['NUM_STAGES']}, - {kwargs['NUM_LAST_STAGES']}, - {kwargs['NUM_TMA_THREADS']}, - {kwargs['NUM_MATH_THREADS_PER_GROUP']}, - {kwargs['NUM_TMA_MULTICAST']}, - {'true' if kwargs['IS_TMA_MULTICAST_ON_A'] else 'false'} - >); -}}; -''' - if int(os.getenv('DG_JIT_DEBUG', 0)): - print(f'Generated FP8 WGrad GEMM code:\n{code}') - return code - - # noinspection PyMethodOverriding - @staticmethod - def launch(kernel: cbd.CUkernel, kwargs: Dict[str, Any]) -> cbd.CUresult: - num_tma_threads = 128 - num_math_threads_per_group = 128 - - result = cbd.cuKernelSetAttribute(cbd.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - kwargs['SMEM_SIZE'], kernel, cbd.CUdevice(kwargs['DEVICE_INDEX']))[0] - assert result == cbd.CUresult.CUDA_SUCCESS, f'Failed to set max dynamic shared memory size: {result}' - - attr_val = cbd.CUlaunchAttributeValue() - attr_val.clusterDim.x = kwargs['NUM_TMA_MULTICAST'] - attr_val.clusterDim.y = 1 - attr_val.clusterDim.z = 1 - attr = cbd.CUlaunchAttribute() - attr.id = cbd.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION - attr.value = attr_val - - config = cbd.CUlaunchConfig() - config.numAttrs = 1 - config.attrs = [attr] - config.gridDimX = kwargs['NUM_SMS'] - config.gridDimY = 1 - config.gridDimZ = 1 - config.blockDimX = get_num_threads_per_sm(num_tma_threads, num_math_threads_per_group, kwargs['BLOCK_M']) - config.blockDimY = 1 - config.blockDimZ = 1 - config.sharedMemBytes = kwargs['SMEM_SIZE'] - config.hStream = kwargs['STREAM'] - - arg_values = ( - kwargs['K'], - kwargs['TENSOR_MAP_A'], - kwargs['TENSOR_MAP_B'], - kwargs['TENSOR_MAP_SCALES_A'], - kwargs['TENSOR_MAP_SCALES_B'], - kwargs['TENSOR_MAP_D'], - ) - arg_types = ( - ctypes.c_uint32, - None, - None, - None, - None, - None, - ) - return cbd.cuLaunchKernelEx(config, kernel, (arg_values, arg_types), 0) + shape_mn, + swizzle_mode) diff --git a/deep_gemm/jit_kernels/utils.py b/deep_gemm/jit_kernels/utils.py deleted file mode 100644 index c6da56b0..00000000 --- a/deep_gemm/jit_kernels/utils.py +++ /dev/null @@ -1,109 +0,0 @@ -import torch - -_num_sms = None - - -def set_num_sms(num_sms: int) -> None: - """ - Set the maximum SM count for all GEMM kernels to use. - - Arguments: - num_sms: the desired maximum SM count for all GEMM kernels to use. - """ - global _num_sms - assert 0 < num_sms <= torch.cuda.get_device_properties(device='cuda').multi_processor_count - _num_sms = num_sms - - -def get_num_sms() -> int: - """ - Get the current maximum limit of SM count for all GEMM kernels to use. - If the count is never specified, the function will return the number of device SMs. - - Returns: - Current maximum limit of SM count for all GEMM kernels to use. - """ - global _num_sms - if _num_sms is None: - _num_sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count - return _num_sms - - -def ceil_div(x: int, y: int) -> int: - """ - Perform ceiling division of two integers. - - Args: - x: the dividend. - y: the divisor. - - Returns: - The result of the ceiling division. - """ - return (x + y - 1) // y - - -def get_m_alignment_for_contiguous_layout(): - """ - When we do a grouped GEMM in contiguous format, LHS are grouped into several batches along the M axis. - Since we deal with exactly one sub-matrix of RHS for each GEMM block, batch sizes above should align well - with GEMM block shape. - - Returns: - Group-level alignment requirement for grouped contiguous layout, which is always 128. - """ - return 128 - - -def get_tma_aligned_size(x: int, element_size: int) -> int: - """ - Global memory address of TMA must be 16-byte aligned. - Since we use column-major layout for the LHS scaling tensor, - the M-axis of the LHS scaling tensor needs to be padded to a multiple of 16 bytes. - - Arguments: - x: original M-axis shape of the LHS scaling tensor. - element_size: element size of the LHS scaling tensor. - - Returns: - M-axis shape of the LHS scaling tensor after padding. - """ - tma_alignment_bytes = 16 - assert tma_alignment_bytes % element_size == 0 - alignment = tma_alignment_bytes // element_size - return ceil_div(x, alignment) * alignment - - -def get_col_major_tma_aligned_tensor(x: torch.Tensor) -> torch.Tensor: - """ - Returns TMA-aligned transposed format of the input tensor. `torch.transpose` will be called if necessary. - If the input tensor is already column-major layout and 16-byte aligned along the M axis - (thus meets the requirement of LHS scaling tensor in DeepGEMM), this function will do nothing. - - Arguments: - x: usually the LHS scaling tensor in GEMM. - - Returns: - The LHS scaling tensor of TMA-aligned transposed format. - """ - # NOTES: for the extreme performance, you may rewrite/fuse this function in CUDA - assert x.dim() in (2, 3) - remove_dim = False - m, n = x.shape[-2], x.shape[-1] - aligned_m = get_tma_aligned_size(m, x.element_size()) - if x.dim() == 2: - if x.stride(0) == 1 and x.stride(1) == aligned_m: - return x - x, remove_dim = x.unsqueeze(0), True - - b = x.shape[0] - - # The last kernel gives a column-major TMA aligned layout - if x.stride(0) == aligned_m * n and x.stride(1) == 1 and x.stride(2) == aligned_m: - return x.squeeze(0) if remove_dim else x - - # Normal layout requires transposing - aligned_x = torch.transpose(torch.empty((b, n, aligned_m), device=x.device, dtype=x.dtype), 1, 2) - aligned_x[:, :m, :] = x - aligned_x = aligned_x[:, :m, :] - return aligned_x.squeeze(0) if remove_dim else aligned_x diff --git a/deep_gemm/jit_kernels/wgrad_gemm.py b/deep_gemm/jit_kernels/wgrad_gemm.py deleted file mode 100644 index 00b8cd10..00000000 --- a/deep_gemm/jit_kernels/wgrad_gemm.py +++ /dev/null @@ -1,158 +0,0 @@ -import torch -from typing import List, Tuple - -from ..jit import build -from .runtime import ( - FP8WGradGemmRuntime, GemmType, - make_2d_tma_a_desc, make_2d_tma_b_desc, - make_2d_tma_d_desc, make_2d_tma_scales_desc) -from .gemm import get_best_configs -from .utils import ceil_div, get_num_sms, get_col_major_tma_aligned_tensor, get_tma_aligned_size - - -def wgrad_gemm_fp8_fp8_fp32_nt(lhs: Tuple[torch.Tensor, torch.Tensor], - rhs: Tuple[torch.Tensor, torch.Tensor], - out: torch.Tensor): - """ - Perform a weight gradient GEMM with FP8 inputs and FP32 output, with 1x128 LHS scaling and 1x128 RHS scaling. - Results will be accumulated into the output tensor. - - Requirements: - LHS, RHS, and output tensors must be contiguous in dimension 1, i.e., stride(1) = 1. - The stride(0) of LHS and RHS must be a multiple of 16, and the stride(0) of output must be a multiple of 4. - RHS and RHS scaling factors are required to be transposed. - The LHS scaling and RHS scaling tensor require a TMA-aligned transposed format. - If your input does not match the requirement, this function will do a transposing with a set of slow PyTorch operations. - - Arguments: - lhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[m, k]`, - the second element is an FP32 1x128 scaling tensor for LHS of shape `[m, ⌈k / 128⌉]`. - rhs: the first element is an FP8 tensor (typed `torch.float8_e4m3fn`) of shape `[n, k]`, - the second element is an FP32 1x128 scaling tensor for RHS of shape `[n, ⌈k / 128⌉]`. - out: the FP32 output tensor of shape `[m, n]`, which will be accumulated. - """ - lhs, lhs_scales = lhs - rhs, rhs_scales = rhs - m, k = lhs.shape - n, k_ = rhs.shape - m_, n_ = out.shape - - # Type and shape checks - assert m == m_ and n == n_ and k == k_ - assert n > 0 and m > 0 - assert lhs_scales.shape == (m, ceil_div(k, 128)) or lhs_scales.shape == (ceil_div(k, 128), m) - assert rhs_scales.shape == (n, ceil_div(k, 128)) or rhs_scales.shape == (ceil_div(k, 128), n) - assert lhs.dtype == torch.float8_e4m3fn and lhs_scales.dtype == torch.float32 - assert rhs.dtype == torch.float8_e4m3fn and rhs_scales.dtype == torch.float32 - assert out.dtype == torch.float - assert lhs.stride(1) == 1 and out.stride(1) == 1 and rhs.stride(1) == 1 - - # LHS and RHS scales must be transposed for TMA load - # NOTES: `get_col_major_tma_aligned_tensor` may launch a kernel if not processed by previous kernels - def get_valid_scales(scales: torch.Tensor, mn: int): - if scales.shape == (ceil_div(k, 128), mn): - # For k-grouped GEMMs - scales = scales.permute(1, 0) - assert get_tma_aligned_size(mn, 4) == scales.stride(1) == mn - else: - scales = get_col_major_tma_aligned_tensor(scales) - return scales - - lhs_scales = get_valid_scales(lhs_scales, m) - rhs_scales = get_valid_scales(rhs_scales, n) - - # Do nothing if `k` is zero - if k == 0: - return - - # K must be aligned to 128 - aligned_k = ceil_div(k, 128) * 128 - - # Auto-tuning with compilation - num_sms = get_num_sms() - num_sms, block_m, block_n, num_stages, tma_multicast_config, smem_config = get_best_configs( - m, n, aligned_k, 1, num_sms, is_fp32_out=True, is_wgrad=True) - num_last_stages = ceil_div(k, 128) % num_stages - block_k = 128 - num_tma_threads = 128 - num_math_threads_per_group = 128 - - tensor_map_a = make_2d_tma_a_desc(GemmType.Normal, lhs, m, k, lhs.stride(0), block_m, block_k, 1) - tensor_map_b = make_2d_tma_b_desc(GemmType.Normal, rhs, n, k, rhs.stride(0), block_n, block_k, 1) - tensor_map_d = make_2d_tma_d_desc(GemmType.Normal, out, m, n, out.stride(0), block_m, block_n, 1, smem_config[1]) - tensor_map_scales_a = make_2d_tma_scales_desc(GemmType.Normal, lhs_scales, m, k, block_m, block_k, 1) - tensor_map_scales_b = make_2d_tma_scales_desc(GemmType.Normal, rhs_scales, n, k, block_n, block_k, 1) - - kwargs = { - # Templated arguments - 'GEMM_TYPE': GemmType.Normal, - 'NUM_TMA_THREADS': num_tma_threads, - 'NUM_MATH_THREADS_PER_GROUP': num_math_threads_per_group, - 'M': m, 'N': n, 'K': aligned_k, - 'NUM_GROUPS': 1, - 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, - 'NUM_STAGES': num_stages, - 'NUM_LAST_STAGES': num_last_stages, - 'NUM_TMA_MULTICAST': tma_multicast_config[0], - 'IS_TMA_MULTICAST_ON_A': tma_multicast_config[1], - # Runtime arguments - 'NUM_SMS': num_sms, - 'SMEM_SIZE': smem_config[0], - 'TENSOR_MAP_A': tensor_map_a, - 'TENSOR_MAP_B': tensor_map_b, - 'TENSOR_MAP_SCALES_A': tensor_map_scales_a, - 'TENSOR_MAP_SCALES_B': tensor_map_scales_b, - 'TENSOR_MAP_D': tensor_map_d, - 'STREAM': torch.cuda.current_stream().cuda_stream, - 'DEVICE_INDEX': out.device.index - } - - # Generate, build and run the kernel - code = FP8WGradGemmRuntime.generate(kwargs) - runtime = build('wgrad_gemm_fp8_fp8_fp32_nt', code, FP8WGradGemmRuntime, kwargs) - runtime(**kwargs) - - -def k_grouped_wgrad_gemm_fp8_fp8_fp32_nt(lhs: Tuple[torch.Tensor, torch.Tensor], - rhs: Tuple[torch.Tensor, torch.Tensor], - out: torch.Tensor, - batch_sizes: List[int]): - """ - Perform a k-grouped weight gradient GEMM with FP8 inputs and FP32 output, with 1x128 LHS scaling and 1x128 RHS scaling. - Results will be accumulated into the output tensor. - - Requirements: - This function handles multiple batches with varying k-dimensions, processing each batch sequentially. - Each batch's LHS, RHS, and output tensors must be contiguous. - The RHS and RHS scaling factors are required to be transposed. - The LHS scaling and RHS scaling tensors require a TMA-aligned transposed format. - - Arguments: - lhs: The first element is a flattened FP8 tensor (typed `torch.float8_e4m3fn`) containing all batches of LHS data, - and the flattened shape is `[sum(m * k for k in batch_sizes)]`, where m is the number of rows. - The second element is an FP32 scaling tensor for LHS with shape `[⌈k / 128⌉ for k in batch_sizes), m]`, - representing the per-128-channel scaling factors. - rhs: The first element is a flattened FP8 tensor (typed `torch.float8_e4m3fn`) containing all batches of RHS data, - and the flattened shape is `[sum(n * k for k in batch_sizes)]`, where n is the number of rows. - The second element is an FP32 scaling tensor for RHS with shape `[⌈k / 128⌉ for k in batch_sizes), n]`, - representing the per-128-channel scaling factors. - out: The FP32 output tensor of shape [num_batches, m, n], which will be accumulated. - batch_sizes: A list of integers specifying the k-dimension for each batch. - """ - lhs, lhs_scales = lhs[0].view(-1), lhs[1] - rhs, rhs_scales = rhs[0].view(-1), rhs[1] - num_batches, m, n = out.shape - - lhs_offset, rhs_offset, scales_offset = 0, 0, 0 - - for i in range(num_batches): - k = batch_sizes[i] - lhs_slice = lhs[lhs_offset:lhs_offset + m * k].view(m, k) - rhs_slice = rhs[rhs_offset:rhs_offset + n * k].view(n, k) - lhs_scales_slice = lhs_scales[scales_offset:scales_offset + ceil_div(k, 128)] - rhs_scales_slice = rhs_scales[scales_offset:scales_offset + ceil_div(k, 128)] - wgrad_gemm_fp8_fp8_fp32_nt((lhs_slice, lhs_scales_slice), (rhs_slice, rhs_scales_slice), out[i]) - - lhs_offset += m * k - rhs_offset += n * k - scales_offset += ceil_div(k, 128) diff --git a/deep_gemm/testing/__init__.py b/deep_gemm/testing/__init__.py new file mode 100644 index 00000000..8abc1d91 --- /dev/null +++ b/deep_gemm/testing/__init__.py @@ -0,0 +1 @@ +from . import bench, numeric diff --git a/deep_gemm/utils.py b/deep_gemm/testing/bench.py similarity index 79% rename from deep_gemm/utils.py rename to deep_gemm/testing/bench.py index 55a9affa..5be63f9d 100644 --- a/deep_gemm/utils.py +++ b/deep_gemm/testing/bench.py @@ -1,8 +1,6 @@ import os import sys -import time import torch -import torch.distributed as dist def bench(fn, num_warmups: int = 5, num_tests: int = 10, @@ -77,8 +75,9 @@ def __exit__(self, *_): self.errnull_file.close() -def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: bool = False, - trace_path: str = None, barrier_comm_profiling: bool = False, flush_l2: bool = True, +def bench_kineto(fn, kernel_names, num_tests: int = 30, + suppress_kineto_output: bool = False, + trace_path: str = None, flush_l2: bool = True, with_multiple_kernels: bool = False): # Conflict with Nsight Systems using_nsys = int(os.environ.get('DG_NSYS_PROFILING', 0)) @@ -96,12 +95,6 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: profiler = torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) if not using_nsys else empty_suppress() with profiler: for i in range(2): - # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead - if barrier_comm_profiling: - lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda') - rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda') - lhs @ rhs - dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda')) for _ in range(num_tests): if flush_l2: torch.empty(flush_l2_size, dtype=torch.int, device='cuda').zero_() @@ -116,7 +109,7 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: # Parse the profiling table assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple) - is_tupled = isinstance(kernel_names, tuple) + is_tuple = isinstance(kernel_names, tuple) prof_lines = profiler.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n') kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names assert all([isinstance(name, str) for name in kernel_names]) @@ -145,21 +138,4 @@ def bench_kineto(fn, kernel_names, num_tests: int = 30, suppress_kineto_output: break kernel_times.append(total_time / total_num) - return tuple(kernel_times) if is_tupled else kernel_times[0] - - -def calc_diff(x, y): - x, y = x.double(), y.double() - denominator = (x * x + y * y).sum() - sim = 2 * (x * y).sum() / denominator - return 1 - sim - - -def count_bytes(tensors): - total = 0 - for t in tensors: - if isinstance(t, tuple): - total += count_bytes(t) - else: - total += t.numel() * t.element_size() - return total + return tuple(kernel_times) if is_tuple else kernel_times[0] diff --git a/deep_gemm/testing/numeric.py b/deep_gemm/testing/numeric.py new file mode 100644 index 00000000..b7a026ab --- /dev/null +++ b/deep_gemm/testing/numeric.py @@ -0,0 +1,19 @@ +import torch +from typing import Iterable + + +def calc_diff(x: torch.Tensor, y: torch.Tensor): + x, y = x.double(), y.double() + denominator = (x * x + y * y).sum() + sim = 2 * (x * y).sum() / denominator + return 1 - sim + + +def count_bytes(tensors: Iterable[torch.Tensor]): + total = 0 + for t in tensors: + if isinstance(t, tuple) or isinstance(t, list): + total += count_bytes(t) + elif t is not None: + total += t.numel() * t.element_size() + return total diff --git a/deep_gemm/utils/__init__.py b/deep_gemm/utils/__init__.py new file mode 100644 index 00000000..e0654f06 --- /dev/null +++ b/deep_gemm/utils/__init__.py @@ -0,0 +1 @@ +from . import layout, math diff --git a/deep_gemm/utils/layout.py b/deep_gemm/utils/layout.py new file mode 100644 index 00000000..2e03fa07 --- /dev/null +++ b/deep_gemm/utils/layout.py @@ -0,0 +1,170 @@ +import enum +import torch +from typing import Tuple, Optional + +from .math import align, ceil_div +from ..jit.compiler import get_device_arch + + +class GemmType(enum.Enum): + Normal = 0 + GroupedContiguous = 1 + GroupedMasked = 2 + + def __str__(self) -> str: + return { + 0: 'GemmType::Normal', + 1: 'GemmType::GroupedContiguous', + 2: 'GemmType::GroupedMasked', + }[self.value] + + +class MajorTypeAB(enum.Enum): + KMajor = 0 + MNMajor = 1 + + def shape_direction(self): + return 1 if self.value == 0 else -1 + + def non_contiguous_dim(self): + return -2 if self.value == 0 else -1 + + def __str__(self) -> str: + return { + 0: 'cute::UMMA::Major::K', + 1: 'cute::UMMA::Major::MN' + }[self.value] + + +class MajorTypeCD(enum.Enum): + NMajor = 0 + MMajor = 1 + + def non_contiguous_dim(self): + return -2 if self.value == 0 else -1 + + +def major_check(t: torch.Tensor): + assert t.dim() in (2, 3) + if t.dim() == 3: + assert t.stride(0) == t.size(-2) * t.size(-1), 'Grouped dimension cannot have abnormal stride' + assert t.stride(-2) == 1 or t.stride(-1) == 1 + + +def get_major_type_ab(t: torch.Tensor): + major_check(t) + return MajorTypeAB.KMajor if t.stride(-1) == 1 else MajorTypeAB.MNMajor + + +def get_major_type_cd(t: torch.Tensor): + major_check(t) + return MajorTypeCD.NMajor if t.stride(-1) == 1 else MajorTypeCD.MMajor + + +def get_element_size(dtype: torch.dtype): + return { + torch.float8_e4m3fn: 1, + torch.bfloat16: 2, + torch.float: 4, + }[dtype] + + +def get_m_alignment_for_contiguous_layout(): + return 128 + + +def get_tma_aligned_size(x: int, element_size: int) -> int: + tma_alignment_bytes = 16 + assert tma_alignment_bytes % element_size == 0 + alignment = tma_alignment_bytes // element_size + return align(x, alignment) + + +def get_col_major_tma_aligned_packed_tensor(x: torch.Tensor) -> torch.Tensor: + # NOTES: for the extreme performance, you may rewrite/fuse this function in CUDA + assert x.dtype == torch.float and x.dim() in (2, 3) + + # First, convert into UE8M0 `uint8_t` + ue8m0_tensor = (x.view(torch.int) >> 23).to(torch.uint8) + + # Second, make padded packed tensors + mn, k = x.shape[-2], x.shape[-1] + remove_dim = False + if x.dim() == 2: + x, remove_dim = x.unsqueeze(0), True + b = x.shape[0] + aligned_mn = get_tma_aligned_size(mn, 4) + aligned_k = align(k, 4) + padded = torch.zeros((b, aligned_mn, aligned_k), device=x.device, dtype=torch.uint8) + padded[:, :mn, :k] = ue8m0_tensor + padded = padded.view(-1).view(dtype=torch.int).view(b, aligned_mn, aligned_k // 4) + + # Finally, transpose + transposed = torch.transpose(torch.empty((b, aligned_k // 4, aligned_mn), device=x.device, dtype=torch.int), 1, 2) + transposed[:, :, :] = padded + aligned_x = transposed[:, :mn, :] + return aligned_x.squeeze(0) if remove_dim else aligned_x + + +def check_sf_layout(sf: torch.Tensor, + mn: int, k: int, gran: Tuple[int, int], + num_groups: Optional[int], + tma_stride_check: bool = False, + type_check: Optional[torch.dtype] = None) -> torch.Tensor: + # Type check + if type_check is not None: + assert sf.dtype == type_check + + # Always do shape checks + assert sf.dtype in (torch.float, torch.int) + assert sf.dim() == int(num_groups is not None) + 2 + if num_groups is not None: + assert sf.size(-3) == num_groups + assert sf.size(-2) == ceil_div(mn, gran[0]) + assert sf.size(-1) == ceil_div(k, gran[1] * (1 if sf.dtype == torch.float else 4)) + + # TMA stride checks: TMA aligned and MN-major + if tma_stride_check: + if num_groups is not None: + assert sf.stride(-3) == sf.stride(-1) * sf.size(-1) + assert sf.stride(-2) == 1 + assert sf.stride(-1) == get_tma_aligned_size(mn, sf.element_size()) + + return sf + + +def transform_sf_into_required_layout(sf: torch.Tensor, + mn: int, k: int, + recipe: Tuple[int, int, int], + num_groups: Optional[int] = None, + is_sfa: bool = False): + gran = (recipe[0 if is_sfa else 1], recipe[2]) + + # Pre-transform checks + check_sf_layout(sf, mn=mn, k=k, gran=gran, num_groups=num_groups) + + # (FP32, 1, 128) on Hopper: transform to TMA-aligned and MN-major + if sf.dtype == torch.float and gran == (1, 128) and get_device_arch() == '90a': + raise NotImplemented + + # (FP32, 1, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major + if sf.dtype == torch.float and gran == (1, 128) and get_device_arch() == '100a': + sf = get_col_major_tma_aligned_packed_tensor(sf) + return check_sf_layout(sf, mn=mn, k=k, gran=(1, 128), num_groups=num_groups, tma_stride_check=True, type_check=torch.int) + + # (FP32, 128, 128) on Hopper: no need to transform, check shape and whatever-major + if sf.dtype == torch.float and gran == (128, 128) and get_device_arch() == '90a': + raise NotImplemented + + # (FP32, 128, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major + if sf.dtype == torch.float and gran == (128, 128) and get_device_arch() == '100a': + sf = sf.index_select(-2, torch.arange(mn, device=sf.device) // 128) + sf = get_col_major_tma_aligned_packed_tensor(sf) + return check_sf_layout(sf, mn=mn, k=k, gran=(1, 128), num_groups=num_groups, tma_stride_check=True, type_check=torch.int) + + # (INT, 1, 128) on SM100: transform to TMA-aligned and MN-major + if sf.dtype == torch.int and gran == (1, 128) and get_device_arch() == '100a': + # TODO: add transpose kernel if SF layout is not satisfied + return check_sf_layout(sf, mn=mn, k=k, gran=(1, 128), num_groups=num_groups, tma_stride_check=True, type_check=torch.int) + + assert False, f'Unknown cases: {sf.dtype=}, {gran=}, arch={get_device_arch()}' diff --git a/deep_gemm/utils/math.py b/deep_gemm/utils/math.py new file mode 100644 index 00000000..02684f81 --- /dev/null +++ b/deep_gemm/utils/math.py @@ -0,0 +1,46 @@ +import torch +from typing import Tuple + + +def ceil_div(x: int, y: int) -> int: + """ + Perform ceiling division of two integers. + + Args: + x: the dividend. + y: the divisor. + + Returns: + The result of the ceiling division. + """ + return (x + y - 1) // y + + +def align(x: int, y: int) -> int: + return ceil_div(x, y) * y + + +def ceil_to_ue8m0(x: torch.Tensor): + assert x.view(-1).amax().item() > 0 + return torch.pow(2.0, torch.ceil(torch.log2(x.abs()))) + + +def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 and x.size(1) % 128 == 0 + m, n = x.shape + x_view = x.view(m, -1, 128) + x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) + sf = ceil_to_ue8m0(x_amax / 448.0) + return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf + + +def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 + m, n = x.shape + x_padded = torch.zeros((align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device) + x_padded[:m, :n] = x + x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) + x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) + sf = ceil_to_ue8m0(x_amax / 448.0) + x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(x_view.size(0), x_view.size(2)) diff --git a/indexing/main.cu b/indexing/main.cu index 5b15256a..674a6831 100644 --- a/indexing/main.cu +++ b/indexing/main.cu @@ -1,5 +1,8 @@ -#include "deep_gemm/fp8_gemm.cuh" -#include "deep_gemm/fp8_wgrad_gemm.cuh" +#include +#include +#include +#include +#include using namespace deep_gemm; diff --git a/install.sh b/install.sh new file mode 100755 index 00000000..4bd90a9b --- /dev/null +++ b/install.sh @@ -0,0 +1,12 @@ +# Change current directory into project root +original_dir=$(pwd) +script_dir=$(dirname "$0") +cd "$script_dir" + +# Remove old dist file, build, and install +rm -rf dist +python setup.py bdist_wheel +pip install dist/*.whl + +# Open users' original directory +cd "$original_dir" diff --git a/setup.py b/setup.py index b39efd03..52add160 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ import setuptools import shutil import subprocess +from setuptools import find_packages from setuptools.command.build_py import build_py from setuptools.command.develop import develop @@ -15,7 +16,6 @@ class PostDevelopCommand(develop): def run(self): - develop.run(self) self.make_jit_include_symlinks() @staticmethod @@ -37,9 +37,21 @@ def run(self): # First, prepare the include directories self.prepare_includes() - # Then run the regular build + # Second, make clusters' cache setting default into `envs.py` + self.generate_default_envs() + + # Finally, run the regular build build_py.run(self) + def generate_default_envs(self): + code = '# Pre-installed environment variables\n' + code += 'persistent_envs = dict()\n' + for name in ('DG_JIT_CACHE_HOME_DIR', 'DG_JIT_CACHE_SHARED_USERS'): + code += f"persistent_envs['{name}'] = '{os.environ[name]}'\n" if name in os.environ else '' + + with open(os.path.join(self.build_lib, 'deep_gemm', 'envs.py'), 'w') as f: + f.write(code) + def prepare_includes(self): # Create temporary build directory instead of modifying package directory build_include_dir = os.path.join(self.build_lib, 'deep_gemm/include') @@ -69,15 +81,16 @@ def prepare_includes(self): setuptools.setup( name='deep_gemm', - version='1.0.0' + revision, - packages=['deep_gemm', 'deep_gemm/jit', 'deep_gemm/jit_kernels'], + version='1.1.0' + revision, + packages=find_packages('.'), package_data={ 'deep_gemm': [ - 'include/deep_gemm/*', + 'include/deep_gemm/**/*', 'include/cute/**/*', 'include/cutlass/**/*', ] }, + zip_safe=False, cmdclass={ 'develop': PostDevelopCommand, 'build_py': CustomBuildPy, diff --git a/tests/generators.py b/tests/generators.py new file mode 100644 index 00000000..8f0484ac --- /dev/null +++ b/tests/generators.py @@ -0,0 +1,87 @@ +import random +import torch +from typing import Tuple + +from deep_gemm.utils.math import align, ceil_div, per_token_cast_to_fp8, per_block_cast_to_fp8 +from deep_gemm.utils.layout import MajorTypeAB, get_m_alignment_for_contiguous_layout + + +def enumerate_normal(): + for m in (128, 4096): + for k, n in [(7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]: + for major_a, major_b in ((MajorTypeAB.KMajor, MajorTypeAB.KMajor), (MajorTypeAB.KMajor, MajorTypeAB.MNMajor), + (MajorTypeAB.MNMajor, MajorTypeAB.KMajor), (MajorTypeAB.MNMajor, MajorTypeAB.MNMajor)): + for out_dtype in (torch.bfloat16, torch.float): + for accumulate in (False, ) if out_dtype == torch.bfloat16 else (False, True): + yield m, k, n, major_a, major_b, accumulate, out_dtype + + +def enumerate_grouped_contiguous(): + for num_groups, expected_m_per_group, k, n in ((4, 8192, 7168, 4096), (4, 8192, 2048, 7168), (8, 4096, 7168, 4096), (8, 4096, 2048, 7168)): + for major_a, major_b in ((MajorTypeAB.KMajor, MajorTypeAB.KMajor), (MajorTypeAB.KMajor, MajorTypeAB.MNMajor)): + yield num_groups, expected_m_per_group, k, n, major_a, major_b + + +def enumerate_grouped_masked(): + for num_groups, m in ((1, 1024), (2, 512), (4, 256)): + for k, n in ((7168, 4096), (2048, 7168), ): + yield num_groups, m, k, n + + +def generate_normal(m: int, k: int, n: int, + major_a: MajorTypeAB, major_b: MajorTypeAB, + accumulate: bool, out_dtype: torch.dtype): + a = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) + b = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) + c = torch.randn((m, n), device='cuda', dtype=out_dtype) * 64 if accumulate else None + d = torch.empty((m, n), device='cuda', dtype=out_dtype) + ref_d = (a.float() @ b.float().t() + (c if accumulate else 0)).to(out_dtype) + + a_fp8, b_fp8 = per_token_cast_to_fp8(a), per_block_cast_to_fp8(b) + a_fp8 = a_fp8 if major_a == MajorTypeAB.KMajor else (a_fp8[0].T.contiguous().T, a_fp8[1]) + b_fp8 = b_fp8 if major_b == MajorTypeAB.KMajor else (b_fp8[0].T.contiguous().T, b_fp8[1]) + return a_fp8, b_fp8, c, d, ref_d + + +def generate_grouped_contiguous(num_groups: int, expected_m_per_group: int, k: int, n: int, major_a: MajorTypeAB, major_b: MajorTypeAB) -> \ + Tuple[int, Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: + group_ms = [align(int(expected_m_per_group * random.uniform(0.7, 1.3)), get_m_alignment_for_contiguous_layout()) for _ in range(num_groups)] + m = sum(group_ms) + + a = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) + b = torch.randn((num_groups, n, k), device='cuda', dtype=torch.bfloat16) + m_indices = torch.empty(m, device='cuda', dtype=torch.int32) + d = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) + ref_d = torch.randn((m, n), device='cuda', dtype=torch.bfloat16) + + start = 0 + for i, group_m in enumerate(group_ms): + end = start + group_m + m_indices[start:end] = i + ref_d[start:end] = a[start:end] @ b[i].t() + start = end + + assert major_a == MajorTypeAB.KMajor + a_fp8 = per_token_cast_to_fp8(a) + b_fp8 = (torch.empty_like(b, dtype=torch.float8_e4m3fn), + torch.empty((num_groups, ceil_div(n, 128), ceil_div(k, 128)), device='cuda', dtype=torch.float)) + for i in range(num_groups): + b_fp8[0][i], b_fp8[1][i] = per_block_cast_to_fp8(b[i]) + b_fp8 = b_fp8 if major_b == MajorTypeAB.KMajor else (b_fp8[0].mT.contiguous().mT, b_fp8[1]) + return m, a_fp8, b_fp8, m_indices, d, ref_d + + +def generate_grouped_masked(num_groups: int, m: int, k: int, n: int) -> \ + Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor]: + a = torch.randn((num_groups, m, k), device='cuda', dtype=torch.bfloat16) + b = torch.randn((num_groups, n, k), device='cuda', dtype=torch.bfloat16) + d = torch.empty((num_groups, m, n), device='cuda', dtype=torch.bfloat16) + ref_d = torch.einsum('gmk,gnk->gmn', a, b) + + a_fp8 = (torch.empty_like(a, dtype=torch.float8_e4m3fn), torch.empty((num_groups, m, ceil_div(k, 128)), device='cuda', dtype=torch.float)) + b_fp8 = (torch.empty_like(b, dtype=torch.float8_e4m3fn), torch.empty((num_groups, ceil_div(n, 128), ceil_div(k, 128)), device='cuda', dtype=torch.float)) + for i in range(num_groups): + a_fp8[0][i], a_fp8[1][i] = per_token_cast_to_fp8(a[i]) + b_fp8[0][i], b_fp8[1][i] = per_block_cast_to_fp8(b[i]) + + return a_fp8, b_fp8, d, ref_d diff --git a/tests/test_core.py b/tests/test_core.py index 3b88539c..c3f4a29d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -5,293 +5,100 @@ import random import torch -from typing import List, Tuple import deep_gemm -from deep_gemm import bench_kineto, calc_diff, ceil_div, get_col_major_tma_aligned_tensor -from deep_gemm.jit_kernels.utils import get_m_alignment_for_contiguous_layout +from deep_gemm.utils.layout import MajorTypeAB +from deep_gemm.testing.bench import bench_kineto +from deep_gemm.testing.numeric import calc_diff, count_bytes - -def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - pad_size = (128 - (n % 128)) % 128 - x = torch.nn.functional.pad(x, (0, pad_size), value=0) if pad_size > 0 else x - x_view = x.view(m, -1, 128) - x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn) - return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1) - - -def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - assert x.dim() == 2 - m, n = x.shape - x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128), dtype=x.dtype, device=x.device) - x_padded[:m, :n] = x - x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) - x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (x_amax / 448.0).view(x_view.size(0), x_view.size(2)) - - -def construct(m: int, k: int, n: int) -> \ - Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor]: - x = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) - y = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) - out = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) - ref_out = x @ y.t() - - x_fp8, y_fp8 = per_token_cast_to_fp8(x), per_block_cast_to_fp8(y) - # Transpose earlier so that the testing will not trigger transposing kernels - x_fp8 = (x_fp8[0], get_col_major_tma_aligned_tensor(x_fp8[1])) - return x_fp8, y_fp8, out, ref_out - - -def construct_contiguous_grouped(num_groups: int, expected_m_per_group: int, k: int, n: int) -> \ - Tuple[int, Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: - alignment = get_m_alignment_for_contiguous_layout() - group_ms = [int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)] - m = sum([ceil_div(x, alignment) * alignment for x in group_ms]) - - x = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) - y = torch.randn((num_groups, n, k), device='cuda', dtype=torch.bfloat16) - m_indices = torch.empty(m, device='cuda', dtype=torch.int32) - out = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) - ref_out = torch.randn((m, n), device='cuda', dtype=torch.bfloat16) - - start = 0 - for i, group_m in enumerate(group_ms): - actual_end = start + group_m - aligned_end = start + ceil_div(group_m, alignment) * alignment - m_indices[start:actual_end] = i - m_indices[actual_end:aligned_end] = -1 - ref_out[start:aligned_end] = x[start:aligned_end] @ y[i].t() - start = aligned_end - ref_out = torch.where((m_indices == -1).unsqueeze(1), torch.zeros_like(ref_out), ref_out) - - assert m % 4 == 0, f'TMA alignment error: {m}' - x_fp8 = per_token_cast_to_fp8(x) - y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn), torch.empty((num_groups, ceil_div(n, 128), k // 128), device='cuda', dtype=torch.float)) - for i in range(num_groups): - y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i]) - - return m, x_fp8, y_fp8, m_indices, out, ref_out - - -def construct_masked_grouped(num_groups: int, max_m: int, expected_m_per_group: int, k: int, n: int) -> \ - Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: - x = torch.randn((num_groups, max_m, k), device='cuda', dtype=torch.bfloat16) - y = torch.randn((num_groups, n, k), device='cuda', dtype=torch.bfloat16) - out = torch.empty((num_groups, max_m, n), device='cuda', dtype=torch.bfloat16) - ref_out = torch.einsum('gmk,gnk->gmn', x, y) - - assert max_m % 4 == 0, f'TMA alignment error: {max_m}' - x_fp8 = (torch.empty_like(x, dtype=torch.float8_e4m3fn), torch.empty((num_groups, max_m, k // 128), device='cuda', dtype=torch.float)) - y_fp8 = (torch.empty_like(y, dtype=torch.float8_e4m3fn), torch.empty((num_groups, ceil_div(n, 128), k // 128), device='cuda', dtype=torch.float)) - for i in range(num_groups): - x_fp8[0][i], x_fp8[1][i] = per_token_cast_to_fp8(x[i]) - y_fp8[0][i], y_fp8[1][i] = per_block_cast_to_fp8(y[i]) - - # Transpose earlier so that the testing will not trigger transposing kernels - x_fp8 = (x_fp8[0], get_col_major_tma_aligned_tensor(x_fp8[1])) - - # Construct mask - masked_m = torch.empty((num_groups, ), device='cuda', dtype=torch.int) - for j in range(num_groups): - masked_m[j] = int(expected_m_per_group * random.uniform(0.7, 1.3)) - assert masked_m.amax().item() <= max_m - return x_fp8, y_fp8, masked_m, out, ref_out - - -def construct_wgrad(m: int, k: int, n: int) -> \ - Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: - x = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) - y = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) - residual = torch.randn((m, n), device='cuda', dtype=torch.float) * 10 - out = residual.clone() - ref_out = residual + (x.float() @ y.float().t()) - - x_fp8 = per_token_cast_to_fp8(x) - y_fp8 = per_token_cast_to_fp8(y) - - # NOTES: please do inplace add on the `out` later - return x_fp8, y_fp8, residual, out, ref_out - - -def construct_k_grouped_wgrad(m: int, n: int, k_sizes: List[int]) -> \ - Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor, List[int]]: - num_groups, total_k = len(k_sizes), sum(k_sizes) - - x_flat = torch.empty((m * total_k,), device='cuda', dtype=torch.bfloat16) - y_flat = torch.empty((n * total_k,), device='cuda', dtype=torch.bfloat16) - out = torch.zeros((num_groups, m, n), device='cuda', dtype=torch.float) - ref_out = torch.zeros((num_groups, m, n), device='cuda', dtype=torch.float) - - # Fill tensors with data and compute reference output - x_offset, y_offset = 0, 0 - for idx, k in enumerate(k_sizes): - x_chunk = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) - y_chunk = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) - - x_flat[x_offset:x_offset + m * k].copy_(x_chunk.flatten()) - y_flat[y_offset:y_offset + n * k].copy_(y_chunk.flatten()) - ref_out[idx] = x_chunk.float() @ y_chunk.float().t() - - x_offset += m * k - y_offset += n * k - - x_fp8_flat = torch.empty_like(x_flat, dtype=torch.float8_e4m3fn) - y_fp8_flat = torch.empty_like(y_flat, dtype=torch.float8_e4m3fn) - - total_scale_factors = sum(ceil_div(k, 128) for k in k_sizes) - x_scales = torch.empty((total_scale_factors, m), device='cuda', dtype=torch.float) - y_scales = torch.empty((total_scale_factors, n), device='cuda', dtype=torch.float) - - # Cast to FP8 and prepare scale factors - x_offset, y_offset, scale_offset = 0, 0, 0 - for k in k_sizes: - x_fp8_chunk, x_scale_chunk = per_token_cast_to_fp8(x_flat[x_offset:x_offset + m * k].view(m, k)) - y_fp8_chunk, y_scale_chunk = per_token_cast_to_fp8(y_flat[y_offset:y_offset + n * k].view(n, k)) - - x_fp8_flat[x_offset:x_offset + m * k].copy_(x_fp8_chunk.flatten()) - y_fp8_flat[y_offset:y_offset + n * k].copy_(y_fp8_chunk.flatten()) - - num_scales = ceil_div(k, 128) - x_scales[scale_offset:scale_offset + num_scales].copy_(x_scale_chunk.T) - y_scales[scale_offset:scale_offset + num_scales].copy_(y_scale_chunk.T) - - x_offset += m * k - y_offset += n * k - scale_offset += num_scales - - return (x_fp8_flat, x_scales), (y_fp8_flat, y_scales), out, ref_out, k_sizes +from generators import ( + enumerate_normal, enumerate_grouped_contiguous, enumerate_grouped_masked, + generate_normal, generate_grouped_contiguous, generate_grouped_masked, +) def test_gemm() -> None: print('Testing GEMM:') - for m in (64, 128, 4096): - for k, n in [(576, 7168), (7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]: - x_fp8, y_fp8, out, ref_out = construct(m, k, n) - deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out) - diff = calc_diff(out, ref_out) - assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}' + for m, k, n, major_a, major_b, accumulate, out_dtype in enumerate_normal(): + major_opt = 'N' if major_a == MajorTypeAB.KMajor else 'T' + major_opt += 'T' if major_b == MajorTypeAB.KMajor else 'N' + out_opt = 'FP32' if out_dtype == torch.float else 'BF16' + acc_opt = f'accumulate={int(accumulate)}' - # noinspection PyShadowingNames - def test_func(): - deep_gemm.gemm_fp8_fp8_bf16_nt(x_fp8, y_fp8, out) + a, b, c, d, ref_d = generate_normal(m, k, n, major_a, major_b, accumulate, out_dtype) + deep_gemm.fp8_gemm_nt(a, b, d, c=c) + diff = calc_diff(d, ref_d) + assert diff < 0.001, f'{m=}, {k=}, {n=}, {major_opt=}, {diff:.5f}' - t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) - print(f' > Perf (m={m:5}, n={n:5}, k={k:5}): {t * 1e6:4.0f} us | ' - f'throughput: {2 * m * n * k / t / 1e12:4.0f} TFLOPS, ' - f'{(m * k + k * n + m * n * 2) / 1e9 / t:4.0f} GB/s') + # noinspection PyShadowingNames + def test_func(): + deep_gemm.fp8_gemm_nt(a, b, d) + + t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) + print(f' > Perf (m={m:5}, n={n:5}, k={k:5}, MemLayout={major_opt}, {out_opt}, {acc_opt}):' + f'{t * 1e6:4.0f} us | ' + f'{2 * m * n * k / t / 1e12:4.0f} TFLOPS | ' + f'{count_bytes((a, b, c, d)) / 1e9 / t:4.0f} GB/s') print() def test_m_grouped_gemm_contiguous() -> None: print('Testing grouped contiguous GEMM:') - for num_groups, expected_m_per_group, k, n in ((4, 8192, 7168, 4096), (4, 8192, 2048, 7168), - (8, 4096, 7168, 4096), (8, 4096, 2048, 7168), - (32, 256, 7168, 4096), (32, 256, 2048, 7168)): - # NOTES: we should mask the unfilled part before calculating difference - m, x_fp8, y_fp8, m_indices, out, ref_out = construct_contiguous_grouped(num_groups, expected_m_per_group, k, n) - deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(x_fp8, y_fp8, out, m_indices) - out = torch.where((m_indices == -1).unsqueeze(1), torch.zeros_like(out), out) - diff = calc_diff(out, ref_out) - assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}' + for num_groups, expected_m_per_group, k, n, major_a, major_b in enumerate_grouped_contiguous(): + # TODO: make a stronger test + major_opt = 'N' if major_a == MajorTypeAB.KMajor else 'T' + major_opt += 'T' if major_b == MajorTypeAB.KMajor else 'N' + + m, a, b, m_indices, d, ref_d = generate_grouped_contiguous(num_groups, expected_m_per_group, k, n, major_a, major_b) + deep_gemm.m_grouped_fp8_gemm_nt_contiguous(a, b, d, m_indices) + diff = calc_diff(d, ref_d) + assert diff < 0.001, f'{m=}, {k=}, {n=}, {major_opt}, {diff:.5f}' # noinspection PyShadowingNames def test_func(): - deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(x_fp8, y_fp8, out, m_indices) + deep_gemm.m_grouped_fp8_gemm_nt_contiguous(a, b, d, m_indices) t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) - valid_m = (m_indices != -1).sum().item() - print(f' > Perf ({num_groups=:2}, {expected_m_per_group=:4}, n={n:4}, k={k:4}): {t * 1e6:4.0f} us | ' - f'throughput: {2 * valid_m * n * k / t / 1e12:4.0f} TFLOPS, ' - f'{(valid_m * k + num_groups * k * n + valid_m * n * 2) / 1e9 / t:4.0f} GB/s') + print(f' > Perf ({num_groups=}, m={m:5}, n={n:5}, k={k:5}, MemLayout={major_opt}): ' + f'{t * 1e6:4.0f} us | ' + f'{2 * m * n * k / t / 1e12:4.0f} TFLOPS | ' + f'{count_bytes((a, b, d)) / 1e9 / t:4.0f} GB/s') print() def test_m_grouped_gemm_masked() -> None: print('Testing grouped masked GEMM:') - for num_groups, expected_m_per_group in ((1, 1024), (2, 512), (4, 256)): - for k, n in ((7168, 4096), (2048, 7168), ): - # Test correctness - for i in range(10): - x_fp8, y_fp8, masked_m, out, ref_out = construct_masked_grouped(num_groups, 4096, expected_m_per_group, k, n) - deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(x_fp8, y_fp8, out, masked_m, expected_m_per_group) - for j in range(num_groups): - diff = calc_diff(out[j, :masked_m[j].item()], ref_out[j, :masked_m[j].item()]) - assert diff < 0.001, f'{expected_m_per_group=}, {k=}, {n=}, {j=}, masked_m={masked_m[j]}, {num_groups=}, {diff:.5f}' - - # noinspection PyShadowingNames - def test_func(): - deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(x_fp8, y_fp8, out, masked_m, expected_m_per_group) - - # Test performance with fixed shapes - # noinspection PyUnboundLocalVariable - valid_m = masked_m.sum().item() - t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) - print(f' > Perf ({num_groups=}, expected_m_per_group={expected_m_per_group:4}, n={n:4}, k={k:4}): {t * 1e6:4.0f} us | ' - f'throughput: {2 * valid_m * n * k / t / 1e12:4.0f} TFLOPS, ' - f'{(valid_m * k + num_groups * k * n + valid_m * n * 2) / 1e9 / t:4.0f} GB/s') - print() - - -def test_wgrad_gemm(): - print('Testing weight gradient GEMM:') + # TODO: merge Hopper's tests + for num_groups, m, k, n in enumerate_grouped_masked(): + # Test correctness + masked_m_candidates = list(filter(lambda candidate: candidate <= m, (128, 256, 384))) + for i in range(10): + a, b, d, ref_d = generate_grouped_masked(num_groups, m, k, n) + masked_m = torch.empty((num_groups, ), device='cuda', dtype=torch.int) + for j in range(num_groups): + masked_m[j] = random.choice(masked_m_candidates) + expected_m = min(int(masked_m.float().mean()) + 1, m) + deep_gemm.fp8_m_grouped_gemm_nt_masked(a, b, d, masked_m, expected_m) + for j in range(num_groups): + diff = calc_diff(d[j, :masked_m[j].item()], ref_d[j, :masked_m[j].item()]) + assert diff < 0.001, f'{m=}, {k=}, {n=}, {j=}, masked_m={masked_m[j]}, {num_groups=}, {diff:.5f}' + + # Construct full cases + a, b, d, ref_d = generate_grouped_masked(num_groups, m, k, n) + masked_m = torch.ones((num_groups, ), device='cuda', dtype=torch.int) * m - for k in (4096, 8192): - for m, n in ((7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)): - # Test correctness - x_fp8, y_fp8, residual, out, ref_out = construct_wgrad(m, k, n) - deep_gemm.wgrad_gemm_fp8_fp8_fp32_nt(x_fp8, y_fp8, out) - diff = calc_diff(out, ref_out) - assert diff < 0.001, f'{m=}, {k=}, {n=}, {diff:.5f}' - - # Construct new tensors only once to avoid L2 cache acceleration (creating them puts them in L2) - x_fp8, y_fp8, residual, out, ref_out = construct_wgrad(m, k, n) - - # noinspection PyShadowingNames - def test_func(): - deep_gemm.wgrad_gemm_fp8_fp8_fp32_nt(x_fp8, y_fp8, out) - - t = bench_kineto(test_func, 'fp8_wgrad_gemm', suppress_kineto_output=True) - print(f' > Performance (m={m:5}, n={n:5}, k={k:5}): {t * 1e6:4.0f} us | ' - f'throughput: {2 * m * n * k / t / 1e12:4.0f} TFLOPS, ' - f'{(m * k + k * n + m * n * 2) / 1e9 / t:4.0f} GB/s') - print() - - -def test_k_grouped_wgrad_gemm(): - print('Testing grouped weight gradient GEMM:') - - for num_groups, base_k in ((4, 4096), (4, 8192), (8, 4096)): - for m, n in ((7168, 4096), (2048, 7168)): - # Vary k sizes around base_k - k_sizes = [base_k + random.randint(-1, 1) * 128 for _ in range(num_groups - 1)] - k_sizes.append(base_k * num_groups - sum(k_sizes)) - - # Test correctness - x_fp8, y_fp8, out, ref_out, k_sizes = construct_k_grouped_wgrad(m, n, k_sizes) - deep_gemm.k_grouped_wgrad_gemm_fp8_fp8_fp32_nt(x_fp8, y_fp8, out, k_sizes) - - for idx in range(num_groups): - diff = calc_diff(out[idx], ref_out[idx]) - assert diff < 0.001, f'{num_groups=}, {m=}, {n=}, k={k_sizes[idx]}, batch={idx}, {diff:.5f}' + # noinspection PyShadowingNames + def test_func(): + deep_gemm.fp8_m_grouped_gemm_nt_masked(a, b, d, masked_m, m) - # Construct new tensors to avoid L2 cache acceleration - x_fp8, y_fp8, out, ref_out, k_sizes = construct_k_grouped_wgrad(m, n, k_sizes) - total_k = sum(k_sizes) - - def test_func(): - deep_gemm.k_grouped_wgrad_gemm_fp8_fp8_fp32_nt(x_fp8, y_fp8, out, k_sizes) - - t = bench_kineto(test_func, 'fp8_wgrad_gemm', suppress_kineto_output=True, with_multiple_kernels=True) * num_groups - print(f' > Performance ({num_groups=}, m={m:5}, n={n:5}, avg_k={total_k//num_groups:5}): {t * 1e6:4.0f} us | ' - f'throughput: {2 * num_groups * m * n * (total_k/num_groups) / t / 1e12:4.0f} TFLOPS, ' - f'{(m * total_k + n * total_k + num_groups * m * n * 2) / 1e9 / t:4.0f} GB/s') + # Test performance with fixed shapes + t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) + print(f' > Perf ({num_groups=}, m_per_group={m:4}, n={n:4}, k={k:4}): ' + f'{t * 1e6:4.0f} us | ' + f'{2 * num_groups * m * n * k / t / 1e12:4.0f} TFLOPS | ' + f'{count_bytes((a, b, d)) / 1e9 / t:4.0f} GB/s') print() @@ -307,6 +114,3 @@ def test_func(): test_gemm() test_m_grouped_gemm_contiguous() test_m_grouped_gemm_masked() - - test_wgrad_gemm() - test_k_grouped_wgrad_gemm() diff --git a/third-party/cutlass b/third-party/cutlass index eefa1713..b244379d 160000 --- a/third-party/cutlass +++ b/third-party/cutlass @@ -1 +1 @@ -Subproject commit eefa171318b79cbe2e78514d4cce5cd0fe919d0c +Subproject commit b244379d9b15574e07b73b814b88bd2233f0b3ce From cc416ee4faf0533a9263c2de814e5565f56ca1cc Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Thu, 12 Jun 2025 16:10:00 +0800 Subject: [PATCH 2/4] Update layout.py --- deep_gemm/utils/layout.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/deep_gemm/utils/layout.py b/deep_gemm/utils/layout.py index 2e03fa07..5536da1b 100644 --- a/deep_gemm/utils/layout.py +++ b/deep_gemm/utils/layout.py @@ -140,8 +140,14 @@ def transform_sf_into_required_layout(sf: torch.Tensor, is_sfa: bool = False): gran = (recipe[0 if is_sfa else 1], recipe[2]) - # Pre-transform checks - check_sf_layout(sf, mn=mn, k=k, gran=gran, num_groups=num_groups) + should_skip_transform = ( + (sf.dtype == torch.int and gran == (1, 128) and get_device_arch() == '100a') + or (sf.dtype == torch.int and gran == (128, 128) and get_device_arch() == '100a') + ) + + if not should_skip_transform: + # Pre-transform checks + check_sf_layout(sf, mn=mn, k=k, gran=gran, num_groups=num_groups) # (FP32, 1, 128) on Hopper: transform to TMA-aligned and MN-major if sf.dtype == torch.float and gran == (1, 128) and get_device_arch() == '90a': @@ -162,8 +168,7 @@ def transform_sf_into_required_layout(sf: torch.Tensor, sf = get_col_major_tma_aligned_packed_tensor(sf) return check_sf_layout(sf, mn=mn, k=k, gran=(1, 128), num_groups=num_groups, tma_stride_check=True, type_check=torch.int) - # (INT, 1, 128) on SM100: transform to TMA-aligned and MN-major - if sf.dtype == torch.int and gran == (1, 128) and get_device_arch() == '100a': + if should_skip_transform: # TODO: add transpose kernel if SF layout is not satisfied return check_sf_layout(sf, mn=mn, k=k, gran=(1, 128), num_groups=num_groups, tma_stride_check=True, type_check=torch.int) From c52b1c1a78df3ea73526887b1360cb47c97b71e9 Mon Sep 17 00:00:00 2001 From: Ray Wang Date: Thu, 17 Jul 2025 20:07:53 -0700 Subject: [PATCH 3/4] Optimize performance, Add SM90 support, Add 1D2D SM100 support --- .gitmodules | 5 +- CMakeLists.txt | 55 +- README.md | 156 ++--- {indexing => csrc/indexing}/main.cu | 2 + csrc/jit/cache.hpp | 31 + csrc/jit/compiler.hpp | 172 ++++++ csrc/jit/device_runtime.hpp | 50 ++ csrc/jit/kernel_runtime.hpp | 139 +++++ csrc/jit_kernels/heuristics/common.hpp | 298 ++++++++++ csrc/jit_kernels/heuristics/sm100.hpp | 144 +++++ csrc/jit_kernels/heuristics/sm90.hpp | 115 ++++ csrc/jit_kernels/impls/runtime_utils.hpp | 173 ++++++ .../jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp | 351 ++++++++++++ .../jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp | 242 ++++++++ csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp | 255 +++++++++ csrc/jit_kernels/impls/smxx_layout.hpp | 199 +++++++ csrc/python_api.cpp | 402 +++++++++++++ csrc/utils/exception.hpp | 58 ++ csrc/utils/format.hpp | 6 + csrc/utils/hash.hpp | 35 ++ csrc/utils/layout.hpp | 100 ++++ csrc/utils/math.hpp | 25 + csrc/utils/system.hpp | 70 +++ deep_gemm/__init__.py | 38 +- deep_gemm/config.py | 28 - deep_gemm/dispatch.py | 189 ------- .../include/deep_gemm/common/scheduler.cuh | 162 ++++-- .../include/deep_gemm/common/sm100_utils.cuh | 17 +- .../include/deep_gemm/common/sm90_utils.cuh | 136 ++++- deep_gemm/include/deep_gemm/common/types.hpp | 17 + deep_gemm/include/deep_gemm/common/utils.cuh | 33 +- .../deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh | 153 +++-- .../deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh | 532 ++++++++++++++++++ .../deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh | 440 ++++++++++++++- .../include/deep_gemm/impls/smxx_layout.cuh | 139 +++++ deep_gemm/jit/__init__.py | 2 - deep_gemm/jit/compiler.py | 317 ----------- deep_gemm/jit/runtime.py | 114 ---- deep_gemm/jit/scripts/__init__.py | 1 - deep_gemm/jit/scripts/sm90_interleave_ffma.py | 137 ----- deep_gemm/jit_kernels/__init__.py | 5 - deep_gemm/jit_kernels/heuristics/__init__.py | 5 - deep_gemm/jit_kernels/heuristics/common.py | 49 -- .../heuristics/sm100_heuristics.py | 171 ------ .../jit_kernels/heuristics/sm90_heuristics.py | 0 deep_gemm/jit_kernels/impls/__init__.py | 7 - .../jit_kernels/impls/sm100_bf16_gemm.py | 0 .../jit_kernels/impls/sm100_fp8_gemm_1d1d.py | 339 ----------- deep_gemm/jit_kernels/impls/sm90_bf16_gemm.py | 0 .../jit_kernels/impls/sm90_fp8_gemm_1d1d.py | 0 .../jit_kernels/impls/sm90_fp8_gemm_1d2d.py | 0 deep_gemm/jit_kernels/runtime.py | 149 ----- deep_gemm/testing/__init__.py | 2 + deep_gemm/testing/bench.py | 2 +- deep_gemm/testing/numeric.py | 6 +- deep_gemm/utils/__init__.py | 4 +- deep_gemm/utils/layout.py | 186 +----- deep_gemm/utils/math.py | 30 +- develop.sh | 25 + figures/design.png | Bin 571421 -> 0 bytes install.sh | 5 +- setup.py | 52 +- tests/generators.py | 219 +++++-- tests/test_core.py | 166 ++++-- tests/test_jit.py | 98 ---- tests/test_layout.py | 104 ++++ 66 files changed, 4940 insertions(+), 2222 deletions(-) rename {indexing => csrc/indexing}/main.cu (76%) create mode 100644 csrc/jit/cache.hpp create mode 100644 csrc/jit/compiler.hpp create mode 100644 csrc/jit/device_runtime.hpp create mode 100644 csrc/jit/kernel_runtime.hpp create mode 100644 csrc/jit_kernels/heuristics/common.hpp create mode 100644 csrc/jit_kernels/heuristics/sm100.hpp create mode 100644 csrc/jit_kernels/heuristics/sm90.hpp create mode 100644 csrc/jit_kernels/impls/runtime_utils.hpp create mode 100644 csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp create mode 100644 csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp create mode 100644 csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp create mode 100644 csrc/jit_kernels/impls/smxx_layout.hpp create mode 100644 csrc/python_api.cpp create mode 100644 csrc/utils/exception.hpp create mode 100644 csrc/utils/format.hpp create mode 100644 csrc/utils/hash.hpp create mode 100644 csrc/utils/layout.hpp create mode 100644 csrc/utils/math.hpp create mode 100644 csrc/utils/system.hpp delete mode 100644 deep_gemm/config.py delete mode 100644 deep_gemm/dispatch.py create mode 100644 deep_gemm/include/deep_gemm/common/types.hpp create mode 100644 deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh create mode 100644 deep_gemm/include/deep_gemm/impls/smxx_layout.cuh delete mode 100644 deep_gemm/jit/__init__.py delete mode 100644 deep_gemm/jit/compiler.py delete mode 100644 deep_gemm/jit/runtime.py delete mode 100644 deep_gemm/jit/scripts/__init__.py delete mode 100644 deep_gemm/jit/scripts/sm90_interleave_ffma.py delete mode 100644 deep_gemm/jit_kernels/__init__.py delete mode 100644 deep_gemm/jit_kernels/heuristics/__init__.py delete mode 100644 deep_gemm/jit_kernels/heuristics/common.py delete mode 100644 deep_gemm/jit_kernels/heuristics/sm100_heuristics.py delete mode 100644 deep_gemm/jit_kernels/heuristics/sm90_heuristics.py delete mode 100644 deep_gemm/jit_kernels/impls/__init__.py delete mode 100644 deep_gemm/jit_kernels/impls/sm100_bf16_gemm.py delete mode 100644 deep_gemm/jit_kernels/impls/sm100_fp8_gemm_1d1d.py delete mode 100644 deep_gemm/jit_kernels/impls/sm90_bf16_gemm.py delete mode 100644 deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d1d.py delete mode 100644 deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d2d.py delete mode 100644 deep_gemm/jit_kernels/runtime.py create mode 100755 develop.sh delete mode 100644 figures/design.png delete mode 100644 tests/test_jit.py create mode 100644 tests/test_layout.py diff --git a/.gitmodules b/.gitmodules index d16e9335..dd976a23 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "third-party/cutlass"] path = third-party/cutlass - url = https://github.com/NVIDIA/cutlass.git + url = git@github.com:NVIDIA/cutlass.git +[submodule "third-party/fmt"] + path = third-party/fmt + url = git@github.com:fmtlib/fmt.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 240f6b17..ab20d622 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,44 +1,33 @@ # NOTES: current just for CMake-based IDE (e.g. CLion) indexing, the real compilation is done via JIT -# TODO: add CUDA utils' library via CMake cmake_minimum_required(VERSION 3.10) project(deep_gemm LANGUAGES CXX CUDA) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CUDA_STANDARD 17) set(CMAKE_VERBOSE_MAKEFILE ON) -find_package(CUDAToolkit REQUIRED) -find_package(pybind11 REQUIRED) - -file(WRITE ${CMAKE_BINARY_DIR}/test_cuda.cu "extern \"C\" __global__ void testKernel() { }") -execute_process( - COMMAND ${CUDA_NVCC_EXECUTABLE} ${CMAKE_CUDA_FLAGS} -gencode arch=compute_90a,code=sm_90a -o ${CMAKE_BINARY_DIR}/test_cuda.o -c ${CMAKE_BINARY_DIR}/test_cuda.cu - RESULT_VARIABLE NVCC_RESULT - OUTPUT_VARIABLE NVCC_OUTPUT - ERROR_VARIABLE NVCC_ERROR_OUTPUT - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} -) +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC -Wno-psabi") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC -Wno-psabi") +set(CUDA_SEPARABLE_COMPILATION ON) +list(APPEND CUDA_NVCC_FLAGS "-DENABLE_FAST_DEBUG") +list(APPEND CUDA_NVCC_FLAGS "-O3") +list(APPEND CUDA_NVCC_FLAGS "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage") -if (NVCC_RESULT EQUAL "0") - set(NVCC_SUPPORTS_SM90 TRUE) - message(STATUS "NVCC supports SM90") -else() - message(STATUS "NVCC does not support SM90") -endif() +set(USE_SYSTEM_NVTX on) +set(CUDA_ARCH_LIST "9.0" CACHE STRING "List of CUDA architectures to compile") +set(TORCH_CUDA_ARCH_LIST "${CUDA_ARCH_LIST}") -if (NVCC_SUPPORTS_SM90) - set(TORCH_CUDA_ARCH_LIST "8.6" CACHE STRING "Add arch tag 90a to NVCC" FORCE) - list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_90a,code=sm_90a") -endif() +find_package(CUDAToolkit REQUIRED) +find_package(pybind11 REQUIRED) find_package(Torch REQUIRED) -include_directories(deep_gemm/include third-party/cutlass/include third-party/cutlass/tools/util/include) -include_directories(${CUDA_TOOLKIT_ROOT_DIR}/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS}) -link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CUDA_STANDARD 20) + +include_directories(deep_gemm/include third-party/cutlass/include third-party/cutlass/tools/util/include third-party/fmt/include) +include_directories(${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include ${TORCH_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS}) +link_directories(${TORCH_INSTALL_PREFIX}/lib ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs) -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -fPIC") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3 -fPIC -DNDEBUG") -set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -O3 -std=c++17 -DNDEBUG --ptxas-options=--register-usage-level=10") +# The main Python API entrance +pybind11_add_module(deep_gemm_cpp csrc/python_api.cpp) +target_link_libraries(deep_gemm_cpp PRIVATE ${TORCH_LIBRARIES} torch_python cuda) -cuda_add_library(example_gemm STATIC indexing/main.cu) +# Enable kernel code indexing with CMake-based IDEs +cuda_add_library(deep_gemm_indexing_cuda STATIC csrc/indexing/main.cu) diff --git a/README.md b/README.md index e1df304a..01e1f540 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,18 @@ # DeepGEMM -DeepGEMM is a library designed for clean and efficient FP8 General Matrix Multiplications (GEMMs) with fine-grained scaling, as proposed in [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3). It supports both normal and Mix-of-Experts (MoE) grouped GEMMs. Written in CUDA, the library has no compilation need during installation, by compiling all kernels at runtime using a lightweight Just-In-Time (JIT) module. +DeepGEMM is a library designed for clean and efficient General Matrix Multiplications (GEMMs). It supports FP8 and BF16 (working in progress) for both normal and Mix-of-Experts (MoE) grouped scenarios. Written in CUDA, the library has no kernel compilation need during installation, by compiling all kernels at runtime using a lightweight Just-In-Time (JIT) module. -DeepGEMM leverages some concepts from [CUTLASS](https://github.com/nvidia/cutlass) and [CuTe](https://github.com/NVIDIA/cutlass/tree/main/include/cute), it avoids heavy reliance on their templates or algebras. Instead, the library is designed for simplicity, with only one core kernel function. This makes it a clean and accessible resource for learning SM90 and SM100 FP8 matrix multiplication and optimization techniques. +DeepGEMM leverages some concepts from [CUTLASS](https://github.com/nvidia/cutlass) and [CuTe](https://github.com/NVIDIA/cutlass/tree/main/include/cute), it avoids heavy reliance on their templates or algebras. Instead, the library is designed for simplicity, with only a limited number of core kernel functions. This makes it a clean and accessible resource for learning NVIDIA GPU kernel optimization techniques. Despite its lightweight design, DeepGEMM's performance matches or exceeds expert-tuned libraries across various matrix shapes. ## News +- 2025.07.20: DeepGEMM now supports both SM90/SM100, and has a full refactor with a low-CPU-overhead JIT CPP module. + - NVRTC and post-compilation SASS optimization are all disabled + - NVRTC will be supported later + - As NVCC 12.9 will automatically do the FFMA interleaving, all post optimizations will be no longer supported + - Please see [#112](https://github.com/deepseek-ai/DeepGEMM/pull/112) for more details - 2025.05.14: DeepGEMM now offers weight gradient kernels for dense and MoE backward! See [#95](https://github.com/deepseek-ai/DeepGEMM/pull/95) for details. - 2025.05.07: DeepGEMM now supports NVRTC with up to 10x compilation speedup! See [#94](https://github.com/deepseek-ai/DeepGEMM/pull/94) for details. Please use `DG_JIT_USE_NVRTC=1` to enable it (may have performance loss with some cases). - 2025.04.18: DeepGEMM now achieves up to **1550 TFLOPS** on H800! See [#74](https://github.com/deepseek-ai/DeepGEMM/pull/74), [#78](https://github.com/deepseek-ai/DeepGEMM/pull/78), [#81](https://github.com/deepseek-ai/DeepGEMM/pull/81), [#86](https://github.com/deepseek-ai/DeepGEMM/pull/86) and [340d988](https://github.com/deepseek-ai/DeepGEMM/commit/340d9880f4a418d943d34260d20a79f41f4c0526) for details. @@ -16,25 +21,22 @@ Despite its lightweight design, DeepGEMM's performance matches or exceeds expert - [x] More correctness tests for grouped-contiguous layout - [x] Shared memory swizzling for output -- [ ] Larger block size on N (up to 256) - [x] MoE scheduler with TMA multicast compatibility - [x] Fix TMA multicast compatibility for indivisible shapes - [x] Skip useless computation on M -- [x] NVRTC as a faster compiler -- [ ] Stolen JIT cache +- [ ] NVRTC as a faster compiler - [ ] Sanitizer for testing - [x] Weight gradient kernels for dense models - [x] Weight gradient kernels for MoE models - [ ] Better `get_best_configs` modeling -- [ ] Utility kernels for MoE models (maybe with [tile-lang](https://github.com/tile-ai/tilelang)) - [ ] CUDA PDL support -- [ ] More scaling granularity support via templates - [ ] Larger TMA multicast size for some shapes - [x] MMA template refactor with CUTLASS -- [ ] Optimizations for power efficiency - [x] Remove shape limitations on N and K - [ ] BF16 kernels - [ ] Split/stream-k optimizations +- [ ] Ampere kernels +- [ ] Polish docs ## Quick start @@ -42,33 +44,36 @@ Despite its lightweight design, DeepGEMM's performance matches or exceeds expert - NVIDIA SM90 or SM100 architecture GPU - Python 3.8 or higher +- Compilers with C++20 support - CUDA Toolkit: - - CUDA 12.3 or higher for SM90 - - **We highly recommend 12.8 or higher for the best performance** - - CUDA 12.8 or higher for SM100 + - CUDA 12.3 or higher for SM90 + - **We highly recommend 12.9 or higher for the best performance** + - CUDA 12.9 or higher for SM100 - PyTorch 2.1 or higher -- CUTLASS 3.6 or higher (could be cloned by Git submodule) +- CUTLASS 4.0 or higher (could be cloned by Git submodule) +- `{fmt}` library (could be cloned by Git submodule) ### Development ```bash # Submodule must be cloned git clone --recursive git@github.com:deepseek-ai/DeepGEMM.git +cd DeepGEMM -# Install DeepGEMM -python setup.py install +# Link some essential includes and build the CPP JIT module +cat develop.sh +./develop.sh -# Test JIT compilation -python tests/test_jit.py - -# Test all GEMM implements (normal, contiguous-grouped and masked-grouped) +# Test all GEMM implements +python tests/test_layout.py python tests/test_core.py ``` ### Installation ```bash -python setup.py install +cat install.sh +./install.sh ``` Then, import `deep_gemm` in your Python project, and enjoy! @@ -77,27 +82,24 @@ Then, import `deep_gemm` in your Python project, and enjoy! #### Notices -This library provides optimized GEMM kernels for NVIDIA GPUs. The input shape layout is NT (non-transposed LHS, transposed RHS). While the SM90 implementation supports only the NT memory layout (row-major, col-major), the SM100 implementation supports all memory layouts (NT, TN, NN, TT). +This library provides optimized GEMM kernels for NVIDIA GPUs with a naming convention: `D = C + A @ B`. The input shape layout is NT (non-transposed A, transposed B). While the SM90 implementation supports only the NT memory layout (row-major, col-major), the SM100 implementation supports all memory layouts (NT, TN, NN, TT). For example, `fp8_gemm_nt` will do a `D = C + A @ B.T` For both architectures, the LHS scaling factor is required to have a TMA-aligned and transposed layout. And the data format for the scaling factor of SM90 and SM100 is different: - SM90 requires scaling factors in FP32 format. - -- SM100 requires scaling factors in [UE8M0](https://docs.nvidia.com/cuda/parallel-thread-execution/#alternate-floating-point-data-formats) format. +- SM100 requires scaling factors in packed [UE8M0](https://docs.nvidia.com/cuda/parallel-thread-execution/#alternate-floating-point-data-formats) format, which packs 4 UE8M0 into a single `torch.int`. Please note that operations like input transposition or FP8 casting must be handled separately by the user, please implement or fuse them into prior kernels independently. While the library provides some simple PyTorch utility functions, these may result in slower performance, but our primary focus is on optimizing the GEMM kernels themselves. #### Normal dense GEMMs (non-grouped) -To perform a basic non-grouped FP8 GEMM, call the `fp8_gemm_nt` function. For more details, please refer to the function documentation. +To perform a basic non-grouped FP8 GEMM, call the `fp8_gemm_{nt, nn, tn, tt}` function. For more details, please refer to the function documentation. #### Grouped GEMMs (contiguous layout) -Unlike traditional grouped GEMMs in CUTLASS, DeepGEMM groups only the M-axis, while N and K must remain fixed. This design is tailored for scenarios where experts in an MoE model share the same shape. +Unlike traditional grouped GEMMs in CUTLASS, DeepGEMM groups only the M-axis, while N and K must remain fixed. This design is tailored for scenarios where experts in an MoE model share the same shape. For training forward passes or inference prefilling, where each expert may process a varying number of tokens, we concatenate these tokens into a single tensor, referred to as the "contiguous" layout. Note that each expert segment must be aligned to the GEMM M block size (`get_mk_alignment_for_contiguous_layout()`). For more information, please refer to the `m_grouped_fp8_gemm_{nt, nn}_contiguous` function documentation. -For training forward passes or inference prefilling, where each expert may process a varying number of tokens, we concatenate these tokens into a single tensor, referred to as the "contiguous" layout. Note that each expert segment must be aligned to the GEMM M block size (`get_m_alignment_for_contiguous_layout()`). - -For more information, please refer to the `m_grouped_fp8_gemm_nt_contiguous` function documentation. +We also provide a K-axis-grouped API for MoE weight backward (with M and N must remain fixed), please refer to `k_grouped_fp8_gemm_tn_contiguous` for more information. #### Grouped GEMMs (masked layout) @@ -110,93 +112,31 @@ Use `fp8_m_grouped_gemm_nt_masked` for this purpose and consult the relevant doc The library provides some utility functions besides the above kernels: - `deep_gemm.set_num_sms`: set the maximum SM count to use -- `deep_gemm.get_num_sms`: get the current SM maximum count -- `deep_gemm.get_m_alignment_for_contiguous_layout`: get the group-level alignment requirement for grouped contiguous layout +- `deep_gemm.get_num_sms`: get the current SM maximum count (return the device SM count if not set) +- `deep_gemm.transform_sf_into_required_layout`: transform scaling factors into required layout - `deep_gemm.get_tma_aligned_size`: get the required TMA alignment size -- `deep_gemm.get_col_major_tma_aligned_tensor`: get a column-major TMA-aligned tensor +- `deep_gemm.get_mk_alignment_for_contiguous_layout`: get the group-level alignment requirement for grouped contiguous layout +- `deep_gemm.get_mn_major_tma_aligned_tensor`: get a MN-major TMA-aligned tensor +- `deep_gemm.get_mn_major_tma_aligned_packed_ue8m0_tensor`: get a MN-major TMA-aligned tensor (with packing FP32 into UE8M0) +- `deep_gemm.get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor`: K-grouped GEMM packing kernel The library also provides some environment variables, which may be useful: - General - - `DG_JIT_DEBUG`: `0` or `1`, print more JIT debugging information, `0` by default + - `DG_JIT_DEBUG`: `0` or `1`, print more JIT debugging information, `0` by default - JIT cache related - - `DG_JIT_CACHE_DIR`: string, the cache directory to store compiled kernels, `$HOME/.deep_gemm` by default - - `DG_JIT_DISABLE_CACHE`: `0` or `1`, disable the use of cache directory, `0` by default + - `DG_JIT_CACHE_DIR`: string, the cache directory to store compiled kernels, `$HOME/.deep_gemm` by default - NVCC/NVRTC selections - - `DG_JIT_USE_NVRTC`: `0` or `1`, use NVRTC instead of NVCC, faster compilation but maybe have lower performance for some cases, `0` by default - - `DG_JIT_NVCC_COMPILER`: string, specified NVCC compiler path; will find in `torch.utils.cpp_extension.CUDA_HOME` by default + - `DG_JIT_USE_NVRTC`: `0` or `1`, use NVRTC instead of NVCC, faster compilation but maybe have lower performance for some cases, `0` by default + - `DG_JIT_NVCC_COMPILER`: string, specified NVCC compiler path; will find in `torch.utils.cpp_extension.CUDA_HOME` by default - Compiler options - - `DG_JIT_OVERRIDE_CPP_STANDARD`: integer (e.g., `20`), support for some old version GCC compiler, `20` by default - - `DG_JIT_PTXAS_VERBOSE`: `0` or `1`, show detailed PTXAS compiler output, `0` by default - - `DG_JIT_PRINT_REG_REUSE`: `0` or `1`, print FFMA-interleaving details, `0` by default - - `DG_JIT_PRINT_COMPILER_COMMAND`: `0` or `1`, print NVCC compilation command, `0` by default -- Post optimization - - `DG_JIT_DISABLE_FFMA_INTERLEAVE`: `0` or `1`, disable FFMA-interleaving optimization, `0` by default (only valid for SM90) + - `DG_JIT_PTXAS_VERBOSE`: `0` or `1`, show detailed PTXAS compiler output, `0` by default + - `DG_JIT_PRINT_COMPILER_COMMAND`: `0` or `1`, print NVCC compilation command, `0` by default - Heuristic selection - - `DG_PRINT_CONFIGS`: `0` or `1`, print selected configs for each shape, `0` by default -- Testing - - `DG_NSYS_PROFILING`: `0` or `1`, Nsight-system compatible testing, `0` by default + - `DG_PRINT_CONFIGS`: `0` or `1`, print selected configs for each shape, `0` by default For additional examples and details, please refer to [the test code](tests/test_core.py) or review the corresponding Python documentation. -## Optimizations - -We indicate the techniques excluded from CUTLASS with 🐳. - -#### Persistent warp-specialization - -Following the CUTLASS design, the kernels in DeepGEMM are warp-specialized, enabling overlapping data movement, tensor-core MMA instructions, and CUDA-core promotion. A simplified figure illustrating this process is shown below: - -![design](figures/design.png) - -#### TMA features - -The [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/hopper-tuning-guide/index.html#tensor-memory-accelerator) (TMA) is a new hardware feature introduced by the Hopper architecture, designed for faster and asynchronous data movement. Specifically, we utilize TMA for: - -- TMA load for LHS, LHS scaling factors, and RHS matrices -- TMA store for the output matrix -- TMA multicast (automatically decide LHS or RHS to broadcast) -- TMA descriptor prefetching - -#### Common detail optimizations - -- Utilization of the [`stmatrix`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-store-instruction-stmatrix) PTX instruction -- [Register count control](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg) tailored for different warpgroups -- Less bank conflicts via 3D TMA or swizzling -- Larger block sizes (up to 256x128 🐳) -- Overlapping as much as possible, e.g., overlapping TMA store and non-TMA RHS scaling factor load 🐳 - -#### A unified and optimized block scheduler - -- [One scheduler](deep_gemm/include/deep_gemm/scheduler.cuh) for all non-grouped and grouped kernels -- [Rasterization](https://github.com/NVIDIA/cutlass/blob/eefa171318b79cbe2e78514d4cce5cd0fe919d0c/media/docs/efficient_gemm.md#threadblock-rasterization) to enhance L2 cache reuse - -#### Fully JIT design 🐳 - -DeepGEMM employs a fully [Just-In-Time](deep_gemm/jit) (JIT) design, with no compilation required at installation. All kernels are compiled at runtime using a lightweight JIT implementation. This approach offers several advantages: - -- GEMM shapes, block sizes, and the number of pipeline stages are treated as compile-time constants - - Saving registers - - Compilers may do more optimizations -- Automatic selection of block sizes, number of warpgroups, optimal pipeline stages, and TMA cluster size - - But without auto-tuning, the optimal one is deterministically selected -- Full unrolling of the MMA pipelines, providing compilers with more optimization opportunities - - Very important for small shapes - - Refer to `launch_k_iterations` in [the kernel file](deep_gemm/include/deep_gemm/fp8_gemm.cuh) for details - -Overall, JIT significantly improves performance for small shapes, similar to the approach of the [Triton](https://github.com/triton-lang/triton/) compiler. - -#### Unaligned block sizes 🐳 - -For certain shapes, block sizes aligned to powers of 2 can lead to underutilized SMs. For instance, with `M=256, N=7168`, a typical block size assignment of `BLOCK_M=128, BLOCK_N=128` results in only `(256 / 128) * (7168 / 128) = 112` out of 132 SMs being utilized. To address this, we support unaligned block sizes like 112, enabling `(256 / 128) * (7168 / 112) = 128` SMs to work in such scenarios. Implementing this technique alongside fine-grained scaling requires careful optimization but ultimately delivers performance gains. - -#### FFMA SASS interleaving 🐳 - -We observe a performance improvement in [the CUTLASS FP8 kernel](https://github.com/NVIDIA/cutlass/tree/main/examples/54_hopper_fp8_warp_specialized_gemm) between NVCC 12.2 and 12.3. By comparing the compiled SASS, we discover that one bit in [a series of `FADD` instructions](https://github.com/NVIDIA/cutlass/blob/eefa171318b79cbe2e78514d4cce5cd0fe919d0c/include/cutlass/gemm/collective/fp8_accumulation.hpp#L73) is flipped in an interleaving pattern. -After referencing some open-source [CUDA assembler](https://github.com/cloudcores/CuAssembler/blob/96a9f72baf00f40b9b299653fcef8d3e2b4a3d49/CuAsm/CuControlCode.py#L46) implementations, we identified that this bit controls `yield`, which may enhance warp-level parallelism (just a guess, yielding the current warp and let other warps work). - -To leverage this, we develop [a similar script](deep_gemm/jit/interleave_ffma.py) to modify the `FFMA` instructions in the compiled binary. Besides simply modifying the `yield` bit, we also flip the `reuse` bit (registers cannot be reused if the warp is yielded). This adjustment improves performance (10%+ in some cases) for fine-grained scaling FP8 GEMMs by creating more opportunities to overlap MMA instructions with promotion `FFMA` instructions. - ## Acknowledgement DeepGEMM is inspired by the [CUTLASS](https://github.com/nvidia/cutlass) project. Thanks and respect to the developers! @@ -204,15 +144,3 @@ DeepGEMM is inspired by the [CUTLASS](https://github.com/nvidia/cutlass) project ## License This code repository is released under [the MIT License](LICENSE). - -## Citation - -```bibtex -@misc{deepgemm2025, - title={DeepGEMM: clean and efficient FP8 GEMM kernels with fine-grained scaling}, - author={Chenggang Zhao and Liang Zhao and Jiashi Li and Zhean Xu}, - year={2025}, - publisher = {GitHub}, - howpublished = {\url{https://github.com/deepseek-ai/DeepGEMM}}, -} -``` \ No newline at end of file diff --git a/indexing/main.cu b/csrc/indexing/main.cu similarity index 76% rename from indexing/main.cu rename to csrc/indexing/main.cu index 674a6831..a05b59c8 100644 --- a/indexing/main.cu +++ b/csrc/indexing/main.cu @@ -3,6 +3,8 @@ #include #include #include +#include +#include using namespace deep_gemm; diff --git a/csrc/jit/cache.hpp b/csrc/jit/cache.hpp new file mode 100644 index 00000000..fde9aab9 --- /dev/null +++ b/csrc/jit/cache.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include +#include + +#include "kernel_runtime.hpp" + +namespace deep_gemm { + +class KernelRuntimeCache { + std::unordered_map> cache; + +public: + // TODO: consider cache capacity + KernelRuntimeCache() = default; + + std::shared_ptr get(const std::filesystem::path& dir_path) { + // Hit the runtime cache + if (const auto& iterator = cache.find(dir_path); iterator != cache.end()) + return iterator->second; + + if (KernelRuntime::check_validity(dir_path)) + return cache[dir_path] = std::make_shared(dir_path); + return nullptr; + } +}; + +static auto kernel_runtime_cache = std::make_shared(); + +} // namespace deep_gemm diff --git a/csrc/jit/compiler.hpp b/csrc/jit/compiler.hpp new file mode 100644 index 00000000..4296b358 --- /dev/null +++ b/csrc/jit/compiler.hpp @@ -0,0 +1,172 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "../utils/exception.hpp" +#include "../utils/format.hpp" +#include "../utils/hash.hpp" +#include "../utils/system.hpp" +#include "cache.hpp" +#include "device_runtime.hpp" + +namespace deep_gemm { + +class Compiler { + std::string library_version; + std::filesystem::path library_root_path; + + std::string get_library_version() const { + // Recursively walk through all subdirectories and update hash + std::stringstream ss; + for (const auto& entry: std::filesystem::recursive_directory_iterator(library_include_path / "deep_gemm")) { + if (entry.is_regular_file() and entry.path().extension() == ".cuh") { + std::ifstream file(entry.path(), std::ios::binary); + std::string content((std::istreambuf_iterator(file)), std::istreambuf_iterator()); + ss << content; + } + } + return get_hex_digest(ss.str()); + } + +public: + std::string signature, flags; + std::filesystem::path library_include_path; + std::filesystem::path cache_dir_path; + + explicit Compiler(const std::filesystem::path& library_root_path) { + // Static library paths + this->library_root_path = library_root_path; + this->library_include_path = library_root_path / "include"; + this->library_version = get_library_version(); + + // Cache settings + cache_dir_path = std::filesystem::path(get_env("HOME")) / ".deep_gemm"; + if (const auto& env_cache_dir_path = get_env("DG_JIT_CACHE_DIR"); not env_cache_dir_path.empty()) + cache_dir_path = env_cache_dir_path; + + // The compiler flags applied to all derived compilers + signature = "unknown-compiler"; + std::string ptxas_flags = "--ptxas-options=--register-usage-level=10"; + if (get_env("DG_JIT_PTXAS_VERBOSE", 0)) + ptxas_flags += ",--verbose"; + flags = fmt::format("-std=c++20 --diag-suppress=39,161,174,177,186,940 {}", ptxas_flags); + } + + virtual ~Compiler() = default; + + std::filesystem::path make_tmp_dir() const { + return make_dirs(cache_dir_path / "tmp"); + } + + std::filesystem::path get_tmp_file_path() const { + return make_tmp_dir() / get_uuid(); + } + + void put(const std::filesystem::path& path, const std::string& data) const { + const auto tmp_file_path = get_tmp_file_path(); + + // Write into the temporary file + std::ofstream out(tmp_file_path, std::ios::binary); + DG_HOST_ASSERT(out.write(data.data(), data.size())); + out.close(); + + // Atomically replace + std::filesystem::rename(tmp_file_path, path); + } + + std::shared_ptr build(const std::string& name, const std::string& code) const { + const auto kernel_signature = fmt::format("{}$${}$${}$${}$${}", name, library_version, signature, flags, code); + const auto dir_path = cache_dir_path / "cache" / fmt::format("kernel.{}.{}", name, get_hex_digest(kernel_signature)); + + // Hit the runtime cache + if (const auto& runtime = kernel_runtime_cache->get(dir_path); runtime != nullptr) + return runtime; + + // Create the kernel directory + make_dirs(dir_path); + + // Compile into a temporary CUBIN + const auto tmp_cubin_path = get_tmp_file_path(); + compile(code, dir_path, tmp_cubin_path); + + // Replace into the cache directory + make_dirs(dir_path); + std::filesystem::rename(tmp_cubin_path, dir_path / "kernel.cubin"); + + // Put into the runtime cache + const auto& runtime = kernel_runtime_cache->get(dir_path); + DG_HOST_ASSERT(runtime != nullptr); + return runtime; + } + + virtual void compile(const std::string &code, const std::filesystem::path& dir_path, const std::filesystem::path &cubin_path) const = 0; +}; + +class NVCCCompiler final: public Compiler { + std::filesystem::path nvcc_path; + + std::pair get_nvcc_version() const { + DG_HOST_ASSERT(std::filesystem::exists(nvcc_path)); + + // Call the version command + const auto& command = std::string(nvcc_path) + " --version"; + const auto& [return_code, output] = call_external_command(command); + DG_HOST_ASSERT(return_code == 0); + + // The version should be at least 12.3, for the best performance with 12.9 + int major, minor; + std::smatch match; + DG_HOST_ASSERT(std::regex_search(output, match, std::regex(R"(release (\d+\.\d+))"))); + std::sscanf(match[1].str().c_str(), "%d.%d", &major, &minor); + DG_HOST_ASSERT((major > 12 or (major == 12 and minor >= 3)) and "NVCC version should be >= 12.3"); + if (major < 12 or (major == 12 and minor < 9)) + printf("Warning: please use at least NVCC 12.9 for the best DeepGEMM performance"); + return {major, minor}; + } + +public: + NVCCCompiler(const std::filesystem::path& library_root_path, + const std::filesystem::path& cuda_home_path_by_torch): + Compiler(library_root_path) { + // Override the compiler signature + nvcc_path = cuda_home_path_by_torch / "bin" / "nvcc"; + if (const auto& env_nvcc_path = get_env("DG_JIT_NVCC_COMPILER"); not env_nvcc_path.empty()) + nvcc_path = env_nvcc_path; + const auto& [nvcc_major, nvcc_minor] = get_nvcc_version(); + signature = fmt::format("NVCC{}.{}", nvcc_major, nvcc_minor); + + // The override the compiler flags + flags = fmt::format("{} -I{} --gpu-architecture=sm_{}a " + "--compiler-options=-fPIC,-O3,-fconcepts,-Wno-deprecated-declarations,-Wno-abi " + "-cubin -O3 --expt-relaxed-constexpr --expt-extended-lambda", + flags, library_include_path.c_str(), device_runtime->get_arch()); + } + + void compile(const std::string &code, const std::filesystem::path& dir_path, const std::filesystem::path &cubin_path) const override { + // Write the code into the cache directory + const auto& code_path = dir_path / "kernel.cu"; + put(code_path, code); + + // Compile + const auto& command = fmt::format("{} {} -o {} {}", nvcc_path.c_str(), code_path.c_str(), cubin_path.c_str(), flags); + if (get_env("DG_JIT_DEBUG", 0) or get_env("DG_JIT_PRINT_COMPILER_COMMAND", 0)) + printf("Running NVCC command: %s", command.c_str()); + const auto& [return_code, output] = call_external_command(command); + if (return_code != 0) { + printf("NVCC compilation failed: %s", output.c_str()); + DG_HOST_ASSERT(false and "NVCC compilation failed"); + } + + // Print PTXAS log + if (get_env("DG_JIT_DEBUG", 0) or get_env("DG_JIT_PTXAS_VERBOSE", 0)) + printf("%s", output.c_str()); + } +}; + +static std::shared_ptr compiler = nullptr; + +} // namespace deep_gemm diff --git a/csrc/jit/device_runtime.hpp b/csrc/jit/device_runtime.hpp new file mode 100644 index 00000000..c3237da8 --- /dev/null +++ b/csrc/jit/device_runtime.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include + +#include "../utils/exception.hpp" + +namespace deep_gemm { + +class DeviceRuntime { + int num_sms = 0; + std::shared_ptr cached_prop; + +public: + explicit DeviceRuntime() = default; + + std::shared_ptr get_prop() { + if (cached_prop == nullptr) + cached_prop = std::make_shared(*at::cuda::getCurrentDeviceProperties()); + return cached_prop; + } + + std::pair get_arch_pair() { + const auto prop = get_prop(); + return {prop->major, prop->minor}; + } + + int get_arch() { + const auto& [major, minor] = get_arch_pair(); + return major * 10 + minor; + } + + int get_arch_major() { + return get_arch_pair().first; + } + + void set_num_sms(const int& new_num_sms) { + DG_HOST_ASSERT(0 <= new_num_sms and new_num_sms <= get_prop()->multiProcessorCount); + num_sms = new_num_sms; + } + + int get_num_sms() { + if (num_sms == 0) + num_sms = get_prop()->multiProcessorCount; + return num_sms; + } +}; + +static auto device_runtime = std::make_shared(); + +} // namespace deep_gemm diff --git a/csrc/jit/kernel_runtime.hpp b/csrc/jit/kernel_runtime.hpp new file mode 100644 index 00000000..ac95f99a --- /dev/null +++ b/csrc/jit/kernel_runtime.hpp @@ -0,0 +1,139 @@ +#pragma once + +#include +#include + +#include "../utils/exception.hpp" +#include "../utils/format.hpp" +#include "../utils/system.hpp" +#include "device_runtime.hpp" + +namespace deep_gemm { + +struct LaunchArgs { + std::pair grid_dim; + int num_threads; + int smem_size; + int cluster_dim; + + LaunchArgs(const int& grid_dim_x, const int& num_threads, const int& smem_size = 0, const int& cluster_dim = 1): + grid_dim({grid_dim_x, 1}), num_threads(num_threads), smem_size(smem_size), cluster_dim(cluster_dim) {} + + LaunchArgs(const std::pair& grid_dim, const int& num_threads, const int& smem_size = 0, const int& cluster_dim = 1): + grid_dim(grid_dim), num_threads(num_threads), smem_size(smem_size), cluster_dim(cluster_dim) {} +}; + +template +concept HasLaunchArgs = requires (const T& t) { + { t.launch_args } -> std::convertible_to; +}; + +class KernelRuntime final { +public: + static std::filesystem::path cuda_home; + + cudaLibrary_t library; + cudaKernel_t kernel; + + explicit KernelRuntime(const std::filesystem::path& dir_path) { + // NOLINT(*-pro-type-member-init) + const auto& cuobjdump_path = cuda_home / "bin" / "cuobjdump"; + const auto& cubin_path = dir_path / "kernel.cubin"; + if (get_env("DG_JIT_DEBUG")) + printf("Loading CUBIN: %s\n", cubin_path.c_str()); + + // Find the only symbol + // TODO: use kernel enumeration for newer drivers + const std::vector illegal_names = {"vprintf", "__instantiate_kernel", "__internal", "__assertfail"}; + const auto& [exit_code, symbols] = call_external_command(fmt::format("{} -symbols {}", cuobjdump_path.c_str(), cubin_path.c_str())); + DG_HOST_ASSERT(exit_code == 0); + std::istringstream iss(symbols); + std::vector symbol_names; + for (std::string line; std::getline(iss, line); ) { + if (line.find("STT_FUNC") == 0 and std::ranges::none_of(illegal_names, [&](const auto& name) { return line.find(name) != std::string::npos; })) { + const auto& last_space = line.rfind(' '); + symbol_names.push_back(line.substr(last_space + 1)); + } + } + if (get_env("DG_JIT_DEBUG")) { + printf("Symbol names: "); + for (const auto& symbol: symbol_names) + printf("%s, ", symbol.c_str()); + printf("\n"); + } + + // Load from the library + DG_HOST_ASSERT(symbol_names.size() == 1); + DG_CUDA_RUNTIME_CHECK(cudaLibraryLoadFromFile(&library, cubin_path.c_str(), nullptr, nullptr, 0, nullptr, nullptr, 0)); + DG_CUDA_RUNTIME_CHECK(cudaLibraryGetKernel(&kernel, library, symbol_names[0].c_str())); + } + + static void set_cuda_home(const std::string& cuda_home_path_by_torch) { + cuda_home = cuda_home_path_by_torch; + } + + static bool check_validity(const std::filesystem::path& dir_path) { + return std::filesystem::exists(dir_path / "kernel.cu") and + std::filesystem::exists(dir_path / "kernel.cubin"); + } + + ~KernelRuntime() noexcept(false) { + const auto& error = cudaLibraryUnload(library); + DG_HOST_ASSERT(error == cudaSuccess or error == cudaErrorCudartUnloading); + } +}; + +// Declare after defining +decltype(KernelRuntime::cuda_home) KernelRuntime::cuda_home; + +template +class LaunchRuntime { +public: + template requires HasLaunchArgs + static std::string generate(const Args& args) { + const auto& code = Derived::generate_impl(args); + if (get_env("DG_JIT_DEBUG", 0)) + printf("Generated kernel code: %s\n", code.c_str()); + return code; + } + + template requires HasLaunchArgs + static void launch(const std::shared_ptr& kernel_runtime, const Args& args) { + const auto& kernel = kernel_runtime->kernel; + const auto& stream = at::cuda::getCurrentCUDAStream(); + const LaunchArgs& launch_args = args.launch_args; + + // Set dynamic shared memory size + if (launch_args.smem_size > 0) + DG_CUDA_RUNTIME_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, launch_args.smem_size)); + + // Launch config + cudaLaunchConfig_t config; + config.gridDim = {static_cast(launch_args.grid_dim.first), + static_cast(launch_args.grid_dim.second), + 1}; + config.blockDim = {static_cast(launch_args.num_threads), 1, 1}; + config.dynamicSmemBytes = launch_args.smem_size; + config.stream = stream; + config.numAttrs = 0; + + // Clusters + cudaLaunchAttribute attr; + if (launch_args.cluster_dim > 1) { + attr.id = cudaLaunchAttributeClusterDimension; + attr.val.clusterDim = {static_cast(launch_args.cluster_dim), 1, 1}; + config.attrs = &attr; + config.numAttrs = 1; + } + + // Launch in the derived class + if (get_env("DG_JIT_DEBUG")) { + printf("Launch kernel with {%d, %d} x %d, shared memory: %d bytes, cluster: %d, stream: %ld\n", + launch_args.grid_dim.first, launch_args.grid_dim.second, launch_args.num_threads, + launch_args.smem_size, launch_args.cluster_dim, stream.id()); + } + Derived::launch_impl(kernel, config, args); + } +}; + +} // namespace deep_gemm diff --git a/csrc/jit_kernels/heuristics/common.hpp b/csrc/jit_kernels/heuristics/common.hpp new file mode 100644 index 00000000..b5a8b61c --- /dev/null +++ b/csrc/jit_kernels/heuristics/common.hpp @@ -0,0 +1,298 @@ +#pragma once + +#include "../../utils/math.hpp" + +namespace deep_gemm { + +struct MulticastConfig { + int num_multicast; + bool is_multicast_on_a; + + MulticastConfig(const int& num_multicast, const bool& is_multicast_on_a): + num_multicast(num_multicast), is_multicast_on_a(is_multicast_on_a) { + DG_HOST_ASSERT(1 <= num_multicast and num_multicast <= 2); + } +}; + +struct SharedMemoryConfig { + int smem_size; + int swizzle_a_mode; + int swizzle_b_mode; + int swizzle_cd_mode; +}; + +struct ThreadConfig { + int num_threads; + + // SM90 + int num_tma_threads; + int num_math_threads; + + // SM100 + int num_non_epilogue_threads; + int num_epilogue_threads; + + static ThreadConfig sm90(const int& num_tma_threads, + const int& num_math_threads) { + auto config = ThreadConfig(); + config.num_threads = num_tma_threads + num_math_threads; + config.num_tma_threads = num_tma_threads; + config.num_math_threads = num_math_threads; + return config; + } + + static ThreadConfig sm100(const int& num_non_epilogue_threads, + const int& num_epilogue_threads) { + auto config = ThreadConfig(); + config.num_threads = num_non_epilogue_threads + num_epilogue_threads; + config.num_non_epilogue_threads = num_non_epilogue_threads; + config.num_epilogue_threads = num_epilogue_threads; + return config; + } +}; + +struct GemmConfig { + // Templated configs + GemmType gemm_type; + KernelType kernel_type; + at::ScalarType ab_dtype, cd_dtype; + cute::UMMA::Major major_a; + cute::UMMA::Major major_b; + bool with_accumulation; + int block_m, block_n, block_k; + int num_stages, num_last_stages; + + // Runtime configs + int num_sms; + + // Structured configs + MulticastConfig multicast_config; + SharedMemoryConfig smem_config; + ThreadConfig thread_config; +}; + +static bool is_multicast_legal(const int& shape_dim, const int& block_dim, + const int& num_multicast, const int& num_sms, + const bool& require_divisible) { + const bool& divisible = ceil_div(shape_dim, block_dim) % num_multicast == 0 or not require_divisible; + return divisible and num_sms % num_multicast == 0; +} + +static int get_swizzle_mode(const int& block_size, const int& elem_size) { + // `> 0` means interleaving + // 16B actually means non-swizzling (but interleaving) + for (const int& mode: {128, 64, 32, 16}) { + if ((block_size * elem_size) % mode == 0) + return mode; + } + DG_HOST_UNREACHABLE("Unreachable"); +} + +template +static SharedMemoryConfig get_smem_config(const KernelType& kernel_type, + const int& m, const int& n, const int& k, + const int& block_m, const int& block_n, const int& block_k, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const at::ScalarType& ab_dtype, const at::ScalarType& cd_dtype, + const int& num_stages, const MulticastConfig& multicast_config) { + const int& ab_elem_size = static_cast(c10::elementSize(ab_dtype)); + const int& cd_elem_size = static_cast(c10::elementSize(cd_dtype)); + + const int& load_block_m = ArchSpec::get_ab_load_block_m(multicast_config, block_m); + const int& load_block_n = ArchSpec::get_ab_load_block_n(multicast_config, block_n); + const int& swizzle_a_mode = get_swizzle_mode(major_a == cute::UMMA::Major::K ? block_k : load_block_m, ab_elem_size); + const int& swizzle_b_mode = get_swizzle_mode(major_b == cute::UMMA::Major::K ? block_k : load_block_n, ab_elem_size); + const int& swizzle_cd_mode = get_swizzle_mode(block_n, cd_elem_size); + + // Different archs have different epilogue pipelines + const int& smem_cd = ArchSpec::get_smem_cd_size(kernel_type, block_m, block_n, swizzle_cd_mode, cd_dtype); + + // A/B shared memory + const int& smem_a_per_stage = load_block_m * block_k * ab_elem_size; + const int& smem_b_per_stage = load_block_n * block_k * ab_elem_size; + + // SF shared memory + const auto& [smem_sfa_per_stage, smem_sfb_per_stage] = + ArchSpec::get_sf_smem_size_per_stage(kernel_type, block_m, block_n, block_k, ab_dtype, cd_dtype); + const int& smem_extra_sfb = ArchSpec::get_extra_sfb_smem_size(m, n, k, block_m, block_n, block_k); + + // M-barriers and tensor memory pointers + const int& smem_barrier = ArchSpec::get_barrier_smem_size(num_stages); + const int& smem_tmem_ptr = ArchSpec::get_tmem_ptr_smem_size(); + + // Sum them up + int smem_size = 0; + smem_size += smem_cd; + smem_size += num_stages * smem_a_per_stage; + smem_size += num_stages * smem_b_per_stage; + smem_size += num_stages * smem_sfa_per_stage; + smem_size += num_stages * smem_sfb_per_stage; + smem_size += smem_extra_sfb; + smem_size += smem_barrier; + smem_size += smem_tmem_ptr; + + return SharedMemoryConfig { + .smem_size = smem_size, + .swizzle_a_mode = swizzle_a_mode, + .swizzle_b_mode = swizzle_b_mode, + .swizzle_cd_mode = swizzle_cd_mode, + }; +} + +template +static GemmConfig get_best_config(const GemmType& gemm_type, const KernelType& kernel_type, + const int& m, const int& n, const int& k, const int& num_groups, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const at::ScalarType& ab_dtype, const at::ScalarType& cd_dtype, + const bool& with_accumulation, const int& num_sms) { + DG_HOST_ASSERT(ab_dtype == torch::kFloat8_e4m3fn); + DG_HOST_ASSERT(cd_dtype == torch::kBFloat16 or cd_dtype == torch::kFloat); + + // Select M/N block sizes + // TODO: support `% 16 == 8` block size on SM90 + const auto& block_ms = gemm_type == GemmType::MGroupedContiguous ? + std::vector{get_mk_alignment_for_contiguous_layout()} : std::vector{64, 128, 256}; + std::vector block_ns; + for (int i = 16; i <= 256; i += 16) + block_ns.push_back(i); + + // K block size is selected in a fixed manner + const auto& block_k = 128 / static_cast(c10::elementSize(ab_dtype)); + + // Some util functions + const auto& get_num_blocks = [=](const int& block_m, const int& block_n) { + return ceil_div(m, block_m) * ceil_div(n, block_n) * num_groups; + }; + const auto& get_num_waves = [=](const int& block_m, const int& block_n) { + return ceil_div(get_num_blocks(block_m, block_n), num_sms); + }; + const auto& get_last_wave_util = [=](const int& block_m, const int& block_n) { + const auto& num_last_blocks = get_num_blocks(block_m, block_n) % num_sms; + return num_last_blocks == 0 ? num_sms : num_last_blocks; + }; + + // Decide block sizes by waves + int best_block_m = 0, best_block_n = 0; + int best_num_waves = 0, best_last_util = 0; + for (const auto& block_m: block_ms) { + for (const auto& block_n: block_ns) { + const int& num_waves = get_num_waves(block_m, block_n); + const auto& last_util = get_last_wave_util(block_m, block_n); + if (not ArchSpec::is_block_size_legal(kernel_type, major_a, major_b, ab_dtype, cd_dtype, block_m, block_n)) + continue; + + bool success = false; + if (best_block_m == 0 or best_block_n == 0 or num_waves < best_num_waves) { + success = true; + } else if (num_waves == best_num_waves) { + // Check last wave utilization + success = last_util > best_last_util; + if (last_util == best_last_util) { + // Case 1: same `block_m`, smaller `block_n` (wasted) + success |= block_m == best_block_m and block_n < best_block_n; + // Case 2: same `block_n`, smaller `block_m` (wasted) + success |= block_n == best_block_n and block_m < best_block_m; + // Case 3: different for both `block_m` and `block_n`, larger `block_n` is better + success |= block_m != best_block_m and block_n > best_block_n; + } + } + + // Replace with the new config if successful + if (success) { + best_block_m = block_m, best_block_n = block_n; + best_num_waves = num_waves, best_last_util = last_util; + } + } + } + DG_HOST_ASSERT(best_block_m > 0 and best_block_n > 0); + + // Decide the number of TMA multicasts and whether broadcast on A + MulticastConfig best_multicast_config = {1, true}; + const auto& [is_legal_on_a, is_legal_on_b] = ArchSpec::get_multicast_legality( + gemm_type, m, n, best_block_m, best_block_n, num_sms); + const bool is_legal[2] = {is_legal_on_a, is_legal_on_b}; + bool order[2] = {false, true}; + if (best_block_m > best_block_n) + std::swap(order[0], order[1]); + for (const bool& is_multicast_on_a: order) { + if (m >= 512 and is_legal[static_cast(is_multicast_on_a)]) { + best_multicast_config = {2, is_multicast_on_a}; + break; + } + } + + // Always pick the largest number of stage + constexpr int smem_capacity = ArchSpec::smem_capacity; + int best_num_stages = 0; + SharedMemoryConfig best_smem_config; + for (int num_stages = std::min(12, ceil_div(k, block_k)); num_stages > 0; -- num_stages) { + if (not ArchSpec::is_num_stages_legal(ab_dtype, cd_dtype, num_stages, best_block_m, best_block_n, block_k)) + continue; + + best_smem_config = get_smem_config(kernel_type, + m, n, k, + best_block_m, best_block_n, block_k, + major_a, major_b, + ab_dtype, cd_dtype, + num_stages, best_multicast_config); + if (best_smem_config.smem_size <= smem_capacity) { + best_num_stages = num_stages; + break; + } + } + DG_HOST_ASSERT(best_num_stages != 0); + + // Recompute the minimal number of SMs required + // NOTES: less L2 cache usage and less GPU frequency drop + int num_min_sms = num_sms; + if (ArchSpec::should_minimize_num_sms()) { + num_min_sms = ceil_div(ceil_div(m, best_block_m) * ceil_div(n, best_block_n) * num_groups, best_num_waves); + num_min_sms = align(num_min_sms, best_multicast_config.num_multicast); + DG_HOST_ASSERT(num_min_sms <= num_sms); + } + + const auto& config = GemmConfig { + .gemm_type = gemm_type, + .kernel_type = kernel_type, + .ab_dtype = ab_dtype, + .cd_dtype = cd_dtype, + .major_a = major_a, + .major_b = major_b, + .with_accumulation = with_accumulation, + .block_m = best_block_m, + .block_n = best_block_n, + .block_k = block_k, + .num_stages = best_num_stages, + .num_last_stages = ceil_div(k, block_k) % best_num_stages, + .num_sms = num_min_sms, + .multicast_config = best_multicast_config, + // ReSharper disable once CppLocalVariableMightNotBeInitialized + .smem_config = best_smem_config, + .thread_config = ArchSpec::get_thread_config(kernel_type, best_block_m, best_block_n) + }; + + // Print configs for the first time + if (get_env("DG_JIT_DEBUG") or get_env("DG_PRINT_CONFIGS")) { + auto key = std::make_tuple(gemm_type, kernel_type, m, n, k, num_groups, major_a, major_b, + ab_dtype, cd_dtype, with_accumulation, num_sms); + static std::set printed; + if (not printed.contains(key)) { + printf("Gemm type: %d, kernel type: %d, M: %d, N: %d, K: %d, groups: %d, " + "A major: %d, B major: %d, AB dtype: %s, CD dtype: %s, accumulation: %d, " + "SM limit: %d -> block M: %d, block N: %d, block K: %d, stages: %d, last stages: %d, " + "SMs: %d, multicast: %d, multicast on A: %d, shared memory: %d bytes, swizzle A: %d, " + "swizzle B: %d, swizzle CD: %d, threads: %d\n", + static_cast(gemm_type), static_cast(kernel_type), m, n, k, num_groups, + static_cast(major_a), static_cast(major_b), c10::toString(ab_dtype), c10::toString(cd_dtype), + static_cast(with_accumulation), num_sms, best_block_m, best_block_n, block_k, + best_num_stages, config.num_last_stages, num_min_sms, best_multicast_config.num_multicast, + static_cast(best_multicast_config.is_multicast_on_a), + best_smem_config.smem_size, best_smem_config.swizzle_a_mode, best_smem_config.swizzle_b_mode, + best_smem_config.swizzle_cd_mode, config.thread_config.num_threads); + printed.insert(key); + } + } + return config; +} + +} // namespace deep_gemm diff --git a/csrc/jit_kernels/heuristics/sm100.hpp b/csrc/jit_kernels/heuristics/sm100.hpp new file mode 100644 index 00000000..722c3d1e --- /dev/null +++ b/csrc/jit_kernels/heuristics/sm100.hpp @@ -0,0 +1,144 @@ +#pragma once + +#include +// Reuse some types in the JIT modules +#include + +#include "common.hpp" +#include "../../utils/exception.hpp" + +namespace deep_gemm { + +struct SM100ArchSpec { + static constexpr int smem_capacity = 232448; + + static int get_ab_load_block_m(const MulticastConfig& config, const int& block_m) { + return block_m / (config.is_multicast_on_a ? config.num_multicast : 1); + } + + static int get_ab_load_block_n(const MulticastConfig& config, const int& block_n) { + return block_n / (config.is_multicast_on_a ? 1 : config.num_multicast); + } + + static int get_cd_store_block_m(const int& block_m) { + constexpr int layout_ad_m = 128; + return std::min(block_m, layout_ad_m); + } + + static int get_cd_store_block_n(const int& block_n) { + return block_n; + } + + static std::pair get_sf_uttcp_aligned_block_sizes( + const int& block_m, const int& block_n, const at::ScalarType& ab_dtype) { + constexpr int num_utccp_aligned_elems = 128; + DG_HOST_ASSERT(block_m % num_utccp_aligned_elems == 0); + switch (ab_dtype) { + case torch::kBFloat16: return {0, 0}; + case torch::kFloat8_e4m3fn: return {align(block_m, num_utccp_aligned_elems), align(block_n, num_utccp_aligned_elems)}; + default: DG_HOST_UNREACHABLE("Unknown dtype"); + } + } + + static bool is_block_size_legal(const KernelType& kernel_type, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const at::ScalarType& ab_dtype, const at::ScalarType& cd_dtype, + const int& block_m, const int& block_n) { + // Layout A/D does not support `block_m == 64` and `block_n % 16 != 0` + if (block_m == 64 or block_n % 16 != 0) + return false; + + // Performance is lower with 1D1D and `block_m == 256` + if (kernel_type == KernelType::Kernel1D1D and major_b == cute::UMMA::Major::K and block_m != 128) + return false; + + // 1D2D kernels' maximum block N is 128 + // 1D2D kernels require more friendly block Ns + if (kernel_type == KernelType::Kernel1D2D and (block_n > 128 or 128 % block_n != 0)) + return false; + + // Check tensor memory validity + int sf_block_m = 0, sf_block_n = 0; + if (kernel_type == KernelType::Kernel1D1D) { + const auto& [sf_block_m_, sf_block_n_] = get_sf_uttcp_aligned_block_sizes(block_m, block_n, ab_dtype); + sf_block_m = sf_block_m_, sf_block_n = sf_block_n_; + } + if (((2 * block_n) + (sf_block_m / 32) + (sf_block_n / 32)) > 512) + return false; + + // NOTES: when B is MN-major, we restrict `block_n` to multiples of 64, + // since TMA performance degrades when `swizzle_b <= 32B` (i.e., when `block_ns % 64 != 0`), even with 3D TMA + return major_b == cute::UMMA::Major::K or block_n % 64 == 0; + } + + static bool is_num_stages_legal(const at::ScalarType& ab_dtype, const at::ScalarType& cd_dtype, + const int& num_stages, + const int& block_m, const int& block_n, const int& block_k) { + return true; + } + + static bool should_minimize_num_sms() { + return false; + } + + static std::pair get_multicast_legality(const GemmType& gemm_type, + const int& m, const int& n, const int& block_m, const int& block_n, + const int& num_sms) { + // TODO: support other layouts + return { + is_multicast_legal(m, block_m, 2, num_sms, true) and (gemm_type == GemmType::Normal or gemm_type == GemmType::KGroupedContiguous), + false, + }; + } + + static ThreadConfig get_thread_config(const KernelType& kernel_type, + const int& block_m, const int& block_n) { + return ThreadConfig::sm100(128, kernel_type == KernelType::Kernel1D1D ? 128 : block_m); + } + + static int get_smem_cd_size(const KernelType& kernel_type, + const int& block_m, const int& block_n, + const int& swizzle_cd_mode, + const at::ScalarType& cd_dtype) { + constexpr static int layout_ad_m = 128; + return (kernel_type == KernelType::Kernel1D1D ? std::min(block_m, layout_ad_m) : block_m) * swizzle_cd_mode * 2; + } + + static std::pair get_sf_smem_size_per_stage(const KernelType& kernel_type, + const int& block_m, const int& block_n, const int& block_k, + const at::ScalarType& ab_dtype, const at::ScalarType& cd_dtype) { + if (ab_dtype == torch::kBFloat16) + return {0, 0}; + + int smem_sfa_per_stage = 0; + int smem_sfb_per_stage = 0; + if (kernel_type == KernelType::Kernel1D1D) { + const auto [sf_block_m, sf_block_n] = get_sf_uttcp_aligned_block_sizes(block_m, block_n, ab_dtype); + smem_sfa_per_stage = sf_block_m * 4; + smem_sfb_per_stage = sf_block_n * 4; + } else { + smem_sfa_per_stage = block_m * 4; + smem_sfb_per_stage = 0; + } + return {smem_sfa_per_stage, smem_sfb_per_stage}; + } + + static int get_extra_sfb_smem_size(const int& m, const int& n, const int& k, + const int& block_m, const int& block_n, const int& block_k) { + return 0; + } + + static int get_barrier_smem_size(const int& num_stages) { + // TODO: remove SF barriers for BF16 GEMMs + // TMA full/empty barriers, with-SF full barriers, tensor memory full/empty barriers + // NOTES: 1D2D kernel will not use the with-SF full barriers + // NOTES: some shapes may only have 1 epilogue stage, but we still allocate space for 2 stages + return num_stages * 8 * 3 + 2 * 8 * 2; + } + + static int get_tmem_ptr_smem_size() { + return 4; + } +}; + +} // namespace deep_gemm diff --git a/csrc/jit_kernels/heuristics/sm90.hpp b/csrc/jit_kernels/heuristics/sm90.hpp new file mode 100644 index 00000000..a1cb5b4b --- /dev/null +++ b/csrc/jit_kernels/heuristics/sm90.hpp @@ -0,0 +1,115 @@ +#pragma once + +#include +// Reuse some types in the JIT modules +#include + +#include "common.hpp" + +namespace deep_gemm { + +struct SM90ArchSpec { + static constexpr int smem_capacity = 232448; + + static int get_ab_load_block_m(const MulticastConfig& multicast_config, const int& block_m) { + return block_m; + } + + static int get_ab_load_block_n(const MulticastConfig& multicast_config, const int& block_n) { + return block_n; + } + + static int get_cd_store_block_m(const int& block_m) { + return block_m; + } + + static int get_cd_store_block_n(const int& block_n) { + return block_n; + } + + static bool is_block_size_legal(const KernelType& kernel_type, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const at::ScalarType& ab_dtype, const at::ScalarType& cd_dtype, + const int& block_m, const int& block_n) { + // FP32 output does not support `block_m == 256` + if (cd_dtype == at::kFloat and block_m == 256) + return false; + + // Must be some fixed block N selections + if (block_n > 128 and kernel_type == KernelType::Kernel1D1D and (block_n != 136 or block_n != 152)) + return false; + if (block_n > 128 and kernel_type == KernelType::Kernel1D2D and (block_n != 144 or block_n != 160)) + return false; + + // Avoid bank conflicts for FP32 output + if (cd_dtype == torch::kFloat and block_n % 16 == 0) + return false; + + // The block sizes cannot be too large (for enough registers), so at least one dim less than 128 + return block_m <= 128 or block_n <= 128; + } + + static bool is_num_stages_legal(const at::ScalarType& ab_dtype, const at::ScalarType& cd_dtype, + const int& num_stages, + const int& block_m, const int& block_n, const int& block_k) { + // Unrolling both stages and `num_former_iters` will cause large code size + if (ab_dtype == torch::kFloat8_e4m3fn and block_k % block_n != 0 and block_k / std::gcd(block_n, block_k) <= 4) + return num_stages <= 4; + return true; + } + + static bool should_minimize_num_sms() { + return true; + } + + static std::pair get_multicast_legality(const GemmType& gemm_type, + const int& m, const int& n, const int& block_m, const int& block_n, + const int& num_sms) { + return { + is_multicast_legal(n, block_n, 2, num_sms, gemm_type == GemmType::MGroupedMasked), + is_multicast_legal(m, block_m, 2, num_sms, false) and gemm_type != GemmType::MGroupedMasked, + }; + } + + static ThreadConfig get_thread_config(const KernelType& kernel_type, + const int& block_m, const int& block_n) { + return ThreadConfig::sm90(128, (block_m == 64 ? 1 : 2) * 128); + } + + static int get_smem_cd_size(const KernelType& kernel_type, + const int& block_m, const int& block_n, + const int& swizzle_cd_mode, const at::ScalarType& cd_dtype) { + return block_m * block_n * static_cast(c10::elementSize(cd_dtype)); + } + + static std::pair get_sf_smem_size_per_stage(const KernelType& kernel_type, + const int& block_m, const int& block_n, const int& block_k, + const at::ScalarType& ab_dtype, const at::ScalarType& cd_dtype) { + if (ab_dtype == torch::kBFloat16) + return {0, 0}; + + int smem_sfa_per_stage = block_m * static_cast(sizeof(float)); + int smem_sfb_per_stage = 0; + // TODO: figure out here + if (kernel_type == KernelType::Kernel1D1D) + smem_sfb_per_stage = align(block_n * 4, block_k); + return {smem_sfa_per_stage, smem_sfb_per_stage}; + } + + static int get_extra_sfb_smem_size(const int& m, const int& n, const int& k, + const int& block_m, const int& block_n, const int& block_k) { + const auto& use_uniform_sfb = block_k % block_n == 0 ? 1 : 2; + return align(ceil_div(k, block_k) * static_cast(sizeof(float)) * use_uniform_sfb, 8); + } + + static int get_barrier_smem_size(const int& num_stages) { + // For 1D1D kernels, there is an extra barrier for accumulation + return (num_stages + 1) * 8 * 2; + } + + static int get_tmem_ptr_smem_size() { + return 0; + } +}; + +} // namespace deep_gemm diff --git a/csrc/jit_kernels/impls/runtime_utils.hpp b/csrc/jit_kernels/impls/runtime_utils.hpp new file mode 100644 index 00000000..ed9c5305 --- /dev/null +++ b/csrc/jit_kernels/impls/runtime_utils.hpp @@ -0,0 +1,173 @@ +#pragma once + +#include +#include + +#include "../../utils/math.hpp" +#include "../../utils/exception.hpp" + +namespace deep_gemm { + +static std::pair get_inner_outer_dims(const cute::UMMA::Major& major, const int& k, const int& mn) { + return major == cute::UMMA::Major::K ? std::make_pair(k, mn) : std::make_pair(mn, k); +} + +static int get_non_contiguous_dim(const cute::UMMA::Major& major) { + return major == cute::UMMA::Major::K ? -2 : -1; +} + +static int get_compiled_dim(const int& dim, const char& name, const std::string& compiled_dims) { + for (const char& c: compiled_dims) { + if (name == c) + return dim; + } + return 0; +} + +static std::string to_string(const cute::UMMA::Major& major) { + switch (major) { + case cute::UMMA::Major::K: return "cute::UMMA::Major::K"; + case cute::UMMA::Major::MN: return "cute::UMMA::Major::MN"; + } + DG_HOST_UNREACHABLE("Unknown major"); +} + +static std::string to_string(const GemmType& type) { + switch (type) { + case GemmType::Normal: return "GemmType::Normal"; + case GemmType::MGroupedContiguous: return "GemmType::MGroupedContiguous"; + case GemmType::MGroupedMasked: return "GemmType::MGroupedMasked"; + case GemmType::KGroupedContiguous: return "GemmType::KGroupedContiguous"; + } + DG_HOST_UNREACHABLE("Unknown GEMM type"); +} + +static std::string to_string(const at::ScalarType& dtype) { + switch (dtype) { + case torch::kInt: return "int"; + case torch::kFloat: return "float"; + case torch::kBFloat16: return "cutlass::bfloat16_t"; + default: DG_HOST_UNREACHABLE("Unsupported dtype"); + } +} + +static CUtensorMapDataType aten_dtype_to_tensor_map_dtype(const at::ScalarType& dtype) { + switch (dtype) { + case torch::kInt: return CU_TENSOR_MAP_DATA_TYPE_INT32; + case torch::kFloat: return CU_TENSOR_MAP_DATA_TYPE_FLOAT32; + case torch::kBFloat16: return CU_TENSOR_MAP_DATA_TYPE_BFLOAT16; + case torch::kFloat8_e4m3fn: return CU_TENSOR_MAP_DATA_TYPE_UINT8; + default: DG_HOST_UNREACHABLE("Unsupported dtype"); + } +} + +static CUtensorMapSwizzle mode_into_tensor_map_swizzle(const int& mode) { + switch (mode) { + case 0: return CU_TENSOR_MAP_SWIZZLE_NONE; + case 16: return CU_TENSOR_MAP_SWIZZLE_NONE; + case 32: return CU_TENSOR_MAP_SWIZZLE_32B; + case 64: return CU_TENSOR_MAP_SWIZZLE_64B; + case 128: return CU_TENSOR_MAP_SWIZZLE_128B; + default: DG_HOST_UNREACHABLE("Unsupported swizzling mode"); + } +} + +static CUtensorMap make_tma_2d_desc(const torch::Tensor& t, + int gmem_inner_dim, int gmem_outer_dim, + int smem_inner_dim, int smem_outer_dim, + const int& gmem_outer_stride, + const int& swizzle_mode) { + const auto& elem_size = static_cast(t.element_size()); + if (swizzle_mode != 0) + smem_inner_dim = swizzle_mode / elem_size; + + CUtensorMap tensor_map; + const cuuint64_t gmem_dims[2] = {static_cast(gmem_inner_dim), static_cast(gmem_outer_dim)}; + const cuuint32_t smem_dims[2] = {static_cast(smem_inner_dim), static_cast(smem_outer_dim)}; + const cuuint64_t gmem_strides[1] = {static_cast(gmem_outer_stride * elem_size), }; + const cuuint32_t elem_strides[2] = {1, 1}; + if (get_env("DG_JIT_DEBUG")) { + printf("Making TMA desc: global memory: %d %d, shared memory: %d %d, outer stride: %d, swizzle: %d, elem size: %d\n", + gmem_inner_dim, gmem_outer_dim, smem_inner_dim, smem_outer_dim, + gmem_outer_stride, swizzle_mode, elem_size); + } + DG_CUDA_DRIVER_CHECK(cuTensorMapEncodeTiled( + &tensor_map, aten_dtype_to_tensor_map_dtype(t.scalar_type()), + 2, t.data_ptr(), gmem_dims, gmem_strides, smem_dims, elem_strides, + CU_TENSOR_MAP_INTERLEAVE_NONE, mode_into_tensor_map_swizzle(swizzle_mode), + CU_TENSOR_MAP_L2_PROMOTION_L2_256B, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE)); + return tensor_map; +} + +static CUtensorMap make_tma_a_desc(const cute::UMMA::Major& major, + const torch::Tensor& t, + const int& shape_m, const int& shape_k, + const int& block_m, const int& block_k, + const int& outer_stride, + const int& num_groups, + const int& swizzle_mode) { + if (num_groups > 1) + DG_HOST_ASSERT(major == cute::UMMA::Major::K); + const auto& [gmem_inner_dim, gmem_outer_dim] = get_inner_outer_dims(major, shape_k, shape_m * num_groups); + const auto& [smem_inner_dim, smem_outer_dim] = get_inner_outer_dims(major, block_k, block_m); + return make_tma_2d_desc(t, + gmem_inner_dim, gmem_outer_dim, + smem_inner_dim, smem_outer_dim, + outer_stride, + swizzle_mode); +} + +static CUtensorMap make_tma_b_desc(const cute::UMMA::Major& major, + const torch::Tensor& t, + const int& shape_n, const int& shape_k, + const int& block_n, const int& block_k, + const int& outer_stride, + const int& num_groups, + const int& swizzle_mode) { + const auto& [gmem_inner_dim, gmem_outer_dim] = get_inner_outer_dims(major, shape_k, shape_n); + const auto& [smem_inner_dim, smem_outer_dim] = get_inner_outer_dims(major, block_k, block_n); + + // `num_groups` is always applied into the outer dimensions + return make_tma_2d_desc(t, + gmem_inner_dim, gmem_outer_dim * num_groups, + smem_inner_dim, smem_outer_dim, + outer_stride, + swizzle_mode); +} + +static CUtensorMap make_tma_cd_desc(const torch::Tensor& t, + const int& shape_m, const int& shape_n, + const int& block_m, const int& block_n, + const int& outer_stride, + const int& num_groups, + const int& swizzle_mode) { + + // Swizzling requires the inner box dim to be less or equal than `kSwizzleCDMode` + // bytes, so `BLOCK_N * sizeof(T) / kSwizzleCDMode` TMA stores are required + return make_tma_2d_desc(t, + shape_n, shape_m * num_groups, + block_n, block_m, + outer_stride, + swizzle_mode); +} + +static CUtensorMap make_tma_sf_desc(const cute::UMMA::Major& major, + const torch::Tensor& t, + int shape_mn, int shape_k, + const int& block_mn, const int& block_k, + const int& num_groups, + const int& swizzle_mode) { + DG_HOST_ASSERT(major == cute::UMMA::Major::MN); + + // TODO: maybe swizzle SF as well + DG_HOST_ASSERT(swizzle_mode == 0); + + shape_mn = get_tma_aligned_size(shape_mn, static_cast(t.element_size())); + return make_tma_2d_desc(t, + shape_mn, ceil_div(shape_k, block_k * (t.scalar_type() == torch::kFloat ? 1 : 4)) * num_groups, + block_mn, 1, + shape_mn, + swizzle_mode); +} + +} // namespace deep_gemm diff --git a/csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp b/csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp new file mode 100644 index 00000000..fe8887e4 --- /dev/null +++ b/csrc/jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp @@ -0,0 +1,351 @@ +#pragma once + +#include + +#include "../../jit/compiler.hpp" +#include "../../jit/kernel_runtime.hpp" +#include "../../utils/exception.hpp" +#include "../../utils/format.hpp" +#include "../../utils/math.hpp" +#include "../heuristics/sm100.hpp" +#include "runtime_utils.hpp" + +namespace deep_gemm { + +class SM100FP8Gemm1D1DRuntime final: public LaunchRuntime { +public: + struct Args { + int m, n, k, num_groups; + const std::string& compiled_dims; + + GemmConfig gemm_config; + LaunchArgs launch_args; + + void* grouped_layout; + CUtensorMap tensor_map_a; + CUtensorMap tensor_map_b; + CUtensorMap tensor_map_sfa; + CUtensorMap tensor_map_sfb; + CUtensorMap tensor_map_c; + CUtensorMap tensor_map_d; + }; + + static std::string generate_impl(const Args& args) { + return fmt::format(R"( +#ifdef __CUDACC_RTC__ +#include +#else +#include +#include +#endif + +#include + +using namespace deep_gemm; + +static void __instantiate_kernel() {{ + auto ptr = reinterpret_cast(&sm100_fp8_gemm_1d1d_impl< + {}, {}, + {}, {}, {}, + {}, {}, {}, + {}, + {}, {}, {}, + {}, {}, + {}, {}, + {}, {}, + {}, + {}, {} + >); +}}; +)", + to_string(args.gemm_config.major_a), to_string(args.gemm_config.major_b), + get_compiled_dim(args.m, 'm', args.compiled_dims), get_compiled_dim(args.n, 'n', args.compiled_dims), get_compiled_dim(args.k, 'k', args.compiled_dims), + args.gemm_config.block_m, args.gemm_config.block_n, args.gemm_config.block_k, + args.num_groups, + args.gemm_config.smem_config.swizzle_a_mode, args.gemm_config.smem_config.swizzle_b_mode, args.gemm_config.smem_config.swizzle_cd_mode, + args.gemm_config.num_stages, args.gemm_config.num_last_stages, + args.gemm_config.thread_config.num_non_epilogue_threads, args.gemm_config.thread_config.num_epilogue_threads, + args.gemm_config.multicast_config.num_multicast, args.gemm_config.multicast_config.is_multicast_on_a, + to_string(args.gemm_config.gemm_type), + args.gemm_config.with_accumulation, + to_string(args.gemm_config.cd_dtype)); + } + + static void launch_impl(const cudaKernel_t& kernel, const cudaLaunchConfig_t& config, Args args) { + // TODO: optimize `args` copy + DG_CUDA_RUNTIME_CHECK(cudaLaunchKernelEx(&config, kernel, + args.grouped_layout, args.m, args.n, args.k, + args.tensor_map_a, args.tensor_map_b, + args.tensor_map_sfa, args.tensor_map_sfb, + args.tensor_map_c, args.tensor_map_d)); + } +}; + +static void sm100_fp8_gemm_1d1d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const std::optional& c, + const torch::Tensor& d, + const int& m, const int& n, const int& k, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + const auto& aligned_k = align(k, 128); + const auto& config = get_best_config( + GemmType::Normal, KernelType::Kernel1D1D, + m, n, k, 1, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), c.has_value(), + device_runtime->get_num_sms()); + + const auto& cd = c.value_or(d); + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM100ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), 1, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM100ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), 1, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), 1, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_c = make_tma_cd_desc(cd, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(cd.stride(-2)), 1, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, 1, 0); + const auto& tensor_map_sfb = make_tma_sf_desc(cute::UMMA::Major::MN, sfb, n, k, + config.block_n, config.block_k, 1, 0); + + // Duplicate the accumulator if necessary + if (c.has_value()) { + if (c->data_ptr() == d.data_ptr()) { + DG_HOST_ASSERT(c->sizes() == d.sizes() and c->strides() == d.strides()); + } else { + // ReSharper disable once CppExpressionWithoutSideEffects + d.copy_(c.value()); + } + } + + // Launch + const SM100FP8Gemm1D1DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = 1, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .grouped_layout = nullptr, + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_sfa = tensor_map_sfa, + .tensor_map_sfb = tensor_map_sfb, + .tensor_map_c = tensor_map_c, + .tensor_map_d = tensor_map_d + }; + const auto& code = SM100FP8Gemm1D1DRuntime::generate(args); + const auto& runtime = compiler->build("sm100_fp8_gemm_1d1d", code); + SM100FP8Gemm1D1DRuntime::launch(runtime, args); +} + +static void sm100_m_grouped_fp8_gemm_contiguous_1d1d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const torch::Tensor& d, + const torch::Tensor& m_indices, + const int& num_groups, const int& m, const int& n, const int& k, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + const auto& aligned_k = align(k, 128); + const auto& config = get_best_config( + GemmType::MGroupedContiguous, KernelType::Kernel1D1D, + m, n, k, num_groups, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), false, + device_runtime->get_num_sms()); + + // Create tensor descriptors + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM100ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), 1, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM100ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), num_groups, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), 1, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, 1, 0); + const auto& tensor_map_sfb = make_tma_sf_desc(cute::UMMA::Major::MN, sfb, n, k, + config.block_n, config.block_k, num_groups, 0); + + // Launch kernel + const SM100FP8Gemm1D1DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = num_groups, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .grouped_layout = m_indices.data_ptr(), + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_sfa = tensor_map_sfa, + .tensor_map_sfb = tensor_map_sfb, + .tensor_map_c = tensor_map_d, + .tensor_map_d = tensor_map_d + }; + const auto& code = SM100FP8Gemm1D1DRuntime::generate(args); + const auto& runtime = compiler->build("sm100_m_grouped_fp8_gemm_contiguous_1d1d", code); + SM100FP8Gemm1D1DRuntime::launch(runtime, args); +} + +static void sm100_fp8_m_grouped_gemm_masked_1d1d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const torch::Tensor& d, + const torch::Tensor& masked_m, + const int& num_groups, const int& m, const int& n, const int& k, + const int& expected_m, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + const auto& aligned_k = align(k, 128); + const auto& config = get_best_config( + GemmType::MGroupedMasked, KernelType::Kernel1D1D, + expected_m, n, k, num_groups, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), false, + device_runtime->get_num_sms()); + + // Create tensor descriptors + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM100ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), num_groups, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM100ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), num_groups, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), num_groups, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, num_groups, 0); + const auto& tensor_map_sfb = make_tma_sf_desc(cute::UMMA::Major::MN, sfb, n, k, + config.block_n, config.block_k, num_groups, 0); + + // Launch kernel + const SM100FP8Gemm1D1DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = num_groups, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .grouped_layout = masked_m.data_ptr(), + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_sfa = tensor_map_sfa, + .tensor_map_sfb = tensor_map_sfb, + .tensor_map_c = tensor_map_d, + .tensor_map_d = tensor_map_d + }; + const auto& code = SM100FP8Gemm1D1DRuntime::generate(args); + const auto& runtime = compiler->build("sm100_fp8_m_grouped_gemm_masked_1d1d", code); + SM100FP8Gemm1D1DRuntime::launch(runtime, args); +} + +static void fp8_k_grouped_gemm_1d1d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const std::optional& c, + const torch::Tensor& d, + const int& m, const int& n, + const std::vector& ks, const torch::Tensor& ks_tensor, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + DG_HOST_ASSERT(major_a == cute::UMMA::Major::MN and major_b == cute::UMMA::Major::MN); + + int sum_k = 0, sum_sf_k = 0; + for (const auto& k: ks) { + sum_k += k, sum_sf_k += ceil_div(k, 512); + DG_HOST_ASSERT(k % 128 == 0); + } + const auto& num_groups = static_cast(ks.size()); + + // Get config using max K for better performance + const auto& max_k = *std::ranges::max_element(ks); + const auto& config = get_best_config( + GemmType::KGroupedContiguous, KernelType::Kernel1D1D, + m, n, max_k, num_groups, cute::UMMA::Major::MN, cute::UMMA::Major::MN, + torch::kFloat8_e4m3fn, d.scalar_type(), c.has_value(), + device_runtime->get_num_sms()); + + // Create tensor descriptors + const auto& cd = c.value_or(d); + const auto& tensor_map_a = make_tma_a_desc(cute::UMMA::Major::MN, a, m, sum_k, + SM100ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(0)), 1, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(cute::UMMA::Major::MN, b, n, sum_k, + SM100ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(0)), 1, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(1)), num_groups, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_c = make_tma_cd_desc(cd, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(cd.stride(1)), num_groups, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, sum_sf_k * 512, + config.block_m, config.block_k, num_groups, 0); + const auto& tensor_map_sfb = make_tma_sf_desc(cute::UMMA::Major::MN, sfb, n, sum_sf_k * 512, + config.block_n, config.block_k, num_groups, 0); + + // Duplicate the accumulator if necessary + if (c.has_value()) { + DG_HOST_ASSERT(c->data_ptr() == d.data_ptr()); + DG_HOST_ASSERT(c->sizes() == d.sizes() and c->strides() == d.strides()); + } + + // Launch kernel + const SM100FP8Gemm1D1DRuntime::Args& args = { + .m = m, .n = n, .k = sum_k, + .num_groups = num_groups, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .grouped_layout = ks_tensor.data_ptr(), + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_sfa = tensor_map_sfa, + .tensor_map_sfb = tensor_map_sfb, + .tensor_map_c = tensor_map_c, + .tensor_map_d = tensor_map_d + }; + const auto& code = SM100FP8Gemm1D1DRuntime::generate(args); + const auto& runtime = compiler->build("sm100_fp8_k_grouped_gemm_1d1d", code); + SM100FP8Gemm1D1DRuntime::launch(runtime, args); +} + +} // namespace deep_gemm diff --git a/csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp b/csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp new file mode 100644 index 00000000..02478a09 --- /dev/null +++ b/csrc/jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp @@ -0,0 +1,242 @@ +#pragma once + +#include + +#include "../../jit/compiler.hpp" +#include "../../jit/kernel_runtime.hpp" +#include "../../utils/exception.hpp" +#include "../../utils/format.hpp" +#include "../../utils/math.hpp" +#include "../heuristics/sm100.hpp" +#include "runtime_utils.hpp" + +namespace deep_gemm { + +class SM100FP8Gemm1D2DRuntime final: public LaunchRuntime { +public: + struct Args { + int m, n, k, num_groups; + const std::string& compiled_dims; + + GemmConfig gemm_config; + LaunchArgs launch_args; + + void *sfb, *grouped_layout; + CUtensorMap tensor_map_a; + CUtensorMap tensor_map_b; + CUtensorMap tensor_map_d; + CUtensorMap tensor_map_sfa; + }; + + static std::string generate_impl(const Args& args) { + return fmt::format(R"( +#ifdef __CUDACC_RTC__ +#include +#else +#include +#include +#endif + +#include + +using namespace deep_gemm; + +static void __instantiate_kernel() {{ + auto ptr = reinterpret_cast(&sm100_fp8_gemm_1d2d_impl< + {}, {}, + {}, {}, {}, + {}, {}, {}, + {}, + {}, {}, {}, + {}, {}, + {}, {}, + {}, {}, + {}, {} + >); +}}; +)", + to_string(args.gemm_config.major_a), to_string(args.gemm_config.major_b), + get_compiled_dim(args.m, 'm', args.compiled_dims), get_compiled_dim(args.n, 'n', args.compiled_dims), get_compiled_dim(args.k, 'k', args.compiled_dims), + args.gemm_config.block_m, args.gemm_config.block_n, args.gemm_config.block_k, + args.num_groups, + args.gemm_config.smem_config.swizzle_a_mode, args.gemm_config.smem_config.swizzle_b_mode, args.gemm_config.smem_config.swizzle_cd_mode, + args.gemm_config.num_stages, args.gemm_config.num_last_stages, + args.gemm_config.thread_config.num_non_epilogue_threads, args.gemm_config.thread_config.num_epilogue_threads, + args.gemm_config.multicast_config.num_multicast, args.gemm_config.multicast_config.is_multicast_on_a, + to_string(args.gemm_config.gemm_type), + to_string(args.gemm_config.cd_dtype)); + } + + static void launch_impl(const cudaKernel_t& kernel, const cudaLaunchConfig_t& config, Args args) { + // TODO: optimize `args` copy + DG_CUDA_RUNTIME_CHECK(cudaLaunchKernelEx(&config, kernel, + args.sfb, args.grouped_layout, + args.m, args.n, args.k, + args.tensor_map_a, args.tensor_map_b, + args.tensor_map_d, args.tensor_map_sfa)); + } +}; + +static void sm100_fp8_gemm_1d2d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const std::optional& c, + const torch::Tensor& d, + const int& m, const int& n, const int& k, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + DG_HOST_ASSERT(not c.has_value()); + + const auto& aligned_k = align(k, 128); + const auto& config = get_best_config( + GemmType::Normal, KernelType::Kernel1D2D, + m, n, k, 1, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), c.has_value(), + device_runtime->get_num_sms()); + + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM100ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), 1, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM100ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), 1, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), 1, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, 1, 0); + + // Launch + const SM100FP8Gemm1D2DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = 1, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .sfb = sfb.data_ptr(), + .grouped_layout = nullptr, + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_d = tensor_map_d, + .tensor_map_sfa = tensor_map_sfa, + }; + const auto& code = SM100FP8Gemm1D2DRuntime::generate(args); + const auto& runtime = compiler->build("sm100_fp8_gemm_1d2d", code); + SM100FP8Gemm1D2DRuntime::launch(runtime, args); +} + +static void sm100_m_grouped_fp8_gemm_contiguous_1d2d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const torch::Tensor& d, + const torch::Tensor& m_indices, + const int& num_groups, const int& m, const int& n, const int& k, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + const auto& aligned_k = align(k, 128); + const auto& config = get_best_config( + GemmType::MGroupedContiguous, KernelType::Kernel1D2D, + m, n, k, num_groups, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), false, + device_runtime->get_num_sms()); + + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM100ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), 1, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM100ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), num_groups, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), 1, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, 1, 0); + + // Launch + const SM100FP8Gemm1D2DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = num_groups, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .sfb = sfb.data_ptr(), + .grouped_layout = m_indices.data_ptr(), + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_d = tensor_map_d, + .tensor_map_sfa = tensor_map_sfa, + }; + const auto& code = SM100FP8Gemm1D2DRuntime::generate(args); + const auto& runtime = compiler->build("sm100_m_grouped_fp8_gemm_contiguous_1d2d", code); + SM100FP8Gemm1D2DRuntime::launch(runtime, args); +} + +static void sm100_fp8_m_grouped_gemm_masked_1d2d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const torch::Tensor& d, + const torch::Tensor& masked_m, + const int& num_groups, const int& m, const int& n, const int& k, + const int& expected_m, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + const auto& aligned_k = align(k, 128); + const auto& config = get_best_config( + GemmType::MGroupedMasked, KernelType::Kernel1D2D, + expected_m, n, k, num_groups, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), false, + device_runtime->get_num_sms()); + + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM100ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), num_groups, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM100ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), num_groups, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM100ArchSpec::get_cd_store_block_m(config.block_m), + SM100ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), num_groups, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, num_groups, 0); + + // Launch + const SM100FP8Gemm1D2DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = num_groups, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .sfb = sfb.data_ptr(), + .grouped_layout = masked_m.data_ptr(), + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_d = tensor_map_d, + .tensor_map_sfa = tensor_map_sfa, + }; + const auto& code = SM100FP8Gemm1D2DRuntime::generate(args); + const auto& runtime = compiler->build("sm100_fp8_m_grouped_gemm_masked_1d2d", code); + SM100FP8Gemm1D2DRuntime::launch(runtime, args); +} + +} // namespace deep_gemm diff --git a/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp b/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp new file mode 100644 index 00000000..2909ef3b --- /dev/null +++ b/csrc/jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp @@ -0,0 +1,255 @@ +#pragma once + +#include + +#include "../../jit/compiler.hpp" +#include "../../jit/kernel_runtime.hpp" +#include "../../utils/exception.hpp" +#include "../../utils/format.hpp" +#include "../heuristics/sm90.hpp" +#include "runtime_utils.hpp" + +namespace deep_gemm { + +class SM90FP8Gemm1D2DRuntime final: public LaunchRuntime { +public: + struct Args { + int m, n, k, num_groups; + const std::string& compiled_dims; + + GemmConfig gemm_config; + LaunchArgs launch_args; + + void *sfb, *grouped_layout; + CUtensorMap tensor_map_a; + CUtensorMap tensor_map_b; + CUtensorMap tensor_map_d; + CUtensorMap tensor_map_sfa; + }; + + static std::string generate_impl(const Args& args) { + return fmt::format(R"( +#ifdef __CUDACC_RTC__ +#include +#else +#include +#include +#endif + +#include + +using namespace deep_gemm; + +static void __instantiate_kernel() {{ + auto ptr = reinterpret_cast(&sm90_fp8_gemm_1d2d_impl< + {}, {}, {}, + {}, + {}, {}, {}, + {}, + {}, {}, + {}, {}, + {}, {}, + {} + >); +}}; +)", + // TODO: add CD dtype + get_compiled_dim(args.m, 'm', args.compiled_dims), get_compiled_dim(args.n, 'n', args.compiled_dims), get_compiled_dim(args.k, 'k', args.compiled_dims), + args.num_groups, + args.gemm_config.block_m, args.gemm_config.block_n, args.gemm_config.block_k, + args.gemm_config.smem_config.swizzle_cd_mode, + args.gemm_config.num_stages, args.gemm_config.num_last_stages, + args.gemm_config.thread_config.num_tma_threads, args.gemm_config.thread_config.num_math_threads, + args.gemm_config.multicast_config.num_multicast, args.gemm_config.multicast_config.is_multicast_on_a, + to_string(args.gemm_config.gemm_type)); + } + + static void launch_impl(const cudaKernel_t& kernel, const cudaLaunchConfig_t& config, Args args) { + // TODO: optimize `args` copy + DG_CUDA_RUNTIME_CHECK(cudaLaunchKernelEx(&config, kernel, + args.sfb, args.grouped_layout, + args.m, args.n, args.k, + args.tensor_map_a, args.tensor_map_b, + args.tensor_map_d, args.tensor_map_sfa)); + } +}; + +static void sm90_fp8_gemm_1d2d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const std::optional& c, + const torch::Tensor& d, + const int& m, const int& n, const int& k, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + DG_HOST_ASSERT(not c.has_value() and d.scalar_type() == torch::kBFloat16); + DG_HOST_ASSERT(major_a == cute::UMMA::Major::K and major_b == cute::UMMA::Major::K); + + const auto& aligned_k = align(k, 128); + const auto& config = get_best_config( + GemmType::Normal, KernelType::Kernel1D2D, + m, n, k, 1, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), c.has_value(), + device_runtime->get_num_sms()); + + // Requires no TMA splits + DG_HOST_ASSERT(config.smem_config.swizzle_a_mode == config.block_k); + DG_HOST_ASSERT(config.smem_config.swizzle_b_mode == config.block_k); + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM90ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), 1, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM90ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), 1, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM90ArchSpec::get_cd_store_block_m(config.block_m), + SM90ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), 1, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, 1, 0); + + // Launch + const SM90FP8Gemm1D2DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = 1, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .sfb = sfb.data_ptr(), + .grouped_layout = nullptr, + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_d = tensor_map_d, + .tensor_map_sfa = tensor_map_sfa, + }; + const auto& code = SM90FP8Gemm1D2DRuntime::generate(args); + const auto& runtime = compiler->build("sm90_fp8_gemm_1d2d", code); + SM90FP8Gemm1D2DRuntime::launch(runtime, args); +} + +static void sm90_m_grouped_fp8_gemm_contiguous_1d2d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const torch::Tensor& d, + const torch::Tensor& m_indices, + const int& num_groups, const int& m, const int& n, const int& k, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + DG_HOST_ASSERT(d.scalar_type() == torch::kBFloat16); + DG_HOST_ASSERT(major_a == cute::UMMA::Major::K and major_b == cute::UMMA::Major::K); + + const auto& aligned_k = align(k, 128); + const auto& config = get_best_config( + GemmType::MGroupedContiguous, KernelType::Kernel1D2D, + m, n, k, num_groups, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), false, + device_runtime->get_num_sms()); + + // Requires no TMA splits + DG_HOST_ASSERT(config.smem_config.swizzle_a_mode == config.block_k); + DG_HOST_ASSERT(config.smem_config.swizzle_b_mode == config.block_k); + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM90ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), 1, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM90ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), num_groups, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM90ArchSpec::get_cd_store_block_m(config.block_m), + SM90ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), 1, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, 1, 0); + + // Launch + const SM90FP8Gemm1D2DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = num_groups, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .sfb = sfb.data_ptr(), + .grouped_layout = m_indices.data_ptr(), + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_d = tensor_map_d, + .tensor_map_sfa = tensor_map_sfa, + }; + const auto& code = SM90FP8Gemm1D2DRuntime::generate(args); + const auto& runtime = compiler->build("sm90_m_grouped_fp8_gemm_contiguous_1d2d", code); + SM90FP8Gemm1D2DRuntime::launch(runtime, args); +} + +static void sm90_fp8_m_grouped_gemm_masked_1d2d(const torch::Tensor& a, const torch::Tensor& sfa, + const torch::Tensor& b, const torch::Tensor& sfb, + const torch::Tensor& d, + const torch::Tensor& masked_m, + const int& num_groups, const int& m, const int& n, const int& k, + const int& expected_m, + const cute::UMMA::Major& major_a, const cute::UMMA::Major& major_b, + const std::string& compiled_dims) { + const auto& aligned_k = align(k, 128); + DG_HOST_ASSERT(d.scalar_type() == torch::kBFloat16); + DG_HOST_ASSERT(major_a == cute::UMMA::Major::K and major_b == cute::UMMA::Major::K); + + const auto& config = get_best_config( + GemmType::MGroupedMasked, KernelType::Kernel1D2D, + expected_m, n, k, num_groups, major_a, major_b, + torch::kFloat8_e4m3fn, d.scalar_type(), false, + device_runtime->get_num_sms()); + + // Requires no TMA splits + DG_HOST_ASSERT(config.smem_config.swizzle_a_mode == config.block_k); + DG_HOST_ASSERT(config.smem_config.swizzle_b_mode == config.block_k); + const auto& tensor_map_a = make_tma_a_desc(major_a, a, m, k, + SM90ArchSpec::get_ab_load_block_m(config.multicast_config, config.block_m), + config.block_k, + static_cast(a.stride(get_non_contiguous_dim(major_a))), num_groups, + config.smem_config.swizzle_a_mode); + const auto& tensor_map_b = make_tma_b_desc(major_b, b, n, k, + SM90ArchSpec::get_ab_load_block_n(config.multicast_config, config.block_n), + config.block_k, + static_cast(b.stride(get_non_contiguous_dim(major_b))), num_groups, + config.smem_config.swizzle_b_mode); + const auto& tensor_map_d = make_tma_cd_desc(d, m, n, + SM90ArchSpec::get_cd_store_block_m(config.block_m), + SM90ArchSpec::get_cd_store_block_n(config.block_n), + static_cast(d.stride(-2)), num_groups, + config.smem_config.swizzle_cd_mode); + const auto& tensor_map_sfa = make_tma_sf_desc(cute::UMMA::Major::MN, sfa, m, k, + config.block_m, config.block_k, num_groups, 0); + + // Launch + const SM90FP8Gemm1D2DRuntime::Args& args = { + .m = m, .n = n, .k = aligned_k, + .num_groups = num_groups, + .compiled_dims = compiled_dims, + .gemm_config = config, + .launch_args = LaunchArgs(config.num_sms, config.thread_config.num_threads, + config.smem_config.smem_size, + config.multicast_config.num_multicast), + .sfb = sfb.data_ptr(), + .grouped_layout = masked_m.data_ptr(), + .tensor_map_a = tensor_map_a, + .tensor_map_b = tensor_map_b, + .tensor_map_d = tensor_map_d, + .tensor_map_sfa = tensor_map_sfa, + }; + const auto& code = SM90FP8Gemm1D2DRuntime::generate(args); + const auto& runtime = compiler->build("sm90_fp8_m_grouped_gemm_masked_1d2d", code); + SM90FP8Gemm1D2DRuntime::launch(runtime, args); +} + +} // namespace deep_gemm diff --git a/csrc/jit_kernels/impls/smxx_layout.hpp b/csrc/jit_kernels/impls/smxx_layout.hpp new file mode 100644 index 00000000..9d6e3021 --- /dev/null +++ b/csrc/jit_kernels/impls/smxx_layout.hpp @@ -0,0 +1,199 @@ +#pragma once + +#include + +#include "../../jit/kernel_runtime.hpp" +#include "../../utils/exception.hpp" +#include "../../utils/format.hpp" +#include "../../utils/math.hpp" +#include "../../utils/layout.hpp" + +namespace deep_gemm { + +class TransposeAndPackFP32IntoUE8M0Runtime final: public LaunchRuntime { +public: + struct Args { + int mn, sf_k; + int block_mn; + void *sf, *out; + + LaunchArgs launch_args; + }; + + static std::string generate_impl(const Args& args) { + return fmt::format(R"( +#ifdef __CUDACC_RTC__ +#include +#else +#include +#include +#endif + +#include + +using namespace deep_gemm; + +static void __instantiate_kernel() {{ + auto ptr = reinterpret_cast(&transpose_and_pack_fp32_into_ue8m0< + {}, {}, {} + >); +}}; +)", args.launch_args.num_threads, args.block_mn, args.sf_k); + } + + static void launch_impl(const cudaKernel_t& kernel, const cudaLaunchConfig_t& config, Args args) { + DG_CUDA_RUNTIME_CHECK(cudaLaunchKernelEx(&config, kernel, args.sf, args.out, static_cast(args.mn))); + } +}; + +class PackFP32IntoUE8M0Runtime final: public LaunchRuntime { +public: + struct Args { + int num_groups, mn, sf_k, packed_sf_k; + int block_mn, block_packed_sf_k; + void *sf, *out, *ks; + + LaunchArgs launch_args; + }; + + static std::string generate_impl(const Args& args) { + return fmt::format(R"( +#ifdef __CUDACC_RTC__ +#include +#else +#include +#include +#endif + +#include + +using namespace deep_gemm; + +static void __instantiate_kernel() {{ + auto ptr = reinterpret_cast(&pack_fp32_into_ue8m0< + {}, {}, {}, {} + >); +}}; +)", args.num_groups, args.launch_args.num_threads, args.block_mn, args.block_packed_sf_k); + } + + static void launch_impl(const cudaKernel_t& kernel, const cudaLaunchConfig_t& config, Args args) { + DG_CUDA_RUNTIME_CHECK(cudaLaunchKernelEx(&config, kernel, + args.sf, args.out, args.ks, args.mn, args.sf_k, args.packed_sf_k)); + } +}; + +static std::tuple preprocess_sf(const torch::Tensor& sf) { + // NOTES: for the extreme performance, you may rewrite/fuse this function in CUDA + const auto& dim = sf.dim(); + DG_HOST_ASSERT(dim == 2 or dim == 3); + DG_HOST_ASSERT(sf.scalar_type() == torch::kFloat); + const auto& batched_sf = dim == 2 ? sf.unsqueeze(0) : sf; + + const auto& [num_groups, mn, sf_k] = get_shape<3>(batched_sf); + const auto& tma_aligned_mn = get_tma_aligned_size(mn, static_cast(sf.element_size())); + return {dim, num_groups, mn, sf_k, tma_aligned_mn, batched_sf}; +} + +static torch::Tensor get_mn_major_tma_aligned_tensor(const torch::Tensor& sf) { + const auto& [dim, num_groups, mn, sf_k, tma_aligned_mn, batched_sf] = preprocess_sf(sf); + + // The last kernel already gives a column-major TMA aligned layout + if ((batched_sf.stride(0) == tma_aligned_mn * sf_k or dim == 2) and batched_sf.stride(1) == 1 and batched_sf.stride(2) == tma_aligned_mn) + return (dim == 2) ? batched_sf.squeeze(0) : batched_sf; + + // Normal layout requires transposing + auto aligned_sf = torch::empty_strided({num_groups, tma_aligned_mn, sf_k}, {tma_aligned_mn * sf_k, 1, tma_aligned_mn}, batched_sf.options()); + aligned_sf = aligned_sf.slice(1, 0, mn).copy_(batched_sf); + return (dim == 2) ? aligned_sf.squeeze(0) : aligned_sf; +} + +static torch::Tensor get_mn_major_tma_aligned_packed_ue8m0_tensor(const torch::Tensor& sf) { + const auto& [dim, num_groups, mn, sf_k, tma_aligned_mn, batched_sf] = preprocess_sf(sf); + const auto& packed_sf_k = ceil_div(sf_k, 4); + const auto& out = torch::empty_strided({num_groups, mn, packed_sf_k}, + {packed_sf_k * tma_aligned_mn, 1, tma_aligned_mn}, + at::TensorOptions().device(batched_sf.device()).dtype(torch::kInt)); + DG_HOST_ASSERT(num_groups == 1 or (mn * sf_k) % 4 == 0); + + // Launch the kernel + if (batched_sf.is_contiguous()) { + constexpr int block_mn = 48; + constexpr int num_threads = 512; + const TransposeAndPackFP32IntoUE8M0Runtime::Args& args = { + .mn = mn, + .sf_k = sf_k, + .block_mn = block_mn, + .sf = batched_sf.data_ptr(), + .out = out.data_ptr(), + .launch_args = LaunchArgs({ceil_div(mn, block_mn), num_groups}, num_threads, block_mn * sf_k * 4) + }; + + const auto& code = TransposeAndPackFP32IntoUE8M0Runtime::generate(args); + const auto& runtime = compiler->build("transpose_and_pack_fp32_into_ue8m0", code); + TransposeAndPackFP32IntoUE8M0Runtime::launch(runtime, args); + } else { + DG_HOST_ASSERT(mn % 4 == 0 and num_groups == 1); + DG_HOST_ASSERT(batched_sf.stride(1) == 1 and batched_sf.stride(2) == mn); + + constexpr int block_mn = 128; + constexpr int block_packed_sf_k = 16; + constexpr int num_threads = 512; + const PackFP32IntoUE8M0Runtime::Args& args = { + .num_groups = 1, + .mn = mn, + .sf_k = sf_k, + .packed_sf_k = packed_sf_k, + .block_mn = block_mn, + .block_packed_sf_k = block_packed_sf_k, + .sf = batched_sf.data_ptr(), + .out = out.data_ptr(), + .ks = nullptr, + .launch_args = LaunchArgs({ceil_div(mn, block_mn), ceil_div(packed_sf_k, block_packed_sf_k)}, num_threads) + }; + + const auto& code = PackFP32IntoUE8M0Runtime::generate(args); + const auto& runtime = compiler->build("pack_fp32_into_ue8m0", code); + PackFP32IntoUE8M0Runtime::launch(runtime, args); + } + return (dim == 2) ? out.squeeze(0) : out; +} + +static torch::Tensor get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(const torch::Tensor& sf, + const torch::Tensor& ks_tensor, + const std::vector& ks) { + const auto& [sf_k, mn] = get_shape<2>(sf); + const auto& num_groups = static_cast(ks.size()); + + int ref_sf_k = 0, packed_sf_k = 0; + for (const auto& k: ks) + ref_sf_k += ceil_div(k, 128), packed_sf_k += ceil_div(k, 512); + DG_HOST_ASSERT(sf.is_contiguous()); + DG_HOST_ASSERT(ref_sf_k == sf_k); + DG_HOST_ASSERT(num_groups <= 128 and mn % 4 == 0); + + const auto& out = torch::empty({packed_sf_k, mn}, at::TensorOptions().device(sf.device()).dtype(torch::kInt)); + + constexpr int block_mn = 128; + constexpr int block_packed_sf_k = 16; + constexpr int num_threads = 512; + const PackFP32IntoUE8M0Runtime::Args& args = { + .num_groups = num_groups, + .mn = mn, + .sf_k = sf_k, + .packed_sf_k = packed_sf_k, + .block_mn = block_mn, + .block_packed_sf_k = block_packed_sf_k, + .sf = sf.data_ptr(), + .out = out.data_ptr(), + .ks = ks_tensor.data_ptr(), + .launch_args = LaunchArgs({ceil_div(mn, block_mn), ceil_div(packed_sf_k, block_packed_sf_k)}, num_threads) + }; + + const auto& code = PackFP32IntoUE8M0Runtime::generate(args); + const auto& runtime = compiler->build("pack_fp32_into_ue8m0", code); + PackFP32IntoUE8M0Runtime::launch(runtime, args); + return out; +} + +} // namespace deep_gemm diff --git a/csrc/python_api.cpp b/csrc/python_api.cpp new file mode 100644 index 00000000..e1e916f2 --- /dev/null +++ b/csrc/python_api.cpp @@ -0,0 +1,402 @@ +#include +#include + +#include "jit/compiler.hpp" +#include "jit/device_runtime.hpp" +#include "utils/layout.hpp" + +#include "jit_kernels/impls/smxx_layout.hpp" +#include "jit_kernels/impls/sm90_fp8_gemm_1d2d.hpp" +#include "jit_kernels/impls/sm100_fp8_gemm_1d1d.hpp" +#include "jit_kernels/impls/sm100_fp8_gemm_1d2d.hpp" + +#ifndef TORCH_EXTENSION_NAME +#define TORCH_EXTENSION_NAME deep_gemm_cpp +#endif + +namespace deep_gemm { +torch::Tensor transform_sf_into_required_layout(const torch::Tensor& sf, + const int& mn, const int& k, + const std::optional& num_groups, + const std::tuple& recipe, + const bool& is_sfa, + const bool& disable_ue8m0_cast) { + const auto& gran_mn = is_sfa ? std::get<0>(recipe) : std::get<1>(recipe); + const auto& gran_k = std::get<2>(recipe); + const auto& arch_major = device_runtime->get_arch_major(); + + // Pre-transform checks + check_sf_layout(sf, mn, k, gran_mn, gran_k, num_groups); + + // (FP32, 1, 128) on SM90: transform to TMA-aligned and MN-major + if (sf.scalar_type() == torch::kFloat and gran_mn == 1 and gran_k == 128 and (arch_major == 9 or disable_ue8m0_cast)) + return get_mn_major_tma_aligned_tensor(sf); + + // (FP32, 1, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major + if (sf.scalar_type() == torch::kFloat and gran_mn == 1 and gran_k == 128 and arch_major == 10) { + DG_HOST_ASSERT(not disable_ue8m0_cast); + return get_mn_major_tma_aligned_packed_ue8m0_tensor(sf); + } + + // (FP32, 128, 128) on SM90: no need to transform, check shape and contiguous + if (sf.scalar_type() == torch::kFloat and gran_mn == 128 and gran_k == 128 and (arch_major == 9 or disable_ue8m0_cast)) + return check_sf_layout(sf, mn, k, gran_mn, gran_k, num_groups, false, true, torch::kFloat); + + // (FP32, 128, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major + if (sf.scalar_type() == torch::kFloat and gran_mn == 128 and gran_k == 128 and arch_major == 10) { + DG_HOST_ASSERT(not disable_ue8m0_cast); + const auto& broadcasted = sf.index_select(-2, torch::arange(mn, at::TensorOptions().device(sf.device())).floor_divide_(128)); + return get_mn_major_tma_aligned_packed_ue8m0_tensor(broadcasted); + } + + // (INT, 1, 128) on SM100: transform to TMA-aligned and MN-major + if (sf.scalar_type() == torch::kInt and gran_mn == 1 and gran_k == 128 and arch_major == 10) + return check_sf_layout(sf, mn, k, gran_mn, gran_k, num_groups, true, false, torch::kInt); + + DG_HOST_UNREACHABLE("Unknown SF transformation"); +} + +torch::Tensor transform_k_grouped_sf_into_required_layout(const torch::Tensor& sf, + const std::vector& ks, + const torch::Tensor& ks_tensor, + const std::tuple& recipe) { + DG_HOST_ASSERT(sf.dim() == 2); + DG_HOST_ASSERT(recipe == std::make_tuple(1, 1, 128)); + const auto& arch_major = device_runtime->get_arch_major(); + + // FP32 on SM90 + if (sf.scalar_type() == torch::kFloat and arch_major == 9) + DG_HOST_UNREACHABLE("Unimplemented"); + + // FP32 on SM100 + if (sf.scalar_type() == torch::kFloat and arch_major == 10) + return get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(sf, ks_tensor, ks); + + // INT on SM100 + if (sf.scalar_type() == torch::kFloat and arch_major == 10) + DG_HOST_UNREACHABLE("Unimplemented"); + + DG_HOST_UNREACHABLE("Unknown cases"); +} + +void fp8_gemm_nt(const std::pair& a, + const std::pair& b, + const torch::Tensor& d, + const std::optional& c, + std::optional> recipe, + const std::string& compiled_dims, + const bool& disable_ue8m0_cast) { + // Shape must be `[M, K] @ [N, K].T` + const auto& major_a = get_major_type_ab(a.first); + const auto& major_b = get_major_type_ab(b.first); + if (fp8_requires_k_major()) { + DG_HOST_ASSERT(major_a == cute::UMMA::Major::K); + DG_HOST_ASSERT(major_b == cute::UMMA::Major::K); + } + + // C/D must be N-major + check_major_type_cd(d); + + // Type and shape checks + const auto& [m , k ] = get_shape<2>(a.first); + const auto& [n , k_] = get_shape<2>(b.first); + const auto& [m_, n_] = get_shape<2>(d); + DG_HOST_ASSERT(m == m_ and n == n_ and k == k_); + DG_HOST_ASSERT(n > 0 and k > 0); + DG_HOST_ASSERT(a.first.scalar_type() == torch::kFloat8_e4m3fn); + DG_HOST_ASSERT(b.first.scalar_type() == torch::kFloat8_e4m3fn); + DG_HOST_ASSERT(d.scalar_type() == torch::kBFloat16 or d.scalar_type() == torch::kFloat); + + // Check C as well + if (c.has_value()) { + check_major_type_cd(c.value()); + DG_HOST_ASSERT(d.scalar_type() == torch::kFloat); + DG_HOST_ASSERT(c.value().scalar_type() == torch::kFloat); + } + + // Do nothing if the problem is empty + if (m == 0) + return; + + // Transform SFA and SFB into compute-required layout + if (not recipe.has_value()) + recipe = get_default_recipe(a.second.scalar_type(), b.second.scalar_type()); + const auto& sfa = transform_sf_into_required_layout(a.second, m, k, std::nullopt, recipe.value(), true, disable_ue8m0_cast); + const auto& sfb = transform_sf_into_required_layout(b.second, n, k, std::nullopt, recipe.value(), false, disable_ue8m0_cast); + + // Dispatch into different implements + const auto& arch_major = device_runtime->get_arch_major(); + if (arch_major == 9 and sfa.scalar_type() == torch::kFloat) { + sm90_fp8_gemm_1d2d(a.first, sfa, b.first, sfb, c, d, m, n, k, major_a, major_b, compiled_dims); + } else if (arch_major == 10 and sfa.scalar_type() == torch::kInt) { + sm100_fp8_gemm_1d1d(a.first, sfa, b.first, sfb, c, d, m, n, k, major_a, major_b, compiled_dims); + } else if (arch_major == 10 and sfa.scalar_type() == torch::kFloat) { + sm100_fp8_gemm_1d2d(a.first, sfa, b.first, sfb, c, d, m, n, k, major_a, major_b, compiled_dims); + } else { + DG_HOST_UNREACHABLE("Unknown kernel or scaling factor types"); + } +} + +void fp8_gemm_nn(const std::pair& a, + const std::pair& b, + const torch::Tensor& d, + const std::optional& c, + const std::optional>& recipe, + const std::string& compiled_dims, + const bool& disable_ue8m0_cast) { + fp8_gemm_nt(a, {b.first.transpose(0, 1), b.second.transpose(0, 1)}, + d, c, recipe, compiled_dims, disable_ue8m0_cast); +} + +void fp8_gemm_tn(const std::pair& a, + const std::pair& b, + const torch::Tensor& d, + const std::optional& c, + const std::optional>& recipe, + const std::string& compiled_dims, + const bool& disable_ue8m0_cast) { + fp8_gemm_nt({a.first.transpose(0, 1), a.second.transpose(0, 1)}, + {b.first.transpose(0, 1), b.second.transpose(0, 1)}, + d, c, recipe, compiled_dims, disable_ue8m0_cast); +} + +void fp8_gemm_tt(const std::pair& a, + const std::pair& b, + const torch::Tensor& d, + const std::optional& c, + const std::optional>& recipe, + const std::string& compiled_dims, + const bool& disable_ue8m0_cast) { + fp8_gemm_nt({a.first.transpose(0, 1), a.second.transpose(0, 1)}, b, + d, c, recipe, compiled_dims, disable_ue8m0_cast); +} + +void m_grouped_fp8_gemm_nt_contiguous(const std::pair& a, + const std::pair& b, + const torch::Tensor& d, + const torch::Tensor& m_indices, + std::optional> recipe, + const std::string& compiled_dims, + const bool& disable_ue8m0_cast) { + // Shape must be `[M, K] @ [G, N, K].mT` + const auto& major_a = get_major_type_ab(a.first); + const auto& major_b = get_major_type_ab(b.first); + DG_HOST_ASSERT(major_a == cute::UMMA::Major::K); + if (fp8_requires_k_major()) + DG_HOST_ASSERT(major_b == cute::UMMA::Major::K); + DG_HOST_ASSERT(m_indices.is_contiguous()); + + // Type and shape checks + const auto& [m, k] = get_shape<2>(a.first); + const auto& [num_groups, n, k_] = get_shape<3>(b.first); + const auto& [m_, n_] = get_shape<2>(d); + const auto& m__ = static_cast(m_indices.numel()); + DG_HOST_ASSERT(m == m_ and m == m__ and n == n_ and k == k_); + DG_HOST_ASSERT(n > 0 and k > 0 and num_groups > 0); + DG_HOST_ASSERT(a.first.scalar_type() == torch::kFloat8_e4m3fn); + DG_HOST_ASSERT(b.first.scalar_type() == torch::kFloat8_e4m3fn); + DG_HOST_ASSERT(d.scalar_type() == torch::kBFloat16); + DG_HOST_ASSERT(m_indices.scalar_type() == torch::kInt); + + // D must be N-major + check_major_type_cd(d); + + // Do nothing if empty + if (m == 0) + return; + + // Transform SFA and SFB into compute-required layout + if (not recipe.has_value()) + recipe = get_default_recipe(a.second.scalar_type(), b.second.scalar_type()); + const auto& sfa = transform_sf_into_required_layout(a.second, m, k, std::nullopt, recipe.value(), true, disable_ue8m0_cast); + const auto& sfb = transform_sf_into_required_layout(b.second, n, k, num_groups, recipe.value(), false, disable_ue8m0_cast); + + // Dispatch implementation + const auto& arch_major = device_runtime->get_arch_major(); + if (arch_major == 9 and sfa.scalar_type() == torch::kFloat) { + sm90_m_grouped_fp8_gemm_contiguous_1d2d(a.first, sfa, b.first, sfb, d, m_indices, + num_groups, m, n, k, major_a, major_b, compiled_dims); + } else if (arch_major == 10 and sfa.scalar_type() == torch::kInt) { + sm100_m_grouped_fp8_gemm_contiguous_1d1d(a.first, sfa, b.first, sfb, d, m_indices, + num_groups, m, n, k, major_a, major_b, compiled_dims); + } else if (arch_major == 10 and sfa.scalar_type() == torch::kFloat) { + sm100_m_grouped_fp8_gemm_contiguous_1d2d(a.first, sfa, b.first, sfb, d, m_indices, + num_groups, m, n, k, major_a, major_b, compiled_dims); + } else { + DG_HOST_UNREACHABLE("Unknown kernel or scaling factor types"); + } +} + +void m_grouped_fp8_gemm_nn_contiguous(const std::pair& a, + const std::pair& b, + const torch::Tensor& d, + const torch::Tensor& m_indices, + const std::optional>& recipe, + const std::string& compiled_dims, + const bool& disable_ue8m0_cast) { + m_grouped_fp8_gemm_nt_contiguous(a, {b.first.transpose(1, 2), b.second.transpose(1, 2)}, + d, m_indices, recipe, compiled_dims, disable_ue8m0_cast); +} + +void fp8_m_grouped_gemm_nt_masked(const std::pair& a, + const std::pair& b, + const torch::Tensor& d, + const torch::Tensor& masked_m, + const int& expected_m, + std::optional> recipe, + const std::string& compiled_dims, + const bool& disable_ue8m0_cast) { + // Shape must be `[G, M, K] @ [G, N, K].mT` + const auto& major_a = get_major_type_ab(a.first); + const auto& major_b = get_major_type_ab(b.first); + DG_HOST_ASSERT(major_a == cute::UMMA::Major::K and major_b == cute::UMMA::Major::K); + DG_HOST_ASSERT(masked_m.is_contiguous()); + + // Type and shape checks + const auto& [num_groups, m, k] = get_shape<3>(a.first); + const auto& [num_groups_, n, k_] = get_shape<3>(b.first); + const auto& [num_groups__, m_, n_] = get_shape<3>(d); + const auto& num_groups___ = static_cast(masked_m.numel()); + DG_HOST_ASSERT(num_groups == num_groups_ and num_groups == num_groups__ and num_groups == num_groups___); + DG_HOST_ASSERT(m == m_ and n == n_ and k == k_); + DG_HOST_ASSERT(expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0); + DG_HOST_ASSERT(a.first.scalar_type() == torch::kFloat8_e4m3fn); + DG_HOST_ASSERT(b.first.scalar_type() == torch::kFloat8_e4m3fn); + DG_HOST_ASSERT(d.scalar_type() == torch::kBFloat16); + DG_HOST_ASSERT(masked_m.scalar_type() == torch::kInt); + + // D must be N-major + check_major_type_cd(d); + + // Transform scaling factors + if (not recipe.has_value()) + recipe = get_default_recipe(a.second.scalar_type(), b.second.scalar_type()); + const auto& sfa = transform_sf_into_required_layout(a.second, m, k, num_groups, recipe.value(), true, disable_ue8m0_cast); + const auto& sfb = transform_sf_into_required_layout(b.second, n, k, num_groups, recipe.value(), false, disable_ue8m0_cast); + + // Dispatch implementation + const auto& arch_major = device_runtime->get_arch_major(); + if (arch_major == 9 and sfa.scalar_type() == torch::kFloat) { + sm90_fp8_m_grouped_gemm_masked_1d2d(a.first, sfa, b.first, sfb, d, masked_m, + num_groups, m, n, k, expected_m, major_a, major_b, compiled_dims); + } else if (arch_major == 10 and sfa.scalar_type() == torch::kInt) { + sm100_fp8_m_grouped_gemm_masked_1d1d(a.first, sfa, b.first, sfb, d, masked_m, + num_groups, m, n, k, expected_m, major_a, major_b, compiled_dims); + } else if (arch_major == 10 and sfa.scalar_type() == torch::kFloat) { + sm100_fp8_m_grouped_gemm_masked_1d2d(a.first, sfa, b.first, sfb, d, masked_m, + num_groups, m, n, k, expected_m, major_a, major_b, compiled_dims); + } else { + DG_HOST_UNREACHABLE("Unsupported kernel or scaling factor types"); + } +} + +void k_grouped_fp8_gemm_tn_contiguous(const std::pair& a, + const std::pair& b, + const torch::Tensor& d, + const std::vector& ks, + const torch::Tensor& ks_tensor, + const std::optional& c, + const std::tuple& recipe, + const std::string& compiled_dims) { + // Must be 1D1D kernel + DG_HOST_ASSERT(recipe == std::make_tuple(1, 1, 128)); + + // Contiguity checks + DG_HOST_ASSERT(a.first.is_contiguous()); + DG_HOST_ASSERT(b.first.is_contiguous()); + DG_HOST_ASSERT(d.is_contiguous()); + if (c.has_value()) { + DG_HOST_ASSERT(c.value().scalar_type() == torch::kFloat); + DG_HOST_ASSERT(c.value().is_contiguous()); + } + + // Do nothing if empty + if (std::accumulate(ks.begin(), ks.end(), 0) == 0) + return; + + // Transform SF with padding + const auto& [_, m] = get_shape<2>(a.first); + const auto& [__, n] = get_shape<2>(b.first); + const auto& sfa = transform_k_grouped_sf_into_required_layout(a.second, ks, ks_tensor, recipe); + const auto& sfb = transform_k_grouped_sf_into_required_layout(b.second, ks, ks_tensor, recipe); + + // Dispatch implementation + const auto& arch_major = device_runtime->get_arch_major(); + if (arch_major == 10) { + fp8_k_grouped_gemm_1d1d(a.first, sfa, b.first, sfb, c, d, m, n, ks, ks_tensor, + cute::UMMA::Major::MN, cute::UMMA::Major::MN, compiled_dims); + } else { + DG_HOST_UNREACHABLE("Unsupported architecture"); + } +} + +} // namespace deep_gemm + +// ReSharper disable once CppParameterMayBeConstPtrOrRef +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + using namespace deep_gemm; + + m.doc() = "DeepGEMM C++ library"; + + // Runtime + m.def("get_num_sms", [&]() { + return device_runtime->get_num_sms(); + }); + m.def("set_num_sms", [&](const int& new_num_sms) { + device_runtime->set_num_sms(new_num_sms); + }); + + // JIT + m.def("init", [&](const std::string& library_root_path, const std::string& cuda_home_path_by_torch) { + DG_HOST_ASSERT(get_env("DG_JIT_USE_NVRTC", 0) == 0 and "Currently only support NVCC"); + compiler = std::make_shared(library_root_path, cuda_home_path_by_torch); + KernelRuntime::set_cuda_home(cuda_home_path_by_torch); + }); + + // Stable kernel APIs with automatic arch/layout dispatch + m.def("fp8_gemm_nt", &fp8_gemm_nt, + py::arg("a"), py::arg("b"), py::arg("d"), + py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt, + py::arg("compiled_dims") = "nk", + py::arg("disable_ue8m0_cast") = false); + m.def("fp8_gemm_nn", &fp8_gemm_nn, + py::arg("a"), py::arg("b"), py::arg("d"), + py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt, + py::arg("compiled_dims") = "nk", + py::arg("disable_ue8m0_cast") = false); + m.def("fp8_gemm_tn", &fp8_gemm_tn, + py::arg("a"), py::arg("b"), py::arg("d"), + py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt, + py::arg("compiled_dims") = "mn", + py::arg("disable_ue8m0_cast") = false); + m.def("fp8_gemm_tt", &fp8_gemm_tt, + py::arg("a"), py::arg("b"), py::arg("d"), + py::arg("c") = std::nullopt, py::arg("recipe") = std::nullopt, + py::arg("compiled_dims") = "mn", + py::arg("disable_ue8m0_cast") = false); + m.def("m_grouped_fp8_gemm_nt_contiguous", &m_grouped_fp8_gemm_nt_contiguous, + py::arg("a"), py::arg("b"), py::arg("d"), py::arg("m_indices"), + py::arg("recipe") = std::nullopt, py::arg("compiled_dims") = "nk", + py::arg("disable_ue8m0_cast") = false); + m.def("m_grouped_fp8_gemm_nn_contiguous", &m_grouped_fp8_gemm_nn_contiguous, + py::arg("a"), py::arg("b"), py::arg("d"), py::arg("m_indices"), + py::arg("recipe") = std::nullopt, py::arg("compiled_dims") = "nk", + py::arg("disable_ue8m0_cast") = false); + m.def("fp8_m_grouped_gemm_nt_masked", &fp8_m_grouped_gemm_nt_masked, + py::arg("a"), py::arg("b"), py::arg("d"), py::arg("masked_m"), + py::arg("expected_m"), py::arg("recipe") = std::nullopt, + py::arg("compiled_dims") = "nk", py::arg("disable_ue8m0_cast") = false); + m.def("k_grouped_fp8_gemm_tn_contiguous", &k_grouped_fp8_gemm_tn_contiguous, + py::arg("a"), py::arg("b"), py::arg("d"), py::arg("ks"), + py::arg("ks_tensor"), py::arg("c") = std::nullopt, + py::arg("recipe") = std::make_tuple(1, 1, 128), + py::arg("compiled_dims") = "mn"); + m.def("transform_sf_into_required_layout", &transform_sf_into_required_layout); + + // Raw kernels or functions + m.def("get_tma_aligned_size", &get_tma_aligned_size); + m.def("get_mk_alignment_for_contiguous_layout", &get_mk_alignment_for_contiguous_layout); + m.def("get_mn_major_tma_aligned_tensor", &get_mn_major_tma_aligned_tensor); + m.def("get_mn_major_tma_aligned_packed_ue8m0_tensor", &get_mn_major_tma_aligned_packed_ue8m0_tensor); + m.def("get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor", &get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor); +} diff --git a/csrc/utils/exception.hpp b/csrc/utils/exception.hpp new file mode 100644 index 00000000..493e4807 --- /dev/null +++ b/csrc/utils/exception.hpp @@ -0,0 +1,58 @@ +#pragma once + +#include +#include + +namespace deep_gemm { + +class DGException final : public std::exception { + std::string message = {}; + +public: + explicit DGException(const char *name, const char* file, const int line, const std::string& error) { + message = std::string("Failed: ") + name + " error " + file + ":" + std::to_string(line) + " '" + error + "'"; + } + + const char *what() const noexcept override { + return message.c_str(); + } +}; + +#ifndef DG_STATIC_ASSERT +#define DG_STATIC_ASSERT(cond, ...) static_assert(cond, __VA_ARGS__) +#endif + +#ifndef DG_HOST_ASSERT +#define DG_HOST_ASSERT(cond) \ +do { \ + if (not (cond)) { \ + throw DGException("Assertion", __FILE__, __LINE__, #cond); \ + } \ +} while (0) +#endif + +#ifndef DG_HOST_UNREACHABLE +#define DG_HOST_UNREACHABLE(reason) (throw DGException("Assertion", __FILE__, __LINE__, reason)) +#endif + +#ifndef DG_CUDA_DRIVER_CHECK +#define DG_CUDA_DRIVER_CHECK(cmd) \ +do { \ + const auto& e = (cmd); \ + if (e != CUDA_SUCCESS) { \ + throw DGException("CUDA driver", __FILE__, __LINE__, ""); \ + } \ +} while (0) +#endif + +#ifndef DG_CUDA_RUNTIME_CHECK +#define DG_CUDA_RUNTIME_CHECK(cmd) \ +do { \ + const auto& e = (cmd); \ + if (e != cudaSuccess) { \ + throw DGException("CUDA runtime", __FILE__, __LINE__, std::to_string(static_cast(e))); \ + } \ +} while (0) +#endif + +} // namespace deep_gemm diff --git a/csrc/utils/format.hpp b/csrc/utils/format.hpp new file mode 100644 index 00000000..bf617372 --- /dev/null +++ b/csrc/utils/format.hpp @@ -0,0 +1,6 @@ +#pragma once + +// Just a wrapper for the `fmt` headers +#define FMT_HEADER_ONLY +#include +#include diff --git a/csrc/utils/hash.hpp b/csrc/utils/hash.hpp new file mode 100644 index 00000000..fad1231f --- /dev/null +++ b/csrc/utils/hash.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include + +namespace deep_gemm { + +static uint64_t fnv1a(const std::string& data, const uint64_t& seed) { + uint64_t h = seed; + const uint64_t& prime = 0x100000001b3ull; + for (const char& c: data) { + h ^= static_cast(c); + h *= prime; + } + return h; +} + +static std::string get_hex_digest(const std::string& data) { + const auto& state_0 = fnv1a(data, 0xc6a4a7935bd1e995ull); + const auto& state_1 = fnv1a(data, 0x9e3779b97f4a7c15ull); + + // Split-mix 64 + const auto& split_mix = [](uint64_t z) { + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9ull; + z = (z ^ (z >> 27)) * 0x94d049bb133111ebull; + return z ^ (z >> 31); + }; + + std::ostringstream oss; + oss << std::hex << std::setfill('0') + << std::setw(16) << split_mix(state_0) + << std::setw(16) << split_mix(state_1); + return oss.str(); +} + +} // namespace deep_gemm diff --git a/csrc/utils/layout.hpp b/csrc/utils/layout.hpp new file mode 100644 index 00000000..47d46c47 --- /dev/null +++ b/csrc/utils/layout.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include +#include + +#include "math.hpp" +#include "exception.hpp" +#include "../jit/device_runtime.hpp" + +namespace deep_gemm { + +// Major-ness stuffs +static void major_check(const torch::Tensor& t) { + const auto dim = t.dim(); + DG_HOST_ASSERT(dim == 2 or dim == 3); + if (dim == 3) + DG_HOST_ASSERT(t.stride(0) == t.size(-2) * t.size(-1)); + DG_HOST_ASSERT(t.stride(-2) == 1 or t.stride(-1) == 1); +} + +static cute::UMMA::Major get_major_type_ab(const torch::Tensor& t) { + major_check(t); + return t.stride(-1) == 1 ? cute::UMMA::Major::K : cute::UMMA::Major::MN; +} + +static void check_major_type_cd(const torch::Tensor& t) { + // NOTES: the library only supports row-major output layouts + major_check(t); + DG_HOST_ASSERT(t.stride(-1) == 1); +} + +static bool fp8_requires_k_major() { + return device_runtime->get_arch_major() == 9; +} + +// Tensor utils +template +static auto get_shape(const torch::Tensor& t) { + return [&t] (std::index_sequence) { + return std::make_tuple(static_cast(t.sizes()[Is])...); + }(std::make_index_sequence()); +} + +// Recipe +static std::tuple +get_default_recipe(const torch::ScalarType& sfa_dtype, const torch::ScalarType& sfb_dtype) { + const auto& arch_major = device_runtime->get_arch_major(); + if (arch_major == 9) { + DG_HOST_ASSERT(sfa_dtype == torch::kFloat and sfb_dtype == torch::kFloat); + return {1, 128, 128}; + } else if (arch_major == 10) { + DG_HOST_ASSERT(sfb_dtype == torch::kFloat or sfb_dtype == torch::kInt); + return sfb_dtype == torch::kFloat ? + std::make_tuple(1, 128, 128): // Legacy format or 1D2D kernels + std::make_tuple(1, 1, 128); // 1D1D kernels + } + DG_HOST_UNREACHABLE("Unknown recipe"); +} + +// SF layouts +static torch::Tensor check_sf_layout(const torch::Tensor& sf, + const int& mn, const int& k, + const int& gran_mn, const int& gran_k, + const std::optional& num_groups, + const bool& tma_stride_check = false, + const bool& contiguous_check = false, + const std::optional& type_check = std::nullopt) { + // Type check + if (type_check.has_value()) + DG_HOST_ASSERT(sf.scalar_type() == type_check.value()); + + // Always do shape checks + const auto& sf_dtype = sf.scalar_type(); + DG_HOST_ASSERT(sf_dtype == torch::kFloat or sf_dtype == torch::kInt); + DG_HOST_ASSERT(sf.dim() == static_cast(num_groups.has_value()) + 2); + if (num_groups.has_value()) + DG_HOST_ASSERT(sf.size(-3) == num_groups.value()); + DG_HOST_ASSERT(sf.size(-2) == ceil_div(mn, gran_mn)); + DG_HOST_ASSERT(sf.size(-1) == ceil_div(k, gran_k * (sf_dtype == torch::kFloat ? 1 : 4))); + + // TMA stride checks: TMA aligned and MN-major + if (tma_stride_check) { + if (num_groups.has_value()) + DG_HOST_ASSERT(sf.stride(-3) == sf.stride(-1) * sf.size(-1)); + DG_HOST_ASSERT(sf.stride(-2) == 1); + DG_HOST_ASSERT(sf.stride(-1) == get_tma_aligned_size(mn, sf.element_size())); + } + + // Hopper SFB must be contiguous + if (contiguous_check) + DG_HOST_ASSERT(sf.is_contiguous()); + return sf; +} + +// Value matrix layout +static int get_mk_alignment_for_contiguous_layout() { + return 128; +} + +} // namespace deep_gemm diff --git a/csrc/utils/math.hpp b/csrc/utils/math.hpp new file mode 100644 index 00000000..264d2d10 --- /dev/null +++ b/csrc/utils/math.hpp @@ -0,0 +1,25 @@ +#pragma once + +#include + +#include "exception.hpp" + +namespace deep_gemm { + +template +static T ceil_div(const T& a, const T& b) { + return (a + b - 1) / b; +} + +template +static constexpr T align(const T& a, const T& b) { + return ceil_div(a, b) * b; +} + +static int get_tma_aligned_size(const int& x, const int& element_size) { + constexpr int kNumTMAAlignmentBytes = 16; + DG_HOST_ASSERT(kNumTMAAlignmentBytes % element_size == 0); + return align(x, kNumTMAAlignmentBytes / element_size); +} + +} // namespace deep_gemm diff --git a/csrc/utils/system.hpp b/csrc/utils/system.hpp new file mode 100644 index 00000000..7189b7f1 --- /dev/null +++ b/csrc/utils/system.hpp @@ -0,0 +1,70 @@ +#pragma once + +#include +#include +#include + +#include "exception.hpp" + +namespace deep_gemm { + +// ReSharper disable once CppNotAllPathsReturnValue +template +static dtype_t get_env(const std::string& name, const dtype_t& default_value = dtype_t()) { + const auto& c_str = std::getenv(name.c_str()); + if (c_str == nullptr) + return default_value; + + // Read the env and convert to the desired type + if constexpr (std::is_same_v) { + return std::string(c_str); + } else if constexpr (std::is_same_v) { + int value; + std::sscanf(c_str, "%d", &value); + return value; + } else { + DG_HOST_ASSERT(false and "Unexpected type"); + } +} + +static std::tuple call_external_command(std::string command) { + command = command + " 2>&1"; + const auto& deleter = [](FILE* f) { if (f) pclose(f); }; + std::unique_ptr pipe(popen(command.c_str(), "r"), deleter); + DG_HOST_ASSERT(pipe != nullptr); + + std::array buffer; + std::string output; + while (fgets(buffer.data(), buffer.size(), pipe.get())) + output += buffer.data(); + const auto& exit_code = WEXITSTATUS(pclose(pipe.release())); + return {exit_code, output}; +} + +static std::filesystem::path make_dirs(const std::filesystem::path& path) { + // OK if existed + std::error_code capture; + const bool& created = std::filesystem::create_directories(path, capture); + DG_HOST_ASSERT(created or capture.value() == 0); + if (created and get_env("DG_JIT_DEBUG")) + printf("Create directory: %s\n", path.c_str()); + return path; +} + +static std::string get_uuid() { + static std::random_device rd; + static std::mt19937 gen([]() { + return rd() ^ std::chrono::steady_clock::now().time_since_epoch().count(); + }()); + static std::uniform_int_distribution dist; + + std::stringstream ss; + ss << getpid() << "-" + << std::hex << std::setfill('0') + << std::setw(8) << dist(gen) << "-" + << std::setw(8) << dist(gen) << "-" + << std::setw(8) << dist(gen); + return ss.str(); +} + +} // deep_gemm diff --git a/deep_gemm/__init__.py b/deep_gemm/__init__.py index a2bbfe21..17e7a330 100644 --- a/deep_gemm/__init__.py +++ b/deep_gemm/__init__.py @@ -1,4 +1,6 @@ import os +import torch +import torch.utils.cpp_extension # Set some default environment provided at setup try: @@ -10,20 +12,30 @@ except ImportError: pass -# All modules -from . import ( - dispatch, - jit, - jit_kernels, - testing, - utils +# Import functions from the CPP module +import deep_gemm_cpp +deep_gemm_cpp.init( + os.path.dirname(os.path.abspath(__file__)), # Library root directory path + torch.utils.cpp_extension.CUDA_HOME # CUDA home ) -# All kernels -from .dispatch import * +# Configs +from deep_gemm_cpp import ( + set_num_sms, + get_num_sms +) -# Some useful utils -from .utils.layout import ( - get_device_arch, - get_m_alignment_for_contiguous_layout, +# Kernels +from deep_gemm_cpp import ( + fp8_gemm_nt, fp8_gemm_nn, + fp8_gemm_tn, fp8_gemm_tt, + m_grouped_fp8_gemm_nt_contiguous, + m_grouped_fp8_gemm_nn_contiguous, + fp8_m_grouped_gemm_nt_masked, + k_grouped_fp8_gemm_tn_contiguous ) + +# Some utils +from . import testing +from . import utils +from .utils import * diff --git a/deep_gemm/config.py b/deep_gemm/config.py deleted file mode 100644 index fe3d959d..00000000 --- a/deep_gemm/config.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch - -_num_sms = None - -def set_num_sms(num_sms: int) -> None: - """ - Set the maximum SM count for all GEMM kernels to use. - - Arguments: - num_sms: the desired maximum SM count for all GEMM kernels to use. - """ - global _num_sms - assert 0 < num_sms <= torch.cuda.get_device_properties(device='cuda').multi_processor_count - _num_sms = num_sms - - -def get_num_sms() -> int: - """ - Get the current maximum limit of SM count for all GEMM kernels to use. - If the count is never specified, the function will return the number of device SMs. - - Returns: - Current maximum limit of SM count for all GEMM kernels to use. - """ - global _num_sms - if _num_sms is None: - _num_sms = torch.cuda.get_device_properties(device='cuda').multi_processor_count - return _num_sms diff --git a/deep_gemm/dispatch.py b/deep_gemm/dispatch.py deleted file mode 100644 index c22506a4..00000000 --- a/deep_gemm/dispatch.py +++ /dev/null @@ -1,189 +0,0 @@ -import functools -import torch -from typing import Tuple, Optional - -# TODO: add Ampere Triton/tile-lang kernels -from .jit.compiler import get_device_arch -from .jit_kernels.impls import ( - sm90_bf16_gemm, - sm100_bf16_gemm, - sm90_fp8_gemm_1d1d, - sm90_fp8_gemm_1d2d, - sm100_fp8_gemm_1d1d, -) -from .utils.layout import ( - MajorTypeAB, MajorTypeCD, - get_major_type_ab, get_major_type_cd, - transform_sf_into_required_layout -) - - -@functools.lru_cache(maxsize=None) -def must_be_k_major() -> bool: - return { - '90a': True, - '100a': False, - }[get_device_arch()] - - -@functools.lru_cache(maxsize=None) -def get_default_recipe(sfa_dtype: torch.dtype, sfb_dtype: torch.dtype) -> Tuple[int, int, int]: - assert sfa_dtype in (torch.float, torch.int) - return { - ('90a', torch.float): (1, 128, 128), - ('100a', torch.float): (1, 128, 128), - ('100a', torch.int): (1, 1, 128), - }[(get_device_arch(), sfb_dtype)] - - -def fp8_gemm_nt(a: Tuple[torch.Tensor, torch.Tensor], - b: Tuple[torch.Tensor, torch.Tensor], - d: torch.Tensor, - c: Optional[torch.Tensor] = None, - recipe: Optional[Tuple[int, int, int]] = None, - compiled_dims: str = 'nk') -> None: - """ - Perform `d = c + (a @ b)`. - TODO: add more docs. - """ - # Compiled dims can be upper cases - compiled_dims = compiled_dims.lower() - - # NOTES: shape must be `[M, K] @ [N, K].T` - major_a = get_major_type_ab(a[0]) - major_b = get_major_type_ab(b[0]) - if must_be_k_major(): - assert major_a == major_b == MajorTypeAB.KMajor - - a, sfa = a - b, sfb = b - m, k = a.shape - n, k_ = b.shape - m_, n_ = d.shape - - # Type and shape checks - assert m == m_ and n == n_ and k == k_ - assert n > 0 and k > 0 - assert a.dtype == torch.float8_e4m3fn - assert b.dtype == torch.float8_e4m3fn - assert d.dtype in (torch.bfloat16, torch.float) - - # D must be N-major - assert get_major_type_cd(d) == MajorTypeCD.NMajor - - # Check C as well - if c is not None: - assert c.dtype == d.dtype == torch.float - assert get_major_type_cd(c) == MajorTypeCD.NMajor - - # Do nothing if the problem is empty - if m == 0: - return - - # Transform SFA and SFB into compute-required layout - recipe = get_default_recipe(sfa.dtype, sfb.dtype) if recipe is None else recipe - sfa = transform_sf_into_required_layout(sfa, mn=m, k=k, recipe=recipe, is_sfa=True) - sfb = transform_sf_into_required_layout(sfb, mn=n, k=k, recipe=recipe, is_sfa=False) - - impl = { - '100a': functools.partial(sm100_fp8_gemm_1d1d.fp8_gemm_nt, - major_a=major_a, major_b=major_b, major_cd=MajorTypeCD.NMajor, - compiled_dims=compiled_dims) - }[get_device_arch()] - impl(a, sfa, b, sfb, c, d) - - -def m_grouped_fp8_gemm_nt_contiguous(a: Tuple[torch.Tensor, torch.Tensor], - b: Tuple[torch.Tensor, torch.Tensor], - d: torch.Tensor, - m_indices: torch.Tensor, - recipe: Optional[Tuple[int, int, int]] = None, - compiled_dims: str = 'nk') -> None: - # Compiled dims can be upper cases - compiled_dims = compiled_dims.lower() - - # NOTES: shape must be `[M, K] @ [G, N, K].mT` - major_a = get_major_type_ab(a[0]) - major_b = get_major_type_ab(b[0]) - assert major_a == MajorTypeAB.KMajor - if must_be_k_major(): - assert major_b == MajorTypeAB.KMajor - assert m_indices.is_contiguous() - - a, sfa = a - b, sfb = b - m, k = a.shape - num_groups, n, k_ = b.shape - m_, n_ = d.shape - m__ = m_indices.numel() - - # Type and shape checks - assert m == m_ == m__ and n == n_ and k == k_ - assert n > 0 and k > 0 and num_groups > 0 - assert a.dtype == torch.float8_e4m3fn - assert b.dtype == torch.float8_e4m3fn - assert d.dtype == torch.bfloat16 - assert m_indices.dtype == torch.int32 - - # D must be N-major - assert get_major_type_cd(d) == MajorTypeCD.NMajor - - # Do nothing if the problem is empty - if m == 0: - return - - # Transform SFA and SFB into compute-required layout - recipe = get_default_recipe(sfa.dtype, sfb.dtype) if recipe is None else recipe - sfa = transform_sf_into_required_layout(sfa, mn=m, k=k, recipe=recipe, is_sfa=True) - sfb = transform_sf_into_required_layout(sfb, mn=n, k=k, recipe=recipe, num_groups=num_groups, is_sfa=False) - - impl = { - '100a': functools.partial(sm100_fp8_gemm_1d1d.m_grouped_fp8_gemm_nt_contiguous, major_a=major_a, major_b=major_b, compiled_dims=compiled_dims) - }[get_device_arch()] - impl(a, sfa, b, sfb, d, m_indices) - - -def fp8_m_grouped_gemm_nt_masked(a: Tuple[torch.Tensor, torch.Tensor], - b: Tuple[torch.Tensor, torch.Tensor], - d: torch.Tensor, - masked_m: torch.Tensor, - expected_m: int, - recipe: Optional[Tuple[int, int, int]] = None, - compiled_dims: str = 'nk') -> None: - # Compiled dims can be upper cases - compiled_dims = compiled_dims.lower() - - # NOTES: shape must be `[G, M, K] @ [G, N, K].mT` - major_a = get_major_type_ab(a[0]) - major_b = get_major_type_ab(b[0]) - assert major_a == major_b == MajorTypeAB.KMajor - assert masked_m.is_contiguous() - - a, sfa = a - b, sfb = b - num_groups, m, k = a.shape - num_groups_, n, k_ = b.shape - num_groups__, m_, n_ = d.shape - num_groups___ = masked_m.numel() - - # Type and shape checks - assert num_groups == num_groups_ == num_groups__ == num_groups___ - assert m == m_ and n == n_ and k == k_ - assert expected_m > 0 and m > 0 and n > 0 and k > 0 and num_groups > 0 - assert a.dtype == torch.float8_e4m3fn - assert b.dtype == torch.float8_e4m3fn - assert d.dtype == torch.bfloat16 - assert masked_m.dtype == torch.int32 - - # D must be N-major - assert get_major_type_cd(d) == MajorTypeCD.NMajor - - # Transform SFA and SFB into compute-required layout - recipe = get_default_recipe(sfa.dtype, sfb.dtype) if recipe is None else recipe - sfa = transform_sf_into_required_layout(sfa, mn=m, k=k, recipe=recipe, num_groups=num_groups, is_sfa=True) - sfb = transform_sf_into_required_layout(sfb, mn=n, k=k, recipe=recipe, num_groups=num_groups, is_sfa=False) - - impl = { - '100a': functools.partial(sm100_fp8_gemm_1d1d.fp8_m_grouped_gemm_nt_masked, major_a=major_a, major_b=major_b, compiled_dims=compiled_dims) - }[get_device_arch()] - impl(a, sfa, b, sfb, d, masked_m, expected_m) diff --git a/deep_gemm/include/deep_gemm/common/scheduler.cuh b/deep_gemm/include/deep_gemm/common/scheduler.cuh index d5c7f1c0..8ce8aa09 100644 --- a/deep_gemm/include/deep_gemm/common/scheduler.cuh +++ b/deep_gemm/include/deep_gemm/common/scheduler.cuh @@ -1,13 +1,14 @@ #pragma once +#include #include namespace deep_gemm { -enum class GemmType { - Normal, - GroupedContiguous, - GroupedMasked +enum class KGroupedIndexType { + MN, + K, + SF_K, }; #pragma clang diagnostic push @@ -26,22 +27,35 @@ struct Scheduler { uint32_t num_m_blocks; uint32_t num_n_blocks; + // For SM90 multicast checks + uint32_t num_blocks_in_group; + bool is_peer_cta_alive = true; + // For grouped GEMM int* grouped_layout; + uint32_t current_group_idx; // Only used for masked layout - uint32_t curr_group_idx, curr_cumsum; + uint32_t current_m_cumsum; + // Only used for k-grouped layout + uint32_t current_shape_k, current_num_valid_groups, current_k_cumsum, current_sf_k_cumsum; + // ReSharper disable once CppPossiblyUninitializedMember __device__ __forceinline__ explicit Scheduler(const uint32_t& shape_m, const uint32_t& shape_n, int* grouped_layout = nullptr) { num_m_blocks = ceil_div(shape_m, BLOCK_M); num_n_blocks = ceil_div(shape_n, BLOCK_N); if constexpr (kGemmType == GemmType::Normal) { num_blocks = num_m_blocks * num_n_blocks; - } else if (kGemmType == GemmType::GroupedContiguous) { + } else if (kGemmType == GemmType::MGroupedContiguous) { num_blocks = num_m_blocks * num_n_blocks; this->grouped_layout = grouped_layout; - } else if (kGemmType == GemmType::GroupedMasked) { - curr_group_idx = curr_cumsum = 0; + } else if (kGemmType == GemmType::MGroupedMasked) { + current_group_idx = current_m_cumsum = 0; + this->grouped_layout = grouped_layout; + } else if (kGemmType == GemmType::KGroupedContiguous) { + current_group_idx = current_num_valid_groups = 0; + current_k_cumsum = current_sf_k_cumsum = 0; + current_shape_k = __ldg(grouped_layout + current_group_idx); this->grouped_layout = grouped_layout; } } @@ -50,68 +64,148 @@ struct Scheduler { DG_STATIC_ASSERT(kNum1DBlocksPerGroup % kNumMulticast == 0, "Invalid group size"); // Swizzle for better L2 usages - // TODO: unify these 2 branches + const auto& primary_num_blocks = kIsMulticastOnA ? num_n_blocks : num_m_blocks; + const auto& secondary_num_blocks = kIsMulticastOnA ? num_m_blocks : num_n_blocks; + const auto& num_blocks_per_group = secondary_num_blocks * kNum1DBlocksPerGroup; + const auto& group_idx = block_idx / num_blocks_per_group; + auto first_block_idx = group_idx * kNum1DBlocksPerGroup; + auto in_group_idx = block_idx % num_blocks_per_group; + num_blocks_in_group = min(kNum1DBlocksPerGroup, primary_num_blocks - first_block_idx); + + // Fix unaligned TMA multicast + // NOTES: for SM90 only, as SM90 can dynamically disable TMA multicast + // while SM100 uses 2-CTA, which can not be dynamically disabled +#if __CUDA_ARCH__ < 1000 + if (kNumMulticast > 1 and num_blocks_in_group % 2 != 0) { + if (in_group_idx < (num_blocks_in_group ^ 1) * secondary_num_blocks) { + num_blocks_in_group = num_blocks_in_group ^ 1; + } else { + in_group_idx = in_group_idx - (num_blocks_in_group ^ 1) * secondary_num_blocks; + first_block_idx += num_blocks_in_group ^ 1; + num_blocks_in_group = 1; + } + } +#endif + + // Convert to final M/N block indices if constexpr (kIsMulticastOnA) { - auto num_blocks_per_group = num_m_blocks * kNum1DBlocksPerGroup; - auto group_idx = block_idx / num_blocks_per_group; - auto first_n_block_idx = group_idx * kNum1DBlocksPerGroup; - auto num_n_blocks_in_group = min(kNum1DBlocksPerGroup, num_n_blocks - first_n_block_idx); - auto in_group_idx = block_idx % num_blocks_per_group; - m_block_idx = in_group_idx / num_n_blocks_in_group; - n_block_idx = first_n_block_idx + in_group_idx % num_n_blocks_in_group; + m_block_idx = in_group_idx / num_blocks_in_group; + n_block_idx = first_block_idx + in_group_idx % num_blocks_in_group; } else { - auto num_blocks_per_group = num_n_blocks * kNum1DBlocksPerGroup; - auto group_idx = block_idx / num_blocks_per_group; - auto first_m_block_idx = group_idx * kNum1DBlocksPerGroup; - auto num_m_blocks_in_group = min(kNum1DBlocksPerGroup, num_m_blocks - first_m_block_idx); - auto in_group_idx = block_idx % num_blocks_per_group; - m_block_idx = first_m_block_idx + in_group_idx % num_m_blocks_in_group; - n_block_idx = in_group_idx / num_m_blocks_in_group; + m_block_idx = first_block_idx + in_group_idx % num_blocks_in_group; + n_block_idx = in_group_idx / num_blocks_in_group; } } - template + template __device__ __forceinline__ uint32_t get_global_idx(const uint32_t shape_dim, const uint32_t block_size, const uint32_t& block_idx, const uint32_t& m_block_idx = 0) { if constexpr (kGemmType == GemmType::Normal) { return block_idx * block_size; - } else if (kGemmType == GemmType::GroupedContiguous) { - auto offset = kWithGroupOffset ? __ldg(grouped_layout + m_block_idx * BLOCK_M) : 0; + } else if constexpr (kGemmType == GemmType::MGroupedContiguous) { + const auto offset = kWithGroupOffset ? std::max(0, __ldg(grouped_layout + m_block_idx * BLOCK_M)) : 0; return offset * shape_dim + block_idx * block_size; - } else if (kGemmType == GemmType::GroupedMasked) { - auto offset = kWithGroupOffset ? curr_group_idx : 0; + } else if constexpr (kGemmType == GemmType::MGroupedMasked) { + const auto offset = kWithGroupOffset ? current_group_idx : 0; return offset * shape_dim + block_idx * block_size; + } else if constexpr (kGemmType == GemmType::KGroupedContiguous) { + auto offset = 0; + if constexpr (kWithGroupOffset) { + if constexpr (kIndexType == KGroupedIndexType::MN) + offset = current_group_idx * shape_dim; + else if constexpr (kIndexType == KGroupedIndexType::K) + offset = current_k_cumsum; + else if constexpr (kIndexType == KGroupedIndexType::SF_K) + offset = current_sf_k_cumsum; + } + return offset + block_idx * block_size; } } __device__ __forceinline__ bool get_next_block(uint32_t& m_block_idx, uint32_t& n_block_idx) { const auto next_block_idx = (++ current_iter) * gridDim.x + blockIdx.x; - if constexpr (kGemmType == GemmType::GroupedMasked) { + if constexpr (kGemmType == GemmType::MGroupedMasked) { while (true) { // End of the task - if (curr_group_idx == kNumGroups) + if (current_group_idx == kNumGroups) return false; // Within current group - num_m_blocks = ceil_div(static_cast(__ldg(grouped_layout + curr_group_idx)), BLOCK_M); - auto current_m_block_cumsum = curr_cumsum + num_m_blocks; + num_m_blocks = ceil_div(static_cast(__ldg(grouped_layout + current_group_idx)), BLOCK_M); + const auto current_m_block_cumsum = current_m_cumsum + num_m_blocks; if (next_block_idx < current_m_block_cumsum * num_n_blocks) break; // Move to check the next group - curr_group_idx ++, curr_cumsum = current_m_block_cumsum; + current_group_idx ++, current_m_cumsum = current_m_block_cumsum; } - get_swizzled_block_idx(next_block_idx - curr_cumsum * num_n_blocks, m_block_idx, n_block_idx); + get_swizzled_block_idx(next_block_idx - current_m_cumsum * num_n_blocks, m_block_idx, n_block_idx); + } else if (kGemmType == GemmType::KGroupedContiguous) { + while (true) { + // End of the task + if (current_group_idx == kNumGroups) + return false; + + // Within current group + if (current_shape_k > 0 and next_block_idx < (current_num_valid_groups + 1) * num_m_blocks * num_n_blocks) + break; + + // Move to check the next group + if (current_shape_k > 0) { + current_k_cumsum += current_shape_k; + current_sf_k_cumsum += ceil_div(current_shape_k, 512u); + current_num_valid_groups ++; + } + if ((++ current_group_idx) != kNumGroups) + current_shape_k = __ldg(grouped_layout + current_group_idx); + } + + get_swizzled_block_idx(next_block_idx - current_num_valid_groups * num_m_blocks * num_n_blocks, m_block_idx, n_block_idx); } else { if (next_block_idx >= num_blocks) return false; + // For SM90 only + // NOTES: we don't have to set `is_peer_cta_alive` for masked grouped GEMM, as it must be aligned + is_peer_cta_alive = kNum1DBlocksPerGroup % kNumMulticast == 0 or // Always aligned on N (constant bypass) + num_m_blocks % kNumMulticast == 0 or // Always aligned on M (constant bypass) + (next_block_idx ^ 1) < num_blocks; // Peer CTA in bound get_swizzled_block_idx(next_block_idx, m_block_idx, n_block_idx); } return true; } + + // For SM90 only + __device__ __forceinline__ bool is_tma_multicast_valid(const uint32_t& m_block_idx) const { + if (num_blocks_in_group == 1) + return false; + if constexpr (kGemmType == GemmType::Normal or kGemmType == GemmType::MGroupedMasked) { + return true; + } else { + DG_STATIC_ASSERT(kGemmType == GemmType::MGroupedContiguous, "Invalid Gemm type"); + if constexpr (kIsMulticastOnA) { + return true; + } else { + const auto& group_idx = __ldg(grouped_layout + m_block_idx * BLOCK_M); + const auto& peer_group_idx = __ldg(grouped_layout + (m_block_idx ^ 1) * BLOCK_M); + return group_idx == peer_group_idx; + } + } + } + + // For SM90 only + // ReSharper disable once CppNotAllPathsReturnValue + __device__ __forceinline__ bool is_computation_valid(const uint32_t& m_block_idx, const uint32_t& m_offset) const { + if constexpr (kGemmType == GemmType::Normal) { + return true; + } else if constexpr (kGemmType == GemmType::MGroupedContiguous) { + return __ldg(grouped_layout + m_offset + m_block_idx * BLOCK_M) >= 0; + } else if constexpr (kGemmType == GemmType::MGroupedMasked) { + return m_offset + m_block_idx * BLOCK_M < __ldg(grouped_layout + current_group_idx); + } + } }; #pragma clang diagnostic pop diff --git a/deep_gemm/include/deep_gemm/common/sm100_utils.cuh b/deep_gemm/include/deep_gemm/common/sm100_utils.cuh index 671b0779..2016a79a 100644 --- a/deep_gemm/include/deep_gemm/common/sm100_utils.cuh +++ b/deep_gemm/include/deep_gemm/common/sm100_utils.cuh @@ -92,9 +92,22 @@ constexpr static cute::UMMA::LayoutType to_umma_layout_type() { if constexpr (kSwizzleMode == 128) return cute::UMMA::LayoutType::SWIZZLE_128B; } +template +__device__ __forceinline__ +constexpr uint32_t get_umma_desc_stride_k() { + return kMajorMode == cute::UMMA::Major::K ? 1 : get_inner_block_atom_size(); +} + +template +__device__ __forceinline__ +uint32_t advance_umma_desc_lo(const uint32_t& base, const uint32_t& offset, const uint32_t& k_idx) { + return base + ((offset + k_idx * get_umma_desc_stride_k()) >> 4u); +} + template __device__ __forceinline__ cute::UMMA::SmemDescriptor make_umma_desc(dtype_t* base_smem_ptr, uint32_t mn_idx, uint32_t k_idx) { + const uint32_t stride_k = get_umma_desc_stride_k(); if constexpr (kMajorMode == cute::UMMA::Major::K) { // NOTES: for K-major layout, the swizzle must be 128B (also, atom index must be 0), as `BLOCK_K` is always 128 DG_STATIC_ASSERT(kSwizzleMode == BLOCK_K * sizeof(dtype_t), "Unexpected value"); @@ -105,7 +118,7 @@ cute::UMMA::SmemDescriptor make_umma_desc(dtype_t* base_smem_ptr, uint32_t mn_id const uint32_t stride_byte_offset = 8 * BLOCK_K * sizeof(dtype_t); const uint32_t leading_byte_offset = 0; return make_smem_desc(to_umma_layout_type(), - base_smem_ptr + mn_idx * BLOCK_K + k_idx, + base_smem_ptr + mn_idx * BLOCK_K + k_idx * stride_k, stride_byte_offset, leading_byte_offset); } else { constexpr uint32_t BLOCK_MN_ATOM = get_inner_block_atom_size(); @@ -124,7 +137,7 @@ cute::UMMA::SmemDescriptor make_umma_desc(dtype_t* base_smem_ptr, uint32_t mn_id if constexpr (kSwizzleMode == 16) swap(stride_byte_offset, leading_byte_offset); return make_smem_desc(to_umma_layout_type(), - base_smem_ptr + mn_idx * BLOCK_K + BLOCK_MN_ATOM * k_idx, + base_smem_ptr + mn_idx * BLOCK_K + k_idx * stride_k, stride_byte_offset, leading_byte_offset); } } diff --git a/deep_gemm/include/deep_gemm/common/sm90_utils.cuh b/deep_gemm/include/deep_gemm/common/sm90_utils.cuh index 05ed0ba5..e0160636 100644 --- a/deep_gemm/include/deep_gemm/common/sm90_utils.cuh +++ b/deep_gemm/include/deep_gemm/common/sm90_utils.cuh @@ -1,9 +1,64 @@ #pragma once -#include +#include +#include +#include namespace deep_gemm::sm90 { +template +struct FP8MMA { + + template + __forceinline__ __device__ static void call_fma_impl(uint64_t const& desc_a, uint64_t const& desc_b, float* d, bool scale_d, std::index_sequence) { + using namespace cute::SM90::GMMA; + MMA::fma(desc_a, desc_b, d[Idx]..., (scale_d ? ScaleOut::One : ScaleOut::Zero)); + } + + __forceinline__ __device__ static void wgmma(uint64_t const& desc_a, uint64_t const& desc_b, float* d, bool scale_d) { + call_fma_impl(desc_a, desc_b, d, scale_d, std::make_index_sequence{}); + } + + static constexpr int M = 64; + static constexpr int N = N_; + static constexpr int K = 32; + static constexpr int kNumAccum = M * N / 128; +}; + +template +struct FP8MMASelector { + + static constexpr auto select_mma() { + using namespace cute::SM90::GMMA; + if constexpr (N == 16) return MMA_64x16x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 24) return MMA_64x24x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 32) return MMA_64x32x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 40) return MMA_64x40x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 48) return MMA_64x48x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 56) return MMA_64x56x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 64) return MMA_64x64x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 72) return MMA_64x72x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 80) return MMA_64x80x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 88) return MMA_64x88x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 96) return MMA_64x96x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 104) return MMA_64x104x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 112) return MMA_64x112x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 120) return MMA_64x120x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 128) return MMA_64x128x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 136) return MMA_64x136x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 144) return MMA_64x144x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 152) return MMA_64x152x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 160) return MMA_64x160x32_F32E4M3E4M3_SS_TN(); + if constexpr (N == 192) return MMA_64x192x32_F32E4M3E4M3_SS_TN(); + } + + static constexpr auto select_type() { + return FP8MMA(); + } + + using type = decltype(select_type()); +}; + template struct SM90_U32x2_STSM_N { __device__ __forceinline__ static void @@ -14,4 +69,83 @@ struct SM90_U32x2_STSM_N { } }; +__forceinline__ __device__ void warpgroup_arrive() { + asm volatile("wgmma.fence.sync.aligned;\n" ::: "memory"); +} + +__forceinline__ __device__ void warpgroup_commit_batch() { + asm volatile("wgmma.commit_group.sync.aligned;\n" ::: "memory"); +} + +__forceinline__ __device__ void warpgroup_fence_operand(float& reg) { + asm volatile("" : "+f"(reg) :: "memory"); +} + +template +__forceinline__ __device__ void warpgroup_wait() { + DG_STATIC_ASSERT(N >= 0 and N <= 7, "WGMMA wait: N must be in range [0, 7]"); + asm volatile("wgmma.wait_group.sync.aligned %0;\n" :: "n"(N) : "memory"); +} + +// TODO: replace with CUTLASS solution +union GmmaDescriptor { + __host__ __device__ constexpr GmmaDescriptor() noexcept: desc_(0) {} + + __host__ __device__ constexpr GmmaDescriptor(uint64_t desc) noexcept: desc_(desc) {} + + __host__ __device__ constexpr GmmaDescriptor(GmmaDescriptor const &t) noexcept: desc_(t.desc_) {} + + __host__ __device__ constexpr GmmaDescriptor(GmmaDescriptor &&t) noexcept: desc_(t.desc_) {} + + __host__ __device__ constexpr GmmaDescriptor &operator=(GmmaDescriptor const &t) noexcept { + desc_ = t.desc_; + return *this; + } + + __host__ __device__ constexpr GmmaDescriptor &operator=(GmmaDescriptor &&t) noexcept { + desc_ = t.desc_; + return *this; + } + + uint64_t desc_; + uint32_t reg32_[2]; + uint16_t reg16_[4]; + + struct { + uint16_t start_address_: 14, : 2; + uint16_t leading_byte_offset_: 14, : 2; + uint16_t stride_byte_offset_: 14, : 2; + uint8_t : 1, base_offset_: 3, : 4; + uint8_t : 6, layout_type_: 2; + } bitfield; + + // Decay to an `uint64_t` + __host__ __device__ constexpr operator uint64_t() const noexcept { return desc_; } +}; + +template +__device__ GmmaDescriptor make_smem_desc(PointerType smem_ptr, const int& layout_type, + const int& leading_byte_offset = 0, + const int& stride_byte_offset = 1024) { + GmmaDescriptor desc; + const auto& uint_ptr = static_cast(__cvta_generic_to_shared(smem_ptr)); + desc.bitfield.start_address_ = uint_ptr >> 4; + desc.bitfield.layout_type_ = layout_type; + desc.bitfield.leading_byte_offset_ = leading_byte_offset >> 4; + desc.bitfield.stride_byte_offset_ = stride_byte_offset >> 4; + desc.bitfield.base_offset_ = 0; + return desc; +} + +__device__ __forceinline__ void +tma_copy(void const* desc_ptr, uint64_t* barrier_ptr, void* smem_ptr, + const uint32_t& crd_0, const uint32_t& crd_1, const uint32_t& num_tma_multicast) { + constexpr auto cache_hint = static_cast(cute::TMA::CacheHintSm90::EVICT_NORMAL); + if (num_tma_multicast == 1) { + cute::SM90_TMA_LOAD_2D::copy(desc_ptr, barrier_ptr, cache_hint, smem_ptr, crd_0, crd_1); + } else if (cute::block_rank_in_cluster() == 0) { + cute::SM90_TMA_LOAD_MULTICAST_2D::copy(desc_ptr, barrier_ptr, (1 << num_tma_multicast) - 1, cache_hint, smem_ptr, crd_0, crd_1); + } +} + } // namespace `deep_gemm::sm90` diff --git a/deep_gemm/include/deep_gemm/common/types.hpp b/deep_gemm/include/deep_gemm/common/types.hpp new file mode 100644 index 00000000..7e879533 --- /dev/null +++ b/deep_gemm/include/deep_gemm/common/types.hpp @@ -0,0 +1,17 @@ +#pragma once + +namespace deep_gemm { + +enum class GemmType { + Normal = 0, + MGroupedContiguous = 1, + MGroupedMasked = 2, + KGroupedContiguous = 3, +}; + +enum class KernelType { + Kernel1D1D = 0, + Kernel1D2D = 1, +}; + +} // namespace deep_gemm diff --git a/deep_gemm/include/deep_gemm/common/utils.cuh b/deep_gemm/include/deep_gemm/common/utils.cuh index 127d80b6..a4ab6a34 100644 --- a/deep_gemm/include/deep_gemm/common/utils.cuh +++ b/deep_gemm/include/deep_gemm/common/utils.cuh @@ -36,16 +36,39 @@ do { \ namespace deep_gemm { +template +struct PatternVisitor { + FuncT func; + + __device__ __host__ + explicit PatternVisitor(FuncT&& func): func(std::forward(func)) {} + + __device__ __host__ + auto operator [](const uint32_t& i) { + return func(i); + } +}; + +template +__device__ __host__ T ceil_div(T a, T b) { + return (a + b - 1) / b; +} + template -__device__ __host__ constexpr T ceil_div(T a, T b) { +__device__ __host__ constexpr T constexpr_ceil_div(T a, T b) { return (a + b - 1) / b; } template -__device__ __host__ constexpr T align(T a, T b) { +__device__ __host__ T align(T a, T b) { return ceil_div(a, b) * b; } +template +__device__ __host__ constexpr T constexpr_align(T a, T b) { + return constexpr_ceil_div(a, b) * b; +} + template __device__ __host__ constexpr T constexpr_gcd(T a, T b) { return b == 0 ? a : constexpr_gcd(b, a % b); @@ -82,6 +105,12 @@ __device__ __forceinline__ float4 ld_shared(const float4* ptr) { return ret; } +__device__ __forceinline__ uint4 ld_shared(const uint4* ptr) { + uint4 ret; + asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : "l"(ptr)); + return ret; +} + __device__ __forceinline__ float ld_shared(const float* ptr) { float ret; asm volatile("ld.shared.f32 %0, [%1];" : "=f"(ret) : "l"(ptr)); diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh index 12bbc20c..360719aa 100644 --- a/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d1d.cuh @@ -74,8 +74,8 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, constexpr uint32_t SMEM_CD_SIZE = SMEM_CD_SIZE_PER_STAGE * kNumTMAStoreStages; constexpr uint32_t SMEM_A_SIZE_PER_STAGE = LOAD_BLOCK_M * BLOCK_K * sizeof(__nv_fp8_e4m3); constexpr uint32_t SMEM_B_SIZE_PER_STAGE = LOAD_BLOCK_N * BLOCK_K * sizeof(__nv_fp8_e4m3); - constexpr uint32_t SF_BLOCK_M = align(BLOCK_M, kNumUTCCPAlignedElems); - constexpr uint32_t SF_BLOCK_N = align(BLOCK_N, kNumUTCCPAlignedElems); + constexpr uint32_t SF_BLOCK_M = constexpr_align(BLOCK_M, kNumUTCCPAlignedElems); + constexpr uint32_t SF_BLOCK_N = constexpr_align(BLOCK_N, kNumUTCCPAlignedElems); constexpr uint32_t SMEM_SFA_SIZE_PER_STAGE = SF_BLOCK_M * sizeof(uint32_t); constexpr uint32_t SMEM_SFB_SIZE_PER_STAGE = SF_BLOCK_N * sizeof(uint32_t); DG_STATIC_ASSERT(SMEM_CD_SIZE % 1024 == 0, "Shared memory of A/B must be aligned to 1024 bytes"); @@ -111,14 +111,6 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, uint32_t* smem_sfa[kNumStages]; uint32_t* smem_sfb[kNumStages]; - // TMA Barrier for both divisible and non-divisible cases - Barrier* full_barriers[kNumStages]; - Barrier* empty_barriers[kNumStages]; - Barrier* with_sf_full_barriers[kNumStages]; - Barrier* tmem_full_barriers[kNumEpilogueStages]; - Barrier* tmem_empty_barriers[kNumEpilogueStages]; - Barrier* accumulation_full_barrier; - // Fill D/A/B pointers #pragma unroll for (uint32_t i = 0; i < kNumTMAStoreStages; ++ i) @@ -142,21 +134,14 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, SMEM_CD_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE) + kNumStages * (SMEM_SFA_SIZE_PER_STAGE + SMEM_SFB_SIZE_PER_STAGE)); - #pragma unroll - for (uint32_t i = 0; i < kNumStages; ++ i) { - full_barriers[i] = barrier_start_ptr + i; - empty_barriers[i] = barrier_start_ptr + kNumStages + i; - with_sf_full_barriers[i] = barrier_start_ptr + kNumStages * 2 + i; - } - #pragma unroll - for (uint32_t i = 0; i < kNumEpilogueStages; ++ i) { - tmem_full_barriers[i] = barrier_start_ptr + kNumStages * 3 + i; - tmem_empty_barriers[i] = barrier_start_ptr + kNumStages * 3 + kNumEpilogueStages + i; - } - accumulation_full_barrier = barrier_start_ptr + kNumStages * 3 + kNumEpilogueStages * 2; + auto full_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (i); }); + auto empty_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (kNumStages + i); }); + auto with_sf_full_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (kNumStages * 2 + i); }); + auto tmem_full_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (kNumStages * 3 + i); }); + auto tmem_empty_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (kNumStages * 3 + kNumEpilogueStages + i); }); // Fill the tensor memory pointer - auto tmem_ptr_in_smem = reinterpret_cast(barrier_start_ptr + kNumStages * 3 + kNumEpilogueStages * 2 + 1); + auto tmem_ptr_in_smem = reinterpret_cast(barrier_start_ptr + kNumStages * 3 + kNumEpilogueStages * 2); DG_STATIC_ASSERT(32 <= kNumTmemCols and kNumTmemCols <= 512, "Invalid tensor memory columns"); // Initialize barriers @@ -176,8 +161,6 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, // Arrive only at the leader CTA tmem_empty_barriers[i]->init(kNumMulticast * kNumEpilogueThreads); } - if constexpr (kWithAccumulation) - accumulation_full_barrier->init(1); // Make initialized barrier visible in async proxy cutlass::arch::fence_view_async_shared(); @@ -188,18 +171,27 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, } kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads(); + // Block scheduler + uint32_t m_block_idx, n_block_idx; + auto scheduler = Scheduler(shape_m, shape_n, grouped_layout); + // For pipeline unrolling struct DivisibleK {}; struct NotDivisibleK {}; - const uint32_t num_iterations = ceil_div(shape_k, kNumStages * BLOCK_K); - auto launch_k_iterations = [=](const auto& func) { - if constexpr (kNumLastStages == 0) { - for (uint32_t k_iter = 0; k_iter < num_iterations; ++ k_iter) - func(k_iter, DivisibleK{}, k_iter == num_iterations - 1); + uint32_t phase = 0; + auto launch_k_iterations = [&](const auto& func) { + const uint32_t current_shape_k = (kGemmType == GemmType::KGroupedContiguous ? scheduler.current_shape_k : shape_k); + const uint32_t num_iterations = ceil_div(current_shape_k, kNumStages * BLOCK_K); + const uint32_t num_last_stages = ceil_div(current_shape_k, BLOCK_K) % kNumStages; + + // TODO: refactor here + if (num_last_stages == 0) { + for (uint32_t k_iter = 0; k_iter < num_iterations; ++ k_iter, phase ^= 1) + func(k_iter, DivisibleK{}, k_iter == num_iterations - 1, num_last_stages); } else { - for (uint32_t k_iter = 0; k_iter < num_iterations - 1; ++ k_iter) - func(k_iter, DivisibleK{}, false); - func(num_iterations - 1, NotDivisibleK{}, true); + for (uint32_t k_iter = 0; k_iter < num_iterations - 1; ++ k_iter, phase ^= 1) + func(k_iter, DivisibleK{}, false, num_last_stages); + func(num_iterations - 1, NotDivisibleK{}, true, num_last_stages), phase ^= 1; } }; @@ -209,38 +201,35 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, accum_stage_idx == 0 ? func(0) : func(1); }; - // Block scheduler - uint32_t m_block_idx, n_block_idx; - auto scheduler = Scheduler(shape_m, shape_n, grouped_layout); - // Dispatch warps into different roles if (warp_idx == 0) { // TMA load warp // Persistently schedule over blocks while (scheduler.get_next_block(m_block_idx, n_block_idx)) { - launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter) { + launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter, uint32_t num_last_stages) { constexpr bool kHasDivisibleStages = std::is_same_v; - constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; - DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + const uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : num_last_stages; #pragma unroll for (uint32_t s = 0; s < kNumInnerStages; ++ s) { // Wait consumer release - empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); + empty_barriers[s]->wait(phase ^ 1); // Compute offsets // NOTES: the group is always concatenated with the outer dimension - uint32_t m_idx = scheduler.get_global_idx<(kGemmType != GemmType::GroupedContiguous)>( + uint32_t m_idx = scheduler.template get_global_idx<(kGemmType == GemmType::MGroupedMasked), KGroupedIndexType::MN> ( shape_m, BLOCK_M, m_block_idx); - uint32_t n_idx = scheduler.get_global_idx<(kMajorB == cute::UMMA::Major::K)>( + uint32_t n_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::K), KGroupedIndexType::MN> ( shape_n, BLOCK_N, n_block_idx, m_block_idx); // NOTES: `k_idx` is actually the k index default for K-major, while `k_b_idx` may be MN-major - // And for all grouped GEMMs, A must be K-majored - DG_STATIC_ASSERT(kGemmType == GemmType::Normal or kMajorA == cute::UMMA::Major::K, "Invalid major"); + // And for all m-grouped GEMMs, A must be K-majored + DG_STATIC_ASSERT(kGemmType == GemmType::Normal or kGemmType == GemmType::KGroupedContiguous or kMajorA == cute::UMMA::Major::K, "Invalid major"); uint32_t k_block_idx = k_iter * kNumStages + s; uint32_t k_idx = k_block_idx * BLOCK_K; - uint32_t k_b_idx = scheduler.get_global_idx<(kMajorB == cute::UMMA::Major::MN)>( + uint32_t k_a_idx = scheduler.template get_global_idx<(kMajorA == cute::UMMA::Major::MN), KGroupedIndexType::K> ( + shape_k, BLOCK_K, k_block_idx, m_block_idx); + uint32_t k_b_idx = scheduler.template get_global_idx<(kMajorB == cute::UMMA::Major::MN), KGroupedIndexType::K> ( shape_k, BLOCK_K, k_block_idx, m_block_idx); // Add 2 CTA offsets @@ -252,9 +241,9 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, // Issue TMAs if (cute::elect_one_sync()) { if constexpr (kMajorA == cute::UMMA::Major::K) - tma_copy(&tensor_map_a, full_barriers[s], smem_a[s], k_idx, m_idx); + tma_copy(&tensor_map_a, full_barriers[s], smem_a[s], k_a_idx, m_idx); if constexpr (kMajorA == cute::UMMA::Major::MN) - tma_copy(&tensor_map_a, full_barriers[s], smem_a[s], m_idx, k_idx); + tma_copy(&tensor_map_a, full_barriers[s], smem_a[s], m_idx, k_a_idx); if constexpr (kMajorB == cute::UMMA::Major::K) tma_copy(&tensor_map_b, full_barriers[s], smem_b[s], k_b_idx, n_idx); if constexpr (kMajorB == cute::UMMA::Major::MN) @@ -267,9 +256,9 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, const uint32_t sf_stage_in_group_idx = (k_iter * kNumStages + s) % kNumSFStagesPerLoad; if (sf_stage_in_group_idx == 0 and cute::elect_one_sync()) { tma_copy(&tensor_map_sfa, full_barriers[s], smem_sfa[s], m_block_idx * BLOCK_M, - scheduler.get_global_idx<(kGemmType != GemmType::GroupedContiguous)>(shape_sf_k, 1, ceil_div(k_idx, BLOCK_K * kNumSFStagesPerLoad))); + scheduler.template get_global_idx<(kGemmType != GemmType::MGroupedContiguous), KGroupedIndexType::SF_K>(shape_sf_k, 1, ceil_div(k_idx, BLOCK_K * kNumSFStagesPerLoad))); tma_copy(&tensor_map_sfb, full_barriers[s], smem_sfb[s], n_block_idx * BLOCK_N, - scheduler.get_global_idx(shape_sf_k, 1, ceil_div(k_idx, BLOCK_K * kNumSFStagesPerLoad), m_block_idx)); + scheduler.template get_global_idx(shape_sf_k, 1, ceil_div(k_idx, BLOCK_K * kNumSFStagesPerLoad), m_block_idx)); num_arrival_bytes += (BLOCK_M + BLOCK_N) * sizeof(uint32_t); } @@ -281,7 +270,7 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, // Wait unaligned cases #pragma unroll for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { - empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); + empty_barriers[s]->wait(phase ^ 1); if (cute::elect_one_sync()) full_barriers[s]->arrive(); } @@ -300,6 +289,12 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, UMMA_M, UMMA_N, kMajorA, kMajorB>(); auto sf_desc = make_sf_desc(nullptr); + DG_STATIC_ASSERT(kNumStages <= 32, "Too many stages"); + auto a_desc = make_umma_desc(smem_a[0], 0, 0); + auto b_desc = make_umma_desc(smem_b[0], 0, 0); + uint32_t a_desc_lo = lane_idx < kNumStages ? a_desc.lo + lane_idx * SMEM_A_SIZE_PER_STAGE / 16 : 0u; + uint32_t b_desc_lo = lane_idx < kNumStages ? b_desc.lo + lane_idx * SMEM_B_SIZE_PER_STAGE / 16 : 0u; + // Checks for MMA instructions // NOTES: CUTLASS does not have such checks except the MMA traits, but we are not using these traits DG_STATIC_ASSERT((UMMA_M == 64 and UMMA_N % 8 == 0 and 8 <= UMMA_N and UMMA_N <= 256) or @@ -333,15 +328,14 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, }; // Launch MMAs - launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter) { + launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter, uint32_t num_last_stages) { constexpr bool kHasDivisibleStages = std::is_same_v; - constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; - DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + const uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : num_last_stages; #pragma unroll for (uint32_t s = 0; s < kNumInnerStages; ++ s) { // Wait TMA and SF-transpose arrival - with_sf_full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + with_sf_full_barriers[s]->wait(phase); tcgen05_after_thread_sync(); // Do SF copy at certain stages @@ -352,7 +346,7 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, cute::SM100_UTCCP_4x32dp128bit_1cta, cute::SM100_UTCCP_4x32dp128bit_2cta>; // SFA and SFB copy - // TODO: preprocess shared memory descriptor + // TODO: process shared memory descriptor by addition #pragma unroll for (uint32_t i = 0; i < SF_BLOCK_M / kNumUTCCPAlignedElems; ++ i) { auto smem_ptr = smem_sfa[s] + i * kNumUTCCPAlignedElems; @@ -374,14 +368,15 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, cutlass::float_ue8m0_t, UMMA_M, UMMA_N, kMajorA, kMajorB>, cute::SM100_MMA_MXF8F6F4_2x1SM_SS>; + const auto& runtime_instr_desc = make_runtime_instr_desc_with_sf_id(instr_desc, sf_stage_in_group_idx); + const auto& a_desc_base_lo = __shfl_sync(0xffffffff, a_desc_lo, s); + const auto& b_desc_base_lo = __shfl_sync(0xffffffff, b_desc_lo, s); #pragma unroll for (uint32_t k = 0; k < BLOCK_K / UMMA_K; ++ k) { - auto b_desc = make_umma_desc(smem_b[s], 0, k * UMMA_K); - // TODO: optimize runtime instruction creation + b_desc.lo = advance_umma_desc_lo(b_desc_base_lo, 0, k * UMMA_K); #pragma unroll for (uint32_t w = 0; w < kNumMWaves; ++ w) { - auto a_desc = make_umma_desc(smem_a[s], w * LAYOUT_AD_M, k * UMMA_K); - auto runtime_instr_desc = make_runtime_instr_desc_with_sf_id(instr_desc, sf_stage_in_group_idx); + a_desc.lo = advance_umma_desc_lo(a_desc_base_lo, w * LAYOUT_AD_M * BLOCK_K, k * UMMA_K); cute_mma_t::fma(a_desc, b_desc, accum_stage_idx * kNumMWaves * BLOCK_N + w * BLOCK_N, k_iter > 0 or s > 0 or k > 0, @@ -392,14 +387,14 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, } // Commit to the mbarrier object - tcgen05_before_thread_sync(); + // No explicit `tcgen05.fence::before_thread_sync` is needed, as this is implicitly performed by `tcgen05.commit` empty_barrier_arrive(s, is_last_iter and s == kNumInnerStages - 1); } // Wait unaligned cases #pragma unroll for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { - with_sf_full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + with_sf_full_barriers[s]->wait(phase); empty_barrier_arrive(s, false); } }); @@ -420,15 +415,14 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, }; while (scheduler.get_next_block(m_block_idx, n_block_idx)) { - launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter) { + launch_k_iterations([&](uint32_t k_iter, auto type, bool is_last_iter, uint32_t num_last_stages) { constexpr bool kHasDivisibleStages = std::is_same_v; - constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; - DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + const uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : num_last_stages; #pragma unroll for (uint32_t s = 0; s < kNumInnerStages; ++ s) { // Wait TMA arrival - full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + full_barriers[s]->wait(phase); // Transpose for UTCCP at certain stages const uint32_t sf_stage_in_group_idx = (k_iter * kNumStages + s) % kNumSFStagesPerLoad; @@ -450,7 +444,7 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, // Wait unaligned cases #pragma unroll for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { - full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + full_barriers[s]->wait(phase); with_sf_full_barriers[s]->arrive(0u); } }); @@ -508,16 +502,9 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, // The pipeline stage const auto tma_stage_idx = iter_idx % kNumTMAStoreStages; - const auto m_idx = scheduler.get_global_idx<(kGemmType != GemmType::GroupedContiguous)>(shape_m, BLOCK_M, m_block_idx) + w * LAYOUT_AD_M; + const auto m_idx = scheduler.template get_global_idx<(kGemmType != GemmType::MGroupedContiguous), KGroupedIndexType::MN>(shape_m, BLOCK_M, m_block_idx) + w * LAYOUT_AD_M; const auto n_idx = n_block_idx * BLOCK_N + s * STORE_BLOCK_N; - // Issue accumulation TMA - if (kWithAccumulation and epilogue_thread_idx == 0) { - tma_copy( - &tensor_map_c, accumulation_full_barrier, smem_cd[tma_stage_idx], n_idx, m_idx); - accumulation_full_barrier->arrive_and_expect_tx(STORE_BLOCK_M * kSwizzleCDMode); - } - // Store into shared memory #pragma unroll for (uint32_t i = 0; i < STORE_BLOCK_N / kNumElemsPerBankGroup; ++ i) { @@ -549,18 +536,6 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, cute::SM100_TMEM_LOAD_32dp32b4x::copy(tmem_addr, values[0], values[1], values[2], values[3]); cutlass::arch::fence_view_async_tmem_load(); - if constexpr (kWithAccumulation) { - // Wait TMA arrival before the first accumulation - if (i == 0) - accumulation_full_barrier->wait((scheduler.current_iter * (kNumMWaves * kNumStores) + iter_idx) & 1); - - // Load the same position and add - auto c_values = ld_shared(reinterpret_cast(smem_ptr)); - *reinterpret_cast(&values[0]) += c_values.x; - *reinterpret_cast(&values[1]) += c_values.y; - *reinterpret_cast(&values[2]) += c_values.z; - *reinterpret_cast(&values[3]) += c_values.w; - } st_shared(smem_ptr, values[0], values[1], values[2], values[3]); } else { // For BF16 output, read, cast and store @@ -589,7 +564,9 @@ sm100_fp8_gemm_1d1d_impl(int* grouped_layout, cute::tma_store_fence(); cutlass::arch::NamedBarrier(kNumEpilogueThreads).sync(); if (epilogue_thread_idx == 0) { - cute::SM90_TMA_STORE_2D::copy(&tensor_map_d, smem_cd[tma_stage_idx], n_idx, m_idx); + using cute_tma_t = std::conditional_t; + cute_tma_t::copy(&tensor_map_d, smem_cd[tma_stage_idx], n_idx, m_idx); cute::tma_store_arrive(); } } diff --git a/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh new file mode 100644 index 00000000..dcfeed9d --- /dev/null +++ b/deep_gemm/include/deep_gemm/impls/sm100_fp8_gemm_1d2d.cuh @@ -0,0 +1,532 @@ +#pragma once +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunknown-attributes" + +#include +#include + +#include +#include +#include + +namespace deep_gemm { + +using namespace deep_gemm::sm100; + +template +__global__ void __launch_bounds__(kNumNonEpilogueThreads + kNumEpilogueThreads, 1) +sm100_fp8_gemm_1d2d_impl(float* sfb, int* grouped_layout, + uint32_t shape_m, uint32_t shape_n, uint32_t shape_k, + const __grid_constant__ CUtensorMap tensor_map_a, + const __grid_constant__ CUtensorMap tensor_map_b, + const __grid_constant__ CUtensorMap tensor_map_d, + const __grid_constant__ CUtensorMap tensor_map_sfa) { +#if (defined(__CUDA_ARCH__) and (__CUDA_ARCH__ >= 1000)) or defined(__CLION_IDE__) + using Barrier = cutlass::arch::ClusterTransactionBarrier; + + // Scaling checks + DG_STATIC_ASSERT(BLOCK_K == 128, "Only support per-128-channel FP8 scaling"); + DG_STATIC_ASSERT(constexpr_ceil_div(BLOCK_N, BLOCK_K) == 1 or (constexpr_gcd(BLOCK_N, BLOCK_K) == BLOCK_N - BLOCK_K), "Too much B scales in a single block"); + + // Configs + constexpr uint32_t LAYOUT_AD_M = 128; + constexpr uint32_t kNumMWaves = BLOCK_M / LAYOUT_AD_M; + constexpr uint32_t kNumTMAStoreStages = 2; + DG_STATIC_ASSERT(BLOCK_K == 128, "Invalid block K"); + DG_STATIC_ASSERT(BLOCK_M % LAYOUT_AD_M == 0 and 2 % kNumMWaves == 0, "Invalid block M"); + DG_STATIC_ASSERT(BLOCK_M == kNumEpilogueThreads, "Invalid block M"); + + // Overwrite shape constants if the compiler gives + shape_m = SHAPE_M != 0 ? SHAPE_M : shape_m; + shape_n = SHAPE_N != 0 ? SHAPE_N : shape_n; + shape_k = SHAPE_K != 0 ? SHAPE_K : shape_k; + const auto shape_k_scales = ceil_div(shape_k, BLOCK_K); + + // Utils + bool is_leader_cta = cute::block_rank_in_cluster() == 0; + const auto warp_idx = cutlass::canonical_warp_idx_sync(); + const auto lane_idx = get_lane_idx(); + + // Align to 1024 bytes for swizzle-128B + extern __shared__ __align__(1024) uint8_t smem_buffer[]; + + // 2-CTA MMA + constexpr uint32_t LOAD_BLOCK_M = BLOCK_M / (kIsMulticastOnA ? kNumMulticast: 1); + constexpr uint32_t LOAD_BLOCK_N = BLOCK_N / (kIsMulticastOnA ? 1 : kNumMulticast); + constexpr uint32_t STORE_BLOCK_M = std::min(BLOCK_M, LAYOUT_AD_M); + constexpr uint32_t STORE_BLOCK_N = kSwizzleCDMode / sizeof(cd_dtype_t); + DG_STATIC_ASSERT(not kIsMulticastOnA or kNumMulticast == 1, "Invalid multicast"); + DG_STATIC_ASSERT(LOAD_BLOCK_M == BLOCK_M and BLOCK_M % LAYOUT_AD_M == 0, "Only support tensor memory layout A/D"); + DG_STATIC_ASSERT(kNumMulticast == 1 or kNumMulticast == 2, "Only support 1/2 multicast"); + + // Share memory sizes + // NOTES: do not use `LOAD_BLOCK_M` for SFA, as we need full SFA for promotion + constexpr bool kMustUseUniformedSFB = (BLOCK_K % BLOCK_N == 0); + constexpr uint32_t SMEM_CD_SIZE_PER_STAGE = BLOCK_M * kSwizzleCDMode; + constexpr uint32_t SMEM_CD_SIZE = SMEM_CD_SIZE_PER_STAGE * kNumTMAStoreStages; + constexpr uint32_t SMEM_A_SIZE_PER_STAGE = LOAD_BLOCK_M * BLOCK_K * sizeof(__nv_fp8_e4m3); + constexpr uint32_t SMEM_B_SIZE_PER_STAGE = LOAD_BLOCK_N * BLOCK_K * sizeof(__nv_fp8_e4m3); + constexpr uint32_t SMEM_SFA_SIZE_PER_STAGE = BLOCK_M * sizeof(float); + DG_STATIC_ASSERT(SMEM_CD_SIZE % 1024 == 0, "Shared memory of A/B must be aligned to 1024 bytes"); + DG_STATIC_ASSERT(kNumTMAStoreStages >= 1, "Invalid number of TMA stages"); + + // Must have 2 epilogue stages + constexpr uint32_t kNumEpilogueStages = 2; + + // Real tensor memory size and offsets + constexpr uint32_t kNumAccumTmemCols = kNumEpilogueStages * kNumMWaves * BLOCK_N; + constexpr uint32_t kNumTmemCols = get_num_aligned_tmem_cols(); + + // Prefetch TMA descriptors at the very beginning + if (threadIdx.x == 0) { + cute::prefetch_tma_descriptor(&tensor_map_a); + cute::prefetch_tma_descriptor(&tensor_map_b); + cute::prefetch_tma_descriptor(&tensor_map_d); + cute::prefetch_tma_descriptor(&tensor_map_sfa); + } + + // Data on shared memory (layout as ordered below) + cd_dtype_t* smem_cd[kNumTMAStoreStages]; + cutlass::float_e4m3_t* smem_a[kNumStages]; + cutlass::float_e4m3_t* smem_b[kNumStages]; + float* smem_sfa[kNumStages]; + + // Fill D/A/B pointers + #pragma unroll + for (uint32_t i = 0; i < kNumTMAStoreStages; ++ i) + smem_cd[i] = reinterpret_cast(smem_buffer + i * SMEM_CD_SIZE_PER_STAGE); + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + smem_a[i] = reinterpret_cast(smem_buffer + SMEM_CD_SIZE + i * SMEM_A_SIZE_PER_STAGE); + smem_b[i] = reinterpret_cast(smem_buffer + SMEM_CD_SIZE + kNumStages * SMEM_A_SIZE_PER_STAGE + i * SMEM_B_SIZE_PER_STAGE); + } + + // Fill SFA/SFB + auto sf_start_ptr = smem_buffer + SMEM_CD_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE); + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) + smem_sfa[i] = reinterpret_cast(sf_start_ptr + i * SMEM_SFA_SIZE_PER_STAGE); + + // Fill barriers + auto barrier_start_ptr = reinterpret_cast(smem_buffer + + SMEM_CD_SIZE + + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE) + + kNumStages * SMEM_SFA_SIZE_PER_STAGE); + auto full_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (i); }); + auto empty_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (kNumStages + i); }); + auto tmem_full_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (kNumStages * 2 + i); }); + auto tmem_empty_barriers = PatternVisitor([=](const uint32_t& i) { return barrier_start_ptr + (kNumStages * 2 + kNumEpilogueStages + i); }); + + // Fill the tensor memory pointer + auto tmem_ptr_in_smem = reinterpret_cast(barrier_start_ptr + kNumStages * 2 + kNumEpilogueStages * 2); + DG_STATIC_ASSERT(32 <= kNumTmemCols and kNumTmemCols <= 512, "Invalid tensor memory columns"); + + // Initialize barriers + if (threadIdx.x == 0) { + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + // Arrive at all CTAs + full_barriers[i]->init(1); + empty_barriers[i]->init(kNumMulticast * kNumEpilogueThreads / 32); + } + #pragma unroll + for (uint32_t i = 0; i < kNumEpilogueStages; ++ i) { + // Arrive at all CTAs + tmem_full_barriers[i]->init(1); + // Arrive only at the leader CTA + tmem_empty_barriers[i]->init(kNumMulticast * kNumEpilogueThreads); + } + + // Make initialized barrier visible in async proxy + cutlass::arch::fence_view_async_shared(); + cutlass::arch::fence_barrier_init(); + } else if (threadIdx.x >= 32 and threadIdx.x < 64) { + // Allocate tensor memory + cute::TMEM::Allocator1Sm().allocate(kNumTmemCols, tmem_ptr_in_smem); + } + kNumMulticast > 1 ? cute::cluster_sync() : __syncthreads(); + + // For pipeline unrolling + struct DivisibleK {}; + struct NotDivisibleK {}; + const uint32_t num_iterations = ceil_div(shape_k, kNumStages * BLOCK_K); + auto launch_k_iterations = [=](const auto& func) { + if constexpr (kNumLastStages == 0) { + for (uint32_t k_iter = 0; k_iter < num_iterations; ++ k_iter) + func(k_iter, DivisibleK{}); + } else { + for (uint32_t k_iter = 0; k_iter < num_iterations - 1; ++ k_iter) + func(k_iter, DivisibleK{}); + func(num_iterations - 1, NotDivisibleK{}); + } + }; + + // Block scheduler + uint32_t m_block_idx, n_block_idx; + auto scheduler = Scheduler(shape_m, shape_n, grouped_layout); + + // Register configurations + constexpr uint32_t kNumNonEpilogueRegisters = 64; + constexpr uint32_t kNumEpilogueRegisters = 216; + DG_STATIC_ASSERT(kNumNonEpilogueRegisters * kNumNonEpilogueThreads + kNumEpilogueRegisters * kNumEpilogueThreads <= 65535, "Too many registers"); + + // Dispatch warps into different roles + if (warp_idx == 0) { + // Adjust registers + cutlass::arch::warpgroup_reg_dealloc(); + + // TMA load warp + // Persistently schedule over blocks + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + launch_k_iterations([&](uint32_t k_iter, auto type) { + constexpr bool kHasDivisibleStages = std::is_same_v; + constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; + DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + + #pragma unroll + for (uint32_t s = 0; s < kNumInnerStages; ++ s) { + // Wait consumer release + empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); + + // Compute offsets + // NOTES: the group is always concatenated with the outer dimension + uint32_t m_idx = scheduler.get_global_idx<(kGemmType != GemmType::MGroupedContiguous)>( + shape_m, BLOCK_M, m_block_idx); + uint32_t n_idx = scheduler.get_global_idx<(kMajorB == cute::UMMA::Major::K)>( + shape_n, BLOCK_N, n_block_idx, m_block_idx); + + // NOTES: `k_idx` is actually the k index default for K-major, while `k_b_idx` may be MN-major + // And for all grouped GEMMs, A must be K-majored + DG_STATIC_ASSERT(kGemmType == GemmType::Normal or kMajorA == cute::UMMA::Major::K, "Invalid major"); + uint32_t k_block_idx = k_iter * kNumStages + s; + uint32_t k_idx = k_block_idx * BLOCK_K; + uint32_t k_b_idx = scheduler.get_global_idx<(kMajorB == cute::UMMA::Major::MN)>( + shape_k, BLOCK_K, k_block_idx, m_block_idx); + + // Add 2 CTA offsets + if constexpr (kNumMulticast > 1) { + m_idx += kIsMulticastOnA ? (cute::block_rank_in_cluster() * LOAD_BLOCK_M) : 0; + n_idx += kIsMulticastOnA ? 0 : (cute::block_rank_in_cluster() * LOAD_BLOCK_N); + } + + // Issue TMAs + if (cute::elect_one_sync()) { + if constexpr (kMajorA == cute::UMMA::Major::K) + tma_copy(&tensor_map_a, full_barriers[s], smem_a[s], k_idx, m_idx); + if constexpr (kMajorA == cute::UMMA::Major::MN) + tma_copy(&tensor_map_a, full_barriers[s], smem_a[s], m_idx, k_idx); + if constexpr (kMajorB == cute::UMMA::Major::K) + tma_copy(&tensor_map_b, full_barriers[s], smem_b[s], k_b_idx, n_idx); + if constexpr (kMajorB == cute::UMMA::Major::MN) + tma_copy(&tensor_map_b, full_barriers[s], smem_b[s], n_idx, k_b_idx); + + // Issue SFA TMA + tma_copy( + &tensor_map_sfa, full_barriers[s], + smem_sfa[s], m_block_idx * BLOCK_M, + scheduler.get_global_idx<(kGemmType != GemmType::MGroupedContiguous)>(shape_k_scales, 1, k_block_idx)); + } + + // Arrive at full barriers + constexpr uint32_t kNumArrivalBytes = SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SFA_SIZE_PER_STAGE; + if (is_leader_cta and cute::elect_one_sync()) + full_barriers[s]->arrive_and_expect_tx(kNumArrivalBytes * kNumMulticast); + } + + // Wait unaligned cases + #pragma unroll + for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { + empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); + if (is_leader_cta and cute::elect_one_sync()) + full_barriers[s]->arrive(); + } + }); + } + } else if (warp_idx == 1 and is_leader_cta) { + // Adjust registers + cutlass::arch::warpgroup_reg_dealloc(); + + // MMA issue warp + // NOTES: only the leader CTA will do this + // Make instruction descriptor + // TODO: refactor `UMMA_M` calculation + constexpr uint32_t UMMA_M = LAYOUT_AD_M * (kIsMulticastOnA ? 1 : kNumMulticast); + constexpr uint32_t UMMA_N = BLOCK_N * (kIsMulticastOnA ? kNumMulticast : 1); + constexpr uint32_t UMMA_K = 32 / sizeof(cutlass::float_e4m3_t); + auto instr_desc = cute::UMMA::make_instr_desc(); + auto runtime_instr_desc = cute::UMMA::make_runtime_instr_desc(instr_desc); + + // Checks for MMA instructions + // NOTES: CUTLASS does not have such checks except the MMA traits, but we are not using these traits + DG_STATIC_ASSERT((UMMA_M == 64 and UMMA_N % 8 == 0 and 8 <= UMMA_N and UMMA_N <= 256) or + (UMMA_M == 128 and UMMA_N % 16 == 0 and 16 <= UMMA_N and UMMA_N <= 256) or + (UMMA_M == 256 and UMMA_N % 16 == 0 and 16 <= UMMA_N and UMMA_N <= 256), + "Invalid MMA instruction shape"); + + // Persistently schedule over blocks + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + // Launch MMAs + launch_k_iterations([&](uint32_t k_iter, auto type) { + constexpr bool kHasDivisibleStages = std::is_same_v; + constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; + DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + + #pragma unroll + for (uint32_t s = 0; s < kNumStages; ++ s) { + // Wait TMA full + auto iter_idx = scheduler.current_iter * num_iterations + k_iter; + full_barriers[s]->wait(iter_idx & 1); + + // Wait tensor memory empty + auto accum_stage_idx = (iter_idx * kNumStages + s) % kNumEpilogueStages; + auto accum_stage_phase = ((iter_idx * kNumStages + s) / kNumEpilogueStages) & 1; + tmem_empty_barriers[accum_stage_idx]->wait(accum_stage_phase ^ 1); + + // Issue UMMA in the leader CTA + if (s < kNumInnerStages) { + using cute_mma_t = std::conditional_t; + tcgen05_after_thread_sync(); + #pragma unroll + for (uint32_t k = 0; k < BLOCK_K / UMMA_K; ++ k) { + auto b_desc = make_umma_desc(smem_b[s], 0, k * UMMA_K); + #pragma unroll + for (uint32_t w = 0; w < kNumMWaves; ++ w) { + auto a_desc = make_umma_desc(smem_a[s], w * LAYOUT_AD_M, k * UMMA_K); + cute_mma_t::fma(a_desc, b_desc, + accum_stage_idx * kNumMWaves * BLOCK_N + w * BLOCK_N, + k > 0, + runtime_instr_desc); + } + } + tcgen05_before_thread_sync(); + } + + // Commit to the TMA empty and tensor memory full barrier + auto umma_arrive = [](const uint64_t* barrier) { + if constexpr (kNumMulticast == 1) { + cutlass::arch::umma_arrive(barrier); + } else { + constexpr uint16_t kCTAMask = (1 << kNumMulticast) - 1; + cutlass::arch::umma_arrive_multicast_2x1SM(barrier, kCTAMask); + } + }; + umma_arrive(reinterpret_cast(tmem_full_barriers[accum_stage_idx])); + } + }); + } + } else if (warp_idx < kNumNonEpilogueThreads / 32) { + // Adjust registers + cutlass::arch::warpgroup_reg_dealloc(); + } else if (warp_idx >= kNumNonEpilogueThreads / 32) { + // Adjust registers + cutlass::arch::warpgroup_reg_alloc(); + + // Epilogue warp groups + const auto epilogue_thread_idx = threadIdx.x - kNumNonEpilogueThreads; + const auto epilogue_thread_idx_in_warpgroup = epilogue_thread_idx % 128; + const auto epilogue_warp_idx = warp_idx - (kNumNonEpilogueThreads / 32); + const auto epilogue_warpgroup_idx = epilogue_thread_idx / 128; + + // NOTES: tensor memory addresses are simplified, as the hardware will ignore the warp index bits, + // i.e., no need for `tmem_ptr |= (epilogue_warp_idx * 32) << 16`. + // NOTES: we also forbid two CTAs to share the same SM and its tensor memory + DG_TRAP_ONLY_DEVICE_ASSERT(ld_shared(tmem_ptr_in_smem) == 0); + + // TMA checks + constexpr uint32_t kNumBankGroupBytes = 16; + constexpr uint32_t kNumElemsPerBankGroup = kNumBankGroupBytes / sizeof(cd_dtype_t); + DG_STATIC_ASSERT(kSwizzleCDMode > 0, "TMA D must be swizzled"); + DG_STATIC_ASSERT(STORE_BLOCK_N % kNumElemsPerBankGroup == 0, "Invalid swizzling"); + + // Persistently schedule over blocks + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + constexpr uint32_t kNumElemsPerLDTM = 16; + DG_STATIC_ASSERT(kNumElemsPerLDTM == 16 and BLOCK_N % kNumElemsPerLDTM == 0 and BLOCK_K % kNumElemsPerLDTM == 0, "Invalid LDTM width"); + + // SFB stuffs + uint32_t num_former_iters = BLOCK_N, num_full_iters = BLOCK_N; + if constexpr (not kMustUseUniformedSFB) { + num_former_iters = min(BLOCK_N, BLOCK_K - ((n_block_idx * BLOCK_N) % BLOCK_K)); + num_full_iters = min(shape_n - n_block_idx * BLOCK_N, BLOCK_N); + } + num_former_iters /= kNumElemsPerLDTM, num_full_iters /= kNumElemsPerLDTM; + const auto sfb_offset = scheduler.get_global_idx(ceil_div(shape_n, BLOCK_K), 0, 0, m_block_idx); + const auto sfb_ptr = sfb + (sfb_offset + ((n_block_idx * BLOCK_N) / BLOCK_K)) * shape_k_scales; + + // Launch promotion + float accum[BLOCK_N] = {0}; + launch_k_iterations([&](uint32_t k_iter, auto type) { + constexpr bool kHasDivisibleStages = std::is_same_v; + constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; + DG_STATIC_ASSERT(kNumInnerStages != 0, "Invalid number of inner stages"); + + #pragma unroll + for (uint32_t s = 0; s < kNumStages; ++ s) { + // Load SFB + float sf_0 = 0, sf_1 = 0; + if (s < kNumInnerStages) { + const auto k_block_idx = k_iter * kNumStages + s; + sf_0 = __ldg(sfb_ptr + k_block_idx); + sf_1 = num_former_iters < num_full_iters ? __ldg(sfb_ptr + k_block_idx + shape_k_scales) : 0; + } + + // Wait UMMA arrival + auto iter_idx = scheduler.current_iter * num_iterations + k_iter; + auto accum_stage_idx = (iter_idx * kNumStages + s) % kNumEpilogueStages; + auto accum_stage_phase = ((iter_idx * kNumStages + s) / kNumEpilogueStages) & 1; + tmem_full_barriers[accum_stage_idx]->wait(accum_stage_phase); + tcgen05_after_thread_sync(); + + // Commit to the TMA empty barrier for all CTAs after loading SFA + float sfa = s < kNumInnerStages ? ld_shared(smem_sfa[s] + epilogue_thread_idx) : 0; + sf_0 *= sfa, sf_1 *= sfa; + __syncwarp(); + if (lane_idx < kNumMulticast) + empty_barriers[s]->arrive(lane_idx); + __syncwarp(); + + // Do promotion like the SM90 kernel + if (s < kNumInnerStages) { + uint32_t values[kNumElemsPerLDTM]; + #pragma unroll + for (uint32_t i = 0; i < BLOCK_N / kNumElemsPerLDTM; ++ i) { + // Load from tensor memory + cute::SM100_TMEM_LOAD_32dp32b16x::copy( + accum_stage_idx * kNumMWaves * BLOCK_N + epilogue_warpgroup_idx * BLOCK_N + i * kNumElemsPerLDTM, + values[ 0], values[ 1], values[ 2], values[ 3], + values[ 4], values[ 5], values[ 6], values[ 7], + values[ 8], values[ 9], values[10], values[11], + values[12], values[13], values[14], values[15]); + cutlass::arch::fence_view_async_tmem_load(); + + // Promote + const auto sf = (kMustUseUniformedSFB or i < num_former_iters) ? sf_0 : sf_1; + #pragma unroll + for (uint32_t j = 0; j < kNumElemsPerLDTM; ++ j) + accum[i * kNumElemsPerLDTM + j] += *reinterpret_cast(&values[j]) * sf; + } + } + + // Commit to the tensor memory empty barrier (only at the leader CTA) + tcgen05_before_thread_sync(); + tmem_empty_barriers[accum_stage_idx]->arrive(0u); + } + }); + + // Flush TMA stores + // NOTES: for the first store, we have to flush all previous TMA, + // as we don't share pipeline stages between two blocks + if (epilogue_thread_idx_in_warpgroup == 0) + cute::tma_store_wait<0>(); + cutlass::arch::NamedBarrier(STORE_BLOCK_M, epilogue_warpgroup_idx).sync(); + + // Write shared memory + DG_STATIC_ASSERT(BLOCK_N % STORE_BLOCK_N == 0, "Invalid block sizes"); + + // Epilogue store and addition + // Issue every swizzled atom and pipeline: store shared, add C, and TMA store + constexpr uint32_t kNumStores = BLOCK_N / STORE_BLOCK_N; + #pragma unroll + for (uint32_t s = 0; s < kNumStores; ++ s) { + // Wait shared memory to be released + if (s >= kNumTMAStoreStages) { + if (epilogue_thread_idx_in_warpgroup == 0) + cute::tma_store_wait(); + cutlass::arch::NamedBarrier(STORE_BLOCK_M, epilogue_warpgroup_idx).sync(); + } + + // The pipeline stage + const auto tma_stage_idx = s % kNumTMAStoreStages; + const auto m_idx = scheduler.get_global_idx<(kGemmType != GemmType::MGroupedContiguous)>(shape_m, BLOCK_M, m_block_idx); + const auto n_idx = n_block_idx * BLOCK_N + s * STORE_BLOCK_N; + const auto local_smem_cd = smem_cd[tma_stage_idx] + epilogue_warpgroup_idx * STORE_BLOCK_M * STORE_BLOCK_N; + + // Store into shared memory + #pragma unroll + for (uint32_t i = 0; i < STORE_BLOCK_N / kNumElemsPerBankGroup; ++ i) { + // Calculate the index of the bank group to be written in the atom + auto bank_group_index = i + lane_idx * (kSwizzleCDMode / kNumBankGroupBytes); + + // Reshape the atom in another view and swizzle + // - original: `(LAYOUT_AD_M, kSwizzleCDMode / kNumBankGroupBytes)` + // - new: `(LAYOUT_AD_M * kSwizzleCDMode / kNumBankGroupBytes / 8, 8)` + // NOTES: "8" is the number of bank groups, "16" is the swizzling pattern + constexpr bool kHasShortcut = (kSwizzleCDMode / kNumBankGroupBytes) == 8; + auto row = kHasShortcut ? (i / 8 + lane_idx) : (bank_group_index / 8); + auto col = kHasShortcut ? (i) : (bank_group_index % 8); + col ^= row % (kSwizzleCDMode / 16); + + // Source and destination memory address + auto smem_ptr = reinterpret_cast(smem_cd[tma_stage_idx]) + // Base pointer + epilogue_warp_idx * 32 * kSwizzleCDMode + // Warp offset + row * (kNumBankGroupBytes * 8) + col * kNumBankGroupBytes; // In-atom offset + + // Load from tensor memory, store into shared memory + // NOTES: if you want to do accumulation, please notice that you need two accumulation barriers + const auto offset = s * STORE_BLOCK_N + i * kNumElemsPerBankGroup; + if constexpr (std::is_same_v) { + // For FP32 output, read and store + DG_STATIC_ASSERT(kNumElemsPerBankGroup == 4, "Invalid type"); + st_shared(smem_ptr, + *reinterpret_cast(&accum[offset + 0]), + *reinterpret_cast(&accum[offset + 1]), + *reinterpret_cast(&accum[offset + 2]), + *reinterpret_cast(&accum[offset + 3])); + } else { + // For BF16 output, read, cast and store + DG_STATIC_ASSERT(kNumElemsPerBankGroup == 8 and std::is_same_v, "Invalid type"); + st_shared(smem_ptr, + cast_into_bf16_and_pack(accum[offset + 0], accum[offset + 1]), + cast_into_bf16_and_pack(accum[offset + 2], accum[offset + 3]), + cast_into_bf16_and_pack(accum[offset + 4], accum[offset + 5]), + cast_into_bf16_and_pack(accum[offset + 6], accum[offset + 7])); + } + } + + // Synchronize all threads and issue TMA + cute::tma_store_fence(); + cutlass::arch::NamedBarrier(STORE_BLOCK_M, epilogue_warpgroup_idx).sync(); + if (epilogue_thread_idx_in_warpgroup == 0) { + cute::SM90_TMA_STORE_2D::copy( + &tensor_map_d, local_smem_cd, + n_idx, m_idx + epilogue_warpgroup_idx * STORE_BLOCK_M); + cute::tma_store_arrive(); + } + } + } + + // Flush all stages in the pipeline to make TMA stores visible to the next kernel + // TODO: do we actually need this? + if (epilogue_thread_idx_in_warpgroup == 0) + cute::tma_store_wait<0>(); + + // Deallocate tensor memory by warp 1 + // NOTES: warp 0 is waiting TMA store + // TODO: do we need 2 SM allocation? + if (epilogue_warp_idx == 1) + cute::TMEM::Allocator1Sm().free(0, kNumTmemCols); + } + + // To safely deconstruct all barriers, we need a cluster sync + // TODO: optimize it by another round of barrier waits + if constexpr (kNumMulticast > 1) + cute::cluster_sync(); +#else + if (blockIdx.x == 0 and threadIdx.x == 0) + DG_DEVICE_ASSERT(false and "This kernel only support sm_100a/sm_101a"); +#endif +} + +}; // namespace deep_gemm + +#pragma clang diagnostic pop diff --git a/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh b/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh index 28b5399a..6fff0252 100644 --- a/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh +++ b/deep_gemm/include/deep_gemm/impls/sm90_fp8_gemm_1d2d.cuh @@ -1,3 +1,441 @@ #pragma once -// TODO: add implement \ No newline at end of file +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunknown-attributes" + +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace deep_gemm { + +using namespace deep_gemm::sm90; + +template +__device__ __host__ void outer_launch_k_iterations(const auto& inner_launch_k_iterations, const auto& func, uint32_t num_former_iters) { + if (num_former_iters == kNumFormerIters) { + inner_launch_k_iterations(func, cute::Int{}); + return; + } + + if constexpr (kNumFormerIters + kGap <= kEnd) + outer_launch_k_iterations(inner_launch_k_iterations, func, num_former_iters); +} + +template +__global__ void __launch_bounds__(kNumTMAThreads + kNumMathThreads, 1) +sm90_fp8_gemm_1d2d_impl(float* sfb, int* grouped_layout, + uint32_t shape_m, uint32_t shape_n, uint32_t shape_k, + const __grid_constant__ CUtensorMap tensor_map_a, + const __grid_constant__ CUtensorMap tensor_map_b, + const __grid_constant__ CUtensorMap tensor_map_d, + const __grid_constant__ CUtensorMap tensor_map_sfa) { +#if (defined(__CUDA_ARCH__) and (__CUDA_ARCH__ >= 900)) or defined(__CLION_IDE__) + // Scaling checks + DG_STATIC_ASSERT(BLOCK_K == 128, "Only support per-128-channel FP8 scaling"); + DG_STATIC_ASSERT(constexpr_ceil_div(BLOCK_N, BLOCK_K) == 1 or (constexpr_gcd(BLOCK_N, BLOCK_K) == BLOCK_N - BLOCK_K), "Too much B scales in a single block"); + + // Types + using WGMMA = typename FP8MMASelector::type; + using Barrier = cutlass::arch::ClusterTransactionBarrier; + DG_STATIC_ASSERT(BLOCK_M % WGMMA::M == 0, "Invalid block size"); + + // Overwrite shape constants if the compiler gives + shape_m = SHAPE_M != 0 ? SHAPE_M : shape_m; + shape_n = SHAPE_N != 0 ? SHAPE_N : shape_n; + shape_k = SHAPE_K != 0 ? SHAPE_K : shape_k; + + // Shared memory + static constexpr bool kMustUseUniformedScaleB = (BLOCK_K % BLOCK_N == 0); + static constexpr uint32_t SMEM_D_SIZE = BLOCK_M * BLOCK_N * sizeof(__nv_bfloat16); + static constexpr uint32_t SMEM_A_SIZE_PER_STAGE = BLOCK_M * BLOCK_K * sizeof(__nv_fp8_e4m3); + static constexpr uint32_t SMEM_B_SIZE_PER_STAGE = BLOCK_N * BLOCK_K * sizeof(__nv_fp8_e4m3); + static constexpr uint32_t SMEM_SFA_SIZE_PER_STAGE = BLOCK_M * sizeof(float); + const uint32_t& shape_k_scales = ceil_div(shape_k, BLOCK_K); + const uint32_t& smem_sfb_size = align(shape_k_scales * (kMustUseUniformedScaleB ? 1 : 2) * sizeof(float), sizeof(Barrier)); + + // Configs + constexpr uint32_t kFullKOfAllStages = kNumStages * BLOCK_K; + const uint32_t num_iterations = ceil_div(shape_k, kFullKOfAllStages); + const uint32_t warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); + const uint32_t lane_idx = get_lane_idx(); + + // Prefetch TMA descriptors at the very beginning + if (threadIdx.x == kNumMathThreads) { + // NOTES: `reinterpret_cast` must be here, or NVRTC will fail + cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_a)); + cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_b)); + cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_sfa)); + cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_d)); + } + __syncwarp(); + + // Align to 1024 bytes for swizzle-128B + extern __shared__ __align__(1024) uint8_t smem_buffer[]; + DG_STATIC_ASSERT(SMEM_D_SIZE % 1024 == 0, "Shared memory of A/B must be aligned to 1024 bytes"); + + // Data on shared memory + auto smem_d = reinterpret_cast<__nv_bfloat16*>(smem_buffer); + __nv_fp8_e4m3* smem_a[kNumStages]; + __nv_fp8_e4m3* smem_b[kNumStages]; + float* smem_sfa[kNumStages]; + float* smem_sfb; + + // TMA Barrier for both divisible and non-divisible cases + Barrier* full_barriers[kNumStages]; + Barrier* empty_barriers[kNumStages]; + + // Fill shared memory pointers + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + smem_a[i] = reinterpret_cast<__nv_fp8_e4m3*>(smem_buffer + SMEM_D_SIZE + i * SMEM_A_SIZE_PER_STAGE); + smem_b[i] = reinterpret_cast<__nv_fp8_e4m3*>(smem_buffer + SMEM_D_SIZE + kNumStages * SMEM_A_SIZE_PER_STAGE + i * SMEM_B_SIZE_PER_STAGE); + smem_sfa[i] = reinterpret_cast(smem_buffer + SMEM_D_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE) + i * SMEM_SFA_SIZE_PER_STAGE); + } + smem_sfb = reinterpret_cast(smem_buffer + SMEM_D_SIZE + kNumStages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SFA_SIZE_PER_STAGE)); + + // Fill barriers + auto barrier_start_ptr = reinterpret_cast(reinterpret_cast(smem_sfb) + smem_sfb_size); + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + full_barriers[i] = barrier_start_ptr + i; + empty_barriers[i] = barrier_start_ptr + kNumStages + i; + } + + // Initialize barriers + DG_STATIC_ASSERT(kNumTMAMulticast <= 32, "Too many TMA multicast"); + if (threadIdx.x == kNumMathThreads) { + // NOTES: we always use `lane_idx` to arrive for the `lane_idx`-th CTA in the cluster, + // even with TMA multicast disabled, we want to make the behavior aligned + #pragma unroll + for (uint32_t i = 0; i < kNumStages; ++ i) { + full_barriers[i]->init(1); + empty_barriers[i]->init(kNumTMAMulticast * kNumMathThreads / 32); + } + + // Make initialized barrier visible in async proxy + cutlass::arch::fence_view_async_shared(); + cutlass::arch::fence_barrier_init(); + } + + // Synchronize all threads to make barrier visible in normal memory model + (kNumTMAMulticast > 1) ? cute::cluster_sync() : __syncthreads(); + + // For pipeline unrolling + struct DivisibleK {}; + struct NotDivisibleK {}; + struct SkipComputation {}; + struct NotSkipComputation {}; + auto launch_k_iterations = [=](const auto& func, bool skip_computation, uint32_t num_former_iters) { + constexpr bool kShouldOptimize = BLOCK_K / constexpr_gcd(BLOCK_K, BLOCK_N) <= 4 and not kMustUseUniformedScaleB; + constexpr uint32_t kGap = constexpr_gcd(BLOCK_K, BLOCK_N) / 8; + constexpr uint32_t kEnd = kShouldOptimize ? BLOCK_K / 8 : 0; + + // NOTES: for too-many branches (> 5), we disable this optimization + // Otherwise, the compiler must know the dynamic variable `num_former_iters`'s real value + outer_launch_k_iterations<0, kGap, kEnd>([=](const auto& func, auto num_former_iters_type) { + if (skip_computation) { + for (uint32_t k_iter = 0; k_iter < num_iterations; ++ k_iter) + func(k_iter, DivisibleK{}, SkipComputation{}, num_former_iters_type); + } else if (shape_k % kFullKOfAllStages == 0) { + for (uint32_t k_iter = 0; k_iter < num_iterations; ++ k_iter) + func(k_iter, DivisibleK{}, NotSkipComputation{}, num_former_iters_type); + } else { + for (uint32_t k_iter = 0; k_iter < num_iterations - 1; ++ k_iter) + func(k_iter, DivisibleK{}, NotSkipComputation{}, num_former_iters_type); + func(num_iterations - 1, NotDivisibleK{}, NotSkipComputation{}, num_former_iters_type); + } + }, func, kShouldOptimize ? num_former_iters : 0); + }; + + // Register reconfigurations + constexpr uint32_t kNumTMARegisters = 40; + constexpr uint32_t kNumMathRegisters = 232; + + // Block scheduler + uint32_t m_block_idx, n_block_idx; + auto scheduler = Scheduler(shape_m, shape_n, grouped_layout); + + if (threadIdx.x >= kNumMathThreads) { + // TMA warp-group for loading data + cutlass::arch::warpgroup_reg_dealloc(); + + // NOTES: only one thread (or warp) will be used + if (threadIdx.x == kNumMathThreads) { + // Persistently schedule over blocks + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + launch_k_iterations([&](uint32_t k_iter, auto divisible_type, auto _, auto __) { + constexpr bool kHasDivisibleStages = std::is_same_v; + constexpr uint32_t kNumInnerStages = kHasDivisibleStages ? kNumStages : kNumLastStages; + + // Assign TMA multicast number into A and B + // NOTES: there may be additional odd rows/columns or cases where multicast is not possible. + const bool is_tma_multicast_valid = scheduler.is_tma_multicast_valid(m_block_idx); + const uint32_t num_tma_multicast_a = (kIsTMAMulticastOnA and is_tma_multicast_valid) ? kNumTMAMulticast : 1; + const uint32_t num_tma_multicast_b = (not kIsTMAMulticastOnA and is_tma_multicast_valid) ? kNumTMAMulticast : 1; + DG_STATIC_ASSERT(kNumTMAMulticast <= 2, "Scheduler does not support > 2 TMA multicast"); + + // NOTES: unrolling and `kNumInnerStages` are vital for performance, NVCC will try to eliminate all + // shared memory pointers, e.g. `full_barriers` registers, if all the access indices are constant + #pragma unroll + for (uint32_t s = 0; s < kNumInnerStages; ++ s) { + // Wait consumer release + empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); + + // Issue TMA A + constexpr bool kWithGroupOffsetA = kGemmType == GemmType::MGroupedMasked; + auto& full_barrier = *full_barriers[s]; + uint32_t k_idx = k_iter * kFullKOfAllStages + s * BLOCK_K; + tma_copy(&tensor_map_a, reinterpret_cast(&full_barrier), + smem_a[s], k_idx, scheduler.get_global_idx(shape_m, BLOCK_M, m_block_idx), + num_tma_multicast_a); + tma_copy(&tensor_map_sfa, reinterpret_cast(&full_barrier), + smem_sfa[s], m_block_idx * BLOCK_M, + scheduler.get_global_idx(shape_k_scales, 1, k_idx / BLOCK_K), + num_tma_multicast_a); + + // Issue TMA B + tma_copy(&tensor_map_b, reinterpret_cast(&full_barrier), + smem_b[s], k_idx, scheduler.get_global_idx(shape_n, BLOCK_N, n_block_idx, m_block_idx), + num_tma_multicast_b); + full_barrier.arrive_and_expect_tx(SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SFA_SIZE_PER_STAGE); + } + + // Wait unaligned cases + #pragma unroll + for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { + empty_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter + 1) & 1); + full_barriers[s]->arrive(); + } + }, false, 0); + } + + // To safely deconstruct distributed shared barriers, we need another round of empty waits + if constexpr (kNumTMAMulticast > 1) { + #pragma unroll + for (uint32_t s = 0; s < kNumStages; ++ s) + empty_barriers[s]->wait((scheduler.current_iter * num_iterations + 1) & 1); + } + } + } else { + // Math warp-groups for WGMMA + cutlass::arch::warpgroup_reg_alloc(); + + // NOTES: use `__shfl_sync` to encourage NVCC to use unified registers + const auto math_wg_idx = __shfl_sync(0xffffffff, threadIdx.x / 128, 0); + const auto r_0 = warp_idx * 16 + lane_idx / 4, r_1 = r_0 + 8; + + // Persistently schedule over blocks + while (scheduler.get_next_block(m_block_idx, n_block_idx)) { + // Decide the number of scales B to load + DG_TRAP_ONLY_DEVICE_ASSERT(shape_n % 8 == 0); + uint32_t num_former_iters = BLOCK_N / 8, num_full_iters = num_former_iters; + if constexpr (not kMustUseUniformedScaleB) { + num_former_iters = min(BLOCK_N, BLOCK_K - n_block_idx * BLOCK_N % BLOCK_K) / 8; + num_full_iters = min(shape_n - n_block_idx * BLOCK_N, BLOCK_N) / 8; + } + uint32_t num_sfb = shape_k_scales * (num_former_iters >= num_full_iters ? 1 : 2); + + // Load B scales with math warp-groups + // NOTES: except the first warp, we want to overlap loading B scales with TMA stores between tasks + if (threadIdx.x >= 32) { + auto num_previous_lines = scheduler.get_global_idx(ceil_div(shape_n, BLOCK_K), 0, 0, m_block_idx); + auto local_sfb = sfb + (num_previous_lines + ((n_block_idx * BLOCK_N) / BLOCK_K)) * shape_k_scales; + #pragma unroll + for (uint32_t i = threadIdx.x - 32; i < num_sfb; i += kNumMathThreads - 32) + st_shared(smem_sfb + i, __ldg(local_sfb + i)); + } + cutlass::arch::NamedBarrier(kNumMathThreads).sync(); + + // Accumulation for WGMMA or CUDA promotion + constexpr uint32_t WAVE_BLOCK_M = WGMMA::M * (BLOCK_M <= 64 ? 1 : 2); + DG_STATIC_ASSERT(BLOCK_M % WAVE_BLOCK_M == 0, "Invalid block sizes"); + float accum[WGMMA::kNumAccum], final_accum[WGMMA::kNumAccum * (BLOCK_M / WAVE_BLOCK_M)] = {0}; + + // Empty barrier arrival + auto empty_barrier_arrive = [&](uint32_t s) { + if constexpr (kNumTMAMulticast == 1) { + lane_idx == 0 ? empty_barriers[s]->arrive() : void(); + } else { + auto target_cta = scheduler.is_peer_cta_alive ? lane_idx : cute::block_rank_in_cluster(); + lane_idx < kNumTMAMulticast ? empty_barriers[s]->arrive(target_cta) : void(); + } + }; + + // Launch MMAs + launch_k_iterations([&](uint32_t k_iter, auto divisible_type, auto skip_type, auto _) { + constexpr bool kSkipComputation = std::is_same_v; + constexpr bool kHasDivisibleStages = std::is_same_v; + constexpr uint32_t kNumInnerStages = kSkipComputation ? 0 : (kHasDivisibleStages ? kNumStages : kNumLastStages); + + #pragma unroll + for (uint32_t s = 0; s < kNumInnerStages; ++ s) { + // Read B scales + float scale_b_0 = ld_shared(smem_sfb + k_iter * kNumStages + s), scale_b_1; + // NOTES: even some blocks do not need to read the second row, but we still load one to align with other blocks + if constexpr (not kMustUseUniformedScaleB) + scale_b_1 = ld_shared(smem_sfb + k_iter * kNumStages + s + shape_k_scales); + + // Wait TMA arrivals + full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + + // TODO: remove some useless computation for unaligned Ms + #pragma unroll + for (uint32_t local_idx = 0; local_idx < BLOCK_M / WAVE_BLOCK_M; ++ local_idx) { + auto m_offset = local_idx * WAVE_BLOCK_M; + + // Read A scales + // NOTES: all shared memory read must be prior to `warpgroup_arrive` to avoid next scheduled block polluting the results + auto scale_a_0 = ld_shared(smem_sfa[s] + r_0 + m_offset); + auto scale_a_1 = ld_shared(smem_sfa[s] + r_1 + m_offset); + + // Commit WGMMA instructions + #pragma unroll + for (uint32_t i = 0; i < WGMMA::kNumAccum; ++ i) + warpgroup_fence_operand(accum[i]); + warpgroup_arrive(); + #pragma unroll + for (uint32_t k = 0; k < BLOCK_K / WGMMA::K; ++ k) { + auto desc_a = make_smem_desc(smem_a[s] + (math_wg_idx * WGMMA::M + m_offset) * BLOCK_K + k * WGMMA::K, 1); + auto desc_b = make_smem_desc(smem_b[s] + k * WGMMA::K, 1); + WGMMA::wgmma(desc_a, desc_b, accum, k); + } + warpgroup_commit_batch(); + #pragma unroll + for (uint32_t i = 0; i < WGMMA::kNumAccum; ++ i) + warpgroup_fence_operand(accum[i]); + warpgroup_wait<0>(); + + // Notify barrier arrival at the last warpgroup wave + if (local_idx == BLOCK_M / WAVE_BLOCK_M - 1) + empty_barrier_arrive(s); + + // Promote with scales + // NOTES: making it as predicates is very important for performance, comparing to two loops + float scale_0_0 = scale_a_0 * scale_b_0, scale_1_0 = scale_a_1 * scale_b_0; + float scale_0_1, scale_1_1; + if constexpr (not kMustUseUniformedScaleB) + scale_0_1 = scale_a_0 * scale_b_1, scale_1_1 = scale_a_1 * scale_b_1; + + auto shifted_accum = final_accum + WGMMA::kNumAccum * local_idx; + #pragma unroll + for (uint32_t i = 0; i < WGMMA::kNumAccum / 4; ++ i) { + // NOTES: for unrolled `num_former_iters` cases, we expect the compiler to automatically make it a constant + bool predicate = kMustUseUniformedScaleB or i < num_former_iters; + shifted_accum[i * 4 + 0] += (predicate ? scale_0_0 : scale_0_1) * accum[i * 4 + 0]; + shifted_accum[i * 4 + 1] += (predicate ? scale_0_0 : scale_0_1) * accum[i * 4 + 1]; + shifted_accum[i * 4 + 2] += (predicate ? scale_1_0 : scale_1_1) * accum[i * 4 + 2]; + shifted_accum[i * 4 + 3] += (predicate ? scale_1_0 : scale_1_1) * accum[i * 4 + 3]; + } + } + } + + // Wait unaligned cases + #pragma unroll + for (uint32_t s = kNumInnerStages; s < kNumStages; ++ s) { + full_barriers[s]->wait((scheduler.current_iter * num_iterations + k_iter) & 1); + empty_barrier_arrive(s); + } + }, not scheduler.is_computation_valid(m_block_idx, math_wg_idx * WGMMA::M), num_former_iters); + + // TMA checks + constexpr uint32_t kNumElemBytes = sizeof(nv_bfloat16); + constexpr uint32_t TMA_D_BLOCK_N = kSwizzleDMode == 0 ? BLOCK_N : (kSwizzleDMode / kNumElemBytes); + constexpr uint32_t WGMMA_M_PER_WARP = WGMMA::M / 4; + DG_STATIC_ASSERT(BLOCK_M % 8 == 0, "Invalid swizzling atom"); + DG_STATIC_ASSERT(BLOCK_N % TMA_D_BLOCK_N == 0 and BLOCK_N / TMA_D_BLOCK_N <= 32, + "Unaligned TMA store or too many TMA store instructions"); + DG_STATIC_ASSERT(TMA_D_BLOCK_N % 8 == 0, "Invalid TMA block N"); + + // Wait last TMA store to be finished + if (threadIdx.x < BLOCK_N / TMA_D_BLOCK_N) + cute::tma_store_wait<0>(); + cutlass::arch::NamedBarrier(kNumMathThreads).sync(); + + // Write back to shared memory using STSM and issue TMA stores + DG_STATIC_ASSERT(WGMMA::kNumAccum % 4 == 0, "Invalid STSM x2 vectorization"); + #pragma unroll + for (uint32_t local_idx = 0; local_idx < BLOCK_M / WAVE_BLOCK_M; ++ local_idx) { + auto m_offset = local_idx * WAVE_BLOCK_M; + auto shifted_accum = final_accum + WGMMA::kNumAccum * local_idx; + #pragma unroll + for (auto i = 0; i < WGMMA::kNumAccum / 4; ++ i) { + // Swizzle or padding into the correct address + uint8_t* smem_ptr = nullptr; + if constexpr (kSwizzleDMode > 0) { + // Calculate the swizzling atom offset and in-atom offset + constexpr uint32_t kNumBankGroupBytes = 16; + auto atom_offset = i / (TMA_D_BLOCK_N / 8), in_atom_offset = i % (TMA_D_BLOCK_N / 8); + + // Calculate the index of the bank group to be written in the atom + auto bank_group_index = in_atom_offset + lane_idx * (kSwizzleDMode / kNumBankGroupBytes); + + // Reshape the atom in another view and swizzle + // - original: `(BLOCK_M, kSwizzleDMode / kNumBankGroupBytes)` + // - new: `(BLOCK_M * kSwizzleDMode / kNumBankGroupBytes / 8, 8)` + constexpr bool kHasShortcut = (kSwizzleDMode / kNumBankGroupBytes) == 8; + auto row = kHasShortcut ? (in_atom_offset / 8 + lane_idx) : (bank_group_index / 8); + auto col = kHasShortcut ? (in_atom_offset) : (bank_group_index % 8); + col ^= row % (kSwizzleDMode / 16); + + // Add back into the base pointer + // NOTES: think twice before modifying this, as changes may affect the number of instructions + smem_ptr = reinterpret_cast(smem_d) + // Base pointer + warp_idx * (WGMMA_M_PER_WARP * kSwizzleDMode) + // Warp offset + m_offset * kSwizzleDMode + // Wave offset + atom_offset * BLOCK_M * kSwizzleDMode + // Swizzle atom offset (constants) + row * (kNumBankGroupBytes * 8) + col * kNumBankGroupBytes; // In-atom offset + } else { + // No swizzling, just padding + smem_ptr = reinterpret_cast(smem_d + (m_offset + warp_idx * WGMMA_M_PER_WARP + lane_idx) * BLOCK_N + i * 8); + } + + // NOTES: only 16 lanes' addresses are used + SM90_U32x2_STSM_N::copy( + __float22bfloat162_rn({shifted_accum[i * 4 + 0], shifted_accum[i * 4 + 1]}), + __float22bfloat162_rn({shifted_accum[i * 4 + 2], shifted_accum[i * 4 + 3]}), + smem_ptr + ); + } + } + cute::tma_store_fence(); + cutlass::arch::NamedBarrier(kNumMathThreads).sync(); + + // Use TMA store to write back to global memory + // TODO: compatible with FP32 output + constexpr bool kWithGroupOffsetD = kGemmType == GemmType::MGroupedMasked; + DG_STATIC_ASSERT(kNumMathThreads >= BLOCK_N / TMA_D_BLOCK_N, "Too many TMA blocks"); + if (threadIdx.x < BLOCK_N / TMA_D_BLOCK_N) { + auto in_block_n_offset = threadIdx.x * TMA_D_BLOCK_N; + auto smem_ptr = smem_d + in_block_n_offset * BLOCK_M; + cute::SM90_TMA_STORE_2D::copy(&tensor_map_d, smem_ptr, + n_block_idx * BLOCK_N + in_block_n_offset, + scheduler.get_global_idx(shape_m, BLOCK_M, m_block_idx)); + cute::tma_store_arrive(); + } + __syncwarp(); + } + } +#else + if (blockIdx.x == 0 and threadIdx.x == 0) + DG_DEVICE_ASSERT(false and "This kernel only support sm_90a"); +#endif +} + +}; // namespace deep_gemm + +#pragma clang diagnostic pop diff --git a/deep_gemm/include/deep_gemm/impls/smxx_layout.cuh b/deep_gemm/include/deep_gemm/impls/smxx_layout.cuh new file mode 100644 index 00000000..5b979a8c --- /dev/null +++ b/deep_gemm/include/deep_gemm/impls/smxx_layout.cuh @@ -0,0 +1,139 @@ +#pragma once + +#include + +#include + +namespace deep_gemm { + +// NOTES: the two kernels below always pack the K dimension + +template +__global__ void transpose_and_pack_fp32_into_ue8m0(float* sf, uint32_t* out, const uint32_t mn) { + extern __shared__ uint32_t smem_buffer[]; + + // Shapes and strides + constexpr auto kNumPackedSFK = constexpr_ceil_div(SF_K, 4u); + constexpr auto kNumTMAAlignedElems = static_cast(16 / sizeof(int)); + const auto in_block_mn = min(BLOCK_MN, mn - blockIdx.x * BLOCK_MN); + const auto tma_aligned_mn = align(mn, kNumTMAAlignedElems); + + // Shift into the group + sf = sf + static_cast(blockIdx.y) * mn * SF_K; + out = out + static_cast(blockIdx.y) * tma_aligned_mn * kNumPackedSFK; + + // Load FP32 SFs + DG_STATIC_ASSERT(BLOCK_MN % 4 == 0, "Invalid block size"); + const auto local_sf = reinterpret_cast(sf + static_cast(blockIdx.x) * (BLOCK_MN * SF_K)); + const auto num_values = in_block_mn * SF_K; + const auto num_uint4 = num_values / 4; + #pragma unroll + for (uint32_t i = threadIdx.x; i < num_uint4; i += kNumThreads) { + const auto& [x, y, z, w] = __ldg(reinterpret_cast(local_sf) + i); + st_shared(reinterpret_cast(smem_buffer) + i, x, y, z, w); + } + + // Fill unaligned values as well + if (const auto unaligned_idx = num_uint4 * 4 + threadIdx.x; unaligned_idx < num_values) + st_shared(smem_buffer + unaligned_idx, __ldg(local_sf + unaligned_idx)); + __syncthreads(); + + // Pack into UE8M0 and store + #pragma unroll + for (uint32_t i = threadIdx.x; i < (kNumPackedSFK * BLOCK_MN); i += kNumThreads) { + const auto sf_k_pack_idx = i / BLOCK_MN, mn_idx = i % BLOCK_MN; + + // Load shared memory + uint32_t values[4]; + #pragma unroll + for (uint32_t j = 0; j < 4; ++ j) { + const auto sf_k_idx = sf_k_pack_idx * 4 + j; + values[j] = sf_k_idx < SF_K ? ld_shared(smem_buffer + mn_idx * SF_K + sf_k_idx) : 0; + } + + // Pack and store + uint32_t packed = 0; + packed |= (values[0] >> 23u); + packed |= (values[1] >> 15u); + packed |= (values[2] >> 7u); + packed |= (values[3] << 1u); + if (const auto global_mn_idx = blockIdx.x * BLOCK_MN + mn_idx; global_mn_idx < mn) + out[sf_k_pack_idx * tma_aligned_mn + global_mn_idx] = packed; + } +} + +template +__global__ void pack_fp32_into_ue8m0(float* sf, uint32_t* out, uint32_t* ks, + const uint32_t mn, uint32_t sf_k, const uint32_t packed_sf_k) { + // Always packing the K dimension + // NOTES: should also assert `mn % 4 == 0` at launch + DG_STATIC_ASSERT(kTransposed, "Currently only support transposed SFs (MN-major)"); + DG_STATIC_ASSERT(BLOCK_MN % 4 == 0, "Invalid block sizes"); + DG_STATIC_ASSERT(BLOCK_PACKED_SF_K == kNumThreads / 32, "Invalid block sizes"); + + // Shapes and strides + const auto in_block_mn = min(BLOCK_MN, mn - blockIdx.x * BLOCK_MN); + const auto in_block_mn_uint4 = in_block_mn / 4; + const auto in_block_packed_sf_k = min(BLOCK_PACKED_SF_K, packed_sf_k - blockIdx.y * BLOCK_PACKED_SF_K); + + // Shift into the right block along MN + sf += blockIdx.x * BLOCK_MN; + out += blockIdx.x * BLOCK_MN; + + // Each warp is responsible for a packed row + const auto warp_idx = threadIdx.x / 32; + const auto lane_idx = get_lane_idx(); + const auto packed_sf_k_idx = static_cast(blockIdx.y) * BLOCK_PACKED_SF_K + warp_idx; + if (warp_idx >= in_block_packed_sf_k) + return; + + // Make an offset on the input + uint32_t input_offset = 0; + if constexpr (kNumGroups > 1) { + // Load each group's size + DG_STATIC_ASSERT(kNumGroups <= 128, "Too many groups"); + uint32_t group_ks[4]; + #pragma unroll + for (uint32_t i = 0; i < 4; ++ i) { + const auto group_idx = lane_idx * 4 + i; + group_ks[i] = group_idx < kNumGroups ? __ldg(ks + group_idx) : 0; + } + __syncwarp(); + + // Make the offset + sf_k = 0; + auto sum_packed_sf_k = 0; + #pragma unroll + for (uint32_t i = 0; i < kNumGroups; ++ i) { + const auto sf_k_in_group = __shfl_sync(0xffffffff, group_ks[i % 4] / 128, i / 4); + sf_k += sf_k_in_group; + sum_packed_sf_k += ceil_div(sf_k_in_group, 4u); + if (packed_sf_k_idx < sum_packed_sf_k) + break; + if (const auto remainder = sf_k_in_group % 4; remainder > 0) + input_offset += 4 - remainder; + } + } + + for (uint32_t mn_idx = get_lane_idx(); mn_idx < in_block_mn_uint4; mn_idx += 32) { + // Load + uint4 values[4]; + #pragma unroll + for (uint32_t j = 0; j < 4; ++ j) { + values[j] = make_uint4(0, 0, 0, 0); + if (const auto sf_k_idx = packed_sf_k_idx * 4 + j - input_offset; sf_k_idx < sf_k) + values[j] = __ldg(reinterpret_cast(sf + sf_k_idx * mn) + mn_idx); + } + + // Pack and store + uint4 packed; + packed.x = (values[0].x >> 23u) | (values[1].x >> 15u) | (values[2].x >> 7u) | (values[3].x << 1u); + packed.y = (values[0].y >> 23u) | (values[1].y >> 15u) | (values[2].y >> 7u) | (values[3].y << 1u); + packed.z = (values[0].z >> 23u) | (values[1].z >> 15u) | (values[2].z >> 7u) | (values[3].z << 1u); + packed.w = (values[0].w >> 23u) | (values[1].w >> 15u) | (values[2].w >> 7u) | (values[3].w << 1u); + reinterpret_cast(out + packed_sf_k_idx * mn)[mn_idx] = packed; + } +} + +} // namespace deep_gemm diff --git a/deep_gemm/jit/__init__.py b/deep_gemm/jit/__init__.py deleted file mode 100644 index 3fcc714d..00000000 --- a/deep_gemm/jit/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .compiler import get_nvcc_compiler, build, NVCCCompiler, NVRTCCompiler -from .runtime import Runtime, pytypes_to_ctypes diff --git a/deep_gemm/jit/compiler.py b/deep_gemm/jit/compiler.py deleted file mode 100644 index 3306077d..00000000 --- a/deep_gemm/jit/compiler.py +++ /dev/null @@ -1,317 +0,0 @@ -import functools -import getpass -import hashlib -import os -import re -import subprocess -import time -import torch -import uuid -from typing import Any, Dict, List, Tuple, Type - -import cuda.bindings -import cuda.bindings.nvrtc as nvrtc -from torch.utils.cpp_extension import CUDA_HOME - -from .scripts import sm90_interleave_ffma -from .runtime import Runtime, RuntimeCache - -runtime_cache = RuntimeCache() - - -@functools.lru_cache(maxsize=None) -def get_device_arch(): - major, minor = torch.cuda.get_device_capability() - suffix = 'a' if major >= 9 else '' - return f'{major * 10 + minor}{suffix}' - - -def hash_to_hex(s: str) -> str: - md5 = hashlib.md5() - md5.update(s.encode('utf-8')) - return md5.hexdigest()[0:12] - - -@functools.lru_cache(maxsize=None) -def get_jit_include_dir() -> str: - return os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'include') - - -@functools.lru_cache(maxsize=None) -def get_deep_gemm_version() -> str: - md5 = hashlib.md5() - - # Update include directories - include_dir = os.path.join(get_jit_include_dir(), 'deep_gemm') - assert os.path.exists(include_dir), f'Cannot find GEMM include directory {include_dir}' - # Recursively walk through all subdirectories - for root, dirs, files in os.walk(include_dir): - for filename in filter(lambda x: x.endswith('.cuh'), sorted(files)): - filepath = os.path.join(root, filename) - with open(filepath, 'rb') as f: - md5.update(f.read()) - - # Update post-compilation scripts - script_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'scripts') - for filename in filter(lambda x: x.endswith('.py'), sorted(os.listdir(script_dir))): - with open(os.path.join(script_dir, filename), 'rb') as f: - md5.update(f.read()) - return md5.hexdigest()[0:12] - - -@functools.lru_cache(maxsize=None) -def get_nvcc_compiler() -> Tuple[str, str]: - paths = [] - if os.getenv('DG_JIT_NVCC_COMPILER'): - paths.append(os.getenv('DG_JIT_NVCC_COMPILER')) - paths.append(os.path.join(CUDA_HOME, 'bin', 'nvcc')) - - # Try to find the first available NVCC compiler - least_version_required = '12.3' - version_pattern = re.compile(r'release (\d+\.\d+)') - for path in paths: - if os.path.exists(path): - command = [path, '--version'] - result = subprocess.run(command, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, text=True) - match = version_pattern.search(result.stdout) - version = match.group(1) - assert match, f'Cannot get the version of NVCC compiler {path}' - assert version >= least_version_required, f'NVCC {path} version {version} is lower than {least_version_required}' - return path, version - raise RuntimeError('Cannot find any available NVCC compiler') - - -@functools.lru_cache(maxsize=None) -def get_default_user_dir(): - if 'DG_JIT_CACHE_DIR' in os.environ: - path = os.getenv('DG_JIT_CACHE_DIR') - os.makedirs(path, exist_ok=True) - return path - - # By default, the user home directory is `~` - path = os.path.expanduser('~') - - # For a cluster environment, we may use a shared directory - # e.g., `/cluster/shared/user_0`, `/cluster/shared/user_1` - if 'DG_JIT_CACHE_HOME_DIR' in os.environ: - path = os.path.join(os.environ['DG_JIT_CACHE_HOME_DIR'], getpass.getuser()) - return os.path.join(path, '.deep_gemm') - - -@functools.lru_cache(maxsize=None) -def get_default_cache_dir(): - return os.path.join(get_default_user_dir(), 'cache') - - -def make_default_tmp_dir(): - tmp_dir = os.path.join(get_default_user_dir(), 'tmp') - os.makedirs(tmp_dir, exist_ok=True) - return tmp_dir - - -def get_shared_cache_dirs(name: str): - if 'DG_JIT_CACHE_HOME_DIR' in os.environ and 'DG_JIT_CACHE_SHARED_USERS' in os.environ: - return [os.path.join(os.environ['DG_JIT_CACHE_HOME_DIR'], user, 'cache', name) - for user in os.environ['DG_JIT_CACHE_SHARED_USERS'].split(':')] - return [] - - -def put(path, data): - # Write and do POSIX atomic replace - tmp_file_path = os.path.join(make_default_tmp_dir(), f'file.tmp.{str(uuid.uuid4())}.{hash_to_hex(path)}') - with open(tmp_file_path, 'wb' if isinstance(data, bytes) else 'w') as f: - f.write(data) - os.replace(tmp_file_path, path) - - -class Compiler: - @classmethod - def signature(cls) -> str: - pass - - @staticmethod - def __version__() -> Tuple[int, int]: - pass - - @classmethod - def compile(cls, name: str, code: str, target_path: str) -> None: - pass - - @staticmethod - def flags() -> List[str]: - cpp_standard = int(os.getenv('DG_JIT_OVERRIDE_CPP_STANDARD', 20)) - return [f'-std=c++{cpp_standard}', - '--ptxas-options=--register-usage-level=10' + - (',--verbose' if 'DG_JIT_PTXAS_VERBOSE' in os.environ else ''), - # Suppress some unnecessary warnings, such as unused variables for certain `constexpr` branch cases - '--diag-suppress=39,161,174,177,940'] - - @staticmethod - def include_dirs() -> List[str]: - return [get_jit_include_dir()] - - @classmethod - def build(cls, name: str, code: str, runtime_cls: Type[Runtime], kwargs: Dict[str, Any] = None) -> Runtime: - # Compiler flags - flags = cls.flags() - - # Build signature - # TODO: refactor post-process scripts if we have more in the future (or remove `< 12.9` support) - enable_sass_opt = cls.__version__() <= (12, 8) and get_device_arch() == '90a' and not int(os.getenv('DG_JIT_DISABLE_FFMA_INTERLEAVE', 0)) - signature = f'{name}$${get_deep_gemm_version()}$${cls.signature()}$${flags}$${enable_sass_opt}$${code}' - name = f'kernel.{name}.{hash_to_hex(signature)}' - path = os.path.join(get_default_cache_dir(), name) - - # Check runtime cache or file system hit - # NOTES: also try to use other users' cache - global runtime_cache - for possible_path in [path, *get_shared_cache_dirs(name)]: - cached_runtime = runtime_cache.get(possible_path, runtime_cls, name, kwargs) - if cached_runtime is not None: - if int(os.getenv('DG_JIT_DEBUG', 0)): - print(f'Using cached JIT runtime {name} during build') - return cached_runtime - - # Compile into a temporary CU file - os.makedirs(path, exist_ok=True) - cubin_path = os.path.join(path, 'kernel.cubin') - tmp_cubin_path = os.path.join(make_default_tmp_dir(), f'nvcc.tmp.{str(uuid.uuid4())}.{hash_to_hex(cubin_path)}.cubin') - - start_time = time.time() - cls.compile(name, code, tmp_cubin_path) - end_time = time.time() - elapsed_time = end_time - start_time - if int(os.getenv('DG_JIT_DEBUG', 0)): - print(f'Compilation of JIT runtime {name} took {elapsed_time:.2f} seconds.') - - # Interleave FFMA reuse (SM90 only) - if enable_sass_opt: - sm90_interleave_ffma.process(tmp_cubin_path) - - # Atomic replace files - os.replace(tmp_cubin_path, cubin_path) - - # Put cache and return - runtime = runtime_cache.get(path, runtime_cls, name, kwargs, force_enable_cache=True) - assert runtime is not None - return runtime - - -class NVCCCompiler(Compiler): - @staticmethod - def __version__() -> Tuple[int, int]: - _, version = get_nvcc_compiler() - major, minor = map(int, version.split('.')) - return major, minor - - @classmethod - def signature(cls) -> str: - return f'{get_nvcc_compiler()[0]}+{cls.__version__()}' - - @classmethod - def flags(cls) -> List[str]: - cxx_flags = ['-fPIC', '-O3', '-fconcepts', '-Wno-deprecated-declarations', '-Wno-abi'] - return [*super().flags(), *[f'-I{d}' for d in cls.include_dirs()], - f'--gpu-architecture=sm_{get_device_arch()}', - '-cubin', '-O3', '--expt-relaxed-constexpr', '--expt-extended-lambda', - f'--compiler-options={",".join(cxx_flags)}'] - - @classmethod - def compile(cls, name: str, code: str, target_path: str) -> None: - # Write the code - path = os.path.join(get_default_cache_dir(), name) - src_path = os.path.join(path, 'kernel.cu') - put(src_path, code) - command = [get_nvcc_compiler()[0], - src_path, '-o', target_path, - *cls.flags()] - if int(os.getenv('DG_JIT_DEBUG', 0)) or int(os.getenv('DG_JIT_PRINT_COMPILER_COMMAND', 0)): - print(f'Compiling JIT runtime {name} with command {command}') - - result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - if result.returncode != 0: - print(f'NVCC compilation failed: stdout: {result.stdout}, stderr: {result.stderr}') - assert False, f'Failed to compile {src_path}' - - # Print PTXAS log - if int(os.getenv('DG_JIT_DEBUG', 0)) or int(os.getenv('DG_JIT_PTXAS_VERBOSE', 0)): - print(result.stderr) - - -class NVRTCCompiler(Compiler): - @staticmethod - def __version__() -> Tuple[int, int]: - res, major, minor = nvrtc.nvrtcVersion() - if res != nvrtc.nvrtcResult.NVRTC_SUCCESS: - # Failed to get the actual NVRTC version, use cuda-bindings version instead - major, minor = map(int, cuda.bindings.__version__.split('.')[:2]) - return major, minor - - @classmethod - def signature(cls) -> str: - return f'nvrtc+{cls.__version__()}' - - @staticmethod - def include_dirs() -> List[str]: - if CUDA_HOME is None: - raise RuntimeError('CUDA_HOME is required for NVRTC compilation') - return [get_jit_include_dir(), os.path.join(CUDA_HOME, 'include')] - - @classmethod - def flags(cls) -> List[str]: - flags = [*super().flags(), *[f'-I{d}' for d in cls.include_dirs()], - f'--gpu-architecture=sm_{get_device_arch()}', '-default-device'] - # NOTES: PCH is vital for compilation speed - if cls.__version__() >= (12, 8): - flags += ['--pch'] - if int(os.getenv('DG_JIT_DEBUG', 0)): - flags += ['--pch-verbose=true'] - return flags - - @classmethod - def compile(cls, name: str, code: str, target_path: str) -> None: - assert int(os.getenv('DG_JIT_PTXAS_VERBOSE', 0)) == 0, '`ptxas --verbose` is not compatible with NVRTC' - - # Create program - code_bytes = bytes(code, 'utf-8') - result, program = nvrtc.nvrtcCreateProgram( - code_bytes, bytes(name, 'utf-8'), 0, [], []) - assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f'Failed to create program: {result}' - - # Compile - options = [bytes(flag, 'utf-8') for flag in cls.flags()] - if int(os.getenv('DG_JIT_DEBUG', 0)) or int(os.getenv('DG_JIT_PRINT_COMPILER_COMMAND', 0)): - print(f'Compiling JIT runtime {name} with options: {options}') - compile_result = nvrtc.nvrtcCompileProgram(program, len(options), options)[0] - - # Print compiler log - if int(os.getenv('DG_JIT_DEBUG', 0)) or compile_result != nvrtc.nvrtcResult.NVRTC_SUCCESS: - result, log_size = nvrtc.nvrtcGetProgramLogSize(program) - assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f'Failed to get program log size: {result}' - - log_bytes = bytes(log_size) - result = nvrtc.nvrtcGetProgramLog(program, log_bytes)[0] - assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f'Failed to get program log: {result}' - print(f'Compiler log: {log_bytes.decode("utf-8")}') - - # Exit if failed - assert compile_result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f'Failed to compile program: {compile_result}' - - # Create CUBIN - result, cubin_size = nvrtc.nvrtcGetCUBINSize(program) - assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f'Failed to get CUBIN size: {result}' - cubin_bytes = bytes(cubin_size) - result = nvrtc.nvrtcGetCUBIN(program, cubin_bytes)[0] - assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f'Failed to get CUBIN: {result}' - - # Write into the file system - put(target_path, cubin_bytes) - - # Destroy handler - assert nvrtc.nvrtcDestroyProgram(program)[0] == nvrtc.nvrtcResult.NVRTC_SUCCESS, f'Failed to destroy program: {result}' - - -def build(name: str, code: str, runtime_cls: Type[Runtime], kwargs: Dict[str, Any] = None) -> Runtime: - compiler_cls = NVRTCCompiler if int(os.getenv('DG_JIT_USE_NVRTC', 0)) else NVCCCompiler - return compiler_cls.build(name, code, runtime_cls, kwargs) diff --git a/deep_gemm/jit/runtime.py b/deep_gemm/jit/runtime.py deleted file mode 100644 index e8f980ae..00000000 --- a/deep_gemm/jit/runtime.py +++ /dev/null @@ -1,114 +0,0 @@ -import os -import subprocess -import time -import torch -import cuda.bindings.driver as cbd - -from typing import Any, Dict, Optional, Type -from torch.utils.cpp_extension import CUDA_HOME - - -class Runtime: - def __init__(self, path: str) -> None: - self.path = path - self.lib = None - self.kernel = None - assert self.is_path_valid(self.path) - - @staticmethod - def is_path_valid(path: str) -> bool: - # Exists and is a directory - if not os.path.exists(path) or not os.path.isdir(path): - return False - - # Contains all necessary files - files = ['kernel.cubin'] - return all(os.path.exists(os.path.join(path, file)) for file in files) - - @staticmethod - def generate(kwargs: Dict[str, Any]) -> str: - raise NotImplemented - - @staticmethod - def launch(kernel: cbd.CUkernel, kwargs: Dict[str, Any]) -> cbd.CUresult: - raise NotImplemented - - def __call__(self, **kwargs) -> cbd.CUresult: - # Load CUBIN - if self.kernel is None: - start_time = time.time_ns() - - # Load CUBIN - path = bytes(os.path.join(self.path, 'kernel.cubin'), 'utf-8') - result, self.lib = cbd.cuLibraryLoadFromFile(path, [], [], 0, [], [], 0) - assert result == cbd.CUresult.CUDA_SUCCESS, f'Failed to load library: {result}' - - # Extract the kernel name - # TODO: use `cuda-bindings` API to do this (requires at least 12.8) - command = [f'{CUDA_HOME}/bin/cuobjdump', '-symbols', path] - result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - assert result.returncode == 0 - illegal_names = ['vprintf', '__instantiate_kernel', '__internal', '__assertfail'] - check_illegal = lambda line: any([name in line for name in illegal_names]) - kernel_names = [line.split()[-1] for line in result.stdout.splitlines() - if line.startswith('STT_FUNC') and not check_illegal(line)] - assert len(kernel_names) == 1, f'Too many kernels in the library: {kernel_names}' - - # Load kernel from the library - result, self.kernel = cbd.cuLibraryGetKernel(self.lib, bytes(kernel_names[0], encoding='utf-8')) - assert result == cbd.CUresult.CUDA_SUCCESS, f'Failed to load kernel: {result}' - - end_time = time.time_ns() - elapsed_time = (end_time - start_time) / 1e6 - if int(os.getenv('DG_JIT_DEBUG', 0)): - print(f'Loading JIT runtime {self.path} took {elapsed_time:.2f} ms.') - - # noinspection PyArgumentList - return self.launch(self.kernel, kwargs) - - def __del__(self) -> None: - if self.lib is not None: - res = cbd.cuLibraryUnload(self.lib)[0] - if res != cbd.CUresult.CUDA_SUCCESS: - raise Exception(f'Failed to unload library {self.path}: {res}') - - -class RuntimeCache: - def __init__(self) -> None: - self.cache = {} - - def __setitem__(self, path: str, runtime: Runtime) -> None: - self.cache[path] = runtime - - def get(self, path: str, runtime_cls: Type[Runtime], - name: str = '', kwargs: Dict[str, Any] = None, - force_enable_cache: bool = False) -> Optional[Runtime]: - # In Python runtime - if path in self.cache: - return self.cache[path] - - # Already compiled - use_cache = force_enable_cache or not int(os.getenv('DG_JIT_DISABLE_CACHE', 0)) - if use_cache and os.path.exists(path) and Runtime.is_path_valid(path): - # Print heuristic for the first time - if name and (int(os.getenv('DG_JIT_DEBUG', 0)) or int(os.getenv('DG_PRINT_CONFIGS', 0))): - simplified_kwargs = dict() - for key, value in kwargs.items() if kwargs is not None else dict().items(): - value = f'torch.Tensor<{value.dtype}>' if isinstance(value, torch.Tensor) else value - value = f'cuda.bindings.driver.CUtensorMap' if isinstance(value, cbd.CUtensorMap) else value - simplified_kwargs[key] = value - print(f'Put kernel {name} with {simplified_kwargs} into runtime cache') - - runtime = runtime_cls(path) - self.cache[path] = runtime - return runtime - return None - - -# Map some common Python types into C types -pytypes_to_ctypes = { - True: 'true', - False: 'false', - torch.bfloat16: 'cutlass::bfloat16_t', - torch.float: 'float' -} diff --git a/deep_gemm/jit/scripts/__init__.py b/deep_gemm/jit/scripts/__init__.py deleted file mode 100644 index 1661a07a..00000000 --- a/deep_gemm/jit/scripts/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import sm90_interleave_ffma diff --git a/deep_gemm/jit/scripts/sm90_interleave_ffma.py b/deep_gemm/jit/scripts/sm90_interleave_ffma.py deleted file mode 100644 index 7899a221..00000000 --- a/deep_gemm/jit/scripts/sm90_interleave_ffma.py +++ /dev/null @@ -1,137 +0,0 @@ -import argparse -import mmap -import os -import re -import subprocess -from torch.utils.cpp_extension import CUDA_HOME - - -def run_cuobjdump(file_path): - command = [f'{CUDA_HOME}/bin/cuobjdump', '-sass', file_path] - result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - assert result.returncode == 0 - return result.stdout - - -def extract_ffma(sass): - lines = sass.splitlines() - collected = [] - current = [] - - arch_name, func_name = 'N/A', 'N/A' - skip_next_line = False - for line in lines: - if 'code for' in line: - arch_name = line.lstrip().lstrip('code for ').rstrip() - elif 'Function :' in line: - func_name = line.lstrip().lstrip('Function :').rstrip() - elif 'FFMA' in line: - current.append(line) - skip_next_line = True - elif skip_next_line: - current.append(line) - skip_next_line = False - else: - if len(current) >= 16: - assert len(current) % 2 == 0 - collected.append((f'{arch_name}::{func_name}', current)) - current = [] - - if int(os.getenv('DG_JIT_PRINT_REG_REUSE', 0)): - print(f'Found {len(collected)} FFMA segments') - return collected - - -def extract_hex_from_line(line): - match = re.search(r'/\*\s*(0x[0-9a-fA-F]+)\s*\*/', line) - assert match - return int(match.group(1), 16) - - -def validate(m, offset, le_bytes, num_lines): - assert len(le_bytes) == num_lines // 2 - assert m[offset:offset + 16] == le_bytes[0] - for i in range(1, num_lines // 2): - if m[offset + i * 16:offset + i * 16 + 16] != le_bytes[i]: - return False - return True - - -def parse_registers(line): - line = re.sub(r'/\*.*?\*/', '', line) - line = line.replace(';', '') - tokens = line.strip().split(',') - registers = [] - for token in tokens: - token = token.strip() - words = token.split() - for word in words: - if word.startswith('R'): - reg = word.split('.')[0] - registers.append(reg) - return registers - - -def modify_segment(m, name, ffma_lines): - num_lines = (len(ffma_lines) * 9 // 16) // 2 * 2 - assert num_lines % 2 == 0 - - le_bytes, new_le_bytes = [], [] - reused_list = [] - dst_reg_set = set() - last_reused, last_dst_reg = False, '' - num_changed = 0 - for i in range(num_lines // 2): - dst_reg = parse_registers(ffma_lines[i * 2])[-2] - low_line, high_line = ffma_lines[i * 2], ffma_lines[i * 2 + 1] - low_hex, high_hex = extract_hex_from_line(low_line), extract_hex_from_line(high_line) - le_bytes.append(low_hex.to_bytes(8, 'little') + high_hex.to_bytes(8, 'little')) - reused = (high_hex & 0x0800000000000000) != 0 - if reused: - is_first_occurred = dst_reg not in dst_reg_set - if is_first_occurred or (last_reused and dst_reg == last_dst_reg): - # Modify the `reuse` and `yield` bits - assert high_hex & 0x0800200000000000, f'{hex(high_hex)}' - high_hex ^= 0x0800200000000000 - reused = False - num_changed += 1 - else: - reused_list.append(i) - dst_reg_set.add(dst_reg) - new_le_bytes.append(low_hex.to_bytes(8, 'little') + high_hex.to_bytes(8, 'little')) - last_reused, last_dst_reg = reused, dst_reg - if int(os.getenv('DG_JIT_PRINT_REG_REUSE', 0)): - print(f' > segment `{name}` new reused list ({num_changed} changed): {reused_list}') - - # Find the offset - offsets = [] - offset = m.find(le_bytes[0]) - while offset != -1: - offsets.append(offset) - offset = m.find(le_bytes[0], offset + 1) - offsets = list(filter(lambda x: validate(m, x, le_bytes, num_lines), offsets)) - - # Replace with `new_le_bytes` - for offset in offsets: - for i in range(num_lines // 2): - m[offset + i * 16:offset + i * 16 + 16] = new_le_bytes[i] - - -def process(path): - if int(os.getenv('DG_JIT_PRINT_REG_REUSE', 0)): - print(f'Processing {path}') - output = run_cuobjdump(path) - segments = extract_ffma(output) - with open(path, 'r+b') as f: - mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_WRITE) - for segment in segments: - modify_segment(mm, *segment) - mm.close() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Interleave FFMA reg reuse') - parser.add_argument('--so', help='Path to the SO file') - args = parser.parse_args() - - process(args.so) diff --git a/deep_gemm/jit_kernels/__init__.py b/deep_gemm/jit_kernels/__init__.py deleted file mode 100644 index e6c320de..00000000 --- a/deep_gemm/jit_kernels/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from . import ( - heuristics, - impls, - runtime -) diff --git a/deep_gemm/jit_kernels/heuristics/__init__.py b/deep_gemm/jit_kernels/heuristics/__init__.py deleted file mode 100644 index 24ec067e..00000000 --- a/deep_gemm/jit_kernels/heuristics/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from . import ( - common, - sm90_heuristics, - sm100_heuristics -) diff --git a/deep_gemm/jit_kernels/heuristics/common.py b/deep_gemm/jit_kernels/heuristics/common.py deleted file mode 100644 index bf683afb..00000000 --- a/deep_gemm/jit_kernels/heuristics/common.py +++ /dev/null @@ -1,49 +0,0 @@ -from ...jit.compiler import get_device_arch -from ...utils.math import ceil_div - - -class MulticastConfig: - def __init__(self, num_multicast: int, is_multicast_on_a: bool): - self.num_multicast = num_multicast - self.is_multicast_on_a = is_multicast_on_a - - def get_ab_load_block_m(self, block_m: int): - # NOTES: this for >= SM100 only - assert get_device_arch() != '90a' - return block_m // (self.num_multicast if self.is_multicast_on_a else 1) - - def get_ab_load_block_n(self, block_n: int): - # NOTES: this for >= SM100 only - assert get_device_arch() != '90a' - return block_n // (1 if self.is_multicast_on_a else self.num_multicast) - - -class SharedMemoryConfig: - def __init__(self, smem_size: int, swizzle_a_mode: int, swizzle_b_mode: int, swizzle_cd_mode: int): - self.smem_size = smem_size - self.swizzle_a_mode = swizzle_a_mode - self.swizzle_b_mode = swizzle_b_mode - # NOTES: sometimes the default swizzling pattern maybe not compatible (e.g., FP32 output) - self.swizzle_cd_mode = swizzle_cd_mode - # TODO: swizzle SF as well - self.swizzle_sf_mode = 0 - - assert self.swizzle_a_mode != 0 - assert self.swizzle_b_mode != 0 - assert self.swizzle_cd_mode > 16 - assert self.swizzle_sf_mode == 0 - - -def is_multicast_legal(shape_dim: int, block_dim: int, num_multicast: int, num_sms: int, - require_divisible: bool = False) -> bool: - divisible = ceil_div(shape_dim, block_dim) % num_multicast == 0 or not require_divisible - return divisible and num_sms % num_multicast == 0 - - -def get_swizzle_mode(block_size: int, elem_size: int) -> int: - # `> 0` means interleaving - # 16B actually means non-swizzling (but interleaving) - for mode_bytes in (128, 64, 32, 16): - if (block_size * elem_size) % mode_bytes == 0: - return mode_bytes - assert False, 'Invalid mode' diff --git a/deep_gemm/jit_kernels/heuristics/sm100_heuristics.py b/deep_gemm/jit_kernels/heuristics/sm100_heuristics.py deleted file mode 100644 index 071c61bd..00000000 --- a/deep_gemm/jit_kernels/heuristics/sm100_heuristics.py +++ /dev/null @@ -1,171 +0,0 @@ -import functools -import torch -from typing import Tuple - -from .common import ( - MulticastConfig, SharedMemoryConfig, - is_multicast_legal, get_swizzle_mode -) -from ...utils.math import align, ceil_div -from ...utils.layout import ( - GemmType, MajorTypeAB, MajorTypeCD, - get_element_size, get_m_alignment_for_contiguous_layout -) - - -def get_sf_aligned_block_sizes(block_m: int, block_n: int, ab_dtype: torch.dtype): - num_utccp_aligned_elems = 128 - assert block_m % num_utccp_aligned_elems == 0 - return { - torch.bfloat16: (0, 0), - torch.float8_e4m3fn: (align(block_m, num_utccp_aligned_elems), align(block_n, num_utccp_aligned_elems)), - }[ab_dtype] - - -def is_tmem_size_legal(block_m: int, block_n: int, ab_dtype: torch.float): - # M waves or epilogue stages (* 2), SFA and SFB - sf_block_m, sf_block_n = get_sf_aligned_block_sizes(block_m, block_n, ab_dtype) - return ((2 * block_n) + (sf_block_m // 32) + (sf_block_n // 32)) <= 512 - - -def get_smem_config(block_m: int, block_n: int, block_k: int, - major_a: MajorTypeAB, major_b: MajorTypeAB, major_d: MajorTypeCD, - ab_dtype: torch.dtype, cd_dtype: torch.dtype, - num_stages: int, multicast_config: MulticastConfig) -> SharedMemoryConfig: - assert major_d == MajorTypeCD.NMajor - - ab_elem_size = get_element_size(ab_dtype) - cd_elem_size = get_element_size(cd_dtype) - - load_block_m = multicast_config.get_ab_load_block_m(block_m) - load_block_n = multicast_config.get_ab_load_block_n(block_n) - swizzle_a_mode = get_swizzle_mode(block_k if major_a == MajorTypeAB.KMajor else load_block_m, ab_elem_size) - swizzle_b_mode = get_swizzle_mode(block_k if major_b == MajorTypeAB.KMajor else load_block_n, ab_elem_size) - swizzle_cd_mode = get_swizzle_mode(block_n if major_d == MajorTypeCD.NMajor else block_m, cd_elem_size) - - # 2 stages of STSM and TMA store - # TODO: consider other layouts - layout_ad_m = 128 - smem_d = min(block_m, layout_ad_m) * swizzle_cd_mode * 2 - - # A/B shared memory - smem_a_per_stage = load_block_m * block_k * ab_elem_size - smem_b_per_stage = load_block_n * block_k * ab_elem_size - - # SF shared memory must be aligned to UTCCP - # Each stage must prefetch next 4 stages' SF (including the current) - sf_block_m, sf_block_n = get_sf_aligned_block_sizes(block_m, block_n, ab_dtype) - smem_scales_a_per_stage = sf_block_m * 4 - smem_scales_b_per_stage = sf_block_n * 4 - - # TODO: remove SF barriers for BF16 GEMMs - # TMA full/empty barriers, with-SF full barriers, tensor memory full/empty barriers, accumulation full barrier - # NOTES: some shapes may only have 1 epilogue stage, but we still allocate space for 2 stages - # NOTES: cases without accumulation will not use the accumulation full barrier - smem_barrier = num_stages * 8 * 3 + 2 * 8 * 2 + 8 - smem_tmem_ptr = 4 - - # Sum them up - smem_size = 0 - smem_size += smem_d - smem_size += num_stages * smem_a_per_stage - smem_size += num_stages * smem_b_per_stage - smem_size += num_stages * smem_scales_a_per_stage - smem_size += num_stages * smem_scales_b_per_stage - smem_size += smem_barrier - smem_size += smem_tmem_ptr - - return SharedMemoryConfig(smem_size, swizzle_a_mode, swizzle_b_mode, swizzle_cd_mode) - - -@functools.lru_cache(maxsize=None) -def get_best_configs(gemm_type: GemmType, - m: int, n: int, k: int, num_groups: int, - major_a: MajorTypeAB, major_b: MajorTypeAB, major_d: MajorTypeCD, - ab_dtype: torch.dtype, cd_dtype: torch.dtype, - num_sms: int) -> \ - Tuple[int, int, int, int, int, MulticastConfig, SharedMemoryConfig]: - assert ab_dtype == torch.float8_e4m3fn - assert cd_dtype in (torch.bfloat16, torch.float) - - # `BLOCK_M` and `BLOCK_N` are selected according to MMA instructions - if gemm_type == GemmType.GroupedContiguous: - block_ms = (get_m_alignment_for_contiguous_layout(), ) - else: - block_ms = (128, ) if major_b == MajorTypeAB.KMajor else (128, 256) - # NOTES: some `% 32 == 16` cases are not compatible with 2-CTA TMA swizzling - block_ns = tuple(range(16, 257, 16)) if major_b == MajorTypeAB.KMajor else tuple(range(32, 257, 32)) - - # `BLOCK_K` is selected in a fixed manner - block_k = 128 // get_element_size(ab_dtype) - - fix_wave_saturate = lambda x: num_sms if x == 0 else x - get_num_waves = lambda bm, bn: (ceil_div(ceil_div(m, bm) * ceil_div(n, bn) * num_groups, num_sms) if bm else None) - get_last_wave_util = lambda bm, bn: fix_wave_saturate((ceil_div(m, bm) * ceil_div(n, bn) * num_groups) % num_sms) - - # Decide block sizes by waves - # TODO: move block size search into `common.py` - best_block_m, best_block_n = None, None - for block_m in block_ms: - for block_n in block_ns: - success = False - num_waves, best_num_waves = get_num_waves(block_m, block_n), get_num_waves(best_block_m, best_block_n) - if best_block_m is None or best_block_n is None: - success = True - elif num_waves < best_num_waves: - success = True - elif num_waves == best_num_waves: - # Check last wave utilization - util = get_last_wave_util(block_m, block_n) - best_util = get_last_wave_util(best_block_m, best_block_n) - success = util > best_util - if util == best_util: - # Case 1: same `block_m`, smaller `block_n` (wasted) - success |= block_m == best_block_m and block_n < best_block_n - # Case 2: same `block_n`, smaller `block_m` (wasted) - success |= block_n == best_block_n and block_m < best_block_m - # Case 3: different for both `block_m` and `block_n`, larger `block_n` is better - success |= block_m != best_block_m and block_n > best_block_n - success &= is_tmem_size_legal(block_m, block_n, ab_dtype) - best_block_m, best_block_n = (block_m, block_n) if success else (best_block_m, best_block_n) - assert best_block_m is not None and best_block_n is not None - - # Decide the number of TMA multicasts and whether broadcast on A - best_multicast_config = MulticastConfig(1, True) - - # Try to multicast on the larger block side first - is_legal = { - # TODO: support other `tcgen05` layouts - 'A': False, - 'B': is_multicast_legal(m, best_block_m, 2, num_sms, True) and gemm_type == GemmType.Normal, - } - for i in ('A', 'B') if best_block_m > best_block_n else ('B', 'A'): - if m >= 512 and is_legal[i]: - best_multicast_config = MulticastConfig(2, i == 'A') - break - - # Always pick the longest one - # NOTES: for double B scales, the best number of stages may be reduced - # TODO: move stage search into `common.py` - best_num_stages, best_smem_config, sm100_capacity = None, None, 232448 - stage_candidates = tuple(filter(lambda s: s <= max(k // 128, 1), (8, 7, 6, 5, 4, 3, 2, 1))) - for num_stages in stage_candidates: - best_smem_config = get_smem_config(best_block_m, best_block_n, block_k, - major_a, major_b, major_d, - ab_dtype, cd_dtype, - num_stages, best_multicast_config) - if best_smem_config.smem_size <= sm100_capacity: - best_num_stages = num_stages - break - assert best_smem_config is not None - assert best_num_stages is not None - - # Recompute the minimal number of SMs required - # NOTES: less L2 cache usage and less GPU frequency drop - # TODO: move min SM fix into `common.py` - num_waves = get_num_waves(best_block_m, best_block_n) - num_min_sms = ceil_div(ceil_div(m, best_block_m) * ceil_div(n, best_block_n) * num_groups, num_waves) - num_min_sms = ceil_div(num_min_sms, best_multicast_config.num_multicast) * best_multicast_config.num_multicast - assert num_min_sms <= num_sms - - return num_min_sms, best_block_m, best_block_n, block_k, best_num_stages, best_multicast_config, best_smem_config diff --git a/deep_gemm/jit_kernels/heuristics/sm90_heuristics.py b/deep_gemm/jit_kernels/heuristics/sm90_heuristics.py deleted file mode 100644 index e69de29b..00000000 diff --git a/deep_gemm/jit_kernels/impls/__init__.py b/deep_gemm/jit_kernels/impls/__init__.py deleted file mode 100644 index 203e3565..00000000 --- a/deep_gemm/jit_kernels/impls/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from . import ( - sm90_bf16_gemm, - sm100_bf16_gemm, - sm90_fp8_gemm_1d1d, - sm90_fp8_gemm_1d2d, - sm100_fp8_gemm_1d1d, -) diff --git a/deep_gemm/jit_kernels/impls/sm100_bf16_gemm.py b/deep_gemm/jit_kernels/impls/sm100_bf16_gemm.py deleted file mode 100644 index e69de29b..00000000 diff --git a/deep_gemm/jit_kernels/impls/sm100_fp8_gemm_1d1d.py b/deep_gemm/jit_kernels/impls/sm100_fp8_gemm_1d1d.py deleted file mode 100644 index ca59fac6..00000000 --- a/deep_gemm/jit_kernels/impls/sm100_fp8_gemm_1d1d.py +++ /dev/null @@ -1,339 +0,0 @@ -import ctypes -import os -import torch -import cuda.bindings.driver as cbd -from typing import Any, Dict, Optional - -from ..runtime import ( - make_tma_a_desc, make_tma_b_desc, - make_tma_cd_desc, make_tma_sf_desc -) -from ..heuristics.sm100_heuristics import get_best_configs -from ...config import get_num_sms -from ...jit import Runtime, build, pytypes_to_ctypes -from ...utils.math import align, ceil_div -from ...utils.layout import GemmType, MajorTypeAB, MajorTypeCD - - -class SM100FP8GemmRuntime(Runtime): - def __init__(self, path: str) -> None: - super().__init__(path) - - @staticmethod - def generate(kwargs: Dict[str, Any]) -> str: - assert kwargs['CD_DTYPE_T'] in (torch.bfloat16, torch.float) - code = f''' -#ifdef __CUDACC_RTC__ -#include -#else -#include -#include -#endif - -#include - -using namespace deep_gemm; - -static void __instantiate_kernel() {{ - auto ptr = reinterpret_cast(&sm100_fp8_gemm_1d1d_impl< - {kwargs['MAJOR_A']}, - {kwargs['MAJOR_B']}, - {kwargs['M'] if 'm' in kwargs['COMPILED_DIMS'] else 0}, - {kwargs['N'] if 'n' in kwargs['COMPILED_DIMS'] else 0}, - {kwargs['K'] if 'k' in kwargs['COMPILED_DIMS'] else 0}, - {kwargs['BLOCK_M']}, - {kwargs['BLOCK_N']}, - {kwargs['BLOCK_K']}, - {kwargs['NUM_GROUPS']}, - {kwargs['SWIZZLE_A_MODE']}, - {kwargs['SWIZZLE_B_MODE']}, - {kwargs['SWIZZLE_CD_MODE']}, - {kwargs['NUM_STAGES']}, - {kwargs['NUM_LAST_STAGES']}, - {kwargs['NUM_NON_EPILOGUE_THREADS']}, - {kwargs['NUM_EPILOGUE_THREADS']}, - {kwargs['NUM_MULTICAST']}, - {pytypes_to_ctypes[kwargs['IS_MULTICAST_ON_A']]}, - {kwargs['GEMM_TYPE']}, - {pytypes_to_ctypes[kwargs['WITH_ACCUMULATION']]}, - {pytypes_to_ctypes[kwargs['CD_DTYPE_T']]} - >); -}}; -''' - if int(os.getenv('DG_JIT_DEBUG', 0)): - print(f'Generated FP8 GEMM code:\n{code}') - return code - - # noinspection PyMethodOverriding - @staticmethod - def launch(kernel: cbd.CUkernel, kwargs: Dict[str, Any]) -> cbd.CUresult: - result = cbd.cuKernelSetAttribute(cbd.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - kwargs['SMEM_SIZE'], kernel, cbd.CUdevice(kwargs['DEVICE_INDEX']))[0] - assert result == cbd.CUresult.CUDA_SUCCESS, f'Failed to set max dynamic shared memory size: {result}' - - attr_val = cbd.CUlaunchAttributeValue() - attr_val.clusterDim.x = kwargs['NUM_MULTICAST'] - attr_val.clusterDim.y = 1 - attr_val.clusterDim.z = 1 - attr = cbd.CUlaunchAttribute() - attr.id = cbd.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION - attr.value = attr_val - - config = cbd.CUlaunchConfig() - config.numAttrs = 1 - config.attrs = [attr] - config.gridDimX = kwargs['NUM_SMS'] - config.gridDimY = 1 - config.gridDimZ = 1 - config.blockDimX = kwargs['NUM_NON_EPILOGUE_THREADS'] + kwargs['NUM_EPILOGUE_THREADS'] - config.blockDimY = 1 - config.blockDimZ = 1 - config.sharedMemBytes = kwargs['SMEM_SIZE'] - config.hStream = kwargs['STREAM'] - - arg_values = ( - kwargs['GROUPED_LAYOUT'].data_ptr(), - kwargs['M'], - kwargs['N'], - kwargs['K'], - kwargs['TENSOR_MAP_A'], - kwargs['TENSOR_MAP_B'], - kwargs['TENSOR_MAP_SFA'], - kwargs['TENSOR_MAP_SFB'], - kwargs['TENSOR_MAP_C'], - kwargs['TENSOR_MAP_D'], - ) - arg_types = ( - ctypes.c_void_p, - ctypes.c_uint32, - ctypes.c_uint32, - ctypes.c_uint32, - None, None, None, None, None, None - ) - return cbd.cuLaunchKernelEx(config, kernel, (arg_values, arg_types), 0) - - -def fp8_gemm_nt(a: torch.Tensor, sfa: torch.Tensor, - b: torch.Tensor, sfb: torch.Tensor, - c: Optional[torch.Tensor], d: torch.Tensor, - major_a: MajorTypeAB, major_b: MajorTypeAB, - major_cd: MajorTypeCD, - compiled_dims: str) -> None: - m, k = a.shape - n, _ = b.shape - assert major_cd == MajorTypeCD.NMajor - - # K must be aligned to 128 - aligned_k = align(k, 128) - - num_sms = get_num_sms() - num_sms, block_m, block_n, block_k, num_stages, multicast_config, smem_config = get_best_configs( - GemmType.Normal, m, n, k, 1, major_a, major_b, major_cd, torch.float8_e4m3fn, d.dtype, num_sms) - - num_groups = 1 - tensor_map_a = make_tma_a_desc(major_a, a, m, k, - multicast_config.get_ab_load_block_m(block_m), block_k, - a.stride(major_a.non_contiguous_dim()), num_groups, - smem_config.swizzle_a_mode) - tensor_map_b = make_tma_b_desc(major_b, b, n, k, - multicast_config.get_ab_load_block_n(block_n), block_k, - b.stride(major_b.non_contiguous_dim()), num_groups, - smem_config.swizzle_b_mode) - tensor_map_d = make_tma_cd_desc(major_cd, d, m, n, - block_m, block_n, - d.stride(major_cd.non_contiguous_dim()), num_groups, - smem_config.swizzle_cd_mode) - tensor_map_c = make_tma_cd_desc(major_cd, c, m, n, - block_m, block_n, - c.stride(major_cd.non_contiguous_dim()), num_groups, - smem_config.swizzle_cd_mode) if c is not None else tensor_map_d - tensor_map_sfa = make_tma_sf_desc(MajorTypeAB.MNMajor, sfa, m, k, block_m, block_k, num_groups, smem_config.swizzle_sf_mode) - tensor_map_sfb = make_tma_sf_desc(MajorTypeAB.MNMajor, sfb, n, k, block_n, block_k, num_groups, smem_config.swizzle_sf_mode) - - kwargs = { - # Templated or runtime arguments according to the `COMPILED_DIMS` - 'COMPILED_DIMS': compiled_dims, - 'M': m, 'N': n, 'K': aligned_k, - # Templated arguments - 'GEMM_TYPE': GemmType.Normal, - 'NUM_NON_EPILOGUE_THREADS': 128, - 'NUM_EPILOGUE_THREADS': 128, - 'MAJOR_A': major_a, - 'MAJOR_B': major_b, - 'NUM_GROUPS': 1, - 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, - 'NUM_STAGES': num_stages, 'NUM_LAST_STAGES': ceil_div(k, block_k) % num_stages, - 'SWIZZLE_A_MODE': smem_config.swizzle_a_mode, - 'SWIZZLE_B_MODE': smem_config.swizzle_b_mode, - 'SWIZZLE_CD_MODE': smem_config.swizzle_cd_mode, - 'NUM_MULTICAST': multicast_config.num_multicast, - 'IS_MULTICAST_ON_A': multicast_config.is_multicast_on_a, - 'WITH_ACCUMULATION': c is not None, - 'CD_DTYPE_T': d.dtype, - # Runtime arguments - 'GROUPED_LAYOUT': torch.empty(0, dtype=torch.int32, device=d.device), - 'NUM_SMS': num_sms, - 'SMEM_SIZE': smem_config.smem_size, - 'TENSOR_MAP_A': tensor_map_a, - 'TENSOR_MAP_B': tensor_map_b, - 'TENSOR_MAP_SFA': tensor_map_sfa, - 'TENSOR_MAP_SFB': tensor_map_sfb, - 'TENSOR_MAP_C': tensor_map_c, - 'TENSOR_MAP_D': tensor_map_d, - 'STREAM': torch.cuda.current_stream().cuda_stream, - 'DEVICE_INDEX': d.device.index - } - - # Generate, build and run the kernel - code = SM100FP8GemmRuntime.generate(kwargs) - runtime = build('fp8_gemm', code, SM100FP8GemmRuntime, kwargs) - runtime(**kwargs) - - -def m_grouped_fp8_gemm_nt_contiguous(a: torch.Tensor, sfa: torch.Tensor, - b: torch.Tensor, sfb: torch.Tensor, - d: torch.Tensor, - m_indices: torch.Tensor, - major_a: MajorTypeAB, major_b: MajorTypeAB, - compiled_dims: str) -> None: - m, k = a.shape - num_groups, n, _ = b.shape - major_d = MajorTypeCD.NMajor - - # K must be aligned to 128 - aligned_k = align(k, 128) - - # Auto-tuning with compilation - num_sms = get_num_sms() - num_sms, block_m, block_n, block_k, num_stages, multicast_config, smem_config = get_best_configs( - GemmType.GroupedContiguous, m, n, k, num_groups, major_a, major_b, major_d, torch.float8_e4m3fn, d.dtype, num_sms) - - # NOTES: you cannot distinguish groups for A, SFA, and D - tensor_map_a = make_tma_a_desc(major_a, a, m, k, - multicast_config.get_ab_load_block_m(block_m), block_k, - a.stride(major_a.non_contiguous_dim()), num_groups=1, - swizzle_mode=smem_config.swizzle_a_mode) - tensor_map_b = make_tma_b_desc(major_b, b, n, k, - multicast_config.get_ab_load_block_n(block_n), block_k, - b.stride(major_b.non_contiguous_dim()), num_groups=num_groups, - swizzle_mode=smem_config.swizzle_b_mode) - tensor_map_d = make_tma_cd_desc(major_d, d, m, n, - block_m, block_n, - d.stride(major_d.non_contiguous_dim()), num_groups=1, - swizzle_mode=smem_config.swizzle_cd_mode) - tensor_map_sfa = make_tma_sf_desc(MajorTypeAB.MNMajor, sfa, m, k, block_m, block_k, num_groups=1, swizzle_mode=smem_config.swizzle_sf_mode) - tensor_map_sfb = make_tma_sf_desc(MajorTypeAB.MNMajor, sfb, n, k, block_n, block_k, num_groups=num_groups, swizzle_mode=smem_config.swizzle_sf_mode) - - kwargs = { - # Templated or runtime arguments according to the `COMPILED_DIMS` - 'COMPILED_DIMS': compiled_dims, - 'M': m, 'N': n, 'K': aligned_k, - # Templated arguments - 'GEMM_TYPE': GemmType.GroupedContiguous, - 'NUM_NON_EPILOGUE_THREADS': 128, - 'NUM_EPILOGUE_THREADS': 128, - 'MAJOR_A': major_a, - 'MAJOR_B': major_b, - 'NUM_GROUPS': num_groups, - 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, - 'NUM_STAGES': num_stages, 'NUM_LAST_STAGES': ceil_div(k, block_k) % num_stages, - 'SWIZZLE_A_MODE': smem_config.swizzle_a_mode, - 'SWIZZLE_B_MODE': smem_config.swizzle_b_mode, - 'SWIZZLE_CD_MODE': smem_config.swizzle_cd_mode, - 'NUM_MULTICAST': multicast_config.num_multicast, - 'IS_MULTICAST_ON_A': multicast_config.is_multicast_on_a, - 'WITH_ACCUMULATION': False, - 'CD_DTYPE_T': d.dtype, - # Runtime arguments - 'GROUPED_LAYOUT': m_indices, - 'NUM_SMS': num_sms, - 'SMEM_SIZE': smem_config.smem_size, - 'TENSOR_MAP_A': tensor_map_a, - 'TENSOR_MAP_B': tensor_map_b, - 'TENSOR_MAP_SFA': tensor_map_sfa, - 'TENSOR_MAP_SFB': tensor_map_sfb, - 'TENSOR_MAP_C': tensor_map_d, - 'TENSOR_MAP_D': tensor_map_d, - 'STREAM': torch.cuda.current_stream().cuda_stream, - 'DEVICE_INDEX': d.device.index - } - - # Generate, build and run the kernel - code = SM100FP8GemmRuntime.generate(kwargs) - runtime = build('fp8_m_grouped_gemm', code, SM100FP8GemmRuntime, kwargs) - runtime(**kwargs) - - -def fp8_m_grouped_gemm_nt_masked(a: torch.Tensor, sfa: torch.Tensor, - b: torch.Tensor, sfb: torch.Tensor, - d: torch.Tensor, - masked_m: torch.Tensor, - expected_m: int, - major_a: MajorTypeAB, major_b: MajorTypeAB, - compiled_dims: str) -> None: - num_groups, m, k = a.shape - _, n, _ = b.shape - major_d = MajorTypeCD.NMajor - - # K must be aligned to 128 - aligned_k = align(k, 128) - - num_sms = get_num_sms() - num_sms, block_m, block_n, block_k, num_stages, multicast_config, smem_config = get_best_configs( - GemmType.GroupedMasked, expected_m, n, k, num_groups, major_a, major_b, major_d, torch.float8_e4m3fn, d.dtype, num_sms) - if num_groups > 1: - assert m % block_m == 0 - - tensor_map_a = make_tma_a_desc(major_a, a, m, k, - multicast_config.get_ab_load_block_m(block_m), block_k, - a.stride(major_a.non_contiguous_dim()), num_groups, - smem_config.swizzle_a_mode) - tensor_map_b = make_tma_b_desc(major_b, b, n, k, - multicast_config.get_ab_load_block_n(block_n), block_k, - b.stride(major_b.non_contiguous_dim()), num_groups, - smem_config.swizzle_b_mode) - tensor_map_d = make_tma_cd_desc(major_d, d, m, n, - block_m, block_n, - d.stride(major_d.non_contiguous_dim()), num_groups, - smem_config.swizzle_cd_mode) - tensor_map_sfa = make_tma_sf_desc(MajorTypeAB.MNMajor, sfa, m, k, block_m, block_k, num_groups, smem_config.swizzle_sf_mode) - tensor_map_sfb = make_tma_sf_desc(MajorTypeAB.MNMajor, sfb, n, k, block_n, block_k, num_groups, smem_config.swizzle_sf_mode) - - kwargs = { - # Templated or runtime arguments according to the `COMPILED_DIMS` - 'COMPILED_DIMS': compiled_dims, - 'M': m, 'N': n, 'K': aligned_k, - # Templated arguments - 'GEMM_TYPE': GemmType.GroupedMasked, - 'NUM_NON_EPILOGUE_THREADS': 128, - 'NUM_EPILOGUE_THREADS': 128, - 'MAJOR_A': major_a, - 'MAJOR_B': major_b, - 'NUM_GROUPS': num_groups, - 'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, - 'NUM_STAGES': num_stages, 'NUM_LAST_STAGES': ceil_div(k, block_k) % num_stages, - 'SWIZZLE_A_MODE': smem_config.swizzle_a_mode, - 'SWIZZLE_B_MODE': smem_config.swizzle_b_mode, - 'SWIZZLE_CD_MODE': smem_config.swizzle_cd_mode, - 'NUM_MULTICAST': multicast_config.num_multicast, - 'IS_MULTICAST_ON_A': multicast_config.is_multicast_on_a, - 'WITH_ACCUMULATION': False, - 'CD_DTYPE_T': d.dtype, - # Runtime arguments - 'GROUPED_LAYOUT': masked_m, - 'NUM_SMS': num_sms, - 'SMEM_SIZE': smem_config.smem_size, - 'TENSOR_MAP_A': tensor_map_a, - 'TENSOR_MAP_B': tensor_map_b, - 'TENSOR_MAP_SFA': tensor_map_sfa, - 'TENSOR_MAP_SFB': tensor_map_sfb, - 'TENSOR_MAP_C': tensor_map_d, - 'TENSOR_MAP_D': tensor_map_d, - 'STREAM': torch.cuda.current_stream().cuda_stream, - 'DEVICE_INDEX': d.device.index - } - - # Generate, build and run the kernel - code = SM100FP8GemmRuntime.generate(kwargs) - runtime = build('fp8_m_grouped_gemm', code, SM100FP8GemmRuntime, kwargs) - runtime(**kwargs) diff --git a/deep_gemm/jit_kernels/impls/sm90_bf16_gemm.py b/deep_gemm/jit_kernels/impls/sm90_bf16_gemm.py deleted file mode 100644 index e69de29b..00000000 diff --git a/deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d1d.py b/deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d1d.py deleted file mode 100644 index e69de29b..00000000 diff --git a/deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d2d.py b/deep_gemm/jit_kernels/impls/sm90_fp8_gemm_1d2d.py deleted file mode 100644 index e69de29b..00000000 diff --git a/deep_gemm/jit_kernels/runtime.py b/deep_gemm/jit_kernels/runtime.py deleted file mode 100644 index 2254ef26..00000000 --- a/deep_gemm/jit_kernels/runtime.py +++ /dev/null @@ -1,149 +0,0 @@ -import torch -import cuda.bindings.driver as cbd -from typing import Any, Dict, Tuple - -from ..utils.math import ceil_div -from ..utils.layout import get_tma_aligned_size, GemmType, MajorTypeAB, MajorTypeCD - - -tmap_type_map: Dict[Any, str] = { - torch.int8: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT8, - torch.int16: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT16, - torch.int32: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_INT32, - torch.int64: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_INT64, - torch.uint8: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT8, - torch.uint16: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT16, - torch.uint32: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT32, - torch.uint64: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT64, - torch.float32: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_FLOAT32, - torch.float16: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_FLOAT16, - torch.bfloat16: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, - torch.float8_e4m3fn: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT8, - torch.float8_e4m3fnuz: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT8, - torch.float8_e5m2: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT8, - torch.float8_e5m2fnuz: cbd.CUtensorMapDataType.CU_TENSOR_MAP_DATA_TYPE_UINT8, -} - -swizzle_type_map = { - 0: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE, - 16: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_NONE, - 32: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_32B, - 64: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_64B, - 128: cbd.CUtensorMapSwizzle.CU_TENSOR_MAP_SWIZZLE_128B, -} - -def make_tma_xd_desc(t: torch.Tensor, - gmem_dims: Tuple[cbd.cuuint64_t, ...], gmem_strides: Tuple[cbd.cuuint64_t, ...], - smem_dims: Tuple[cbd.cuuint32_t, ...], - swizzle_type: cbd.CUtensorMapSwizzle) -> cbd.CUtensorMap: - num_dims = len(gmem_dims) - assert len(gmem_strides) == num_dims - 1 - assert len(smem_dims) == num_dims - - tensor_dtype = tmap_type_map[t.dtype] - res, tensor_map = cbd.cuTensorMapEncodeTiled( - tensor_dtype, - num_dims, - t.data_ptr(), - gmem_dims, - gmem_strides, - smem_dims, - (cbd.cuuint32_t(1), ) * num_dims, - cbd.CUtensorMapInterleave.CU_TENSOR_MAP_INTERLEAVE_NONE, - swizzle_type, - cbd.CUtensorMapL2promotion.CU_TENSOR_MAP_L2_PROMOTION_L2_256B, - cbd.CUtensorMapFloatOOBfill.CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE, - ) - - if res != cbd.CUresult.CUDA_SUCCESS: - raise Exception(f'Failed to encode tensor map: {res}') - return tensor_map - - -def make_tma_2d_desc(t: torch.Tensor, - gmem_inner_dim: int, gmem_outer_dim: int, - smem_inner_dim: int, smem_outer_dim: int, - gmem_outer_stride: int, - swizzle_mode: int) -> cbd.CUtensorMap: - # For swizzling pattern, multiple TMAs are required - if swizzle_mode != 0: - assert swizzle_mode % t.element_size() == 0 - smem_inner_dim = swizzle_mode // t.element_size() - - gmem_dims = (cbd.cuuint64_t(gmem_inner_dim), cbd.cuuint64_t(gmem_outer_dim)) - gmem_strides = (cbd.cuuint64_t(gmem_outer_stride * t.element_size()), ) - smem_dims = (cbd.cuuint32_t(smem_inner_dim), cbd.cuuint32_t(smem_outer_dim)) - return make_tma_xd_desc(t, gmem_dims, gmem_strides, smem_dims, swizzle_type_map[swizzle_mode]) - - -def make_tma_a_desc(major_type: MajorTypeAB, - t: torch.Tensor, - shape_m: int, shape_k: int, - block_m: int, block_k: int, - outer_stride: int, - num_groups: int, - swizzle_mode: int) -> cbd.CUtensorMap: - if num_groups > 1: - assert major_type == MajorTypeAB.KMajor - return make_tma_2d_desc(t, - *(shape_k, shape_m * num_groups)[::major_type.shape_direction()], - *(block_k, block_m)[::major_type.shape_direction()], - outer_stride, - swizzle_mode) - - -def make_tma_b_desc(major_type: MajorTypeAB, - t: torch.Tensor, - shape_n: int, shape_k: int, - block_n: int, block_k: int, - outer_stride: int, - num_groups: int, - swizzle_mode: int) -> cbd.CUtensorMap: - # `num_groups` is always applied into the outer dimensions - io_shapes = (shape_k, shape_n)[::major_type.shape_direction()] - io_shapes = (io_shapes[0], io_shapes[1] * num_groups) - - return make_tma_2d_desc(t, - *io_shapes, - *(block_k, block_n)[::major_type.shape_direction()], - outer_stride, - swizzle_mode) - - -def make_tma_cd_desc(major_type: MajorTypeCD, - t: torch.Tensor, - shape_m: int, shape_n: int, - block_m: int, block_n: int, - outer_stride: int, - num_groups: int, - swizzle_mode: int) -> cbd.CUtensorMap: - assert major_type == MajorTypeCD.NMajor - - # Swizzling requires the inner box dim to be less or equal than `kSwizzleCDMode` - # bytes, so `BLOCK_N * sizeof(T) / kSwizzleCDMode` TMA stores are required - layout_ad_m = 128 - return make_tma_2d_desc(t, - shape_n, shape_m * num_groups, - block_n, min(block_m, layout_ad_m), - outer_stride, - swizzle_mode) - - -def make_tma_sf_desc(major_type: MajorTypeAB, - t: torch.Tensor, - shape_mn: int, shape_k: int, - block_mn: int, block_k: int, - num_groups: int, - swizzle_mode: int) -> cbd.CUtensorMap: - assert major_type == MajorTypeAB.MNMajor - - # TODO: maybe swizzle SF as well - assert swizzle_mode == 0 - - # Make TMA aligned to 16 bytes - shape_mn = get_tma_aligned_size(shape_mn, t.element_size()) - return make_tma_2d_desc(t, - shape_mn, ceil_div(shape_k, block_k * 4) * num_groups, - block_mn, 1, - shape_mn, - swizzle_mode) diff --git a/deep_gemm/testing/__init__.py b/deep_gemm/testing/__init__.py index 8abc1d91..2537dbf1 100644 --- a/deep_gemm/testing/__init__.py +++ b/deep_gemm/testing/__init__.py @@ -1 +1,3 @@ from . import bench, numeric +from .bench import * +from .numeric import * diff --git a/deep_gemm/testing/bench.py b/deep_gemm/testing/bench.py index 5be63f9d..7e77866d 100644 --- a/deep_gemm/testing/bench.py +++ b/deep_gemm/testing/bench.py @@ -29,7 +29,7 @@ def bench(fn, num_warmups: int = 5, num_tests: int = 10, end_event.record() torch.cuda.synchronize() - return start_event.elapsed_time(end_event) / num_tests + return start_event.elapsed_time(end_event) / num_tests / 1e3 class empty_suppress: diff --git a/deep_gemm/testing/numeric.py b/deep_gemm/testing/numeric.py index b7a026ab..d06a03b9 100644 --- a/deep_gemm/testing/numeric.py +++ b/deep_gemm/testing/numeric.py @@ -9,11 +9,11 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor): return 1 - sim -def count_bytes(tensors: Iterable[torch.Tensor]): +def count_bytes(*tensors): total = 0 for t in tensors: - if isinstance(t, tuple) or isinstance(t, list): - total += count_bytes(t) + if isinstance(t, (tuple, list)): + total += count_bytes(*t) elif t is not None: total += t.numel() * t.element_size() return total diff --git a/deep_gemm/utils/__init__.py b/deep_gemm/utils/__init__.py index e0654f06..e8f859a2 100644 --- a/deep_gemm/utils/__init__.py +++ b/deep_gemm/utils/__init__.py @@ -1 +1,3 @@ -from . import layout, math +from . import math, layout +from .layout import * +from .math import * diff --git a/deep_gemm/utils/layout.py b/deep_gemm/utils/layout.py index 5536da1b..ac8c070b 100644 --- a/deep_gemm/utils/layout.py +++ b/deep_gemm/utils/layout.py @@ -1,175 +1,11 @@ -import enum -import torch -from typing import Tuple, Optional - -from .math import align, ceil_div -from ..jit.compiler import get_device_arch - - -class GemmType(enum.Enum): - Normal = 0 - GroupedContiguous = 1 - GroupedMasked = 2 - - def __str__(self) -> str: - return { - 0: 'GemmType::Normal', - 1: 'GemmType::GroupedContiguous', - 2: 'GemmType::GroupedMasked', - }[self.value] - - -class MajorTypeAB(enum.Enum): - KMajor = 0 - MNMajor = 1 - - def shape_direction(self): - return 1 if self.value == 0 else -1 - - def non_contiguous_dim(self): - return -2 if self.value == 0 else -1 - - def __str__(self) -> str: - return { - 0: 'cute::UMMA::Major::K', - 1: 'cute::UMMA::Major::MN' - }[self.value] - - -class MajorTypeCD(enum.Enum): - NMajor = 0 - MMajor = 1 - - def non_contiguous_dim(self): - return -2 if self.value == 0 else -1 - - -def major_check(t: torch.Tensor): - assert t.dim() in (2, 3) - if t.dim() == 3: - assert t.stride(0) == t.size(-2) * t.size(-1), 'Grouped dimension cannot have abnormal stride' - assert t.stride(-2) == 1 or t.stride(-1) == 1 - - -def get_major_type_ab(t: torch.Tensor): - major_check(t) - return MajorTypeAB.KMajor if t.stride(-1) == 1 else MajorTypeAB.MNMajor - - -def get_major_type_cd(t: torch.Tensor): - major_check(t) - return MajorTypeCD.NMajor if t.stride(-1) == 1 else MajorTypeCD.MMajor - - -def get_element_size(dtype: torch.dtype): - return { - torch.float8_e4m3fn: 1, - torch.bfloat16: 2, - torch.float: 4, - }[dtype] - - -def get_m_alignment_for_contiguous_layout(): - return 128 - - -def get_tma_aligned_size(x: int, element_size: int) -> int: - tma_alignment_bytes = 16 - assert tma_alignment_bytes % element_size == 0 - alignment = tma_alignment_bytes // element_size - return align(x, alignment) - - -def get_col_major_tma_aligned_packed_tensor(x: torch.Tensor) -> torch.Tensor: - # NOTES: for the extreme performance, you may rewrite/fuse this function in CUDA - assert x.dtype == torch.float and x.dim() in (2, 3) - - # First, convert into UE8M0 `uint8_t` - ue8m0_tensor = (x.view(torch.int) >> 23).to(torch.uint8) - - # Second, make padded packed tensors - mn, k = x.shape[-2], x.shape[-1] - remove_dim = False - if x.dim() == 2: - x, remove_dim = x.unsqueeze(0), True - b = x.shape[0] - aligned_mn = get_tma_aligned_size(mn, 4) - aligned_k = align(k, 4) - padded = torch.zeros((b, aligned_mn, aligned_k), device=x.device, dtype=torch.uint8) - padded[:, :mn, :k] = ue8m0_tensor - padded = padded.view(-1).view(dtype=torch.int).view(b, aligned_mn, aligned_k // 4) - - # Finally, transpose - transposed = torch.transpose(torch.empty((b, aligned_k // 4, aligned_mn), device=x.device, dtype=torch.int), 1, 2) - transposed[:, :, :] = padded - aligned_x = transposed[:, :mn, :] - return aligned_x.squeeze(0) if remove_dim else aligned_x - - -def check_sf_layout(sf: torch.Tensor, - mn: int, k: int, gran: Tuple[int, int], - num_groups: Optional[int], - tma_stride_check: bool = False, - type_check: Optional[torch.dtype] = None) -> torch.Tensor: - # Type check - if type_check is not None: - assert sf.dtype == type_check - - # Always do shape checks - assert sf.dtype in (torch.float, torch.int) - assert sf.dim() == int(num_groups is not None) + 2 - if num_groups is not None: - assert sf.size(-3) == num_groups - assert sf.size(-2) == ceil_div(mn, gran[0]) - assert sf.size(-1) == ceil_div(k, gran[1] * (1 if sf.dtype == torch.float else 4)) - - # TMA stride checks: TMA aligned and MN-major - if tma_stride_check: - if num_groups is not None: - assert sf.stride(-3) == sf.stride(-1) * sf.size(-1) - assert sf.stride(-2) == 1 - assert sf.stride(-1) == get_tma_aligned_size(mn, sf.element_size()) - - return sf - - -def transform_sf_into_required_layout(sf: torch.Tensor, - mn: int, k: int, - recipe: Tuple[int, int, int], - num_groups: Optional[int] = None, - is_sfa: bool = False): - gran = (recipe[0 if is_sfa else 1], recipe[2]) - - should_skip_transform = ( - (sf.dtype == torch.int and gran == (1, 128) and get_device_arch() == '100a') - or (sf.dtype == torch.int and gran == (128, 128) and get_device_arch() == '100a') - ) - - if not should_skip_transform: - # Pre-transform checks - check_sf_layout(sf, mn=mn, k=k, gran=gran, num_groups=num_groups) - - # (FP32, 1, 128) on Hopper: transform to TMA-aligned and MN-major - if sf.dtype == torch.float and gran == (1, 128) and get_device_arch() == '90a': - raise NotImplemented - - # (FP32, 1, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major - if sf.dtype == torch.float and gran == (1, 128) and get_device_arch() == '100a': - sf = get_col_major_tma_aligned_packed_tensor(sf) - return check_sf_layout(sf, mn=mn, k=k, gran=(1, 128), num_groups=num_groups, tma_stride_check=True, type_check=torch.int) - - # (FP32, 128, 128) on Hopper: no need to transform, check shape and whatever-major - if sf.dtype == torch.float and gran == (128, 128) and get_device_arch() == '90a': - raise NotImplemented - - # (FP32, 128, 128) on SM100: transform to (INT, 1, 128), TMA-aligned and MN-major - if sf.dtype == torch.float and gran == (128, 128) and get_device_arch() == '100a': - sf = sf.index_select(-2, torch.arange(mn, device=sf.device) // 128) - sf = get_col_major_tma_aligned_packed_tensor(sf) - return check_sf_layout(sf, mn=mn, k=k, gran=(1, 128), num_groups=num_groups, tma_stride_check=True, type_check=torch.int) - - if should_skip_transform: - # TODO: add transpose kernel if SF layout is not satisfied - return check_sf_layout(sf, mn=mn, k=k, gran=(1, 128), num_groups=num_groups, tma_stride_check=True, type_check=torch.int) - - assert False, f'Unknown cases: {sf.dtype=}, {gran=}, arch={get_device_arch()}' +from deep_gemm_cpp import ( + get_tma_aligned_size, + get_mk_alignment_for_contiguous_layout, + get_mn_major_tma_aligned_tensor, + get_mn_major_tma_aligned_packed_ue8m0_tensor, + get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor +) + +# Some alias +get_m_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout +get_k_alignment_for_contiguous_layout = get_mk_alignment_for_contiguous_layout diff --git a/deep_gemm/utils/math.py b/deep_gemm/utils/math.py index 02684f81..884a7112 100644 --- a/deep_gemm/utils/math.py +++ b/deep_gemm/utils/math.py @@ -3,16 +3,6 @@ def ceil_div(x: int, y: int) -> int: - """ - Perform ceiling division of two integers. - - Args: - x: the dividend. - y: the divisor. - - Returns: - The result of the ceiling division. - """ return (x + y - 1) // y @@ -25,22 +15,34 @@ def ceil_to_ue8m0(x: torch.Tensor): return torch.pow(2.0, torch.ceil(torch.log2(x.abs()))) -def per_token_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: +def per_token_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]: assert x.dim() == 2 and x.size(1) % 128 == 0 m, n = x.shape x_view = x.view(m, -1, 128) x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4) - sf = ceil_to_ue8m0(x_amax / 448.0) + sf = x_amax / 448.0 + sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf -def per_block_cast_to_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: +def per_channel_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]: + assert x.dim() == 2 and x.size(0) % 128 == 0 + m, n = x.shape + x_view = x.view(-1, 128, n) + x_amax = x_view.abs().float().amax(dim=1).view(-1, n).clamp(1e-4) + sf = x_amax / 448.0 + sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf + return (x_view * (1.0 / sf.unsqueeze(1))).to(torch.float8_e4m3fn).view(m, n), sf + + +def per_block_cast_to_fp8(x: torch.Tensor, use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]: assert x.dim() == 2 m, n = x.shape x_padded = torch.zeros((align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device) x_padded[:m, :n] = x x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128) x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4) - sf = ceil_to_ue8m0(x_amax / 448.0) + sf = x_amax / 448.0 + sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn) return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(x_view.size(0), x_view.size(2)) diff --git a/develop.sh b/develop.sh new file mode 100755 index 00000000..58798613 --- /dev/null +++ b/develop.sh @@ -0,0 +1,25 @@ +# Change current directory into project root +original_dir=$(pwd) +script_dir=$(realpath "$(dirname "$0")") +cd "$script_dir" + +# Link CUTLASS includes +ln -sf $script_dir/third-party/cutlass/include/cutlass deep_gemm/include +ln -sf $script_dir/third-party/cutlass/include/cute deep_gemm/include + +# Remove old dist file, build, and build +rm -rf build dist +rm -rf *.egg-info +python setup.py build + +# Find the .so file in build directory and create symlink in current directory +so_file=$(find build -name "*.so" -type f | head -n 1) +if [ -n "$so_file" ]; then + ln -sf "$so_file" . +else + echo "Error: No SO file found in build directory" >&2 + exit 1 +fi + +# Open users' original directory +cd "$original_dir" diff --git a/figures/design.png b/figures/design.png deleted file mode 100644 index b3761d60e586c262906719daa4d8d6803884f13d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 571421 zcmeFZd010t*FM_5ty-l@ts){~+KQkeA~MR9I-nu~A_7W)sDMmjMCK{GO4X_i$|z%0 z1Y}BN2s7+D5CnwC5F%rQ0D(k^A%sjjd!KFJ@B4n&>Gz)NI-Wnz)#s0pKsI}?^{jQT zd#z_Z&z)2D$JVadvgWI=zFKQ@{K)CAzWP4xtFOLU{%$4u8>K}p)>mKs^p(w#gJ&Xh zKaNG@?r_dS5OK*mgE7mmA6~z%w)oS5f^RRZuvSfb^lgNX`^F8=@ZtJE8R(Kbhn{S}pm~0YrhHK>Ld=q!+e0_*2H26=yqMGxm?`v4b`kH|+fy^MwOA zt#|@$x(BAzz1J;koja{p^7Dr7KhAZ$2EN)c{`HqWoFtX29(cHS_rD6%I5q1$~tf4j6YNbPLz%A%PMy-%^c&EUnnC6=!^9#!#ZB{ zZeav9m-wFT@XoM#QFlJuZ42ei(M6rxUpSk|*gHf~T*_5-SNA=fPO<8-bmI+|u@|zw zSX)3a_?>D7ux`h*KkyIJFMFIk7v!Y-JglS0bMP`j-3n51UiYPn+Yi;KFa|PU;b>Zct4}!qJ8|4eFx_|O@b14Wezp&%fYYs|L{cSgTtG!XR39a z6Vl85acwS2i=C$P?O!^8uc4Q()n%!>bl9!IDdjVpHg`yD^w#`v-#|gf4Xm^_o zhPzjGc&q5$;TZCHt_#+U!M4(Qok{i=4rE&KQbmn?2DVq*P~lUmI^MW&vvNFpJdj^l zV*KZqib=8nzq?mMOw3LUsvecuR+NI#eH(KpwIYN1C%;rifp)J+?YXACJ#HNjXX++Z z+=|WzD=ytg+&}V#BC^@wRpJEwDzEX-jhEg$mCK@6M^~&2FVY{B?@}iuez60owN}N{v>V$a_8+Y)hs*dp^8HWh!dxdV%_UP0|AW)8Se&ox#2c<^SsuKC{eUv8pH_ZHKG zcD>aKIjQJ(pFRm zA1co8I0w3kSV#23*4K{j%P$vKySJ^p_cG|ymx`**)JsVTl2&;dmHu9Gp6zNbVWjn5 z_R_wx)#?ie{8U17Q{Su=t>Q4K5|Glsuq~506nhW)>8X9GodNqHuF?+wuh}o)6NBFZ zTMH|0ZP_ZWNv#HJZ@#ZqW)}$OAC%JdHe*eW!RRwYN(x9x= zirqY-@1`C6{k$)RK=2cB{o6f2B>z>Eah6j{xQgxW&2jzKMMI{Zs$cv(r=cgF9Y^lo z!Jj3V=a98@at5@^sjpJkOnfP5Ib;*cMP-$;)UcaFRr|F(FYd^SuJ9wCdMV8QVuK=) z@K0F*HkZzT72wm|f1c1HX9$SYxXd=4e>m5Fn&E#q*FT)=e}VQtoa>7q>>tkc|1vgW z|KVK!aIXIpg)o?ZIM;vH%m4kiz(1Vpi&B<<>Rex8yl8Gowq;CsL+e~AQC+kY%KKn~DZh=0HchW!#4B4o3aTA1t=!$T@2Db9 z(FiSe8CjTR*8qNu=T^{!a$vU+^I2U8B3`!djTskivs|7nu#ubB7VyYT_PEQH*5?lZ z=gRL3eBmEc-0mn;v5P#z(P`sg=c`0G45mEu2CHQgb> zQ3KNLv5L?NEFr!~@7^LZp6{t~^~2YVLb7?PKzpP&PHg#TEMmlRL|b7{#B0aGmA7u+ zP8DDc4WDgruyEv+S0~jG8o5?c9COy=7=Dw98sg>U)wm#L&ZRef{4q(?>FG6dQn9G& z`mKUbW8GEJS-P$qdn)j4Zq^cKpd?QA?zYvF+uNd_s~u!W)YUY43z4{z@b?$w-sP&+ zbu$DrJSY_r(#+vZU0l%$rKERhyPSA%1e*c^zmnjkV(Qh24Pi)AkDA)HK>+4NP(>~{ zrh@-*Vd2WWtkiWbo29UCkO<%Q)49KWqx`*$6=WVX9GVwJ%Zgqe^4;ig`0y(;Ub$Xi zznw{ioRtWU?`jz)fQrypYfUu0#=^}S#p5v|CL!|hk3fQc_JNRFtp3L4C4$pb&j-ASF&YF;*gZoi zC3(bFUK0RweLiM0vjs*}3VO%A;dxhIu8v-M(7v%9nQ(s9_~EvyUvXZ<8m`;sg7N%q zEsN=tzUfSljHNr8x=}O!x@VvNbO%e@-2G7Bqc}g@ZDV|=z{J$_SqK=OR688d^%Gpb zSs`PBp04;PpC19UJpHi>~f3`w`+)%qd#0(X61RPiOjC>rI^VhGmPR~3eyA-^7 z)wtMM7WQY(c-oSUYeb!K78Z|}4>NDT5pZ82ojaB4-~s?Z3Q}a3Q{IDFRblqx>w*w!&Y#@D_pKM7@_UX3O>AY1prG)7Z zCpzaRmw}ecZZc3A`on?b;ljMbvvw^Pwu^Ndt1sBNG9(Ms_M`( zV<#BI!x8;`g|?!bbl{*u@r@0fV;q;A1T}3BV~#~=hiPov(x(DSZjOJtrAV=wn_FPi zj$A1V0R^H*h(=3f??r_W3z(ti9_lY!nlqEl7NAJMwt>~RZXK}3wj6Kbx5oS!SiEH4 zd!=c<6ytgI`XuCQu3;(paM%o%uu~;Yv`1NH3-!wWSEuh!LMHVW3tN$ReqdKn?TJM^ zYoH{0J`i7wh*?P9QpfE3hwxQqp4rSpuM~;X=7OPW-?<3wc$ z`TBJjb(E9p?=IZN6k(wkbZZ}+YsHpcXIIaXZ3(DI7Gql9H$=jahRKD^q9|=6XN6}HqczCS+wNA zF*)wHy8rG4296IhR;aQl=g7=NKooo^RUj1Eiy(7ck&)F zd+IOC522Hm9xKP0b_3RrjhJq>0P1f)EP*l9YgY^RN?xqUZi1T)5xcxnCclY7ZeA=! zRgjdx9UQsR@5m>*rXY98d%JVY9gxsIY+?MPPJlxKT34==r9w<5v-q2JrYN=&WA+XD z6i!TB@Q4_rnoa2vY!>kj4s|nh*2!XELdgcIOv#U%IhPl=JY)j}OZo3!t?S>OZ1p39 z5#Y_(jpCbYz-O1C+eF7uyO&4iMnmW*@JHp@A5&7bBKKR#hcw}O;Y1x6cYWpUl?GBm z`rf9G`{$o4mQfSz8!2#ISUjPS97CMNV9wEF?Mr_u#5}fk?)`4Sz_({0h80sYDe*d! zAL9`!upT}iFRGRj=&t+{9di9IA2--+M?p2pS1o=LVltfldUC_UAVK>o1s`%g953rl z2A*DrucP>uWo9jJU7$9KGH@GFPl)q%|0@Tzs>g%9sMzd&JCBqZO@kEh(j`35U{=o2_U_7*yc#dS}b5?`@j1 z#hY!vgGR0arin8lN{BNwP4m=CSf|V@)u9X|nJ?sa0G7k3e=M~q5r3$0NHNnWJGL$A zh&CzqH0t90h5r33mzL{5%NQgGWg(e(SSjfF#7dj;`MHH&9el$*Vz6|&a~h1{w>SX6 z$4VPm=kDT`>GxiUXE5~O!O`2tJ|oyOy++TgOn^cZi?sKCXc(`%Zk953z$Ohb9*^|O z3!cfcfvX3(S-$zbAW4C-fp(pIypVsTWrD)`!NVjYNJG6^rU+yDbMhkWWlv`p`et$n zWIj>cGz5uXr#f)coa{Shz}6C{u-OZ@FY&=}GY!Qux2&djrMjk+Qns^ZtafCMgGHiJ zkaFs0*IAlS22691OK;B}K88`WK0IOv_2OcqA4f=n!z-fbU}lOo3A>Xw0i^;F3B z7D``=3QS<*A#bNR zVGe8HY;kG=*mh5qhet$sMt^@!0{EHgK{N427W=m{s7sWl1u=sf@E`%ytsQbx^$2}n zmRIQwSuVdXFoV4?P6PsBKV0NMB8!{lV{S&xyUk%@$TYz)rrU_2G7Y}mJYU03l8q67 z;!1mMH9*V)z;WTYIN$_7?J3r`JG+9Nj6}vIw285LjgnvNNLH42sI-m4adl&%Opl1J za9WBW0(~?0?(jNC0zb_g>drH9ASpij8rAn{FuNrpAj8?uWi0ilS&M3(=)QLa2JCGe2`C!`xK9PXGp=0NpdV>GOQLK zB+t$jXL>KP{vd`6=rs$YqQ00QnjcEf8FnV4&nkN>8 zKL~Y!=#`}OCs5r*>6$#GLP$j~zR_rgP&}*a7MjH4yEA_0;Oj=i%a&OHAQDR(s>}IB zIai+O0n)C|?6zf72Ad$5Y%>!k7}P98^DZp_%iKS|Q66}uY=x4e4z7%ac*cBeJr?DD zjDyErX(((J&GY@FRX#QZtL67zvVmgwTp*<-9OWJKhve<7;HTS&?}1Is8WFdQ+~M>dm>;SRLqGJhldUoa!o0nYWO^oMo|%z!*X{XxoD1l}$y#R?g@cuzNUfk|id(_hoi zCu{>C{m-f4X9PzBU!2SkWzW9?P(LQU3jh`K3`R)hl9Yu8LJ9`{bla^p);a~*7Wv^| z+($1=ewX7HA^{B!f%om;?u5}2HV(5s3B*a1v@sj~r zK~fYVHwqjq*F?~C!j%NQUL8F2GE*_c!)~7& z4kR8QdkjRc7*gkWKzt)kx5UfFW#xxK6y=tEuO{uFo)Wk8>KLxl3~W+ZCJ zCo?eFGMtWHg>v)IiJRCC)K)n8F`=Wu?f!XTGw8&%H1OsySXpu-XFMjQk$kq*9Sip+ zqcoM$nVWV=n>=(T&}(UOVmVJ2w`>^2@bSxkbhEh|6Sv`P6L zHyvgZ4CwDNa6ws9pjv;?-2KDr`l(F!95jDYz_-+ESVXxm2?=TuxtMWsLzYhdfF@Al zH`|kE)10gd>^KB8I8z7RZ9iLTn9$CoP>j8OTU3Vke9UZF4*QFVqcg@7w~FJ|u3fi7 z5ygrV&#^JsgCS)W{ABO|ucx;Xxq5jS=Ac=!C{+n~VksUqkV`YsbkI7}eX;tZ5Ntvu z$37#>+Zk}J@maifzQkq>;K$i`kfLVP;y5F$J~AcR`A48=X&I$C_%f8`??EkW*?i%h zd6LC`IYj8Uo8h6Mh=Hb;8B?IPs$~fpdCNXW`IJhlp!2`3(ullI6z1Ty1?`C~hu0m@ zL^*Vt{wxcg?{cg1zB37l-?NkFgd};hXQfo9k9X5(@ZE^&o_ACxY;^~_{Z&CDx3p}D z_KouK$z=VTwd2u&pV@B}9oZK9>H6-Kg;K&il{u*2zO?{HXmLhewBcIe-j=D9Pg&(A za4n0t%H!&b6+;|AJQ_mztHiQgPR(g8r?HwFgiE~uI=x$v?_Q2QWagmFy4u1EP&V z&Al{~LmI_ojK>W)HVSU3qulOsdc3S{8hoeDlF&YK@U_FyfpI*Du*(F^L|(~WYumpY z4y5Ew{o6X)x0RXmo2${ZDO1;@R0-i7ny(fIqPcQ^zmt`!d?Fbo5&d!Ev0|@eg`9if z>O<6Vu?{4Qi4D=uWhYaxmO>N`Q6m{7H!Wl=*#LOZvyR^?8!F61i!-y>amBHgrfB2! z9B{44=&!}O*Y0l8i{9M|eVOQF=bx4QZiU>W2cpr~W|8FS3unkK*wbdvoO>B36n3sr3r^di2d3% z!rw)-%EvU}<9)A`vE|9cB?;;uVKe0g$+1h@8xdIkxe{U36q-f+q`-J%QwK&^#VyOj zwpeM$O_WuMv7dgk`gC1cQ_#M-pn+FWoN8}n<6tR0bDCL9BD%38GA0$J2ij7?@Snde zvj~7kxOCbEQu`~-Wj&7ADoy&Rphtx;nH(!IWs7^m{tf`82!kl)%xZyi^y3=WVWF#v zg*sRCUoc9Y8<&RzL!Ze{o?e0oXmM%aoNu1W=hVng@Jn7e*FU>@A@^x(AX(puZ^qp@ zcC9N2%~jnGJnhm0-Q_L82r9LB$&Xi2J7@w%d76Fay@b{TW?wrHPxr4}iH38aFgy`CNTM&rg5E{E(uBLOA@bSUOhX>cGAIdbGK(;h7}PlAT5?? z7c{z6oNYbHf|nzx{YmCpaeZjM73V27^N8*!3`5KbmqSkivVM$cLEMa1254#Grw#|E z4Ny0=&D_fvLWs4Vl|zsrA`+uz%5{BcZu&Vrx`h(_{zMZl0bULof=qmSw#Lob`tD4^ zPU|n1w_cU08v6F+Y7VaZ{v?_KC%v;X8QZvg=c)7dSF&ybu^@h(#lAb(u`!R~P&hU( zr02jv#2onS_x%VohfXzGMxC1#x44SOVkl^WZc%drr5CK>7*bpc#Oxodm~Wx4TG8TP zxWhJoP=H91Q2=K6I{rn-I<)i4E6fJht(#kgt^c*@W3y8V5*-swW#-3lpWNF^mV-pk zfy(IF$G4YYF^ENx!GWajp-l?CHe=!FW@P$|5Ubz~e^klQj@yscWS2((K*2Qu*9~ji zc;zL1u4XMo0ODm$cw>=mV7a~%Nj}O+k)+z|<byE!e?J#u2QYz!GM)%V1j5%d@~(K#0?P(`5t~))2q*eMCO#!1K;!b%HwX z%IWxSUJYW`pGib;b@tSV0uK@&06ULr&FS?o0q?Zej{11S^a*_9f88jXepf9@QljA* za6K}%oA&V0d{~31`6R;Rxf#hBSl+dH6GvPNOi%-gxV8YMy;Xn0#YY%aP3)2cCSr?9 zF0^13eRdXFEsYJ!?QBYa4TIwAp-Lb^UBh?Sq!DCJQf;^MZh+gim5#Zg?9+ql)sG3> z<#v^lVS++|M#LKA6};2o*vrG%QOw3A$m~MvGtovxf1sL8bV#0lwNk1g*P5U z!sPDqc!m-d?q(CnRc~Y?jlwmgjW?>M47LFFkFaQl3 zx&f<-Tnp}t+6!-CGhY}nl9AI_m^qEkYt_#jFoE4I-foK@#HDorx`Otw#5hkWJ;5A) zT8asg(g$`-3GrRhOdS&v(s?u@B~NV=Y2K9&8E62tMo798{5nOl-5N0HCArHJT_vxj zdY1vYB(NI@t4|Q(`{B#5LFeml^_FKAN$E|$s23{&l7VqpJd#zzieU(#SY#v`+a{pr z0jhadK`r7>#zDaxJRFNf3R9WHk&h@VEcXv!TEQEa!Ss_yVLH<}*b&DrY26J4 zX$IS$jnM94?+#-);3Gwrz?B3@ryGf(l{=gGWpMJZWlIv<%5zgv)BFFnk)1WJf*_C4r44cD6T;`u2d_Ws^#yH=o7RW)gEgzFmeRh#}vj zDf8KzVuvT_T()T0LD>@c@KUQ=|H_f0g7!2y z=s9B8GD3tsFh762IT;+4x?Y2f?lnWMB3mt3iffZPxX^hr%JU0|T>ivhTv{ipyFn311r5wIl|RLV^b=* z$yhtc0j0r+KvKpBE=}U|?%gP9K!&0ft}WYAwKfBD(#)sJJgm*Mh_E2j!vK(&Y!{q$ zlp(@jfvmkcTXu~E=EN^GB!K}5Qw!M0$65x?YQKDUWiJxt)kP@|j2VO7;d_^S+wsnv z`xR^UWyxe3!vn3mnmc_MA6C=w`kqk8JfREl@wAh%S_HI%`{27;h?X2Xo3;cOmzVTl zvk&pO^|n z;~uaWPG#nfN;3li%Ow{BBwch(!B1>6g%cy7o92&lank@I8zB_Oz?tSq_N>2ntutPw#gDFC@y#se+1=o8SsspqGaCx(kvJ+vTIESIOV_dIP3sC`Y1og{F4RkPp zG4I@hyo$rZirbLLA*%Tj2|T-E9%F)7JNsK9DaqIkW59QlpsN#;sJ>+LP17=GGI^v$ z5SQK<=+*<`*bWL>9&k&b&C_zd z1X~_<0~K_nna7HRsG~qBGoTx|WJDw3eoVyjg}(l9S39W)!C4`vslcd^puRUk?uF{5 zE+SXRkM4?uR>WTu@v-k`pqdF_gzN3z*@sPs)WKECBx)m@`aZYq<9W zHo+lreI2hq?Bz>r>A)}_TQXcEf>RM=6pAa$XQ8I!T{_H8G3eZn3`tzuKcL0rB}gxC zl85yXlDd$2DW#@g;#H;~Rp$|g`M?Pg_541#beXeA0ubGCyA76zZ9evhXe%FTGD0@8 zlfh4r!4_a#pn8=tSRrPW$sKO85b_&F0+z0T${@WMWEa;7D65is{_t zLa7!lVpr}~Mzq*sXVVcCei=+1d+=5Fvg5%EfQl5S&TfU4_xC8TVYb z!D|ku<~2a?wY zJn>^u!lIJN3nRA=s=YRGX9xu3Hk)q$1JDOY1fDg4_X|wast#u8v=^@4w&p0CZp>4s z=^FwFfS9wI_*sInJ5grv$&gdE?U;TmxYT39 zZv0OC9b2V#eDh5npst}bzW3Qk>b=-7d5ilLYjXf9$o_uFbr}>>9?{s80N{rg1p;60 zn;&MdI3WJhcHH_7D+cyK10_4Y!D`!eh=v#NC5XC4a>fC zU^k^d1sR@94g{=ViBu1xpD0?kJ5w%XOW4e@-?613Nu>%A`3jAsW`Z~i+DYdU!A8Z4 zy#!g`G!ch&7t=`oSY$V9u{3vsIR2wFGiVaUlQi!Q!k2=~WUQxGyy-wVbh|qci;!J~ z!<=OB&|bJr$G8=sO_C5p1xg2~c1k0Y$=)}V-^!)*OV{k@>k_eLgF^mQd}PCMJA^|A zb0nz=6{{EV*WwwU2?FBY140UZ#h#J|_)Q+LYF@mm^a+&N0tY66?ywRan}e#+nJvoW z&I5Yz&VIO7z|2OTSZ-=HxULr^r5nrjW*t<4_71>$;h=$s2P@hS985>pM;4bKf>Su`DzK+h_^G7$>u}@ zDgFK8#?eR}okj%ZZFbN;k+JB<0`AX$s5eYVLX3JmkcQba+W;6g2^$$+#2b8kA}P>< zr^pD+9I361E_!zj_E<;B5NJnn@E7tbvRiWRVY}nifq)()`%``rI3v^E0t{MFcvvSY zIPp@zAp>QW85+RpXoadfvQo!{!;(VG^89<)yZ~sL4wO>kz1|Z7@67aoAs&;lf!V$e zHgH)x31jK1hh!z05BD_=;%X8=E%=x~n|)hAH-~?rTyYy~z6z}nu*!n3LWSL*X!r^K zi-7zf3?LF@Vt`UMFrv7aZ zWNaPZt(~MMh7TL~nDwYD+ree;KPnuorL3Tj(hZ{kO508^%jgj}I+#t}Snixkgaj7@ zLkoUtfKyAD^tj%A^?xq+fWu_uvf&>J0W%kZF&LukH8Dd@GvE*I zkD}*gSDNJ{tqs7_MJdUMO#-OnNco6H+Ne8H>yAk09oWfF6s!bP`??90J_0*=HA;qC zR0&PPVm4?3kBabe#VRnWR}$EO_AefbiM|&FW&(? z$*YB=KteVG@sf)s$R%tRKtxd>KR6d*HzKKS<8hi<9FlB7LG%g9@d6SWoiu6C8B6+NE!_%Y>R=HTX^YMo#O|I)Vkhdj*0tzxr z2YC>4Kb*p@-F4tSLKi_Jad6>09Nz@nEGCy79ub1Ia#AUc1hV2lBK(6xtxJyAJ`Es< zPP$i12hWH}*}gkasJejUl)9c29voPb;;X$DIjk&6L2y18*&eeO@gId;=h+!TLWxD@ z<@d$No)ji+E?-{*FYj zDg~t>X2hfiQ&|gf3qrjxd5Dyb2#$nE<#$znq#_4Gg#&svZv0RCl5$ihQv`dB#Z?8?m1YDEUN0oiL` ze#1xx8+ce%l!X8@Sl|@D+mDIn7bKuaN-3QVX-4LTS@)%oKFq=2C@}dnVukcYBOSYE zUz;K-0W`cDd;jLCUvMGGf2OPE0dm&;yX_=%dtTf4y1*!SvPdR$J?K*wH*O3+=}5+s z?=N%cWkPA@KG^WKdx{Y^MZu=*DN_R$BnIQP8`?1pd}+rtSai-G+Lf6T%|g?a7n?jQ z4?-H+2+CYz<|px60rwu=Yf`#lJ3bfTvgyeO%GLuZ5@sJP=mFhEFs>kcha^H54?vk&l`HSGBy&MM5i)pFLiK}*}LHscsCjKeyT&^$jV>{sLYWE#Gm;93oy zNxpumDd1jdtau`0S-{zQdV~g~ey{}L6A@RS%z45#?9SN*vf0nS=NM~yXZ^Y9HJX$v zjbM_oPTJ2$CDk<1pQda*;-6DneC|C*S~|A`wmfm0OlBcObDA2kh}LspZ)c+20So;- z_nyI?2SZlm$V80h98N`_f5x|Lx{UaZi;+@zI+*kzTWRt#lvsO5*5-|ZWJqah^Qa^Q zX+3KOce3?{v8{am83MFIj)jB9f;vtjP3*m@0CYNKw^U$)CiLUQL2P>BHw-yd5MS_i zL_b(QOhe~sBNe+RN;H{#LxM>LpT`Ha=q1!4uTt}NM;tw@EedJBmP~%z9 zN>@tvZ;)(gM;_r75NY%`!7ytJaQWTfPw?+5-!I#R7-iFeFyI)Q{$9os3onhBeZ%CYy6PJaQs^$QPgb0}3rWA(eJ7W?d+xrJ(xE-&!&~FM$==?iU0S+!zlu zg`eJb|BxJ_aK!++D3t@M2-yuU>c3+C6l?QjO z>+(Z0NJ*JElan(8=_FuFnO%56Jve$$~sY53Ta8+j>C-Q zjOgiza<4?459wII`^8v0YMddQ{X8=nm=JK7)-MDm3yRy3kaK*en>Z|gXOBZ9@*-yv z&CZ3ZDEUQh+A#`ON*V@Wk0F;A&Y={S8jtdW{Tx;|G5b8kh!RUjkuzHGv%NYUK!-%= zD;KjA!p(F-6XMwhs2`w>)LOxZYohWjk*bbelo4_>@S#1>A+-I2sT3`FNxI9jPC^ef z`voD9rtqk=uDcDxB-hH72N?!ji^Y;Z>y3S_hg%=XC32N+^(wcuEV7#k0fP#*eD;7N9Yg(D<7z{VnhXz3_I zQLqz!f4(IWYT)qg@NzbtQ9^#iT-tA=>VoE+Z&Mc`{nI+YADaqZ{G6=b1r~f;>J^ZAGooszanqb3F7UFDcua$L7C>yK#2^?d z^Q{^sO! zIV9PDp%f({jx*$%8R_9%#2qndyH%T>FO=4S;~}lc)Uf`zG}CoFL;?9O3-z40zG(wx zQhXqMuha1%@{&y{xfjgLL8Cze0LD_XHpIdP>cCVSe4@7~%*#MfY-L|I-5~KjR41@Y zPwo`G6fiA-on1oDoNKfrVoVyUpDO{34CI=in1}|ECug5$)A8I!hQ@ld%5=LHHklV= z#}vhWxvoa3<#OnOYt1kn2sqy?cW{K%wruSK6(gBcc9MBOyGF#jg{FCO2wma!9v&62 zW;nyFJ_HGD1@;Z0C-KPG7Q+@#vM>1GAkE{3%>FQ>X#MW_uU4u{G8V4Cb?lbUnKzru z!k9T_I>JQ?>2yBG@rl zwE(YZDG3M@8wt3Xc-^6y78{~V;5p|%DmZPx$}W{FmYl-b&;9so36U+vLy;STssu;T zrQHg&jbKbz<&XWMLkwC|ufj2&N)eYP-U~ogcXkyZ7r$wBH zo4aiXB7jY_TetswDithU8$rHgkB>xe9e6)opQuru;D)Sgwvhg;fP&LXU0b$nA(*T0O>&L3kRn9@PFwQJXM?`V@g zcTAPXEHoWc9!DN+i-R0>J!{U=Efh5wdvQf7I<9_xSC5z7x@}YykbabQ)X3PZsIpQ& zF)>VY#POI*_}k9TjG$A+1DBmt#r*Dpbn!UN{PkIaB6^H#T~qO(H0Pq@gy(_(Id%#0 zVcWXUHiv1d(B5p`?zCA=O~v|Wr;WQ499>->KbduLjd;93z7iG@R#Mi4^JuZm?i(u@ zANLe3F9n=nZM4hlD@HdIi0tP3|NIqPS>({aYf$W&fImAUftz|q4#m;!_2<~$0&Dx=8_4#`C zy5U%%u3;Q4ROM!`tw8q8e_!)|ej;u2nfRU3km9c{R|Jx0fUG&|SzI91ulJY4grv^n z`)PiI!_@=*41-<7=ABW0JKPGX5*%x%)qFx%{@PSrXD9FJ4W^@w+S;4;p3$z?omvw? znG}yl(_{uOJ17N-Gd@QDqi-m)n2=!THS_&^y<#1@ip}J%CA@bvU#L$lvQs~$SbCp6 zZ#S`r?{R62V<`=K`M-C(0Ufe#)f44sfN9OIa-tfdlkt?*xXs=F>x+o3L<1E^@QZ?* z2@XhEvJABU2k+l@H~j<>G))W4d!ho#hC+iy~602V5|I1DW6dr=@Z(uyQxzQgQu?r;?4zf!SWP* zLCc~4g@Hg3p)fr@zEQD^3mcbmcrTI0Q_0xnvuAFV*;v%&vihC;y45V*2;s?t=c-++ znIC)I45AG~j#4hIJN`|aWw!pi!q$?TMayCSkLu9PLPz5)J+k|7&zf-Dxd)AnN}^B| z?(TB*O_<7258AVxPoK;_+vl-6A^qgfUS_*bIu3SiRtt27b>3M!k6lvEKu%RZDEGd1 zsHlIn`2K^0xij*j`(_vPX~(&*Eidi}v)Pim#%XVRJ8I`guT>sSDW886zUEw_f4SGd z+}U%uX9mWljyj2npSuERM++b5m-RZ9O|184ym{!d7PmI}?Ao37n#cb_g zOxMuG=@;$jIx1SKx}ZzEbjHEbvizn);%tZSEhAG?H^y^ISc0L>&#S*%DU9f?rC1$N zzak9~+0SQ|r&CW~NGMM?^EiE>FzM-D+@=xzln2i3$8UDV4ed%;10YSR48lmIb5`vr zvY=;j&WK+o`U~-p2>A|2^e8?xov@D&%QZ*lj&q;&SMIMZu=hQQcE#saI)>3`-t!NS z9=mNcsFtpwHuz@tsx-zs{nqVOM*FKym)&v+2=yy;S-ZFY!-xG9Zg7pu-k78DI-3Hg ztM3^$2b>x-ZZMaY+N5c%T3IxgP8qdU+O+50^;O%R9S9k8?saaiCu9G^aWRNkF2e7O zN^a0-BjNjq1rd9CcTSD2!zsF`cL}ac}w!=ZMcRCYKt_GuPi7_I^7B!93Gq6&} z{!(m#YpY52AgfTbZu2{1$}Dq}y4W9Xw(9mMc2zhvo@hJM?KCxVGDoQd^21Ai zUOt}{ObRgRHajhC+8pHBWUWDe-QJGi=aa+DqPcTuo8+5Eu;jy|g?hrNx>2fo-B|S@ zB~|70vh1Q~RpcG+8$Gfs2Q7kH+Ho2WVGIGgl>T<5zJ-E4K-_4nxoU&KSK-qZklzENF7*|pkad%ZtQJ!8Z5A#0|^FMDO zFqdGa5%`Kgjl3Uo&Y9f2B3n27;`^Kl*GF*X_5U>)q*T^i=+Z!{!(6ed$5XUbbk7(H zqA8tv57JCxFE=zK6xq-9&8RQWCB}63;raSb=0W-%fjzUkyU5rhS9}szW4F78KDA#T z`>r`ZK@;~T+kUB`G3j!zZb8yOWmS#504(H(Nr?HH~*XXti`M;5}csWXFNu$_`( z(brsYte@e`X(ok~1~lfEM)A(DE-G}EO5PK~{iCqPBa>g>gu4w{vBGYXzr7$IT3sD* zICJ5)*!v@(bZtJ7n{~=?7Y1BBaAC533;ZEPi`;O{kdtei8*Bg4@9kXwcN^b)?f6hR zc(Q&K=jF5R)8jv>+}B*cPTBKiRn=2r{m=HN66F-ZU$=q=RDa7}OL!Dkhij->;xAiWk7F!aQRb$~=sEY=3fv)w2yKPuzY2~fO}@fF zKLA8Uab&x>ePPv;C*^M{!(p#hDcaclw->mQt*W%X3-3-_obLMSXHu8GBBB2Pd&wZ_ z4m#ym>{->fe^=yZz4G}HqR`5zA&z923HG3y zod~-&6`cC@(9hb3#{RHS&Gn>{nt|{6R^S8KdTHo^-x}-nPV8E3(d2GOoc!&+#p9ne zANY2*SDYaEeVByX@61eW$#kgV|8*;Tzw7mdZS+N|Try#)JKAzy)f^Y|ruEx{uY32& zA}@JFX%zP~-gWnBO?#gsvH5k?FOPz+UA?BKl)vP;rQ_h!Y~$Cvf_Ln&SrcxRRCpoW z>`ChD{{PYj@SWn>*{5O5m4jB|uL3iA3JQD-9M5RVul4Br_i0)wN*n|N2VzR47i}^{ z{fNBz$LaG1GF^w6wa-JWx^T^^QNU`$-G{+i|s*SEhA# z2DHkkl>NakgO9xin&kiCei%3G-Cv9e<8RTf&+V`Xhnxts)%ohiqn$l3W{Y+c`I8S~ zxytQVdzRK8ueG%KXtjFcrwfiLkgkpS4r26gDOQd@MFo8Q)M3-k6$#j;k5vhO*$T2U zR+L}S6x|?%^G1Wk>jtkR9P`UO<^0>-&MjKGvFoZ?V%LOhNz(miWm{BJB_(#1q%(cb z-xmCldjCV&U7CmUYT>m~$rKN3zaaW$$KjXVhIbrm-~N{mDnDj5Q1UV}v2Shf@h>oE zoR745zQjblDl2WwR_ei$)UVWEw`Ga;1_T7Yy3u-j>h0Sl&QDf|{@>j9uaC(4CFQNA ze`bQvj#g-z^3^FPTvtWi=u6buLytgpO-OvNY9hw*7s^eo>H}xBe~M8(>0ZC_*v}1# zh4dejtiiIf)U4`DRoA9>pL$%|nC+Wk@v4Mx8gzW=Kimv^r1g-mtEz)V8J<(WqQnjdCv-fClODrJ4kuR7}eV~55M z*PYI+=sk&XcOT>&w47@G?3VD)Lg(`lI6Z7$Qx0Lg;btPPsgIBS_Q>z+*Rmgm4QZRQ zpC~^FNsm;`&-lgupzZ3$HJTj_Lt;by_wBW(h~IyD(W#}-y7S8PkdLy>PyY=OTZx?O zapgep*LoJqO5ucaHkGH2ypy!5To?^zOZu@=Of0vbbn)fycm8hC6dNle#U+{nG9{cZiX$=hEies41sd{tHZ&b`Am4#xF= zDf7Z7mG_Nhe?}0gHvu{2|KjVdqoQ29_i+>zL6H_Er5jX4nvpIg6hv|aqy(g-BnIhj zq#0=>1*Bm{x`eeb=meeJT{nbKjsxT>dV z*sE+|(9#Y$aJ^|qgHN;Il1093tu!04ylnj3#oM z6*Jz)dFpWU|77Z_$WxTj05yxk_@2$B9?-ORs^nHzf9mCLiXU8fFDPiy|GBV&+dnc@ zcybW(k7g2dhQUuy_XOJ3R-v#XD$302yg>bTP04_DHd8tQjBcr?ac!6Gc(l#^fE|{2 zHh$Gco@tn z#p;V69hOKXC@J`dB7i)lE>kGz90Y8uzn+JrbrZ~wrQKCh=6cWgy1$ade*YXB*>4SP z=TNRayneJrCGRlJK{lyu2~(QYx=NNtkRQ5wMUxnRI4tmppSQLBc=3lijSOi4)zJVEDH7KJU$K|oMsAJ~3EVBchazKM=+1N6VH3^1)E z^w3o?J(FPC=NLvC7|RRcl62zO2#$N*G7%PSRIIkok638sV)+^3MO-kofwf;zK?b=0 zK?DnE;Vo*98@#&p{;5DeBlZ-Gi^$kxpd=+=q2;MND|~=P7v&#?oJVBW54Tg#UP-&r z|11M9dfkjCn$?`~%2_hjBk{{8g0e+AW(K+(lE$&xv|pE!?C4lD$+^eg`+#HwF}{)i z;GLFsFt^Mqi_1H`0kk;oqqaImnEP_!=ak4J2G~MXpEFU`V*2vKh0#zS9W7QE)3wI8 zQxSr}{y4P*emV|0nWz6xbg*NR0ozK(X!62G|C`D_?=EXXzt zdK|lVrL!1Hli66nyfyN=)AJt%xWqGOI45!RGaJ2&L+kIs&KY7kk!dURaf@j?W<9tq;bof`fx(20l755%Sg1tS*!AXKYApNo4VVYisv%k{cNPH2%!#9{6HXga7&~8 zhFq42z#^4ECUNZhHa}TXR@-`JrZOF>C}su&8!NdxRz{{Xza3H>*ibUuO0aaX2Ly|- z5}a3+KRrOa=CPTr4Jm`1TBAomIKfn5JLLiU9wR#(GU|^*{o`Bx*ALl#S5eD%G~J_L zUbNyU)U&>nJ9M2uG*Cv`B+H3K__ssvvcWveJ~Pt@+*3qG^G#_}=-jjeg zIy!o=Q#t6k^(CKy46gOYvh;t|8$^Q{D8=V}g}9EpfM5hZ@#@A_f2rs0OdHv*I~|F0 z+Ymj?NuGk)krH-KNo5mp(q!CPyoleuASa0p;W?Br5L2nR{_8vYb?C(NiF+X6p!oqf z##698D~if^GT1y1{9dSX!9$XtTEX_5d?Ek2?~3qMGCA44ibP*mrZ3d}ycCzf^SE!` zbUftkRtDq0#g`3-*xeA2bRZc7p};%algSJ&si~>8>nTwO=fY^oIQC~mooiD4KM6=; zjs^bLYW;GxjB?tii<+&zZyewzW`YkHuLbK!6^%xJ#An2KXTDlIvzs3%O>ZF6bVL$i z&nls2eMt{wvv7Yq0^m>V_BmDN@%Kv%-an1s)vvJ|eK^e3comN_rfS@1iJ3j4?i6%( z)Q;7@Miw0=i|~KWdRMH?{lI-WGlJj(&{JE=NZdLGEwo>A&qx z`n(m%c>MUW<0%2Cs~?J!;CPyRvzEyVYxX}^?@uu#N(C2A_{q_J-xSY35l#QlufR}T z_rDT?A!SGQ_x4PdS^{PPmY+8Q`gHk93N1Hskt!R}avFDMQXRb~zBAnz%E^t_9j7AI6Cpg`br{FaY?VqL?+qRK@+Qk} z=#o)pSM!ZCu*tuO2A&R6mH9+Xw2!&F`9ftm6d!-4RFL1hfKAPk#onR_5Id_u2+emD>{J>TJ5D7IYpV6aV1>|Bw^dXgnck)FwY&_Jo7~>Mkjsv(!yl=FGug zPoo$H)?TS~biv0tY%4#dGkoUA<%yd&8? zhv&6zyRO|{g2r`W`Ch|Plv}5 z%W)VO?aR1;GV}eM#Oap4kx~uR~=CVdQtvGQVR>zdejpJo}$ z@w28O-@JinFtJ+52FJf=KV7Nd_sh%1iH}l|?aR;|77%5Tb2La>wFD*oDW}YZxn1|y z4zv>xD`Ii|4@0%+u0xLFQWEb-!Vz;O1{o7Ocm06q>*3e4LPLExuo>*0F%%i!$wLDD zmu#l`RGXHsC!REgm{x&xa;P6lzqw|~Uv*-IelL+#8YCI9M=|}JCk+*?n!1EWv zS*7ne=@*@g|0U1`ydVC=Al6ENFZBu+{H(z0QUt^j>SK0>mp;^|{>{$zpG1?nRK#7v z*HE_)1EP5nZ=bASKh%=wyvLBzJItuu*7mUvX~$)*c9LBx`$Hwx!W@<}aSew>Po<|} zkNV3L@HWU8V@u>)FAr5P1k~Re75!zIv&YLyN%=wW7~C=LS?)VGsWzQluTHUydYt%< z-Asr`$V|RN@qK#HOH>lZZR|q_UIN{ghoa5m8Zs57)P$$3qnDP4eoE69aI%KLeJl$s z{xTv#GlOeRPm><{IV~JpgwwfRUZ%TT?1eOg9WySN1CwP=C_l)iBNXYlyb|b3*tm34 zijtvzr|90lO#!_@O!+K#q&{qmo}Siw#LkCU~RR^G~rYqY4)5b)kFE zB?EK65Z>I|y!1Km;h?yMkkheq=YzcRiq#^2e=F( zIlnP!+ix3LmErw`Rj``-6XPPKDI;CfrBE3u4j)o^d6#z|VmPoD3tYE9H2ZzH6mt?b zZFPrm9*Jm}<;v|0^pgvHLG0YP>}hZKkl2kHmG**A@fsT|_5*Rhn(tq{2iEw^(2R;g zJ*_*u7o=u*>`-uBlk%|TSm2<<4nw;=(G>mx@)Y{dED6XBy*I!QL*xoPbR1Puh@|Uo z3!y-E05e`J#oiucaW^bc^md%tbKe?T+PSMblxwE(lhW6b_S^LUB=rGnD;ePUISgGSZP!2J?&KwHBjpX&wJW7*$uB+usFxkUEsvYEC}Q*CWK zdj1SG#~!gzT)O%(V~_uF=?CAnFz>76(?Q%aO0cn2Rgi(CA~TneLTkaHv6kDD<-OoG zTgo5z9VM>E#KUm(4(>70h@!WD-899vvf{fPt1tMM!*5`Njh$I8{oL7kbGR&z-pL1< zb`BWk5G=geB2C9B;Yv8Dv2I?H8YLC`X&F{j-_^An(S#EGt{h z5CZ+XG#JKk@7nQKx77rVbJU+c0~M)!d3ueyM=EwVD?U+)BT5_GYkvKfWR^Fw?T7yV zVo}f}8lON+_Zz~&`=C`^gIILr8F516hHnhI`$8e8WxjLi%4sR(H1dFv^pJ^6oZ$;7 znZ&n1AhUY}Ph+$l^Au!l@Z_FKY}qeE5+wxe46@s`b+2r))(K;7D5i1JMN#e#Efy`yt90JeY}BZaVh>IIeSZ|c z+uafUIAV??%kE5n$`2)Yy_+J0J_rK6Ug?NfGV0f0@}8(r-z*nRzcaf5{tJ8jSIDzY zA_W#Q;!hc;9QyWi>8#iHX7ed1nh~p|^HRAd>_Y#^YT7L8G7&m@z@^Fg1E;Zh3r<>0E5Bn6XWAYI*`)|n*~@C7`L#4 z0><=g#JU>EI6rfq9`H1y%I*wbE(LlkGcpDys<0&MMIQib&?&l+CaW#~1vfS()OZ}L z`l`;c{XseI@6;|}uT1O&cwQDSQiE`8e0(L>HMH5%gqtGOA(p(_ROLKO(t6sm%ir)5 z3?#%L5^Y}-$@fqQ{)~`7;j&D)vSX6#;uXTHnwniP>n^{H6QdJMMv6dC?QTZ|&XjzB z@UvXNJD0{I=Aw)y#vsy=dzTrA{qRGU4Q<*(>Nme#VR?2dlEO94Du0Ro9yJGHkhjVm zAt`0vNYj)*PXrrt&`!wGscwb}&=G9(ru6+WH_}=`odL{u*vqYQ7Phzf7cUM{^3v4G zkqdp(O6Lt_f?){`46XE(6)U;yUNjb;=%O&;s_lH74V`QETq`WAg*pqMI}7BGFBLR0 zF;Zm}D3!nZ{EC-co-DvbPK)u$-k3vminmo?S-B%Rq>py^< zFz^&Gr-B5kD|=_UDNLq(6qk{~km2!;koIw-Os2-q3EcCXX81oGnP-{GMTVpVFt(`= zGq;b|wMI7*$vB`mFPRd@q4V*0k#K@yG~mV)hhgVC&d_K}J|8DQ4`!CR=^o!&ON-%LV#(@?_2K(()?! zS3-Q=BCv^WC+7%{1-4zaus|~-)_4bUP^P?cq_i@fq6D~A>@SaxW?9l1yJ^u9A}F$S zqFPY;g@Ej_-Bgbkye=_;M`vhe#$)tf9Vn*aPB~+s-Y+`^i^rrVlF->uO^He37ROOQVo>#U2Z%?L|UuGv&kdFO` zDpT>4tnbq~wV7{B2CmH~e1cnzYp0CJ%g5`IPpqx2-C4D-aM?!Ie8ps&!~{`G#7ACF zJz@N4QvN+ViZ`b!x2h4RIEDRq))-bbMztcJbequBmSZ%7Ei zY4^=!grL~PJ1N^dEska1c4xbe^7pvTC08j8Vh?U8N$h%BKAo*U`CvsV>uNCjn(;zv z*TWMdq_#uFdPNCZ@HFl;JRL1-a5pgKs7AOOE&cR`+a}6#rUi1e8jD&iDQ;>Hzb* zezUv0X4G6a>FVDm$ffOPPpoXx!ed!bG{Cg{%M}HMgJUT2)aeJHhF)}dp~H1$0jIr; z#_GL!_fg^>Z$#-QlCs`n^%)EpT5&Rxma%MJYK%4bmS`W{+-_@6mg(idY{%;R*#Qwv z$NM|>W+VeUiYi1(evGp)>i$}9j;H5R^Yvgz(S0+Q3Yc)BfgSEszti_`=kX3+QJ4}1 zLC4LW&s@~*!xkKY;c16UBoIr0fV+=EddDQ3C?q`kU><0RV6x)L8Pvz!$vO;wKIH8X zCCX^euQ;#a^AhP zyqxw{`VoO#@bw%P1pcxn9eE-ZweX-HD}^XwHUoz&wRp%1dx^?4zk^BjFP@)KOzQr- zH*AcxIUx<}z%A%cFOX5}z5P-Gqi+sAO}w|%Kr{9{JZ(9BF*-VJ`-(HELioge;sTwt z8@mU$(mGhA3777W)fjqmRf{ogMjrpxGXUP+o$t$u8#Em$4mEe3sIbf1s8cTz!&t3w>c5T3*<$|)ZK`jr= zHvA4%lJLH?-ptIf$!4o7>(emy>|3z9T`=!$-LJ1Nm6SAhZ5;I_iTaIMzZ@y&D6Wyw zdb8vkkS1<@d!-{vpz4rw3itQa5F;I|Fu%!*L^Z=B66mU8@Cj8B@!Gmy3M(dbBraVhrIWlc`u+>FkVu0AC^)o_ z^BM1S^5N=l$pg(<9eLX754_ICMuozOpRq**}B_waKRuMmz= zwP!EIEOj@x_yyCfr&{;%SyDO$Py2c;=U^FLyW*C%wkyEUL6hBSq42wc zp*=l?h{-Q>#Q&nB0HsEIZpfuoUE1b^f>$_b{D3{{cTu<)9l3O2jAc5~mNo_VK0eDl zx!{d^uaoyG*v%d*F+DR=X$-E3(y6wc>xkX>Q~_D=j-5BV;9)MXv}{j(g>xH3oo?%v zn_M+aPxpx~>pH}e0QBoXgU@sdjsL1L*9}W6+uae!wcWG7XCNU5$fkhH)-TIU_GSq* zv<=kmjA*p(2yTfwZP`D%`~1#w(IavWH#Rbv$KS*Nkejl`iiPr6`*4!R7~Be8VZS>@ zbgQjFL|OmWNe1Nza6g*Rv7f3VPLud@`-)a(wk<6%P>6Kz1tGQmA-}jDMKlW?ewzPR zaw&1`;Oi{CVOed*c#LpyMil#XBmdQqaUiu*DAM+q;f_iI3l?WM#0|b)b$9`Q;!9uM zdv7x&oxc{lPphmSr?dWw=WwI%IqG)&Q!Dzfbe(<3|1k`IdSAu}Z$wf0Cv!`5<<0S( zhFE{Xa)R$d{3$IdR{c*f4{#a?-@t~TM=zA6AI<*^>PkFer^?JLyw)8sEX93<$d0Ns zy>MaPU$Do8wKJ0pK-E>Z_cTTcg{rDn+stGF7DAtQ30Gso+JhBu1O8ufmXkS4OJ~A< zzHY-|EkWn`!enq(!FjPkL*SV8_>Z|U!*N`5J?8JU2%`b4aCmI2yCPV({G*LPy1UY7=I+aZG3k zNsYH1HeNRpa{sDgADdrV;BOjI@S8<^0CB^-Vuw0j5^ANzn`|ly;GhOIpV5m4XCa0- z?*ANw|6PlJGuOf!4Y25&tg2^*%lC7U&#wg{W7FA&^PeDdkUipdS3rr^IIB z_q$nK=i5*M3i1p_wLar>OM&Exy-wr077tuzMfj7vE=3R%vs(93hw>VH^#T0JQ*na* z|5X5cWP3qZhtA{jpNH}%w~2zEV7^LNKp%$d=0|cS46;AJ5wEZ{$g;dL$Z$v#rNp}! z>ci|X{9Z)c)r$G?p<>q4**+DrKgp%&b4B5oj%)73;dgZdE*nW+OuKw_9~&`#?@Evr zs!Hynt&x9Eubx^#g#ce5Gt$*RFvQ=wa`TXF)JeJGYlq?c%Y}y+fy*&Te+4pR$39P3ZVx{HMrLY+6kD8vvQYR8V1AKXkf~AYMs+Y#hY$&J$=1kW(eY< ztY_ZK!`*xHxwq4Tm&Cz${9>bY_vyaWL+#D2EtB0|0YS*T=U&sf5rg{)**L86)4;&M zPK3ABTulX*>gVR3`CPqo9~1<=@V?N`@J3lx?@vl+YPg+Z885>J+cYcxD7+ zAGduqm>nf`nF!fv+Jm6sPZqr#)aF3mo*&)85>enb9D_)|;UF39=N62qeY+5YHRQ@^ z!^I6nN#^JT{%P!L1=N!6=lPxwSVhi!-r1k*uTNxPsZ(MUB_ym-M%312WqO{WijIxz zWS8G8YOo*1nFbIQOw=leDoS5D3{~0rzjMy4e)niaO)Rx1Q?n>jCTkD+DPg2GyjBSWKS z@t7U-=PLw!vx4JYzGRfXn|I#KE!M|9B)CQQU6E{1;+S_^*w$otwR?3XRnVvi2dIVL z@%qR#4M?<&FfJ05!Nrjw_ssd33t#8NAkc;gw#HD0k&y^^CVi?NCT6bl%s3tdK?6Ik z$t?nRVhOnc&CkufdYa+9E&SE7T_sN~T?NqKqEei2Jti#R@-f4QU`0lE|Hvi}GJk`C z2Ttoxy>aK>QR^`*?vgX`nwmr$stj8Ge)E{}gNNolux2w=qTxV$qQO%`!#N*s7fjI; zbyt#wi+NYwrVqr0p}lKv0602Ogdh~d>4n!n!?<@A;8FP*o}mkKIPvv?^xo;yZLYJE zH{+TG+Am(T7ds6s)q|QM&oS8jk;z$f{W+X7^wrTU`5yE_++}w*!YOhkl6FvN!ZV&i zEy=%d(W^~xP@)yrgmGc0#d)9Y&84D{p+vxUE?Z`_yI>ws7bo`F%V6+6T46kHsaQjB zW@RxCQt22p%TpUF85&9!{urC;W{Kim<#3(v#wU?_qHk@Da=n8`*7(O+pjU8j$@Fx?0@R~Or8}b+JGxvZyJkyLe+cWRNzPRESIza)+Nyuc1*rkxA6`DTY);c zKW2JckJla6Pe$%Ye(2lYf|UR0Z6%SE1L|e*ke*k8Ijme`V`G$F4{Uu4+6;Lx>y7iL z;BX>|iq}lGCC&B&4lS$7X>QAug+)cdq4NB-uW!MD`aDZ3tE=&x+!e2vF0c0`%kCal zmDOVxexB(B?9Nmlfk78{%WP(Y@s=I?(uCqF2Q$bPG@iq-7pTG?u>AD>1qXsK1`j$$ zyYo=bt25Q_mR7AeI)&pKQK$({pz3K14QIq8SwKB*lRB!2tc|+f-%is7bfOv>#XVDF zw~-IJ;BxiE-5PH~KMXJ_U&zS*Lhwhe%Gu8qiQ|(I31l~Dsbg@ii;yGj$9``aPW52E z()^Uh8x;qwFyGZna{``6xv%bkSxvhvO84P=v(XU@~Ft_gw1?DUh*v!@Q8CsYD z8)Dtc08&0-AF}U(J|FGa-W4sPXzG zM^WRQF75$I+ba>RY%in;_(cBnfrMtYXd$fFCCOleh*M!L`pF@&k zIQp&muM~uL%oca&Xt#EsSV=}ccPAcSnzWs)JjKp}dHKn*lOglLt>u^#>E3B>qFRlYtnaeVPMwwv zx+zTD@uI{O-TLygR%MMh{=A9;#{uR22?p-o_?86@ukCnJbtFSjoUmXuPh0MHPiJl9 zG+4dPmsF&r5_w;Y-slcs74WUCjZ=O?7A&(Nkz1>l5KZ~3Y7MZ~iXDuPvyu-v(xlCH z?XOr)HKlSu-9oF+TzaD5e%t?&h05-WDuV=wELk-^j7Nd4d+E@muCcm{lS*UzAkpz5!mx__t#7Rh;Gi%Flda#>vWTjN?t3@YdiWB)O6T= z1Hca=Wc+aS!xS3YVOZI#a$jxhm%}4-&!(*LKu7ZHgQ-9_1W(~&n&~vxNRms>BQ2_Z zL1AuIjP(4EvnbED)*H8f0C`34FiJ>};12OY-!LWK1igb_%s9HCf zI$C?rFbN)LYR3%L(Q#xv3RsDPJ~1;hD=7OhGt+xRfY(&LGoyk^>c^RY0-0(eZ>EL6ze&mu6-igHYq8zFThO}n0jZ=9aBhO zNex=S$bF4!)Ot!xDWxD5xfMH%egl9r`9CJs1G}kaba6-Z6IHm7PiLT0`^E2<073z= z0l{+Yp}W-q{z9?<+Pfy=KK&?VYx`lK;=}EnnwqZJI;T3ID-?+(fvJ1wi%$W~u-R++ zQttAaZ53t$_Pr&-$D=RQJGM6#w;>C#TkmikyS#cQcLHSOpzW(^rmF>-Pt!zV>d9ET zIEd9e1MmsXoc8BjkC(%GxfC?>us_h)|E#S+*UX%2w(X7IHu~To>LD&hN>*63xVlo5 z?9lU7{uB!0_toj0c_2j)Q4)Il6%P%Rq^?S^w4;-A{=pXF^E> z{~Q2=dOV>Kl#X*jS~!Bt{TvC;h4nVU9M8J=+d=NJFXO$@3+JpKeb^wIl{D(_9Il^KZLW9{m`2eobPc(T5O8diqUjRp)1Xz`X1a zx-+E4a)$I&NYAN6GJo2S059Bf3WWigxVoS!YookuQ-PxK;&l!XmJo_Sblw$x`)2ym zC4j|0W=t9E(JtboBq}S%S?tw@aRAfY>$WH0>!ski_R_hA7z9{W7M~IOLul6fLKr#7 zh~WSu7Ql7~2UT{co$30@xO!}IfM<*%+#k{Kg{m(a}@3xiRRBMP8E zzx}LaMi=IJcL6mHcUY3Aey;yQzgi05ae(pA8^At@H?1AS1co@txhC|)AY=g0h|Qmq zsilbq{GE3#GMa1P7~5)_xt;-f@yS{%3cy7#$8<8R2!M_g!|w_|j(A~c*a|grr3aWy z>B+~9fSZ`a{O!_DoVN$t&eR;X+~~d3C2EZRNn_qwx$S% z^UDQ36pq~8elIi_O9kJm4JmcEL-3RX+X&CwmA*ju!z%)mXh=w649-CtWd#Z`PngWxXhZxDmMF%IEIvz{ zLK(_{{Gdmp6AnY|)W=8l&O26i9NY?hP+{nx=eoch;deXI7yvs1e9a$lkJGzC#!Q|q z(a_=Uvxt7mEat)`qB^CYb_lkii6#sSV>!B=TDQg+ze%zQ;MBO2BrROr#FX=JNQ1~J z>b^YpHx#swixoAA+Al}(8Ypdpcg=@$R!HiBv^abkPyRcFniSsr|E z)iH8JO)?=STX88ZXIn|m($Z!XXkdQuvdpM|I&osT%>me~bu4zQQ?a7gB*4!I1m} z;F|>qqif~$a^3wrVSgQ1{j!{<3shn1N>5$eJ5C@Z-8V|sseljyH3bHz^XCV3gL7@7 z8-1+MG9}drdhtMDw+Ao}fN>pIuCW3Hs>=HM)+#4wCr3qKOlX(@Ea`A1GP}OMZ~-Mi z>EVw0VlfFb^$;t|%8KPFZS)oJK2%L}A5YHRB^lMy5%d_K1GGq@wCPL_hCMASw_nOO zaO{@%T+L=&118R|MC?G4wbRegueWiQ7k|@?5dlQ7p}1@Lhg0q=namvFwxzF8AxvC1 z2$lsOcgK)hZV6|TtQDZUE4x7FgwcrI$ht`wZ;KzdA23Fd9t(h)-rUbu`* zYqn)>%HEbBm!c>ga(i@>C+jkK)T@Xm4?)LM6pq_P+)ioETlcYOeA5^3SClWChP$J@ zEB(eL+oQqz{zK0a8qa;N@uz5>2VRsJS?zNk9SG9*8IInb&eMnqa(#NYBOhzYTQycU z(S+5G#}F{z1}!uXZdB!B%1XN&#=BdQqXddUIbGSF7pb_%s6v1scbwm3d39jJbMNH6 z%U6$HZjE;3BMF{o{9A!lxY6raVV#V zrq&gK=#hg9f!_pD%(3;`Z7T*3a@r#3=X?(kJw1J~QDq>|U&D0UZ~xz*RWFIKvZZcB z<|wZD=f=n9Y&pDlrl915>{kdYduj7FBW_qNx#=xOJFp0~f7Tz-C*pCfNTH!hk7p5 z<=#*5C)TR8TSDGD?zmk>Wbj&VVtKxodOhYCDPb`vlA4#7_x^&z+f4qd4rX2(4Qx2e z)!;lDit_f~F2g;dV1L?H*T=YZaRM)_h}|x|5EN@@sKuPwRXA_o*7(VHyB3FCIQ-+? z-Er#Q>nbG6zTw=5cNsT&KGO(4{mcw*4l2do!(H6Keg9_lLQsEw41$Z9y(K6{^ zTDO^(ikz&P7@i!ZG_kb8aO|_BrXo9Mu)y#FF&M6Czua_#KXw7vbaD33br1g!A#tdu zs?_D|%rr#t>DK_n)7_x=;q>Au@LKE#HINrd8iI2|N>mCY^{2aH!*f-EA%nGtPt)!{ z7^$qmUHn(#Jg5T9c~arD!TOg8$~3&me_bmaKs@leE6c3#WT;v9Hfq%53H}7pa|GE37PBU*&xc z(LVcx#QW?U9d468B^;;xkxVGEDLuQDn5}#t;JZuTc|UNaPIWoUiJ{%*o~*3-E-$zR zq+znV%HLU2FJRXmGkE9mofaLpGz{1-Q|!~q+V*dQaq~Q>t|v#?3wz5J=lf4Dz%fC` zgT))mPJs?RIrZ7zzx42WgtPlVni#Jnu&s{nvsdS}ERz@me#XQxdtw1tbbj~#ox_5| ztj|EdhqTV}il!22q^vj>z2YN-Aad5GyF7h26h0sbliU9aydaa99%B|u1}7Pme=O$W zywZD4?9BQ7qNGT^!^-kXd54tjIr_iE`aAuD7G6&*_U}`WI=cW$D7?SS5LuGH65Lgp zc}X-3arnX#>i_9e6^vC`x5;$Kb%`Y7P~iv5nra1Wb=*+Ut(dasdU^h-jChW95>DoW z#w~4xtB#^iIDyo<$a(X@Xy8Xr?X#WD*F#yIQzPXhOsqX%hcyq&x9=3ZYHLfftQg6` zr>Azf`FV!hQmE@CCA){*lv~@|$?j)2w-iQCx8c2Ob@yRYS7ee*%DY+uZdh5_dYzu) zuAPI(3hXQwzm4Pxnz$L#2#1F98lq&@*-22gI1|e9H{DL(_NVZCaRff9H82Y3Z}E-;(AP>eq%i`B zmi#HsQ!r)4#j9R$oKz%KUkySTE6{$jG)S=?|gonQeNn1vCT(&ss%GWf8WJBvWl zCc7MU+by1h5TPt}PU?=LYN_5dz!DS4fyIO&^*i>vuQB=k6q&Hn4BB@HkJ8#BB7mA3 zIR*dZ`f53}Vl!3GwyU!YICIn~?a!Z+e&ksDxL#2;kwaF6sJpTQYetxJ@aOux6kwgF zp!@KuhW8K8|IRM_IAviRnui5FqZM9eiMk$M&@JfjlsLkf%0I`7x==|%3+DP+rRy3? z_~PR7_=QNL+~J0&QeI?l;i`-mW7M#KG9KxEO9WD?kK#V$$Aw z8RawR`I>n3D$0s+G1M+QduxXl=Zq{f3+NhPPb(#Rc!|kVC+$hF_U7Kax<_x`4|c){ zyGJ{HZXs}7HsW0MQJ+(rM}1(bw!~)FzRaDwQln!U3p_C+we;lYpht7EN=JW1a%h%S z)l%Q-dh@q|Lb1JTngKx8r_#OG{6l>M!Q8*$Md&_oL`e?=R0W#iRTrG^?l-QcKh2P^ z`c<;R-*BIhZA#&d+0}mk=y-lcVh-Jxh0x62g&rDL8zijpU4f^Y*EcOD!r0kxhhP+m z3JQ^SzmJUk0;m)*@J3QS5_;^*3DjaA_yz%#t8^8dQ68X^B>MK!8#GTjdk&0S-6q-- ze@sr|#gF`-@ZYPcZuAf5$Sn* zWW?|kwy)i^B|Xvj;dEJ5;L;+qVtLi~VO5&pMa!$P2THqwz8p^%Yth-UV7a#ZK^caA z9{J5Gl?byD0yZ2OB*^1J+L?C0=sj|{{-rvN>DStWSPDF@y* zF4;PE^?}_HRv1g$YiAo$=Venv#2F2q*>9YJ7Aq5*PH*b zPUOke(`H=y%Zb6&8ueFfN7PI5l$L9&3n*gq)4jzt;W zKZ8c0lTzy4clDU6j;>Vj`quf6UY8aW9A+%A4p1_ZSMyD>MF+~@19Ow17WS|4d}=jb z0|9b6rNd(cw|@XF&3las>&wdn8JHBY*ub0KDMu?J=0LSHdP)bb0$zAPnOg?&*(Cw> zQRx_UH@Y8hA=Xp^fDPSi;z@0qeSE-R0w9s+>qMqZZeGnsPpfj4af{UJ6kEnh2$a5N zf_4c279c6rt}*7!#JO3i1hecP#$TT_#-nq%BRF_pOpJ)L6BSjyQcho7`}V4Bi7m%B zTrvtw7jTsg0Kar@n{1E9JkCq!ZW3!#$B&-D%UC5&M+NCbJ^dZoq|?3vEXQp=qrT~k zDw~<=h>$}T))0&Hfw`MEiOb(#^NwZxBMWkJdcM@8=))QLb8|B zG^^uLpTgE=elMw+?}0*22i<^GoG(Dcpm91W$M~W0^7mxVK9spcnpniFujkD+do8;I zDX2*}1Z4fvb%O7wyWkuy-0nXP8GR#ppDIux(g|@OEq9OH(l(;*B65)zR8WCET0shO_rLY!KUHTJ?Y3Shc)dtdHRgktwRYic zi6yqeX6F8Q@zaUa2*&mYI8>*%O$up$K>7{XsEGgg=uNe}vN`NmlK>kI`j|q=dmoeI*jsZHC4v|KJ! z+dBn&vn*EZ>>7Er_9i$Ujg*vCEWy47r=dpH_5OaR+ND5ROFQe0H)WiWb%*PzwRXK; z;gOM9PoCcrFSKbiHHF<@OhAl)IjkYn@&;9C?mw0KT)IXlR+#We zxMuVe(th#vxa;*Rdve7GKuncYG}+xV#$#n|z0noJ!m8X-IVjm1BO6puY@nQ}QLw@h zVlg=1bd&hz+Ac-=gE>j-%>rEwN0(0M{j!pTKp9!RS3_+h6BC_ntE1u0ey@_(NtQZe zt{UDzED}g{4YR2Ze8#XLp6~UHMScmX6J@(dzkY+7cM(Nf3jAA1I&qhq8syJ@f?UEw z4i62Ih-$ma-G<0R(+%awV=R3ycXmb|_rg(E4d{-S7_r8B8&$B)J;?8~{$={G0eur~ zf?^N*VptS4uX40BehLn2plPm?aN}vTh%wOJ{o3i9b7^#o^@^N0iP5l+b>iF1v3pwp z`(n?>mw%aA>WZ1T9r`#e%CEFq;oVlGHDc5u{wsdh`K@PfqAKNX)r*=|o$!+}9WE`ByhUfA<)_H30JXARU$xe^RCg zej6zS8QEI1diYI`e}D_UqvYYpFl&Z)Cu|8wOlIUkkW7C#b+ z?@O(U2ZiqGIkEQplIM=aaDgsr<7PDf{LwhAW$FY?50X-sVoHDu#B1)pc6P4ZVj3mY z(Mxbkqg3lThPf2(jSpOlHc{8-7b~30d9-#pmDz&Sddl23UY0yfiXN?M_btkCHv%qi zkC)~)HPvgdadGU-*QG8sFHxXC%G^8CResHx(s%}x<#`RMrO5O@^ziJgx>2ppz}#Z@ z9fuHF<@x$y=lR3#c_oW$rkO;#yKC4Dkfea<}|dyxXbrr+^u`LP{Byl^WYv7yPA(U$DJ)4FXEFt@7aRg z-2?qTxIb_7)q(S^yR?k?Wo0CI95U9bAFrkhiAs7B9dBNaLk|X6#5|f-t@vE_KJ$`4 zD~X6&+R&lUSwrF2(o{!g>>bFb1KEfE+~9jVJ&QuMOIAm0nrUJN{V*mOy*8epNHquC zK<0}I(4l=!Szones}^}lkIGEPT4%mFJoMp1J>!LAQ@S1EhLq}>$M(Ibm+v5|lkP7i z=_DU_SI@B+?^%61haV2|!Rw{~r%UAf{L9maQo{tCQX@h!noPfI0hn%zUFp|qbw4_! zW=G5Bxk|S@zpE|7?$+LL(%WWz{npE7WV9v&0khQ)JE_FN-wzL%?t;viPwwe*4)vek zybT2=m-ehZL~cvCV;}y3hW^71{@<^-Y2*?|b)0g-J+80~P-fCN_baEW>d)6lO2@fO z(3M>i-x|E0xjO`>Nx}+W(vPSTCC;{pL{1?FBHq8hJtEjLIeCXf!XDx*jUTE`4lL3( z3cS`T6UZ4_hdFb72)Fj;2OHex%=|)OT_4Rd_hnUk8Xl`P*|_F?ZFo-k9&cmq=6WjU z?XWUSxKkvMh;|$cEds)9nddVE*9ee%*-;N4NqD}z0EM!Q+r45uQ2~dPeYHL>^&C=S zm!%3+RG5VCD=S1sUDgO@sc&fD0ydtCK$uj4aXNu{>`xA)j5kW@g?2U8)Ya4u+hCZR zQQDo?|8XPQEBx$w^-8_{q3dVyy#-X1?bII^qI3!Z zG9uEYbV!3VD2ND1Hw=t)gLDXtbPGu5&>`K@-OW$~%*=Q5?DyIKyPxmdziYA9EEd9= zxvx0RBhJ%mI}eQ7d~iu(QIFvdAT(dW=MNqAO-M?q_c_`%u6giB$~wa%EhScA#X6(?ZHNfA-%?ME1Oz(M z&AUHA&{XKTH`F& zOK{J;#;dx|g?=L7**jI%G%=!`uEZlvCQA%ufq$1uuQSw-_q@O$ zKS{rW+M`kx_PzP0AfHxm)hNeE7R7{LACE~W@d(_`iuLMZu<6{sX2r@D#BuD+)CpS$ zvBOqK)927wNIzsg(IXiyz+z7;Bovyr2PynLe1wBOx>#HWh8CLua%^}uzGh+NFiKZ= z^7%6HugN0Mfu7PcSm9LG&M%F;3Dh@)Qq(%bG~j%Lk3gCKebymwH7|(PxMLkeeCNUBc89Z-dnyb%1lJ9AEJ&oHmN?PhYj9_GRswS^> zkjDp6Pl@{g38Wk1oPVnrm9c&ucMYp*X0%gXfC8Y zqy^ZvfVH1`j31qq#haTW{}`$n=YKMel@qc`mZGoG9C*hVR0lA{ASudbM-hn_bz@tf?u!y(H6;>e<_H6+7^rKywUn zry7@HzE^+>ML!(+-}ZvI;^lEjXCWo-*qbp$c@I?$zPu?tnKJ>MHxq=bP~K)u(7RrS zBGn&_e9e=!kG_rzWR6pJaBV~i^EoVT3;_68U7m#2eeEG3wMYvf(Wz-2z|a_hLH8v^ z?E#Kxz1XNJB_UyhGUc6C;mA*q^|gKIC0-QxqJdh->6=2#nzLdK2@F09OG_lrVK2H` z&MsvxD0ANBi#(-W4&^A7I}^u8f>b#l)t#qU{i!*`m646Y{C&K4`gZu zg@-%-Sx+>q>~!xWV|V1e8Reaw@a$v_bQ~oWFWLRH4l+ewsNhG+wQad#Jzuit>HUfxgB&-0 zO!_Wsh4I7tvWgD6AJz?;z0lm6@l42Akw)`im75+3 z#^ffX?TuqUMMgP2*(~K_XeyR}w)!U%uz%o&yARV7@%7)8%Hg(f6@E zJB3>d?mtW>R5ykBuklAK$YC+@E}Hxs)ezqFkbnM?qDUsq9FnKW#Vztmg5_BrzQW6T z|5f}W#L$1xH-Cfi#Y^)+P-34Z`}&c*KU*YQM*w~o3LbG-$Y>RyR8%~%TeV|$6^%Ps zI;s24wU{-OA2q#}7bbE7$C+r9ER8s1#&058ACEoR*^gQYu7R!6a3;a%4V@U@Dk=Rk zgChaTLroqyz?-9deBO{TL^ARNIL1rvt52wPM5VR_U5)hL*XZsl#IJ*X8Au= zj{L~&K@K(Qv9WncSx@0R_S7TuW56p}Wx1F##opKTIQM*j@*!Q+6PPK1)kcK#eBk-* zwmjE-)V~K2lXtsZ-oFjx_YZj=Y`SeEn;(hwQS0s}3Oa7sAXHlA#H}r%P2kNtpvuSfh3Mfi~B7iAQ_rm(Qc4TLXQ&^ zh0o7HWwDDo=qfO0mk!ZyE;i714`1a7_CQ3CyO$q!_Iv^DLv)l3MRBT#pPfFL24;Hj zxm7IwgOuS8G_beaIK+Nfha{l{Kcx@|Iq@Qn_GisEDhO$4)% zAfM*O2-5UHgr>43BnBtQvrnW9E=$g)~K;@;w&pLt#8uD56YNg8{=U9wH`o`YR^ z^$#XsW*TqGot&*Q8|l}hNHwXWWN)_oz*?i6NV>{|_hFhaXDKr1wuH{zRl{$8gL2ev zZ=!?`utx-N(cPSh9O2>Nx0PLFHe~ry0+>zm|HkjACa;W>oTOti=uKc!LQte#$j$;+t79pF$;C(Ad z?^N?SFY2lkof5sg?ASLo^E)}~BcycRTT7PL3VOfU@*qgp>tRM#iJ!hrbI4@%yVvqD z{+@i+nXpnQQ0SFk9CtsVqr;BNmB6H0tco_#Hm8~5*p?`wWYvE2%K0R=3dX+L-Tmik z*zZAZ7|YI=QY(Lh-5vK5iz=ZD+-mLLqy z0Tt0R=B)0q*HHp1_3<3rdwUWcY2)Y^hXBvRrbHF*6VfluhqSp*St~wa#{v5RXjtU+ zwRFLm^|tXSk@(FE$Yl_uB?^5RRbjKQH!~|tym``~K8e!XZ?3=4LuXB*<-EWt=rp5i z4&|}$HU#1iSkgvNdLPEF7q9(+E9q%h7o4gp_=W9)0VNe)sSwIwO{aXt5@_qYxVkJ` z4Hq`PvA%r|8LB<>?iOEN<-JXAGs^ajac;YM$b8tKdw-wr=(AM{11ZTd zY6G$iymbfQ!%zdWAZb=a-2Pjwi~1f5y32slw)(AqvrE+*W$tTsmh-cEgOyA|6*kVH z<5{e0K9x!U9d#@m!>9-q*Wx7$e6*Y!Wt29)I?=Yl(V33R)xNUI0!5xJ#7rZ=S2)y4m$iPKXKXqo; zNuD^O`Z<49sMp^z7bAG(-kSeH39WXc`Rw^VKx#jReljG%F0rQJtY2eHp`^scSEv5a zkiN6@X0%#q{n-nxloeowiII|$7ToyDR*Uv$NMhHWob=hNIr$>ZS0K@0wO2bux=y{F zdy8l+`5Xy=mM=%+MgkTkZIu}2y+2i9tv+RA!C{O)n?3H~KjLDJ%rZy_W2*P8uiI?C zeoXb99$~f?T%WoSH*Yv^ccKhKYwhLxSxKgzP38rn!^=SapgpZlxas*YJ#YQIdsM70myJfK5& zsMwx)mD)F!X5o;`et03tLhS&)d*=gj+X{METGR< zaL+wK)O#gIEs+dl3=U_kC_E0mlPPfTq4eq6c@aBiH|eD;~3oByoL{;iVT@T_GmJ9%&! zB;t3Y;NK4Edj|0`Q`h2M^@(39=^#1ucNuw4EC(^F-Yi$7LEo~(qf46y~nsZI%qFccWLjs{(A zvj3=w_>?Bw=uHSm*}DatU#!0bOyapP2z{)Cq`^`Jz}n~^%(g|Q2~U?>Q3$&pyx)vW z16sByyBy5*-hLOMbCvG3P#!B=E#3wwXbKssNCtN&lT>Scx=-&v+ubJ^oI~T;`&8{t zPVTF}5NS&;!yW8y~ z1Q||EgbzL!Esp|jPJ?c?$nEWej6b@mt+`7OjaZGFW1!sF^IX9NlL$$s4Ff^tL)5|H z2{z;iyI8*=AuwdFQ)S8m^ZP4%fKMO^O&v6BfLRWW}gPNMDl%k^tO$*0Pj~^87I%=AE#Cd3S$*#$> zZ>3}iy)UVFbz|(vh5s|MP=*TNN5|T#;NqG7;iKEf7RMO}(*p2|y60_$>(325?D1(I zQ$L4aR&)!xRRSkc^NOI8Q%sOcK$^*S{7AE~@4Aiy2EXHpSAsuz$OVUk@7>-Y+Z^T2 z7jnbBxHS9RqS`et@)$UW9oX>0xOYn}I^~Ito;?H6q<@@>T?I^55fB>$oBiJBG}0Vz z4(5~>T7?HP`1Bzwzi>TEKhT6@C(=mv+&_$$8=L052ThQ*%xx9<9Q4v0iD-~#DDL1`sD_)=KQ6HU`9Aq$-ZqJMX``e%IPZeVN`o3eW z&fA>^!aw^pi^{UxoRMtW?BKH@RO-O@LsJK*2N)^(DYv}o@WKc;lkrKc9IIsA8ZPVugMu(v9JsBfPT%UQGS;%0tpAR88aR|lmBcs{ z^*k3FTw0=UUYSOBcn1asj=4N~>E&@u=dJKz&7eO_P_KDs+OA~Wb`Q~|Y zd>HHR7er_ec6vpzoEhB5)sR|!UQK$L$Aijb`?Ov-aRbOz3@~IT$YDZD`(C^TO4A|~ z(7}(J4k759+JBz!%*Fuu^;Ri>hFt1LFRo)sn=&rr4@NoKegimh99`g@=j0N@+IXr; zATyr{F->*tk|Hj0=H!-FR1o%-%L;#J^RI&T{tI3eQLlP%RPa!a?>9H;reB}m_~=hn z`S`wz^Xpa+DKo7d^o|)xVbCzXy2fK$~KnA6_F78|MW?H_)76?(4SXjfa) zQ9aMV5JFm;-xN*2u{lu*8&)OeYO0McN(`(E4{eJ^kpOD0_rVN00#I+E8qN8{vXO#H z6XhEm_bR!ANSJNyxOAf9gd!NZ z$smr5S5jfz#8Mh_kkC>p(U92vJwdyR``f>Ly03Im;LX@a_svL7xu@kyJ4F%kOp)s`gkkkykuv1jY{}#vkXk`dK#J;dt{Z$zbWUeHB)(B z%A8(8WV@-aeZb|{&kM}wHx)jN&KZ}Sl)^n zq#W8mbDN%Xeq;lRiG%reV0FjlJi86M%S1)pCGLJWFDmNfG=8A0zK8~jx5Z|!2ZRJ0 zw?+BH{Cp#FAJFLjxLclEO(setT)b*1;I;&J9deci?kk#$!yR&0-iRhFQ=m$sIcD^PSFq?J|d*dB07rA(cNoB$>q zZjqM`Msxoti1DMNBZflXebsXK@N5C_CG2Ojb~d_W-a|}L&|H5)uBhtZb5~s{s=lzN$=-LI|w!hgT@~|vy)X!B#*2b z=Y%``gk^X3ra~RcYC15Z0y=1w{iX3K>#Pl_XmLJ7_)yD#qm9Yv{{kpl0_qn-!)Enj>ni?Z|7ga07s-5eSoZ|y$j#Z~G5Flkae{hY_z}>*VB`QeT z<)C181~(YH2w-cU=H-skD6}b_%q%y_y!UW+5&y^b|C=eILib++2%lfLDCVhf-HSbO zI>d*c<;tm(!O<2kdC{qdzR^`3rp*l^USo#t{d4PdT=mXwg;QlbPq5@8SsU`F1akNspN)6qQn7C>1qSxsK z(Z)WVEO*Tu7(a~{UqZPXB@nu$+mH|j(vw>=7^H(HtpQykOy0kL|LJ;{P`L!z`DW^d z>z#oY?{mk^uMz-V@3CVMct6Mud}tIHBzuR!;bGCcpP7Gj`zbEQPYey-@6Q~7Ye<#Y z|8m%zpP!tLsf>sZtHdbZ$f~^s8cK9J@h9tDJ4P>zEl$Z=6iQ8^?a9W*#+1q{+&s>$ zw%Ts^@?u<$dXXSbrh?xZ@uUKIv#69SjdbU*LaS|yNLWF^Ga&EQ;C?V|Ui8tYD^TX0 zL6xtG>^1I9sz$p&{PEPE;SM6NEoiw(XDB$_6FTf#oEUOk4qy2Pb!q$!ti`_WKe~oR ze0O(M?z81vcdqr?y$X`_<$E!lm%0*PD+5i>e*U%-yTJgQ0}^@3R~2yj;H$#G<^=Pb z=@xByS2B5XElD*Nk>PY)ancwXj?B>D;nBwxH>02484D6gKlz@2w06<$wh=Hghl&Zp zlzr;%HuQq=MQNw~zL{g))k(q+UpV4393ephQNINjVtk#|h96Lp#50?ZbgCMYe@5p2 zM5f^wrV-4Q)d%cpJ!7R75b2CWn*I;Ie!b?u@k#c4eu-2i=Vg4Pp!Ddq7wz14bX)ZO zo!sOVQOdQ#73spoH<=7?RB~M?6L(oATJjcorlVN#bU%6gU8x&`nqz}&RplTM^>Nm? zr>w{FzXxF1pr!w6AAmj-k3uW08k0+(>?;h4!jyaN{Xy{@x2u>r$7eUo5hzBdGPtuU zk8uezy>0k;RzE>v&JAOi+I2Y*pa4IVH`C&jSlY4kHG zXA(_JaFNh9j|Mnr>&mL?W;}5_-}%=3*edkV5;2f$H+OSmqaf+z24>!n?srD=RKs%W zl6MzR-U)^7`$WYsoAKG!K~c@ku``gTC<<9_Aox?tg>{?`&>Vm^&;#VS9#c|EsAdY7 zC=KZe%~(;eYa;BaAGJAE<_UM;yj70BUDws(jcLmOLY@$a+%_V0VH4Cvv$6w#HfUaL z_C@05OlBi(_ig5fF(^%tL}yCJ zEs?7k#q3R(EY;%PsmHzkJ-8I0&ubf1)>=mMs<`#^2ibpNhEl)*`2p1qa4ZuY`r4@m z&CtKS09EP3C<&dGUn`@J9zFkjmtIX^&Uhokkm;}G`V%2R{VXEDj!3r6H58;0eQ)RP zvdqCq80*9z0yUY`nnPMES1lb4)j{PEkwB)D*6T7upXspG1*|6Z!B&Go?omMuAG zmD6~C^@4EAOLbAYbl5*c=)WsK^u`*0)vJ?&A#fnj=CH!^!sb>Z5<4NKbDzIFI{`JG zLqiS=yA{$0a*gs%_0B3%k8JhWCw|yO^vS>b^|^$v;v0yOQ?D}EOfk`6`d1F8VSQYn zM7!W-k>)-T{P%VgwCa};rm;rQm>z>})sq=O6+QS<8*Nga^c!a?lWild1{Q|E-@ukO zU(4I%K{9`oCh5_Zo0u`?(sr}k@q{{~rJ74p+`2d=cqTGAlBa~($O&`qi0gXb6R9aIpwYReocRsZ8tl$28m{g8T zb2Mn)d-(`tBqb?{K?YU_GV6u_9H(lRWTIcn4CeS*>)RQ|m7&U(T8U00oe>iH{N;1M z7GTiEX!)Lp)>K=|r$YDH_Y~-!JP^b%$HK_mcq=Xm0l4~58EBE*Hdds1qWsdLV#4yJx zPD_AW>w>!k2zg#``90OF*!BBNIj|?;_27XALjCyHEo3CKxb28A@XNq6*_V( z6WkLEmlZ*!5=l(Bo4+i`d&iWZPsOF?^QPMRqjEbt1s^$)yUnJ&OQB?J)F!-8E_WVc z3%NQ%=!94|1mcE?15hKS*1CNz?$B<*M>FU{C;;u0;(Vg<8laSB?`x@LqDdc-;WwH& z?}qOmrk0icY0-Zdu=zo2iz;a;x~V%kb#!_P-6fs}#u1yEBR)@gckfJIE<970G3wQ| z=T+0{%O;*ie=9M=n}ArSptx~_)tbO4oYKkRYv@wr55@^BX@P6{%)49x z%!O~`^Nx6hle5=O4U5p0`#EtMj?IPDC!f6kf7tgl0j}mP#>ryc_7q+#pz|*|K&Umo zzeOJ95|=*8AN2@W^hLfZkf)FrzAI)2>7MDmJ!2V*VlnHN@Yc{E{my4ySfpO!W^+mG zV$*Z(raF-Du?M%rK?bZfirA{u*>LypMNapHtOD|@Zi5TWB4k+_sMzlWX9Y*NHga$d z3p^9A0y%<}{3uT{&(2xp0|qYU`(C48+dvp1YY_?hx7M0oZ|&eZ@AmtT`i)xvZ(@)~ z)E$65#-;;*<+YkDvs-Q>-7EDs|Axa@0=xit|@T4(G` zl>bL9isw3V8j2c5obn_9CDV#xy>JA^`2tx@kt}54@17nU$O$W?{}7F$10wc57Sa>uRalP=qfk+IR4_>C04pUmmrj6QjMo&IFA&}@KO zK+j|o_0Ud2*2?FA+_A2NNrlK6;?bq0DT$dh%to`bpd^&5`#ZYa$< zt2d(Hn}PlqcRmA8O`)k%WxobUDf@~eC-g}DkU8o=cyK@z{z>cBv;#K{VS6>c_TeO5 z!Hj*K<8=>;D^zxr-k|M5`f9_lK2Vyhlw`67l>as;*&FO5^4|da;$=kuowrPEfj5mQ{y+2;@I{P9ua!4+|)CbXBEd^Z$7+*>gHiDNrqDdKF_ss2T-hepr z3h2_+uFbW)*pdjUdJ9O5uV;*p4lbG#K{q=fIB>aS>LC#_ES2j(JGL$r@0+|!82Vxt zux;E&vfvzWx*TahFQ2_D(R05mUZCmr@tSGQdCCxc7;p$!?>nc^SG(XXKe`oH?@dK& z!5Pn(30~m(L@F0~ECq1VJ3PH6Ed@L}D;eEEC#_m)TQqFPQM1^$PJDEw*H?&# z+gAwq<5wjjV9?yTShYBJ|M0Q8SV~jdvZ)#E46n%U6;QP%fbK+(>f|q6XvH%>+w}D( zPTjr7SW2I5+5LZX4kKu8%|j@TRwwJ-G~ zb34RxgmjG(y?4}}FwU!-LvN&Z$_vMwS)X#`A4|#P>0y(4NOv`it=sg8mozq3;GxjW ztZwDU%P}M5&^DCSx4Pz-0j<=2Zk3*MnJp24_w$^dpWh8t ziR%WMUIn41#aBfSz4>eAJ*dJGIX`{*5)LFk;c2jI;dJoMv)s|nr^$cnSDmW#6SSP4 zdg~~$o?Yc=NuTz2K_z&V?9OB;eOs1V6hX%U#z5kB`@$PaKs5=7wD<7{9T;5!z^7Sr9=q1B`1#hY(~eo6K1KXCbp3T0!dl30p+!jTmOKPO z!Eb8BFTHFUkS;b2>+gVkqXM8qDMBE0DhH5IK$Ebc^*ZGSD1SH~I~n6-$zn^x;xc^dmEG z+5WSs{+tTYTuA;cnW*ndSaGmZ_hnx6E*=440_W$YuBwu4Ddv1Uv3YWdB20WRS(j=A z(uf*UlEp<-u&bx%w0|mPhx#r<#pa>jb-TX6kAdRPS+4@f_Oee+Wx| zPtC$6C{sy8^i`}mw#@;NEOu#(DN1>gA6{9FMhK=X8UigM-aCemFBI|cv zkha1cY#n3rym);jj7x|EN!R&scd{f;XD`1(+_au8H?JY+{5(?oRQ%xw)XhCk*bY1G z;)Q@|mT4;s85u(;Bj8z6(w+j3B14re#b0NXw*&z#6>Be=2Y|HGve#vfAvpluYOfC( z>VPk+8{3otJjcy})kPFwF>su+aoKYVAJu%;^ph7UiYdO27GY-*FP#WrGk=N7oqyx3 z$E{`d@uR=zVM7#PV6d&%szd4`6B0gopP16En$jWJ5)B;WJ*sJ^=jJ*9hEX11lm1j% zkeZsBx_cdx-j?awhga+nmnlOnuTNhz?UZZRR;0n_72--N7b&D(yApN(xTH5R>Es&o zVz}5af>Vhb@|B-Bi&48{*Ev@T)VJs*>Z#G`QE|?51 zui*^h!i6byU>>yajV^?v!($FEQDsE7u@8&5Yen^Qgk+Ec#=@$RO zD04!x1A|&pmD#nW#MJfS%4daNht~SZ9R^jGjzb z7#QG`QQ~a$Q$Huh4vo~`+T9hlI%kRuz~892DJ=S&Jf_Y%B-lNw2r@g4VR`lH(T?R{ zN7PDEm8dA$tdQk*64e#9+y-+3n$Ad#9dwVhf#-bH%}(hJ9>9;|6;2-WAXpc^F^1^W zxIBkkvEoJW4%=j0Vt9UXa~H*0xr zv$G2d25c`%Msl@>c0+JF2N}_eLnuzbKLJTl1Q~j$dy{no7fGZr%z+hlOa*HN`glO9H||?M5Ow2VBAV|Q_T%TS-2Ws~9e9y} zZTs=#t`hYum8d~F)#lGTBUheH6Dd&3 z-l{Q65L6Vp_1b&M>yrF8Jy;Mi%b(Jb)z$x~V5|b5ymHnFeF5^X{bMs{<=$8p1^c6nE*jo0P z)U#J)K%*Vu+Us3cIAX{v9bj@U$u1?V^f{ncYk9T6bQ8qrA(EErCoH`yH;~k3PTiP5 zYKzn5I9bIz1$GVOg-PjZc2ZMZH(;lf5^G@c`0zBBySv5&PL(NTXv8#3Aom6-9lwAB3|+(r&gUqvi=fw zN_}xU><8wJGb99qRi(l2J#gNZE`tb8v{(#f<0DlDHiY~Ljj&2ntcywRdR7=}Yqsem zl;IP7m6XH!H`l1$yfiF#vRFPLsV=X2#*1G3evTg6F4aZ-cfZMMqJmj7vcwi1D}H7# zhB}6>@jwx;fD{AT?k|_)>Nb6H2xrydOJtdM>>XT4-SgMHMXB#NCa7=mGk(B*v1>Z9 zJ|JKGMj=PNc7uA#$dRBkGIFZl_Os>ERP&J)fDXmw>kRHb)JyqZ`(=C7BiXzcB|!$4 zq`M-X0yG-Z)m1iP!ESPGF7t)PI`<~uFt^PHzLQNzby9f<_i)SJ&d$p3b?sK&FKCIm z7VHJ|n*?KbmaHsOM2>_crq!eqX{VZD{FDhqGNUC5Swy#fwOeX#@}+pDk!uAYc61s) z%CjNaQ52orQ^pWc$Vncg|7cjQb_?U-ibOxaD*+uCfZS&Vkv;WsHPvF*6bv;5^lpu? zo~8RVbRY_NOYHAs-?y-^`EVT5S!_@hdVyRbI;jRCfX)kZl(UowSMT zy4;OAgKe~8A8gv9&(Y%il{Te( zx2aF&0KrlvGL{=_(U1Q#YK%?Z>Z#J1*e+;~BN4~q-a{)ahuJ38RyXr{N7E+wB$QYi z!+yytf0Y?4RGRz}%<${xc8 ziehIL8$bRL?Mvj-Onld2Y{W0ItUV;?8-bYqb^J4W2Fymeq-@t@BjwlRhyHT+lv+Z3 zNu_w>9RC(LVUb`T9NBtuJY3um6^H2GR6vW<|8=DQua`Ictgx6k1bVswF3=EpHFym* zPMa+`m~5W;W#+zE#$A0JX#Y4NiRE=$0@h0(|Bp$mvGE@trzd}Ne;WJX1rJt?$Ph;< z_jDjJ&f^$dT!9Zt;nH`zI{2!~KQ{LKNqDXsR$@5srGiWsFC27e(CD=zOqWF>gG9*m%wl51R6N{B>l6TUp1Gu@pSPft`tT9620MZfwdW;OonsY$!N7 z4b`$LG0!-A<)JXze9}!svoe!t+~;+%5QNFZ!~}TyjF(a?fPQk#_e_z`c6A}YJ#xuj zb_%{+xrz_H5DM&uI?0-qQv(36P_FX#{cQv`ZhoeA664?sSh&x zl~nZ8qcyy3_?0c_qEeEvB0`lrz z#!ItTW4pUMRcemBI#ZeL4maydv`55i_unACw*s9&7U4v;;sOiXE)8)951teol>1IK z9h}|t>>7!~&mnx?LoW7!jiz=bl%sa#gd_cY5z=sJzI)M!(ryMLPS)w@7EM1MHaCF} zJ&EjxL@xkjcTr*1_o!MNCD7m^+IrCBDNtrhcTGh79t=+H4UvD^K>AiWp+iSKfn|io zgt>l|@Ze#4u$$vtb$ViZb9MEd1$c$7>08gqh02?;+-JCJz-NLk0JoRlkqlTLfjoCT za<%JAE}U~0@WLLtcB~2R9G7pAl(WBl$VN|44(&|3Kr@sMxyNXcgJ8B`D@cky6vqDY z=*bn~tAN;D;?d)pb&3}GqM*8s(VUy%f?he>Q{TxHJP49zaH zu?3Ry;*IE&r~;x$a@BDaHlIwtlj^ql2%?JT^N;{lgQ%$ z;hZ-w+E0Fw;ivw7?l*#hCxDD^KU(wlCq`&&??PjmgL=|@g4C`>_$tpltPGdO*vEssargd| z$97lSbjmg5J#8=>@|+pkPXs|wmkG^M^1Q9{dd8s-B(DGZz0Eh~8&PyZ9hg9{`*z;6 zHq3*Rihp4s;d#_A^F^cIA@U-0T15teq028~xyKZ;aL-&)(?7WnDOUUmXJ@P4ZxEbd zqW6Sr@}|{SPC?%dv{y?1^`=f}y8B-!V(7^f#^}NDFo_G@h^{Q6lW z-iL()i*sY^SE2Y{UP+K^u zUcYfDpbc^Q{0Cir<#v_rb$Jo?%TG+3_Dj>1N3Uire!@&En&#YZLSpm}#ab3H=cVrJ zG&m%2KpQSCY$1-156Du%ZFjjuTp4N>nkZcVpgNcr>pkf_iJWvR%XkhKA&BljLvJdi zcZV3HSQE?&6aA8Nf)dDx=N!Z-6zO&whg1wT9}HzRPZ_>WxF^_cL=*N-Nk;vBattb z6%s~U{vNC1DwRLZ!t7|=XZSg!nMUI6*()B2gQd^uMjEE-p6ZmNlG=qH>u21rV6q1P z+GJ2eIzM6@L^5znMRC&^!;ksR-m^m=WYKbypD0hO5AO8Ei~ao!Ap$T?MOv^O58cghQ0+}wpathg=ykM^U!rl7iQYALKCRX~>N@Q4XcH@ClB^X6nyhZojYTS9JMOza07uoGS#>L4lM=3Mx9|KczB!N{7~P86pMXoCL$o9n$ty=R93O*mwdi0C^d2ZzN$g?KqF5l6##~P|`;WGR}v9ZcE;ct~? zo<2>Lj2+Af;aXiC^pPAy`LSFR3SOVPPB*~%NIhBI9b9`OSy}&_OC?$hcTbx>!56c}8)0;4rQZiwt%|;RY5^JjGA&1^%|ZyTBWh%PH3a!? z>JTDl>zTTYJ^q4{U#WcNUtoq`x>lR@F{xbMrMR}v$w-{H1$ zOurh2YSFA4S7h6%DewJvxc3F~`E`U){#$O>mt#Wwtl2N~XdxnN){OsiDP%%J%MEbt zqB=?2KJbK^w%IIy2;zQke|=z86jyM5eRz(DZ5MZ(Ogkoi!iG2C|TM?-d4=0@QQ6u?AmLT z-`x4U=$HD#JHjq1?={ZVA}b(^ga^%tin(i$@yAYibB9)~($!SOpJvM|S{9Y=Os<6G zb9m{4xz{@r;)bp);irgApqGTD zfRn^hhgQ1?+E-e3jRNakDfP;4{KS)aY@+N>Qf^;11&kZK_)D8Vb&5dH>vWEEDfC98 zq6<1Ssp%%@Ll&~fv?mB5{wQUizTI-sbsB zF^XL4*5YRxN$p3zAJQ-8HVNKdZ)Pqg>-~-^t|JZAIFs34hAYYJ4(M!JCz?4w2kS9i zO%M${>h3@J>f$%%jX2r#JB(iF#b6|uT?BC8?eg&3O)S54dag0iA$U-GOmxf3vb3Le zJ|YS~d?{XXH-EmOHO0AUrp>w8R?(#+@`!%z&ap?w7WKn)*%zhh9`7H!k-8~Y3c1q| zCnYxc=)T>K7$>awxJDlu$C`UrVl{6`4|tOoezq+=a_^DyS;gY^{uIX5SiZ{-&uO+! zfN8EL@!z3KaCBv?{^3G*>v8a^g)7e6gw-#eTQ=3=4n#GJm$+QDJ5{NWPAlEUy^b|O zEEUheU{jlpI}Ab_#QpY)7zP2tBV)~iNj`!PnRI?OJYg7n*3##&#Er#xPkI6R(zY6S z&IkJMzx3B$rgD3)Xjx%wIW0Ag$q1fhkC^Xa;}?s5NTTyXSklgNx2m$*O%`Nv8orzZ z+eDQi@q>g#E>Z(-){0b^6S{_97=RZT6T$<)S0uXiW$k7|hW#_7fp>JOYuf<`^ZU=| z&7Nuq_GAg|O1E)&$}jv#(^lOE2MoooPz3t+Zu5S@nQa=4sY#OxH9@Oj5fxP69q1Gq z9we=}_2IIR=(fe3N z3e%Qh)z-S*ZnssUpDWHF{muaM2(_f>%u@v%Wh9v|yXjW0gGtc170*4Sdk@A6;@Lf; zukI--HiU$gBKr5xg4tVNSR-hy1yyh{fW<-d!#0l7h^+RcF06K^t+R30I?rxV@)|L& z?Q%4QE{i>!Yaoi6a8wL`tmq=zoD8X7CaS&u&VMv#wb0|p1>wyKOOP!4B);=Zr>|_h zdCUDtt!~A&QnaCt6K83Ah55E@aQZCt?k~o2BmTu2*iGH?>R`+WVJUbS&j``BJ|1;P zPy#mQgFp0m)eq+KbRe$pfk@Fg(Q}-}r4{MMrA+vVMu^Fna+p(Si@SOFiH~c#^5Gis z>O*S?esrg)I_7^>djE&6uMCT_>)Hm9kVaZ! z2$3$O!$A>HDQS=t1SABcdnjp1=@Jl-7LbksDQN)#$w9gshGFU((EEAs=ly(t=;3iN zUR<;GTIV{~+UH*TH!Zy^`NpBrDt%jC3+bk}dS}3cUz#*YdOvf&-bzX@1|Xlgo_C~} z%w`18gLVfnD+dH3-uK?1+l(UTT*TX(T&O>@`CZ|mSYUg@ZGP}`_y{PF%zwk9VRg;$ zfU!)^lIXf_Io(o$p1^XEsSBH6%S-Mknoy~OF_I4|4}7cI!^xfFvZ?jU*hES{1yiM3 z3xD+Xer~1R@e#Y8AUUO*)H5xQ$^6?iKTPBDm5bm+q1Q$v?QVd*)*fOe(q(=f^4MKv z2Q|MF^!4Lx&dw}@I+`T!%w1qHpw6I;&`Cj;>4AlwP1;rh-cSNbAwrLSHc?(OG|^u9 zPnX^!=i?S=G`T$K5>M$*c(~6DaYsj4By=CVdu1uV&E^99L!D_^fa)M=Gnx}??U6G3 zH@y+@s?_SRuiM9`39e2O>HmYYa0#nG-+=BnbL?uLNWa@8X%zJ-Zx^@zIBpRIi_n)L z2UJS=1+(B#$Ys&b1i zM=7C2dZ=&67Yh?smlnbA-gj^Ig%3-^l04cHb5NT(OKXmEDku6W#@Cml-7FCB9I3Yq z=nZq!5_0`8vNWaUmB{yyn*j!cvP%|(UCNEeO%A1&Fw;b8GQ~*Oa7l^pPdWmh(JI_N z4I(?OymTeBt8b@%HJ8(_jxy|Y?bXFY&Zf7q>#xMO!SMGS;!Ng1KqW-3PZ6&;%DozL2c@=5Pbf6$0sBh{Bx=d61 zoPa3FfaFP-n{Pi7G_iU6Q6)U5i$_A-L-UFHcjut>#d+I#4GGjACayHA-aC3`T#U6x z7vJdpw>}2OFiLA4uU5-l>WOSptj#9+OG4%dI!moS5}wMCc5m^y(>=U2UTl=(Ld{%h z*5UhB2T0xTC4shrJ@y??YmTT;2&^5yad}wUJAujy^KIXYgUv~R#}||mF7vMj`u&NA zs(-SQbmSZtlq=iVwuu zN>Ej>5^+>a#xIFA^7$K8GlDp}?`9}8Wt^YC&sH-Ju0dXw{9sdGoH>%m;^XtAV07i( zM1*<#+NAWf5bzkHaOo3!Sx zI?@TPdTE^h-9>=ycvuZf$-gMKm7Fw$xF@BDjNfrl=RYcORtH@|3+9ou3SpSmWJ+m8 zL!`qrhrZeTgm8bMCgJ4#_;Wge?hDrsPqo{R6_lQ?<5>yg;)d$u z3qDIy{*=&v31%9HL`0d54(}o7^+gATDU~Zt>aUka(0jsXd&=xHDV8a_hGK8VP0<2| zfQ4^Kz;k!Sl#1)lA86G0SgtF>UDV;nLiLC-?acZw@ z3~IYjtDjkw7Z_0{km*K2K7=PLy%TQpNBB6|5MJXH@ig1)gLHd zDGaW1o6&4z-7gIy4JW_0qH$M?spt!rF0KLZ_Yq-nmPK7L{*tjCx&7OhQJ^#e;JG#; z>q4;)S|_OVJ0!e+5U+IJYv{hT=NW=*2CPdnpb}bVnU_ic7nq^ur;_CDGO_)JrgRdX zG4d7eV|#R7>bnzcor7D;fd(^p!A*Z`2cwZ6ANhEe84hqq17rYrK?R*}z2>nCZsVVY zQXMeJ+uqr8qfMha-&XQP#Q*+^v28h|!8Q?rUdQi08>O5?34i)4ege|=nWr0FXq(;l zzLGp7z;vE7sd;sWBQd4|(jv{Y?w+Y6^DK*%BU(NfcTCQ>KfG61_WGvw>!jbm<;vGF z-b40I{ot*jf1n34%;PX^#J6xyMKh$m&rb1tqCUG{$$y*Cae)!d_rT zF#en@E%TT%@r_>1cH=?%flQ!wQjNCaU=pD!y%nBig_F=_DCIB#^s7I2*6InYs{0=^ zqPD@(SrY@a&r}l=@qe~#=A8EzK}Oc39od#VOGHM;VkFa7&B`*1O|l#4O61m4rAj~2 zXyE#goSa=|4uscAk(4&)N@&I}h$GsRPWU6IP7>beb-|x;z-}lj*b2h>6=qBj3^EhP@^-va3?y zNx@9?>gs3wl3H#p-f~f}p3U}ct;Dm*FBP$;#gMF>qfsE5OWI?ni#T$a+|g&-(4f(F zRqjunc$x7~J8<~f{<0wYsPa4qy$r5-GcM~^8;VKjK+p~F*`!;`9>)OvEgokl6Y*9& zmLS9SYsD|wd#C8R%>`3vD7Tms0D)}a&=OXPQV?q;*PDBMQdD;?-LWKd%cUfkekirw0`I>WadLa(2`Xq`}A1EpW8IIQrN8{+jGQ%&qP< zL`vvSm38@Opii6h&UAA|tRG?-aps8D!`!lQUBcnBXW62vpkXHfQB(sPO{p~h#C1fF zq;oxc>Vfx~*Bg6klL<+D?jLrUcW$v>I}IOQSRtx1h+Wn^ZRFTLTHkhJeL2LD2hI$BXO2issa>7)xcV#C6Jo8l1b$6-#A(Vph`)|GXWf z*F#1=d8F2bTQz|jjY#0!{9{+hNAW*wN*}g3 z^h+{`@@Ecp2P>6JmJ*SI~&0>Z;mn{o~B%El>{cXe1~Y0Yq=yX0}awMl8t9k^P)B!a08XQH zF}`+o68jI;7d|36(hyrOj>*|w7DR&nmcKFO!1yjtKB@w*w{UeIayFs>{Blo^iKPTF zQL5vZYS_$d#y^>kaGUqGF08Yd!E8OUhh&SFK+FH#`@giF#eQ>@z`ZqP1`K9%xcX4*CYyWi#~eumpyU?%No6vZ7UU`uK<$Di`qWP8{B3>Nsvi)9SDH*xR=q5cDUutsjh=8c|I(%dbAGzyW35O z<8*F^Y;p{z|4uAIl)9_+*>fhG2Z;RolGY?uSy)|$;* zFZC#vSkhfq%crta86w4Pc&yM6k0NGasQ|;rmRJ0M@l0MwO}{FjzRijRy_=%Gj%gxE zs6}%$yWjts#ej;G1|Lq@knP)m94B6NyrsaAPNX6Z@q zVEn@0t9eDmV|!!LSui4Ngzq9uS*qPXz}4lu;SDmNPCY+^+uGAF4vx>&MZHV(lD^mbm-6*~p3IOS z|F=J>AojX$*DrYUf=+rtT~laSj+Q}(1S~`YwHsT$Y7J=!t%}bza*s@|NzGq%b_&RN z;U;AK>YI0+N?UvoY0Gf7u1=tzu;tPBmD4z)D;DpyVknf%zj@FpXfTXEb}6c{ofeXK zKmVl8ak}fZxPQ=VvbhQEp7BquHri%mD9AU~Zxh4w8S@B6$KMb;a{r4OSQd9Hc~aXd z6bT}ZV{zHJU2bv|P}@}3jcJgC%5BG`Ov{yR0I-W*5^Q>z=l>uCCG z#{7AnZ;_Aa@$nBfp7p@oBc;Z?a3lmiD+Np0`OAubm<8Pdf(F*YxUHY7x=7_07ecka z`i#}C`#rZDvj<%$5XhhtlpEL5?HVrsTx`(_-IWrv`=4FSD`Z>t;e#!7l3h(51EZTq zmIKnNyr+d6K|{9|4to;y@6Po3KZcpV!tJN;N(S(Xe{65HD(iQxU0i7oA;4QBrO~W@ zOn6>u4mHQr6xo*)YeXIrK&>3+iB#4*0)Wm$Xx>pvJagy#SqGcNWGl{Tun(tpJ`ztk z|2&!0%H}rmY~?#&I{eQ9OSZoC%_gzRR(z%2s%bTc#>avheSv09;uFu^(D9R5E#Ib1 z|Dx6ebL`;ae@(;=G`QLn$8b$cu!wS&%2qC$O-zyYG1BX=zE^cTWM*$4vdXLX*}R)j zm(R2#Xmr%zczkZr0iA|$t*k1NaHO`->OZ;gsAOj=GnjG+SI?@_we+qz%LkizVuMgs zu~jcRRqcO%4ip5zcr*QHifh&Y`@iGmieCcp+By@}_tvy9mvwGeqHoXd-KOI+)<3YA zs`zCMak|Ny9NNTp{PRQn#STQ?eG4MXDa2K}D6Nd`oxLU-TIj%92nsMz3G|HDHVeNB z;YwscmA65dc*{#%@Oknn73(4?Pu2^^V|*3n1Q1b9<7!} z2&OZQ$*Yjeks=qT;5aDkJQ|KRs-pUop%fYG$upJpYSIYPG}SJ#O=)aOwqus%EDiF^ zy_IJ28^UsQ0Gy{MgEW1W>sm2<+5;KWUOs;_XBz_hcwEm?Hmywj$$)d27@zf4$EZrL1Rvp02U>R zk0~2{r9t@S0cNS|2i7iJpDecLGy}4mi`rYU92A}dG86l zX`edkd%AXE=lO!MJ!*LBeDAfX%4%vsmiOa{2pvB@q>%cPHN&fp9VpK4-eVtEb<6aE z`@E9RXjALZoDZoR42SU5E3p(v$2d|bAH(uBQs{-A-qWNgPGno_s^k9MAsNYYl-t*$ zs~LH}EO<}-`1Ehp1g81F7G||jtW>jEu4OWXa2WSr({99*^)n(JPwbwp{Zrjt83m+CEDqHQ3O1`FM;O=F< zZ6)Tny+7UPc8{XyR<3v#W3$l&KK?_$vQ)>bs6zyvVjr*lkR77;+%gKE{(=P8mY5|^ zD%8BcqwoJPQp*@edo{#JAbX37#luW^F1pF)54{KA1711klJxwQHyq)QlBsqm9=|hq zuCF&x{=iUiWdBPI!#!0ow*|(3(_3KeetY)3&-t7+aDYU74oiZyeP)6kaXz4TxMinX z$Fm}7CJ@_|DJaCJB+;CBZ6in&P0X@H!y0;0$$~rkK!7Y zdBik#OsSlX1o^20i=uN&5{m99j6FW?XdmM`NXGFYQN|77-i>`G5yG?hTfH@q^BUl| zr<(>v{vSq|P$v_qTewboGDq`;Cx1f0GwaRPN_h@<1hi~xi2b*7T^5RMh@!Jg;ag$l z>&1P{?P{1oHTZZ$N>VLn___z7DHShY{nSL}$rB`L=HdLK1!(*sM=1084hz|sXh?j} zR$g#XofkoBcc$3n>4D617h90dGt#DDsukL}%jdV9485@Da~i@-PQON~#AWoZP||=X2MD zla$2gYJGX2Vj0GAX?wq2+P81Uwn%}lL(cvO)k&$6meZEKbwhI=+bMJq_)9{O!lPpI z3QZ)FjF98+22}8xxb&`7H9_jsgSy<1fC)I7oT=W`e1G&tTC`Wsq3eryoRFbcK_Cl( zL|1O|yuhNnPU;>9zmX7Oa8??_oeXwW%QD(XyV(32iXf(sdwa{`+h7mAplIL8;8d(5v?uL=;*~0}y zi4LE!#C3_)t_RE@Pj=`$io=OB{YA1Ov6!9SHQ)X>FWYLboE?8aou*vJN-izPAtJj1 zKW_>S;?pW`-rbCBZ;!U+`}BC&&{+PF=7$QsfoM9$x%3hxP3GTd0iyBi9Us9QSy#`J zoei7y|0`z#ogu2D=V~^Y9nPB>8h*KmAG{p_WyAMcJO|kDhB}pEkgkPdQ-IRib znk&D?S5+yWB7VKAhhEH;mw|qbqPw*xBqGw4z@gj#muC1cZl2R>n#>U^gAfw;KP{bu z&--A}(F>dSPCqp{{c1_Tjf8y>TqC!cLpt$!aaf%v|@ zT0F8i2+u^I$dS*hWSW3gDNl@wb&-_g`aPlEzJY*LQ+buw_xNzHg&CC?T&%0H@RRK= za<@;-)c)E2vhIK`eu<4_*2KSfpY45D^CqSu$q?}xwpK|Q*S?Vz*o6M0ue(B`D;|>q z48N~CkG|${L{d3@9P0sJPtR!+wIhM7!K}%4cJ+#Q?j*_L>`B^?_>_v>t1SW`5qlGk zc_jLO)Q^`B0UDw)gU>I?5(z_Fq%;-lXNOY)!8UVl+DzHxn>|=nCAq5&&8Y%kN*=Or z4gy2D^$1ovQe<%#-s)c1Mv}5lVtdfY6K)O6UXeWfZUAAR))fCDwV+YY1*M{pcz@rM zI-t8C6atGS&yo`uEW01_=-5?1b^Ui*dvG|Pr7tKnI6ht^)Xo&u7acdav1v+rov`(Mp*Y1Tr_y(pY7qcCYhyqb6MfHx_1?UjL?hR%MB! zbliyZePVO`np9A%{s~pf6_Vks#kwx$xzHS5q+|tTP`18AQyNY?L%^RMp~Fq7w%_5e zZugc8{@y?fUSvqleyLd?&Q&6ltz<7=^pv7}x>xB~J4tQ<%CPX4{C$jN73Ni>#dwt$})e(d}Jch}Dsw)1OVfOPiyoVPQSJFpSu}0|-KHOQ=yqYpK!w zDp5{xTI0^=J}(cDvZZJ7mYS~2CpuF3*d*|Q+3)U{5xsu%wW^BH_3S-|c<unNw10f%fN7&{cs!q; zDwq2{r#Swcq4-Z1P%;7bKu0^bYPWXEb%(`Km!!<_V0s#+`kfWQ8~tY5v0j&n8)Fjd zk;+J?Z_ws+QVyic->5R`%~82dT%cF;BK9jZg zHq=}4K-rs81u8DdPwC?RRaAgz+Bz5Jn_*|W^_T{-q7bGT@3iWY;&l2p%CK$9YfDc0 zmH84e0R@>b?zE(*5P?_L>f0(~yZop2R$kIw-bvj}xL37A9O@bVFz|JVo^dN-z}@7X z=~Ftgc@XbvmnY#8+s6PtWtauvH%!ClJv?SOFLG~X%`-*CyjWCB3wB_S%eGChjakJq zZl6CUI7$xk9NMbp*NNJA)=)4kjM3^Yns|iV(D?%Zi0P97*SYeT)?(fWjkSH?Y+}*% z^BXw`mPX~Q!yR>utG#W)Le>%P;&4rHm9u_ii*aU!a1lh zA?fVQ53@|x&}E3l0JQC0SUB!yf$L#LG*57r^a7nJ$e5xDM&BIyhBt(X>AMDC?j86>Db4(A)W^|s^@;VATNGZ5hGNmU(ryZf$~Q_y?dusQX%#H~&1Whs z@XHrw$i93&dJ^RPrKalhewX)cF7d}?5>+Jt<%<`p#wEw_IZ)i98lUE~%I4*&Mi^j- z>&Q(p5t~bPYL0+4iD8a4AdN!CnCDLes%CBZqlHC@e7jIhlfdbVs^T9l&42B%oS)0f zGgbJ(YK`DWc4=ZHav1#=U~In2J#TW$HmkNY4jZeUo2cFJnA>iDpRSeRgE|tIou~md zK6N~Vxo+8U-kHc0UR3$!VzUUt&3>ni%LN6b0S3M$D>45XmfkmCk71${l6olvoqH>@ zU)IfSjQpw8yDf#2JA1Ap!-O4W*-uH#r8PsR`F!9;aDjVwoJsHmEi<{xhXvBO;s;{5z>6b;Su7+%A zdmPO(64~7E(iIXsF=8OsB(Id*+??w8@MV%C{Ew&l_~*$aWpNS(Zi!9${&lMd!~P4S zg=#?q@3&TPCz}`2Cv4It=wCmdIcaW&3Rrtz_#Xx13N$P}6aXWfvq96AM$U&En5iYh z>5z|1gTI9+S9oQLjEH!%*Rn0p=GV&VcLD) zk1!h&jBKhGI}^K6D<7ItXUNCP)}x5no;|uu1u10uLdnr?66hbo0x}j95}wgJe&!6Z zz?OJ3?(aO_?ehm4eiQyMrB08*_D?}aG9Hg2ZTukhnC~IEg;OAH4E4?1<9l&|P)Wng2w2~126q;|V-R);4a1?$Yz&DSW>$^A9>}N{mrqa<&m}yVL6JG6H6uc1 zv$C2gX3Y`M9eKvsu4IzRV{OG9KzbRsrv3w0`-sd-TTvFpi(hz)AYu#*&2~dq9i5@ z@VSt_Gu6u^ShKDc9;0yJJ4As~y!;~MYgPcokdS&W`O)30HNhMr_G9b1_>dNp`}Re# zu2(wv|1wG$foz&LVT~?!vQ-QBZT`LT=i@|$YIBG^)y7@Xf15>pv%h*k9s*~+oT`J_ zbO`KR8Vw>ExwX18-lN$l{PBbpUbPw~VAIggMhi#;nBSF0wPNv!1-j8_&6b$8@|+I` zZg2;%ToI}xLx0ZpC!EnvFx)6Q`X}m{8)_;-ZTnBskzmSs?s<3x@+ha=vB7P;Si#Tx zMXG_+>Bd!n=^?R3BP{z(jTx>JNoH8=aaT&mXyHQ@$7+KaMoVl9-GsjzJq$YpW)s!QY#$KsfkshjWShaRX!R=^ z_CMZm=@y8_HT=Sts;Q%xK`C;MtmArytzA{xgGV9nnreUcE-C*MUk-Uh+hoA}m_oWcavUNGh6$8o%o z_3MgSiaD1-)wrYd(7Rgu0?1E_n;M_Z`o7^+GLL>C_{HFOb=;%Qs?;&O0qiEa_{?G1 z)mSxpnW;`^xYh*wqOjnf0I(j1Cj7`PaApa?(p|w3IX&WeCtAE|+@JZ^nh^6FO~B(O zHl8K#M?S1Qa*$M2Q+Eiln6@o1J=862SuZGZ{+APxRYUljFXuhUAP(5Eh^Yg|F(pTt zmZGK&O(0gbl8Sr4HK)O*k1UdoaX-VEiK7%T<=yVlUEEbk$uog9u@zqmnMY-WgNZ;+ zWMB141sv)$UsQM4nBlI?wySO=lJ8V$r{{Ff54Prfgh6 zbT7t7+&#*IlOFs6&-GY?FFa4JJM!M*0=;){Vdf*0S^RqJ*9iME3*P^5Tr3D2hND+l zNtyW~*7&%jjC}Xl{C^E`9t8{PA4b=q1LZly?SFQ|(NK1owqQMiS4^*j?d^)?n>l6+^L= z`&BQGgUOCshcY7QvdATNqwh)V{?NKQP+H0ETXyR5{DF>_qOpES{^}Q9+P%28pEd5R zliC+TA-{9i6?_KoNDt=*<4+%L(74-!7;z#=Gu<(Jxs^;$-gc8ddq_Y>!S~MBQI4>X z9hbW6Bz>}gMJCFv6swm{WQs|v=%+1+SXB(>GS>U9?}^1!wm_^yPW$7)+uCl*HLZyS zcaI0Y6u0`?Kj<6gYunWxEHC6O9cT91FH0R8U?Nwy4(J2Ht`dfpx zo*nVkqYwQtf_PIfLC+R-XQs~#-2}kUKy4{mu~R(CGuq}O{JuRH!454aS>xs>!Wq4! zR%L*8neY6^us8;@V)+QkD`6d`idqi=dJ~?A-V4&oNyOIwG&~aBl;K#y!@5t~zZf!^ z^swtQX-d?5{n~UI@_BDF}#61pc~I8en1euv*q-!5u!R@k+8s zR|0#6Q=308kpp6-=6W_6% zu||xH{(%?kWCxC0V+ap7xfa|7%k)Y1VVzeg6 z=EQh>X8QHBcX00+Q$_39;Tvto?f%(6$1P#i2Rn7YSIYzR>v2b)9nELkX>v?e(LQ&h z>%GV=;al@JKWt#c#vmGhH}mf^qS6!t>PDVCfXqXOmjiWA_WT#Ip!Zz^!s=DYi6^s! z23^!lyN(6*zeN_vt5+f^nwY0yFH=;ZfS2R)_&*UF7(j5HtYBW+amzPyMzIY`c)c8U z0Ezc1uUG9-+BduwgvxjZ82jAOulv;5y)N)8;h44>)%D`x3zubN3Q!3t7ae~{cMN+> z*iJIZ8s|_OCbQM-GbxiLNIklxvD|w5-O1iF)5nj(b8o?9`C}(n!Wg-tDTxR=#!f?u zwj;=|h(#NwkvY#Ks2t91D3njfle)P#>HAlBMxVp28b37Q!Lok_e8DJNvWgD0xOI-C z;%mp&=XMFz_{;o*%eg;Jr23x@R1cW?A}r=w->_rl=jS5^M^N@#&+cOV**v^Nar7Xf zdUo}&+;F+^vy6~*N0g_W?6K8{o_wF&CJS2Z2MDZEZc)(s(jyc^cU!Se<2cqLkQOp;$n%a~=&iN~XPKFTWT$f&*)j zN9%E94fsuzCbTxJ&!})IIJMVR_Fy94#<(NbfvaX8_vKpl0yGnFeB=qw-?55PLYtyg z2ob95QftXUp&3~?p0>&Vl+XTMht(NU$z~yDu2NaqO)H6dYvX@~Czmeq{l2lM64IDf z9Htc`eWqQoCiuB@Vmv{oKwo;|eMsY-RSF1rN!b|HO{` zDslW?*ngFO`y3`-nYCo-Kjbfa-E%eNqg?VyW5xYdGnHh_`3`(o$~(9hENLKH7v3y< zv}l3ffLv&0Oy@g>n2x{c#y-wl1)iP>=GVA*q%?~IUJfiTrxttw5%sLR?DG__hK;~VLy-*C=jNSqHh0M(hI;$gb`x;R#g*~d4x z)JL=-fxRB@jw8UceU2tjtY#=?3Y;&Rdp~ht!+u(97=W6facHoSc0~S4JfFzXALzG~ zm)!+QP{1(S+CyISG4`I%hMb?b<}RqCgGAiZ*)fGd%zoXjUbcg`!lWDI-|Hz8f=% z6RTfeiaSG~qz6FM3G3eZ9`*t}s_0D*H#-gtmWZZmlbqjBNuIb76G*Ku{65&>$!gaL zOgUMD?tM1p%e;e=`ginuCXml$bf<|LM=99MT3Aj`NRLEI0UtGyH$t|eaQ;*>G`=21 zCkd|KT_1AMk!G#wOJNSf{q5Kpk^hHVXCKvsp6%xN_&+Wfen$7q&rffv+`o6m%yEXf zHOn;=qUoyzx)|aad351ehgiPsi2PUH8_rTc>Y{8OyTB?O&~7oFhNb!9eP1l80v={-)c87QJ9=axOKCcO9yc3}a_E|M(JDOSsMWRMZq#iy zs;j~de%^XRJ&02-HolB_Mp!!A%fQ=e9!)%^!Cgl}~di7*lY#+X1;}7#7b@tD5(?A>-dQxD@ zTa~Z9K%ZoQ57eJ|TF*2cDR`c8s*^iEOz7dk5wXzI4;aB{0m_<?{a~;fj`UHpNd_`tXG47 z89LElGw|}4w2Vn8`NgHcGT~rMV~XrAUl;kx0a#D zC{sv>v`KSlZX4Z$jY>rsr**7-&(6rXdRxgiV!s^GHqnCjeFdO!9tz0W*B7XxdUu!e zM8RJNMa+wi{vP7!vt5Rt)os_f-aKfq^8o_k$nCY>91_hPW`Yke@}7Y@@F%mQ^O`$C zAHU}9R^|pL z8g;EKKcQ-}{f-m?qVnOR26Fo6ak$D6pfR5*bP&cD|F^GUJdiA?;f6{xkcE#r-_(ntGRg)yqk}{kllTC9lr{ZtZo7Bi{-}27@|%DdcOfb;{#bkDTqyq?kV`FPNCj zzJ#Z39CP1lZ(R~xz95BnS?h+>J$ESE{LOA*cM+5-Mk2mt=pF+=wnXvZ&kYW^J99?A zUPEdV!gBP7K=AK3thR1|5%~=Cn}Q9g<>vgE8Ly64I=uJvo`z1fxQ0N`6XkrSCM?b4 z-wG!zP0;1+h!be#DinQm+Z1*|1%%bPwK(bJ>>u6tKlG@~bNN2r80`f+o2m$TTu&Pq z>DFuE{RUp7D{-*HxR1`UPEA$A3Xgs<5Xu8Sh;c(4Waw$}lsNkQT7~j2msdU?-bYN| zSb$bh-Y&i?nsJcCS$IB{ZDWG^8ptt;4gztTYCjPzm~b8_=DA6)l1o6y78%(PvlVsE z|9ccRc?Th#&Qv0`HX?u8UZJyic~16y`^99oL`HNDp>^~p6}SCH;h`$ z;JkmHvn2R5?{1~{V_M#&=`ET`c>${>!mT&oB}OE9=zs^sj{i zUe(&s8=%CD7CX~(X^?MxA${$?P2!`wNPPGC{o7{AaB-xhU z(N*0!Z(YKPs_daFIr-jw#{h}6RT&YBDtKC2JX`XH^xI0phxxl4B z#;+{q8ZL!cMe$)%O6+Iz24@XK6<2+%?s+8;{8q39EwF%UT8--3Z)1__eY)3tr*wRE z##CFX*OVadJTR8F9<1~0>&uK0piEy8KxVxzJdl5T05L{Zxo+%etiYY@wF`)-wp4wa z<30Pl4L+OPET;M%M`;@Y<5bU&5U4`YB9t$9em+XI`*_Yo$^P)=+=mBBrKN^Ej|gdo z)Nel$Rp03jpCv2XUe^EIqoYc#sqv#E<6Kr+SF7V=(0Ru_HgZc&?*E`&Slg?oV-C4n zc!Lb=R7AK^9%MsXDHG0BO78q@E~!wLVq^6GkyYD#v`cmTk@Ndj?pC4G7kt*e!-rAe z^Qe(Dv~Mry-i4JgpP}hmc{%Lb7CHkn9W6#2CF^yQ=}|qNwyiZhOMX5Lk{N zIN3of1loE+Mta{Gb<701%JP4ZwQ@m;zvf=e;P7C>6v>a0P}`ga;5h>h!Jd0;){(Iv zZZt#09_s5pMErw!`}y@2?RoX(D4|J?ZF+Jy$ME5(Z{-Y#g+t_$m#Gf9##}V+)LN;a zGC6q%TNmz&<#e7|acC|VWleIcl9i;>$2WcF)RyK=-3y&0*|M@qR5!MDlYX`pM@JZC zX;DO5b)Hr-f}{gGdl`17fkYTKP-bed62mXb!_nCz%25}iE64g|@mqvc>`o~Zbw-AI zcZ$oi8XkxWnjJw@o3q;RzAHR)M|6PQOltMxSGqjy zY)-+8rek_I5w$-Ivn)I*PV5oWH zOGJ5{^M+@ZtWS}K&eG4%Zm-j5?YHwspBllqc+A%3H(wr}zh>y+oY|3;8JIp+z0O7R zcGQid&QU-MBO@&3*;&xU3DKQ(K3`#s-N`VxISV`&AEu??!R?gGIXoS0X{f8M079KG zenASZTQGnETDk#POWeUIa)>6_&Qw#TTiEyR2!Hs{g z>Nl6yJS%_O*eK<4DutyO^0e<6Kdlj4;l!LauBKFk8%=w|N&_GMc*7LqB2B*)doL&c zCxQgdamzHh!~wH(fMQ>=o&+Qp9K8DIbTqYUK84@uF-CJO7} z8o)Dmci0<1rcJ+B_dVp>_k})aL!8>2JYP=Xi{ZP|{Zi1R7xdH!9!#ohrdAYX)pHb5 z=}9MPcpxllmRS$GBhIA=KV@JuhhUNz8V}skAbs$G5ap*$VRTe!#mR(6^+5B+pT$H= zo{o!f*oe(5@E&?_ai<`eo*H@0uiu}}Kb{@(K=S|NAphpNzZZ7Nf9I(aE#@|cmBFCM z$EfTNEA&r{?Y=2&b4_#fA&k2#8UBlH3tB?PQtiH{X5e5`j`?_G2;WD4n6E(R^^f} zYzCweofRE=I?tBz^9eqg&vN79)H{Cq46nQ8lbxQspyGM)jZvMk#ARgH?|4aFN5Vzm zSD&?&q)(dyD~;mR`vw;uV0@taQi2r4gQ`bP-RHA67$p~0MvLalPT!P8SMb%?*-eDq zCjLQO|C+1D$}2)^S=GMek~Qc16zSKPH(1w$4jjuIsUqKow^gfG9Ff&NWv?2XtS!PY z7Ps?$L|8ziwC%H#ynXb;vbnm(9%!2_2l%wZx{6Ep*AIQnEk6fu?w_T(wCqj{c)>mR z*R!*NADIhoD=Gf0uIS1D`$^`&XC&`f-PC^JEw+k=kI|tKKW|t1j*ZK!g*_;$sg~oo1{RF?O;nyHwC@h9G=G~c%z2dj%o(r5>Ho+O; zZ}YRET-3z7RHRv{ziP;0g$*2oPuvkx%PGpC^BH(veM|!zyuB=6&)?oL#f%S?=d^w@ zYIxVb_Vx4KP{2#RM{iVWb=hMO9`0)MPybKuj7cwoYrRI=neIpdqJC|JshbA1X^iW_ zwg9JTC8c@M-D6wY|Mo;1NXghMkyJYoxp~N$a=}jy#}Y%0^35^}zQvmg3Viy9ZyBK3 z8F@D~W*-<=FzKhnu^jvBbzN9$y{RcQd~?IwHVDGo7GQ4twYsUzx}6hlk}udsnir5~ z`?#E?${F562;HSK8_^p7`1;rPLrG>Y+jJlIlc6YZ)<;H}=j*>(rx(dy%!cwncgxbY zC~-D>edEO}NKMd%Q9lShU?I>*_R4Ve=d=_UzTSJj9GkPk;X!ifxKNjjDbB*3NBOp* z-nO-+PJw+>r`L4-S6jnUQ=b^=@MY%MA6AXT90I{kF~%9TEyq5($CFch))S9?H_x>^ zPS~fvlLroe>*lQmcbxCDX&|InZh5$KT67eO%=G`TpRG;#dScYyd*3HC7P(Acahh8) zD9XRQu4P(TFm_dll_lIxE;c}CKYVE(xypmW>QAZ&06bc~Xp~DH4anMA0ZFDD@MZ1s1d|6fp>VA&wZ&3JQ{%qH=8w3& zBJtVN`u`6D|C11-;CSCqiW$7~QLgYvS7LC2d$|&n_iGR|$xEf4?!Ue71>mi2>66ll ze$x^vcEWDDXaybPrA+LkZydb_IXB}t$A2yCpS=T73?;71A&7_Dh}px{f9PrCsH~TA zVCu3JxQA+TT3%4(3i9x|?;%{Vtr>HXSUkbDUQ(l-PZFvvRkwsuTC{f=->P_O-0M($ z*6Bb$vG;6*CGSMf!ri{O%0&y8@Aei(Dj3;nk9cxzgwgI`qs>6C;IV#H*|#%Q>d*@! zFE}j;##xqP%0*4_R8&N#JsO4&tE~1NFukj1rjH^lay8HPBjD=P&tDImsEmzcI#*J@ z)hqWiD!I2zYWen**z67TR@XtVWWi_43PUYPcJ!3BZcy#6N|3uy5o zfoU}H*XjqIvnH}%_mNwe``0GPT-WL}bY^znE0L)kg3)IWOpbzM5BAV+qOH|MY2QJ} zv`$|Cf^7S07}X&;w#o_??+oiIJYK5@D}I^n{|G>gF`<23av@0VHGZpTO}_rIEl|h* zz>BNt~01@eUk}=%(#8=b? ziva6S-}=vFu5P1)u=+ye!g1fvzUM8*+XNc~X!nRn-}-y#sbbeH!C`=mgxkBXUXA%D z2`x!$?XVKt_k6;KVCDQr1DKsFWJ6AmP@Pb~W8jLz|MPY`8Oq(w;@Ss#W)vkRD2|vy z?;t!y&zZ()Hwz48R_D%vIhP;)4Z4F<1MJguD^x4yCVR{)Q=_r6CdM6%>E~=XX=2J{ z`oG!bR;)n-6|wC6xv)W92NYPEAeZOeN5fpsUuK_^eGGwc21+e|v1H*WH&K#Z*x0-Q zTcaf0sovwX1~m3cisO2p=EED8M(@6pJX_V+&9(_`Jnlb!TEG=<|Ljq?02n?YLs}(#gskJg1ADS@MuZ982psi zL`{iY#J^zV#jUkIxcz6!U+g_k9aKOz8u-{|HsGv+Fs|6jcM#*mf3vb4)}f9bmT%m3 zCRIjvxh% zIM9iLGlPmd8t{&0p9g-q@Ys7!8TWdYniavoAU>kkEtSMj{w=XMy^nRrdkQb%v`A8` za~HkR+a+EB-|*6C6(fo+1tl5PtlX75zDi~lFTbg~x1l?}45;K+@f3c(l5gBRb$#P0 zeWjSLm$|@q8_)ijkXZZ0iivKzJO;UA2BYJ4nSQm~9a7Yw)T(?Kr1tPJS%%Bu*z0s; zD%P9+fIA~$p+Sb_H){ka-qc72{dvLP3p8Gp)~`xkqq@3(uH%7Akcy#I_O}QCpef#t zee`NXj)CxamQOUr&9`7A$U=)w!qmp0{QgMQ6SV4a2xrKN>c&GtZDL(V=OG}yR2x*i zrPydN`pu=JXUss2ZML<=WBb)NYqDkUfIw;2Ep($lk?DV?=>JR!KJA2(LWUU7Foyf; z9BDoa9cu##U2uhrcNp%9!He*OqvwH7)7g95l1LxHvx@DH!ZSX@y}KK6&(3wyQdrfd zotoaP4C*&{at+8X(BwUf|F?8wVf}lfp=wsu%Z)Kk$}r8!6UPu`|Au*#xiMcbLTp>< zGt;;E2|n9_ss5ui_bC&%Ahz5^E|`^t$?wJSDKblbF+8w>zlOHzT2<8-#d*J)X*Cnb zd08*FP|ZMCq~mw0ljX%BY6dYi{yNOk6q~6#B~v#w>4Kjh{_I=)O(XArqwH%YVKHsB z|K_fFlwwr*W%`5Rq&cjlJVz9CHp}lZsoEZ75QdMjxZeSO+Xa^iMBya%V=Hx4ARcKc z8BQfp|AI*iOpJz6z36Jg5Ug(l2Gk=-1{~fHIO%EWJXIYF%S`XUpZ$)xI+p@8)a&rs zEkQ*br8H-SePak$W(O0!?S(+|2T&wH7Y+l`Y^q85q$^y0*uX`$-Fcan4**b`wg+5e z&ZrgZ`DzaCXJA6k@Mz?Ew(i*~M5ZJ#jv^5Z&epeb9D`V9Y~axY&KUv9H4V=&TicPgkeVy z(nJQb-Uqh3!D^o1*kdrH>MH7E^gyKGSd7z`De`^H9otw__REQLw`2tt`mNtJ$gK); z>*V~p``P>z--K8p+ANOfzD8v`wh3*N(bR&vfDweGz>T+}q9j2&`G_s%uRZ&>INzX9 z?{^^^xG_8oe6JWdYW=hrFnDLnGM7h6XtGDEp)hJq0H!xaX0&is=TsOPMSS7gyGAh+ zYx2pgYK`(3C{HWPrAZGxL>GeU8LpQXMP>a)ET zd96t4g?qVkVT3?hcilbyw=n%*Y!-$E$?j`mx0%e@fDJL=ebX#+xjf1&qCl(e4JA|k zZ2H1osoPz!B`0Q47k${domg#WNd;!BU4!pFB{4;nL$}HT{O*^gS4f;U)zrb`O_Z7b zy=o9r^PW>xkwTcljd`BsB{i|#EAu$D%W?9wIJMwDmc=KVZah`(yI(%O(b}~k@L8RM z5qL(nVq?3GWJK;WpF|ZD^gyWqGVe_21rC!A$Ulw?-jrXjtsg-Bsa_}5yS0wz8rfBv zXl!hGr_5|}<%y{8A}xD|->5X?(@6*oEs%~sXCfw{TI{Oo4HsnD+4*(tSo9Icf!mZ0 z*il9&YPy?LV)G_dk@y7)r`fr@-mcJF}7{Llcf`zIHxp z)p=AkYUQy1f|KvdQLzp zmj>TkyO^8%&!wCgz>%7qL>1vNK3G;u^^H|-L-T~&vYm>z70QXTmHS;#USYgj@BR#01rTWwgdb9u?(<;C|B8Z9t=F^ZQ6;`S z71h;55)*e2we+jeFFn+Gf^SblATPR~${-cs7db#`i@wWgDS16T?~PLd5Ne5LxA>`L z2rXy|QIYmlyoI>d&s{p~_iLf)t=`Ys?$n0*bJ+zbp55<@LpFN16_BU8b&@~P-HYAp zqaAyUZ?KDf4hMM3idMgVpR+@_X3G{C7dY2~*y7=c`hd&w8sN3p8B4|zhCKW~7X9B} zcF<8_PvDA|A_N;N9(U2BC-$HyCpH#17d^oVg60cy6+~ zO1~lARk2Y1JMBA($LPq)9HngB@Q#w{E#;H`!Aa_2YirLZMBLZESX}scG(J?evnI7# z)o+Qv2>d9cTTywkvA~-sn^#*xjx;@!dowqM9osO+@&bdJApB?DT;Fi(-ZO?fW^=4` zxZk5pZgxBM4pF$8N-Rv@w0SDw`t8GaTXt$DCjekOFZV&B=cSOIbAAK$KFg*Wxrldf zT7oPix%@&}n}t0HR0aVhTqC|K8gvwlPfiF%;|O=yagse`$lmGsbh2u&-2ybD03sNg8+;t8$QJr&9Yx$adomb6dDLM1 zcXbI0Y1U(WqS>Z$w?yf~Eg>}dFOUAEiQ~~FkV%BJ|O%3 zzQt~`|1=x7`Z2?xKISsLPt{Y%y{H#(R$b@xoEXcFYE$VDR4RUYHe|6u?q7voz2S3R zUDEd&w>}NC46(o@=g-a>pwcrJlK4MAvEjw5!Oy2{ALf1igU}C&$_$-dlA8i^4n+Dj zKRlJJ(eKOJ-u9PRgg*gVhMHTRy)X9+^G<0*TLtXj_z!e34w8te^GZ%*Oi_x>ZWeWa zrNzLLqvCQtR$3`*eB#45C0eo~b&duH?@&$S&Z7UwCZc!X4Pa^RIfV0L=@U1NQ|jm@ zEk&=1A8{%p@;KXLt=p^RqsWSh>Yvr*)LbpQ{@WW7#v9jhr@L*+wW$9_QadvO=8_8u zjy;3#$-i>N4h#O4oXo1zZrg5Ro+}vHV-=N*5|H}ygXPz@WqN2L%lx(1EW%Gpzen}K zT32!~nRW<-zZ=bxZTkjC`xVE&;B6KS2!F@_`@!hxtr8vQvb^$2OGAQMplyz>x#$*N zYUfe){fG3R&AJCOU&yIa9cjlfzbD167o}J^DvyQ{4Ek(8?gy2-S$B@Cy=_oYX}XUo zm_PHk492@sY2_0X=kgT39OqW}EWH%9JzOT1a17d+s4)&#OmM`cSo(nH(V?gl9UI{Z?;nU*7NsBlX|aee#S$^#T{&EgyfC_)&cT}mpA(~OkwEyfMPeZVLsQQ^tt?g1?6IM_28<}?P7Nw zNQCd&*VW;7d-otMpglYR_=j?HX5vHQ)LY~reo=6u3G1+pFGc;R1R8?^0C0XjbbG

2ptM@bW>vF-h8_WsFv-|X7c;Z60} z;0%9s_XRz_rVJ%hT=nAAjh7H-Io1CA-=CHEAIyEt&E|Ai)@NxNZ6i{>e3rF2JRa_Trv;J7c;WJ3h@@gAAAh!sZ|7XcKS=e*M}KmVc>A7I zFGAj5Mqg5MeT_aKcS@QnOKJ4n*q0tgG{@&)tYy)6knE=us+-KW`ZJzKwum{F__KBO z2dd&jf~h5`7 zo)oU%b7(Q5&yxS8kNW>d<~uH^&BD)nURA$eRUw>(AgDPNah_5%@LA=mbXp~i-X&A( zjhwrFhh2ldMAzPJRfh9B19RAaKLu?UP!Zk4Ugj~Q^fXL_JeDj&+u0%S^yNGvbHb#n z7S}=dEzHMjKKGMV>;V0>QNud%edD?Va;VC}+2RXC*XD~zeQei?4f0^EU-p+eE3yW? z#kY)iUc1zUf0ZOxj!?WH>w+U`&0T1>OAzbB*-U!_;q23Ja-NmNjN8Sy4+7!_kB?Kq zipb;U?R89b2Ix!Jc!=bz$neaPfcP~8JYMizw6LKX8E=Iq%X?Q}fqx(h`Gy(r&?JZCmxQ3aVmS$3gB^@#J^-sO`uL zPNFs*wr%tAVRc3{vZ0 zR-G0*GmJcqW3H2Mb;&|s^_7$f&t(5qFce?dSaVI$WqUCE;h>Ds!402Jj^>f2s+(+F zSE)Dwsh#NzOprrKQ{OdMkm%uY2J3jtQ6PV;9MQXY z8WMY4bMW)%MnPx~tak$uUZ!5Le7*{>TI@*_mu$V*aBeynHx*IlboDwe*X67<=t|eD zLy(xDnZ(g~X!AAG#g-1v`YVh$>)-CM|6S1okLk%Y*c+5`+OrP;Q?6mMijPx@ZcASI zXRX5e4L0t!>)tF>x-A)$9oq8TKEWC7+=rzr{<}P{p65+g@$dd3hSF98)$hYRW-t1g zpW|W*zu5A|1#eQnP@$jRLD4VO`um<{uw{6fU7MAuDGn!lw>L1UsAZ>x_mCuQt!PO*=hIEFN8ib74e( z>F^ZCU++ZS$}y}|i4WY1xWgdS&+4B)%0=V*nJ2#@S~M=dOmqV5(R!Ee5HZe0uSq0SSZT zommmN_SE{f{y5l{tK_iQUG_&d?0R1@NW3kwM4X3EBfO-gM#rG^t!#J9f`5w%FF+!P zC(nbw;fCW9t_Uq{zc|;yEE}ghD^}IoM(n?2xQ=i=S)7|f@$5}4gV!1*5T}&TJCx1! z83jf4%8?(A%xr7(>v4OalG%XbIOhrN$C)e+g%5KYVMJPvQpk}MjDGC&P`WLt!|U8+ z_HMp?3I)DK`bfcfC^}D!(qR_P23_YjfK%!#jp73Wyab^)IAko<+D{DI?pw^ULWAWA z=UHyB^!LZtoaJ~2S*Wrr1G^olzh~b!?Kdf$I_cfC&ep3WW16e(jcyjbK4&q{VMMvc(fUo5 z{fdC3JG&q@%}KQ;j9jt@VHwk(8*QwEd?(Axx5a-LB@H0Y7zS1y132GR(s1*x01Ci7 z#UIx(Io8Z>cC2c%qUNpY>YYKDqZC_@Gac0|Td65Vkxofm5&pi?P`3d5G^yOhseYi$ zo#ds>(#Z)ufqI)Qh91R}pdZay_h!pc4m2ieZN_)u)xZhA zps=x#-V#YKj0*REyWhV#_aK#<6s{HaUdpZH0)_9xiR106~ z_4VTLcO9trvRaGbImO~n2^TpR=pkgzE^Wbr(H5&}z%maA00(X%-9 zxp)+mZEmpoKG0ev!#hg3xKDcO;A22o<&}-gf<7{g!#6Qx-jI! zM0Ys|Hb6ZsdN$mER=_Fnd?UZ3NkHy_%0 z8ur^|Pd`@;4abo>6)q9+9b~yTNLFcJx)nrC{QNP7RtEZfj)tIq@@iI$j~DsxjBQHP z1GAw1u-DFamu9A)hwe3HnIZjqLrB$56@Y$OHIlrUdd=XcuFHdj7@2VU1~sl-c!$&B z?iziFM5h;_dRA2`-c`sxLsH04AMRaqvtni6&l zsKxNnH`h&Gj$3F%?gJH7R<_qBYS;bLLyPcP0cY<4Les{Mgk#z!pvh9*Mr7iaObYs|38an%5LeT&blfHgeiUmP zVasA@2mz|ODs2}xsOXkB(8Y9+g%rp~YAX%U4@Ckewy`?gv|jjZIAeksrxc;?%8#m} z{J>&CocffLvFptl)n}rwM-Gz6)6B8)!xZ)TwXIR3gi|>=6L?0(TSi5i>@Gu};>24> z>RmnJ-jC#lc;)hbd()M4*8hiHs>3EHs_fY4sMU6&e2^=+!q8`B8nCk<*2hz>PI@!&J`9fmNv@QPIJVIj;*)UBzI6NdVC@T_ zs%$Ft;5U@VA6$p37pw_Ce=fws2qK~DXI=70wi+g`TN99CoTTSC1cb^@Wg`;%!#57yf5C~3nKS&cBlT1Zr3b7SNosGl9pw}swj?j>)9kO-%IP?)E8SbrlH+-NE zxo4!CkEKhZfsGM&TkNjpx>xr8zJAPbH_I-CEYr$;x$lp?o8TtXijJ2FghdEHqB7r$ zmQIhvXhl1()A}%u&6l9cMJGP9PKuGx%i++zsC>$PR(>17iz|ijDnasUrwg_z-p0KC z%>WO_Y;tbSJX3}SgvN)8x`NX@Tf==)e{#NNQglp%*wYDvQ6nI7oLt_|4v!o3{@ zf@0JgF##H_uhUKu_q|jYv5MT%tSW>lvV(b6SdwD!%M;~4Y9mFc?=LGQdcCz=;Y|@d z33SP#I^vK`FLa!%sws~Iv9GP`z1}#9Pi|7iIPu%=`@fKfZ^&R?dXGG)I)Wh%26>`_ zW3U-_2a|<}cTUrG1GTmuX)kOJ^!@4}dh9M!#}6cG-9xF~%4K|!hUS28MQ2CJcvhr_ zM|;RYK8RNlAI5*l!x5ZektAak4#p-d@!Zc6%`xcpZ6?-gLR)%~erzO}u*f8SlUx7l zGOo|Z-PaZ2%H2-Yg#*>n+R9{Ct3tyE`Q)I$y;pc@<3HaY3pbv>#+k#**RMlkqIo#U9h*B1W}#5oc=z#XCz**?ctmEB`=y{?MwuYS(7me(JnrjDO%{(#vom6_I|uk?r{YVDtpuGGZ_ z*^!Z5q27Ak#{2}td1IbZT!{F!)Lfzdi34y#?~(R-@iLlwcWOeF-}MsT*5ct8!>U2d zAdDw8H2pO0%Kx$J1tTT^w}JppxoSL@i9SB)^x2(YZ5{Ch<*Zi9U$5kyE_LGkZp`O$Hwu>X^slafe~e&Vmh9zt(Wfg z^J4t8!O!oydgq z#@*^$%nZ&-i^H2UIhb-zy_?8R9E7iz*0h@m0B9%N7b|CqCSI23L@ud0Y)K=tHNqnZ zpI4t;d-p0gg71yt5o3lByRF?S8!hq~=28Q7;RF6MG`HH+1wu?|5LLjExo$&Cxk*kQ@DZ|Q`2fM3c1)wBi4<6UcYw$rPScB&xX2FRHPF1VSx?z-Vt3?N(gk{5v3 z;Wsaut12|&zpqeZ!$RDh+EGx#JL7;XSW3hv;lq2xT(o}&7-`$j#cYCQZ@ti+=bLA@ z5GxE>^gn~<9@0DEfdo{jfatX34KuL3I{bwElNZcs`#LSDnjz!qn5?Phze_2$0&tqx zg)W>q-W5cC=3{>6O5i+?c-JQ9AE#dDRE(66nJaiH?-KPn8tC+@d{9zP_=Ujy&oEYf>i50JwDv0<-xH=J1@5!C_OMv4 zwlv5(?A#WVU71LV@!Ad5|Fq+bNNDl-jIM;BLf^AnkF^uFBwSwmV$^<{!?muC<3!CD zu15|OO`|GZ6EjouJ~zK#Eeq@V{nKxF%k ziSqyh(*8n=iEKkLKs1<*k}2Of{$miJ;$7zr%iEQfiUi=z0VTuD))T3#CiwVxdxu-8 zbl&i@G|m3i4qQYSgYgtCpe~jzUPe5S&dAfb>dX&ZpUD6mdw+MNW#PX88WeSH{xh~u zS&UUjCR9~(n&NCSmC$L+(uabvY$lL&e)Nk&@9MSG!+FY)SELLU)kQv;gftKO9>|%Z zqsanyZs%R0)lwhgnrJGmZWFW3mRS}}+fj0%N%?%zmpgNvJ3(f5d(I!=3JlUCnK0|! z(+>^7#v5CwACnfk9iB*EbRBw+-RfN&n{MbZ$mX-|bFga2z-rke?C%HhaqlqIlQsB0 zhk!U*cNof3cv+6bRBwc)&6}Orsf|TkW%TC(h1vqMMuVVG&N5j_1CD{;tV_4`lv&5A zn-(OK9|XwA%C{qR#-b}}UYRlD8O#GPy$BC-GVP{6-H4T_JG0F+un({eP%@3;0Bs&Z zq{^CRv+tFmk##Cl#W`XNg}H|6!k@nB3z#srr!6~M-e&r0Mf&%c|VP z4SxhAt!Cef@=jYieW<|{LK~{|A2CEsGzWc_vg>FDI)qNx+jSzVw}~%?u3u{6*qfO* z4~)30{t~d-{+O9I7TupTvJ5unJ6icY)-G`42MD>5rpizvkXZ6e%cBCeAoL}cQQpbTcg;tuo6EJbKKvQUI^&$0; zrR4Uker{W{Rcp;~w3VFQHqHLQi^U5Dx#=UMQES^Jcno2ec5^OblK`E-OMONAbSHkb zV;_1r^=WrzA4s%0)r?mCf69>y0*`85dUDNPcwXjx*2||Q2YZ*qUcXZGiVJ|jpFaP$ z`v7p(ZLU)%rQ8=8OctvFPWH&mC{t^#^$9ahtM^q3)^~+&NBQ8wyrf)PQWZ^5l``sj z)i+m_!Es+GV?WE1U@~5(?M}b;6s4K6WkCef0mMB(TLZmfj1Oj!EFU>M;1$B7l-CMg~qAz#FbvCR?WceerA2nu>_G zKSmld;fAIU_!s+>K=ULTk=atS&AQf(JNvEhU6MK*O}&ikYWICz;>X_072!T(Uau^U zWP9@h_~}GNhsx@NvUcG3^k>(#= zAQckv{v1zXLpwgpg?hldEUf*zuB^A%gXM;!E&;$p3nnvzACRN3ZG_I;q54CcYHAHo z#Ss2Y$8syPp<<4!lOv16PXjF#X2;Q3=o6Fg$X^LgE(09z>MAQHxD&gYgqO@_HF;Y5ugd>7Ot{Dt`b?M{F7@hW(1Z` z+}G|M>gsyW1#q?o1W3CcnIF{juQXXDMvXisrOzr9&6r%i+hsM9aB~Sx3L;0%MUE^! zCOrnN|LIDJooZA>(z9t;eKTAi@FIr)S{>V~pPdeQ1ijMRU0J?bGo6iTo_A7e zezkmtJ18Wc@B2f8zM2D%N=RxXlhrrtnDTcy@nEqLO#eYsQf`_?bbW zwNGEuoMM^Q^8QYCFsS!zD%6FpdWm0TOP5b}wKN$#{(a(vikCX@I}z2nK%vSHkRMjd z0XifuaKRAb3o1q94lhMHd@z{G(o7P#yQx>gN+WtM(vV`|eL+cxGhc1em$uxj!JV-u z@z@5P4U2R3-=u{gFvDU0{Ux#irj-fuAFBG=%BS|t2fzY;=_phZ5f@$FJS9%^uRljB zC)BL*3-R`Jclj+g*$b;dm&g8mR}gVUVp~{PsC-c6Zf2+(z5;jUMIJmyT{@-F^jHxun$Q z?SlX@f7tWmZOgZkX1vLn9jYA?~#G##?~mvmCoTQEs?l_{b7ipIUgO;**5x`Dda(45zemBY<(ROwgL zG`XcS4!(I(nwx)C`H|;`=_#**&V=vW$MAYYBb2T3P&^)_KUcvys04^YI{R&cZrfE7 zyQ!N8H2D_`bN`mg2N;9T&)33kVt7Mg*Mm$mhH9rTVz(z zjz$u1t!2QoJcx>rap2d812fCU$&3@)n5(ptZW$p{S9g5^hEuG`h(?MD5Kr#`lzX`H z2{ZnI>=;13xAW`p6c*1%cT}&3QfQ^%>qj{UA>Zy>`fu`xN?nK01p3C5$LAzT zkDhKKyt!0TPq982-l$CowE}msi_AwS^e*sBGF9EK+kMEjC6^N@s(H8Dx<-!tOtO2HIpeq5s2R z0&>Ny8f1Gc`|QG_#JH_4!bIaa_xk_R0^k*w>14x4fyH^S*?fa};tza!Q{Er1uvrHy zFoWf0!GAyTF0Vg{PR>Tq3V;JJ@2^6yonibjrP_V$trQpUcn}B7tI@<#dB+C22<+2|GEyq*w=*aa|X{KZ874|&J;grceY7~7r&cB=zgYc z*2K#ahF*i5dIBCkoS*TJ$X$3mVpbg`V%u1Px(G7Ig8&v~4Me21GX4p}+NB61&-E>iK z8^+45_Z1WQOFP01v62xx-riT*hXfo{VXF4;jXeyCwp9Z1uR+CmaeXY+i%WebqyNuo-_ zT+LlmN_{32KjSj>J(6%O2>x`9rJxiWZ{r;m=0F~ez>_p6+=dr~{qhS%;Xcn-fU|G$ zZkB*X^6-cLX&sk*wYm|BySfyeyBlS^#FZY?sVelAamXrOahs<*Pnhi2AL)f==CTK& z@}5GdCc!rBH=FTfX69e=gwYCv_Q(m7h~;aZ94++zBCguAoSh}EfX`ID4>@cL&Qs~w zR z)OG6W1Me*0<7TnRpo-)Q~?dN zo@9Qih5sb(8ylnS&{~QZgg}T6uf?gs``IcvVxuE()qa*v1pPQjtMJ~hYN@XTgR3Zr zERj9`cXa#55@XUK4njWX(+Yoo?koOEJ*@6XxA!_~Wj+T)`r|d`dQQrILZx}P&D^T) z5EWPXRsA@IE-`NH3;Zi$-X*Uu9I8I^-IEDKLGHxFtCxFz{&aKlosL0V0MNqz2MkgZ zC#xM#Kr|(pg8M3gQeYhCk#Af*f4BY`*f9+uS@~k@+8kT_B_C@7KnbL))uWyWr_B0TH|2mbMS=asq-t!{dgOvG24 zXHp$<+bl+3m_?8$J*RkXYU{{b9ZkN(9*LdG@yP}Mp?SF9+VshL*Wl>zM!`yX9{i0B zC~1udN&LW|@Llrukd25j^&@p2vnwL>5n_UP$GX9V2OY+w&Nh5i1x&|u$dJgh(js2b zBnr5G&PYbw+H~KMUrdSJYUDo0RzNdY-r212i|W~YE(n^UR43%l*ZdF?)j52|x~pu2 zgEM>E+WJNu-SIyDPY)p~4eNvKoiN{2QXPjHeQtizz7M7hqC1KPAu1ojkGbw{uDww>i=p@j%||)zMHS8TVa!P1uk8 zV!*6%{C8ybUwG|~G#pT~aNW-;)w8^IDYSN}?7W!^^~M8TpWARhK~XbFZ#R}co!aFs zgg4|4XSTT8?LEtwyhrc48&fB6-Q=4BMCDz%?42^+iymMC18<=XfvY#lvcSK#o$*k5 zOu{eEKN$^60&iF_+={3{Aqi8H&9{mQ5;$>sOK*qi{Zc-4GKEqb*jP@eSrC>)$cdQ- zimB=T;~D@7{{aHU^7R6Xu7sA6M^vW-!K5GD*7B2|OI;{G!wZyw6!s>(K#7kl+ z0RQvP=g|$^p?FAN0bBG_3qNK0d{tvUHdn)}X5jz1o5gvG(;MToFl#c-=#1Ln&KRx$ z*dNwl%=9c}#uz&4T%ZPv^2SQWN{Yc8Vi52P`5L6W!JOzV(laDeY12pDyLr3`O_{I3 zCqd^L=iGhxRhJ`oRcLm2Uf; zzAkriYtD;8_H)l7k)eSMU~6oG!PnjL=Q3@@jc4HG;>u@7y&kawA26+O7w61Ph7`!_ zYkqgtOS;^hESS`2#VGTKJ&u>e=ah;wY{PQ1(lYEU1Y_+)g-Z5fq)LQEnv6#`)A_)* z#9F&r&exl~LNi-1REo-!75v6|a!@bmbyKGay4UxG%;U7M3uJDLG`~mi3%_T@V&YL) z6}nYh+cn-eC{#!l9lj)C#yle8f3)Oo7>KspZW&!Zv70;%%!kcPW^2SE$1*L$E)O&q zB{T@+mMWCCuME3(=s)FN@v5~J^aRG$WMvtHm|Y8_5{qRID0lHgIv?gxkZZhS7CDRx zj;Wv@l+c_;V8sV345hk!u*WvaO?b$@_1$4zl7qY73(Hh|W7h|gB=pVoP0Ph9=IfXB zC;7hYJ@>k%H$uh*ic|AmD~LGDbcFAdb>6G7PnJI&-h409rp@y;DVygn*Rg6o$Sf1g z%!R2;TB}S5hzK5M($u*7zuH8UiM_B8lpms<%?|87a4AM<;83y>NIu|P?A+9G0PhbO zG&zYY<^QZFc}5a)`mEC)L~$JHJ@BGur~Q+(jyMfHa_`k9PIBunjxWBBnTIz=0$0Fo z9!w1o9e?vGxvY4} z;Jfi9bbuJ!j=3b=WDv$@m|(iGI{kkjtOI;^_9! zyls1fj%!~}>2sX=%8{+lAqt+Vu%DY+E1z{B$6&9U8n={U`r(8O0sur&&+?m zpQX3`;;{81-AKRL!Qd4-5tPyUpM4&$`_Pgte-lCe3m615qIh4p34KDay-!fp9~?{G z_ny-=OyuFu%*e(tDLnHLm{C-}A9)9ahw2}E!K2iDl^5W5brwV#7?}`o^ny;xjhR@{ zqqyYS{#9jkv$;=_z0j4*X4}f>L&qF(xj0 z-MHYrADvX2UeHMz?3RVNA1wj4>sW1k5K*?Cl9GMr zpC4drFN+6xgLp+fQx&=|g(U(ZCHh^icnh7#lHZLet5m=AFeIs(HlT4g2tXkFMTH(xFN#RG z?A*4%$8b*R^X{zJTZ|X_sunIKHgB&M_dt=gzNN$E+j8%2m#6!Krk<1C_=RQH9B;FT z`||9efQ-o5p71#qj4Z$D0@^4Y(}_ueR^U2xQ6i6QMRTLxFz%awys*}FZcp!13&{4> z!bFr;UMu`$FH>KU{TJ%1gXw%kCvgDb0~X)@T&WZ}i)OcgV;i+)knRh2&L}>D0x2k1 zI@TFUoyBkxw)csTLirgqiD^#AD61Do!-m$XmvC8IYJj7mQ3{7 z252n#cPE@Jh$zXp)5bIg(v)#jVKKcr*WzG~bWb8RC?1~_8c8MEuu+{lH%=+$7!b*< z{ffV8B9}A&DepUyb0Cyc_!9H>HwMzi`Pwe_?R?wsg`tdKgG_PwchVL0$85hB-)bEbYz--pVOd)?O4KN*Q3|UU< z5Ch~i?4S*tciNbb$qAWud`%5EdHydN?jNw1K?s;*qz^RaXuNsJgYy^KN&3w;FwZll zGr_lyDa+_jW8a>oc?wXU^d`s+C0vMwk~-LOsd*6W{Y_7SO0a+(3_ZsRJcryxsv&FC z_{4nAhzZUzbOBzY%AV(N3m%5`Ibp8N(KxF2a<=x27$%K%+9&7X?vy_>r`!!PyNh7IdKenyEAaId zYfs)FRa`ESsw1^tfOgzc`QWNzVUt~FqG0jR_W=B)v&$$tRr zpm5;i&pn67H}|!JRFY&z*UMEMx^JlH)^1;~aSjd9uH5DlJX_@-ZqobA5R z3E*y?M(qUT7zU*^?xPY$;+vj}I-_3%XL$Pm+`YKMKo)fH*(QGm10jC!P{>SeYcI~( zjP@RvXmY2!HX~f!-ulxTneS7u=KIAut0yg{fC)sLsm+gN$3dSI z;7Bz(%9w|&K_H*6nPa6kzCB3!oy;8jjTZp2U$fI%uC4FBmBB~Bnjc#2{yB2^CXI!h z6@D@2eg(9zV8ekf-E_R$=P2~~+{xkIgxeHeu*kmv+_gA3F!VR9KoNMG=ubgxkq6`0 zLze*&DG{K;#mf7bOWfXP)gBm{-@_<&h}NY4k0}Uzj5>;-i8(UBJR|asC<&Ao!Js)3 z(t@R=fS~En`Y}UcPgUDUv9E4Da#my^A)t1ki6K1y=5y;6dU;Z{3DS3%NuUc1DEuLL zdBgqBqxx*jE3I7_Ey!WQwetfROu%0fPu|AmeOMVqxUN33^mMx;C#RYJzQ{ZOv7mQ` zA(MHv^}hN=T%}%F@>RK7g?-cge~&Nfrw+}(!$R*B2i{Gqjv=bNoYuwFRNye4ymfv6 ztraw1w9_uK=Gz4Bi(7_G&WOBq*sat=Q;jcCwpVACTbnTP(r!49)4&$v2gH}!UlxtC z@AXN@e<&*K+b*q;LVIdu#1o$vEIKT@2oJ?=TNza8EDEqJMLM#whjsENQ6#7IzmOa%E zcd-@q_y$*ZXjLE7cL~NUn$fZh-mD2G0^{VtAq<#LNHoFfERCa`*Vu>i)l>w${;k9K zN82Yo>;sai{?8PkscM(&P*QzK^^FVR8tdh7r(c%cT$)tHyHDxFq=xP`8|Xw0`tI|z z*(Gw$UnLHX&@<(&(0Lp{-Rizih)x*dD4^k6=us;c)7#{K#Uycv`*ePZ2S~`NZ3r3|=Bbvi>e~K8eoWq*on30sslo=v28d-HG#%zGD zZ>Aa&iQ$bnzX~1U@G}6f`bLv*)mBzqmzi*8ry8CNne|TA-$x7DbfmmYVAeq@(&WaIh_8&!)U$A}r$FEH*M84X zgzo&C-!Q8thpSpvdgnQtHD5hmzG!XWQ!p?Te!Tx5!3f|@ppEnv^2-z{I&2;TN3=u% z<tO|{UIx^k z+ux`yipGqeM9yTcy;+#bq~F{l^!bpUD=JMgRTEE1rEix_&OcNH2lU0apTJTh(f=}v zl~|=5&m)v7kKT2dT+#6AOvJ27gPKl-PYhxHPXK@;wyRT(JkMdt^a03@mYUrPQk7FZ z(6c{z5v(*Ir+xv(v8Qx_H)J2^7AGhmcT7At`GAkdJG6|d!9XKQMlX4p(Ap&n2bd@$ z*|1y36FEfz;iBO3=BG#TwS1O}%HK2RB;FQi#z>?8#oe&Ob!1p<$FRJ0IzOF~y5ppi zI$IsaFSsDrojUd8hqC-P{}6Ov?@_jxE;N*0{8KqX$_-q(oNMQT!S<;E3QHX+=RAO+ z%ZyGkme7mSoF?>lR-WsRUvAd#<&fq2$6?Zi$thN;Pe`6no8J$fdnXZX{C|ACby$>L z*FKD*Qqph}(jn3cNH>xqp@if}2vSngFeBYvl0!(B5|T2~-3?Mhcf$+}Gv5e4zu)_P z?{k0u%;5ot%ysRx*IMT~*SXhDe#vg5lTegkSQ1_}+b#TDyTJIuow&Ug?(bvN1if4{ z{GKhQHjQ!Lx+bEFQ1xWE{MmVAzoxx;EM`fv%gCW#IT2Kj=03ve(vn0qE*vL(IQ7wq zDjZT;@av`9!%81BUaRf9((lS+k5-r2>3)l}ESOb|uk%$t#q%ZiF%n3Hw_}BlQTWkb z1A775(63EzOG(H`?rU4Ef!gq@o<~o`C_n>LMq-8Q-vQ&K-E>DiB{0m5i>AmcT0V+t zKk!6jrBs=6>sJIgTT|OqjRXUS!~4(9X*1Eyq8Hvvg!rE{ODy;JVzMH_J=ZQRQ=S^w z#lV@YpbmqY9PP>~wDGZn+Nk^Ti468!;lC2_cu*AEaBv3A+S0zgBlM6iHS(KSPZ0hi z_m$|y&M)2?}zp(tMz!fp&x6P)kb3g1bHC2q>2L%P)vO;zRp6l64 zbdp~|o^hkd%wP!$A4wU}RlQWic{@A49u3NfTyChgwtZw-R`tU#Bgs#nEdFv&j`S0a*M4w9$bIf0t&UiAnJ#usS zgvuSKrb07A1By~^emfPr*@pT}H?J>*G+i#S5a3@MVY7rSu|-u+4sPxn46fS%I(`j{ zIuRajR$Ql7qfX>-!ihrz+y_aUf^W99E~Dt;n6=Ovm;k>~zz_peSkR4ckUXOOVQ9N- zB5}n`oBD%8do)5tWUYbQ#}JYiExSETiR}K6$m?wrAc2+t;YmPBEA1ubwozTyXUx#^ z?u>j;xn|IhOMri**vV!;%1O61>XlE^<}pyEtcRrc`-rHUxNz%WL4lPgC7PpiFZ`wBhj7}2M!sYtS*OAX~{+jih5C+MASJ4NZl)S$BL(PgLlJgfRc8muiPFK zTVE89QTp~-yN&U+b$PReSYCPgkJ9C@XV&|r<0m_A<5H8TKw;!| zz_2l_vb%?mTRiSJC)`9W2sXSpLS60Hk{*EpI_^5y_~ncFX}=;tcsVRmqG_c-_aqts z@zcia5M-wRbu8U%`oVG=9%nuBjHN#+p^ivjQn)C9^^#oED*RxbJ$PNzCb;1i1jHiB zOE7s+};+<^G-f=5tCVUyieEgEFasVV})czF@ugBipTTsV99gCHL zW)B>G@#9R3%=-`ro9-^7d%nqgz{2F3ET!E7QkmN zTqR#kgks#s)6i#ngDzw$_FuOihLLoP*BoCD_fWfe?@)RX3~BVWvV7s+1}j5qkk2j#Z2TO!(?l=UjE$(vDtgH`0ycOxxrO75X6SBfYjgJ2l zcm2UAECJIKUR>d9t*!U-YrLjo=gHqMyZ{9$?CQJVwH3GY)sCstZ!U6H#5=c7g_l;r z6rnBvM@Wz&+NRVPFt8>1rW_`?Ye(iGG@7iW4kJ`^nA^|=He>Ww1(<=c)To-?*J4+wzOF>>y7c)6Zsp%n>32*1F;^2w20R-Z|h6LWq;%3 z>i4-@n+N%`ZFFl56+yznPZV>mNuW*7eaFtCJ*anI+foD>$V54>qU99mDRa`PFJ`EF z$C#a;b??^yM+>kkrI)vbd1(t@yd*iZA5f2Z$IF|KU`$YyT$IJOS{39+LCL9Ky?4r| zay<`0EMz9w?KFy@m2V1@;=EF?0<B$~@>TlQ-XC5WTRxH3LVR?OmP4w6+c-$;1G{EikI3&fxq8Oj=OxlmgwYuK+6``VnjezB& zjWQHQ4$tpsqLHW!`G00lf;CFioDYkzX0GNdo(YNhV%SWUt^Jq-`t8ltzEQ$^X^psT zKmIo<5E2eA=dWA`_cp2bh{h;pQ8_V!&(+UdWGVLgc!j3}9whP<<)Y79~$`q3W%zbC&2n5ZQ+wFc=;c$^9k$<1uV(Y&=agnwR5K zBFUxasLphKgQacL!U(;W>vfwfA@_5cmiJNjB;-H?37-6mfdVu|&Y#HN2#B~L(FA%+ zX1Q?U9qY4*H}TI~ZR4Cbq)xm7W4~NG9IYLYY1K#w)J=M@W-=ZY;JQnwz&v7%&8;p^#upLRarrowEw-yiEa(09H=$YlYT+X zkf`cgW)*~s0^TNyn_m^XBJ3`j|0>r2Pk?2{E6nt8PT-y{DAFFdlU6u{;h;8M{R2In zac2Orh5DJ8Y&iEy-=Wa?bC#&o^`vwj9zUZdKBg|wgW`7IZ~}&uAa#m(2T9>@VIq8? z7aLY&P|;hQ3SSO?I-GpyTmcvaAO^`1y#|vbQwAYS?9Nqhb}uKT4r?G+m{Ge|YD7#p zoUt4&dcV-a98#1hm$0D6*)7EKWG_~5^Okbr4PNq(ycGIdV#OZQ9vOzv3VvUf974){ zpYBNQE4S<_vzOHvZ-c$?=lw0#2>@D{k8J-9iYXO2yPlrt*%xwHBA%5&Ss~bc!L<%! zgpkOuUplKXH2K)FvbigkUptgXsWmLxUqRMZ)73pg8*%>ZK3pmBp(Bx~AxQMb#!5lx z%CXDO-{)Z_<@KfLi>FO6`nC97>C(-ITC7qXnH0hgWYb>u>|uW`$}Q;&JlGfpWfgRD zPd%)(GZG3Es7%kwv7CSuWm-SabyiT$#=e%ov8sM=*@selnL7UhMzSs5~J}_#nnxiD2E0ar_~d;IUSLU67x;DilZ2M|mqv#vZ@;9Y^(V+!uPkf=s-p zQhxkErtif}z6u4c8p`x+G$<*0(oSKZfATjnil2~MSmf7VU)XO50at`(eqMzh<`6yk z8Z|Y~*RF^ha|OZ+CM&)5*dNNm37T>D5+!T)7@$6sTypxDx@9)?q>7_D`ggqFu2p0x zAXqZvZ|XfHUaPuM@AI@i#6GG%cF}9-pYny#mmdQj4NzavqaG~6uB4WV4uJSp7Qz$5 zOc!A%jQGi5xwrVEoS`b;`c)`l7ow15f@<+N-sQiMP`+yl1A8%dEXg70n!@q)w8He{ zd4*})kF-05mYHmX{fDvY^cyyWuXxq|vZvX{66pDZ8)!WPrJLl_e*%a%w*UhZwF{kL zHM%**vIdf_JMu2Ez7qI;zb)8ac$|!HHye) z;~%Q^p}720>Mf=s6fF_O^7n$A@(1^?-;7&R!5?OqmnZD-9IQmv5>(|>1dyh59k3VtVFHZZ z#yC9t?z1GJrp^5o&s80)i=!;6w*kz3Uy3Z;wWYVX#eO`z_?A;Ey=`uM{gPz4(q$ge zUlQywQPa1zuVQA$yg997lZjk;)&)4#CX{o$iIz}&ihqT z@d$dxtJk6dD92hwqnxanR!^iwfr1BgnYZfTyI0}REC1Hi<>!c1b>zL;1t-q=M3tAQ zD*ABq|G$)91ue)jVJ(dMydin)P&T|=dVN;yV3(mb$CbD8S1^o%udA!69Rh%;3)nE& zCnPpD*1;&NG5v68u{sLivqLRdwilLbpr@oO?TrPH5szsHT| z8^Qt2d1KVa$a2Bb1G$OgNW7tg9d0zW@U~~o<|QH-BHm7W5t(D$Q~6jC3{0>G?yL7~ zKmTEjlB*NAbBA{+=0~JXN5=!7$ByJ)kwC}0#(8bAkEK4z+^}+J^bC=fWGw6?<-O~E zOr9x*o*eYG)DXm~3pp*IlsFd{BV`_cl!@i0c8Es3<^y4U=<~fzb*Hz$Ah27PGNb$? zR80Mtjb7^E6YQ@YxQ{BD5@~s_*<*&K{5kvrR<(&W9sG)y_=dg7;W4D@vqk&X%S$Wpymsy6YdOFHmJIcyq=gm|Z*I<{5T`_^S zjRUE{qOAv>1i+vBA|l#mg%G9G&!p4U4Xr~>p4=A);vK##+F|!iu6y}(OZO~R9E1|f z0|5mq4BV~u(rx$p$75b%7-{zxY=rR=?UgQcJT zJiqA7yR~-|Mmz~UL5Zvjh_D**x)ZW11q-D&U7h6QXm_v}OU0dE`?k^3gj_7FeX-1w zkEqHo(CO>0{wb$muiiUha9JpnXjV`xpk=TIxajqb6``A!80@Q{`f7;lp~JyqSXCeV zT#nV1HXXVd%8}xk#JgkKm-5{qilfv()1r6O7e+UNH0yT>abb=u73?$9jMWg?n92=JZd-%<_% z$^_dMGA<6Me}aZ~$CWWS5%fvE9{=*J?L`^Xx0-P*?(vh^rM*WN{ZUW z0N%8QeWfx{w!BqWQ6kEDy7~a)U12Q)rN*(x--p0o^=+V45i3}3M+3Y-?1MDh#v{cT z7M$$6!2Vj4Y_^64x&`LA zVzbcu6AZT(sCrPPJ)}!7-f@8R=siGgTj=2Dg)98U!XLa{o{GggcFmI_Iin+g>gn@+yzYO56P6)KnwpOZe6kn!`Ob-|ZHly(&={Z@Gw6^pC3FMw1JyofP^M z-O&Q%lRUiNT|GiwnP@{@y!$G~w=)kUBsyp>=*>PggceG%fxLgTm|f*h^ZkC68ek+5 z_W~{Q?(dwiNdZ;E!?H4e5RhEz`P+%UL48kofSyQaI<}3wCg6ZuWVX|Q#t}OoNwsF2 z-Gk<_EGfZ7Rn90DgQ@djbR_(7#@YB5;xs_@x@HU;)p1`$rYC&Kv`^~fsZk%p2LdKE z^liFqShF>rA7FzxCNSq#eRyNACM7brgv?mcQi*z0u~}fKD$VE;m57NxRQAwueT3(# zpEFo(@Mxe*{h3i;cYw#4J^I;6uL@?+&;Ffg_ID^=JP3WEU~TVK1k&x^LEG0%gubwi zc8temwAz%%YLPN}iuuFSPc^5!U!x*RIBef)qjOV618llBzJPX6Hiau-qe8R7@vsNW z12zaTJsvmFN@Q^{1YMss?HC=Ho~-xH!pGwzH;0-^PIlgv@5%lvfj$y>k8Z8cOPdV$XQfm)6T|ZWvP&Gko)55cr2Uy<{+viei4qlT@@1T4`*7s z7p-9F`*e8G6DuU{r795{kc;aUzAVBvHo{@9!UEllo@!UTYERBit30}F>b56^0-qGC zstUn(r__(d=%J})9*PWUCFsvG?_|Ci*B(dWwD(>n{HpXlRmefAM)I z>ay1)Q<6_EX^ZNt!g$DiyZhndyjY9j&l$~+Wxs8=zyOKo^Go4`bQ=fzlAW+m@ndG` zuTa~D@%9HPOJf)5OPf9VlF>({a4^_B<#>EqulaDmY*&m=ksnV;j<^#?nfLHy%2q(6 z>0(aYy$kGUYR)Uk;R;XcgRcRAY(l>sFgr|-{J;>)Uew^&OFKlzEwsL^jQ&UENYSZF z=v?I-5eYW6wT<}x{mnS~7L`XJW2WSHNN^72(}~|FRY@7*7soq{uf?{T-qN}BiWQyP{%cf&O3JfYMJIkZq0th4yHLRZ2L6X8f3^7-im^Pf^SQITX=zO!TN0{ z6IhR7x+pjfn)YAuF_{^DV0=0YFuh(ABBrrx=^&sC4$wXt`6eo!CF?ce7W-jwMPzwQ zu4DkTyoVBRLT_@bV{J0+JZH%s^ah%Il*!Dt;eLF`g+6Bh{T0e_#?|Ig{|<@X>~50o zah`1_Id<1*9>Q`>D3vPu0KCpg7|q8g+LiQt-krBhz&Uq)ZTFJ?QOR*pWAt$)tTMP}a$LfT)Qs_~*mdPKqX2bAbX zfoe~2Z&WVh2DaC991Jg(IGL6X8bznCHmbkko9kD?#`~CFhKzhQRll>W;mX@w30_di zX11KYJ~;F2EN>h0-dm2Rk<@%YR_v5MjTzFeu`vnJw?Ce4c6@AMjTnyPtoQy?PV++G z*9YjsKEdGh)y#wRc5<4wKF-oJy108yaUOn%Lq7?S05#3b1#L5YhR|tuiRVL5&vW!e z&P&M)NLORj<woye?dA%B<$M6%i5dKr?1#jQ0sk1P$!PUX zfl64h@>n7wvjxbk3pR5+WKYN^)#Q^H=r^(HNww~h@G8#1iI9Vn{-47<9 zeQ+=N*ZLVb@B(1BiqHEf>upRDX7Go*qYs;@Q{+f~HGZiKH^~tS>7$jPDIQ^eVI>k= zO<+Z$bHLpFCA!Vye&wbw+wD^=!3ijhjaPMnuFsiT=|Kph+!dEf3;$Tm#g&l?6!)4F zy^SD%{xGLdWnTTE$Ie<7$ z)|cF-Vl@aon5XJZfpFyou~uO0$OHPT-_NuH7u`2JA9#)4FZ-*zz;2eU$vo<~dh9-R zu)n;d6dChnc)r%p27t6EZ`kV6zkWg>x?8zK^Pzffo1uY$^wQR}B0OMX(P+G-!?kgJ z8z6lprH($J2+@LLs7GtTe-)g^WVO)1D24k1>ODJE%Vf%m3j^jC(+Pj9nZgS4 zx4n@I#LZ{Z!f8`ll*S7k(s`dLz7%;zWu%*Jv{0$3>9hr4yhDZRNuJJJe+`xEDb!A^ z*LPI|=09~R?nJ|o04%qt@2>AdC$3pJ*6tNVh0Z((RxCk>(}B<52;}&+#Q5p8w-8h!%Y>b#Kz>{XSf=y%eJUcCsX(PMeA@n}4Nd^VuZ1 zeb1@DQ_f03Pp-?5P_&ql;%hT33%$fE0j==kHy{4OCfrY5s(k)_@?j$7CVST1s^>%9 zn_P{A{m=OHI)unbgoX9jcn_x4b)koMCyM!=&;4TD7?*PXW^bMp?G0zxNlu<=C{J6v zX6=<%T<1p*6l+6A6zDIq0`&Cph4_3NKpl$G4OD=kRy37B`bIm|81QX_ckDs%1u)G^ z>I4`iacxvtdgoVfBBw{60Whhy`3!w^t0dV%-jOy3+5wn4FVHhOYsSWW=zGWxTOc;5TH21Sq? zYwx%dl<3Y`z=RSj&n$&qcC@JV_@Ya2g5DkCmX119K(pTHx0 z0trW{yakyCjy_7tg$P;7{^+}2_yl()W9zJacydOqMXU!6zub1Wn~#QCv9xBbIF~xW zQ%F39Viv2|=gCH015%f2`|wkGN5qnv_*1?~^yEhog`bnFLZJ7|B|)QsCAXdfS@Qmp z(Ffa|QiM+|dWD&>*)T@nu4OWo1L0{NM}#PTBC8pI`ZTvjqU+UM4Lmf!I294IiP*`Z zVFZ3?-e}?Zdi^I{0g5L0qtg2aS4m+^8K;6PSS|LRL*Ku1bafSPtSRoiK$Ceem}&^k zKti(Ry6#S>uu{5vllv8)4yI#|qkXj5_K}(I&3Zcis^$V6G5S=p`>+!** zo%`b`Pl=Tx<}q;>^UK{!xA7Fub*hX)N7-#(aNqG=3DAH=qQAQMFCXyrn(FYs*N6>^Et z9k#3}{VVhvdQ;dV!(TNmCt{*iBoE(ODH&wQwRgp;FUHAbV1Z9gmbEApiWr1F>f9GO zg(==U2E;zIQeH?@@H0Z?WQQFc^&?Pstb`!2lra6xAJqfu*QVF-pKag#L39t7(iMS3`I*3G86>6S8SZ9o?QDiN4O^~}9MUnFqhG+*dsA9QUjX!};t;O*xQk$=@1boA)5a0!x< z#=6QT2cX@5k#4pR^MkP6eTi4tq3^cHycQJ2B3)^GEY^}3AleGJVV%KCqc@c`60c8J zyHh>V@oKNu`9k6%<~`MuQyr3plcC9zl?v*t*S!Et4?~%zy99i#1z1`{;0Mjp&3qpy zzvFU-L1o|e+&^ru7H1)$m4N=Nb9&-|ysdng*1}0A%Q;Sl>DQSbP{*_0`{jRgC1gO?B};M|=CDk)2f^k>YcjHj zwM|xj7`S)X9ayYgmgsM(stLh^D>9Y<$aA`V&kx=*f4dL7U#`!3vvak98$$%gnR8%M zQ&Hi;y9t*;b?4+#=z1T<6DJ^2a^2ytVz@=?V5~1lqwOF9Fi>me!3@rFeDAqIPL96~ zx3s+NeW2}wk~!LVtl$CL0kut`mVF}Vyl@ZNygq$!2mv9S?bjAjSD>!-S83R!T)sL| zWZ@tGm92c~zqn9#IXVu?J~YCrWiJ20>#Fj=WLWJz^3}+>&;&n}VKfD*H)iC7;Qw5Z zS2KQtbWwgMHZ`L3*NeOJ9_cK)E}QVh`q&h0J(~lsl8HEd?wYSj0s0I$R5%e_FLygX zqg6(qTfS_#9P$x*iT~*8v|snhlo&esh9y;DX3o5O$3zG79|p#!vlHAko{JZG4K_o) z5p^5aj`lAs&T?uRClXzbPqxbYLmpjtscUK3dhADf?JqxCynM`@%Z%~K^)aH~fZ{mb zKD{d)>7*GgN*^aM?mq^T*M7N^o)!0;Z9UgdlyF;?BzQ_g5U5NZ^>ozM=~<+OX(;-{ zw>|I?jJ3ef(}a!QbI$u?qlxR=*NNF?SktF4q3NolzMFa)Y)EmjMr)#Smz{c8U*1?l=HyW>cO>|CYtot;vCZ5i z6N|EJkztM&E=nUs@@9=H3c>m8(Nag07YkmzVs5(_fhs*m4LcTx%|b(#aHvxi6Bg7+ zb(BcUfdA9>+y7_*ny1y_f_!EYPP*_rl^u7hvi;N~^{@5h@tH=gBD^bCWOK_SPD3^#{A zQS*p3Ih`Ms7=`jaZL4+VZtJo_fJjQqq!OD_H7?j4BIWVk8t6M0yUz|Cl)6@aoEN1& zdl2cFk)6$`Q`Ue;hgW?1vssaxJB0WIM-*VBfVw17m_eA$@Y}a|fcphgd3_Tni@mC- ztaSIva11PV;}a~Y;iun;d~2{fMVzQ<;~@cBW8T??n=U{_za#i(!`aJ0X1D52T&iG2 za~piQwfC?i6(a3+=-$oE+OIsv@3Z+m)rEJ5*H5=&#Odu}w(3HWIY;}AyGhOa6oeJ> zJ|q+lG}VgUm_~WwfZGbMKI4XRKC|CKUkX$PX|1VkN@gj_%5e{5?%F}L=@J~jfgrg+ zUW+o;E1WJWGz#s)?a@2%|{k=*ouj7I^rNqzRs9*+>&2R^n3U$J1h zO8DoqW7oAio^6-FR_TFp(ZJK({uHrbhMKMm`u}UR|JT2eDQEMcG3hEvlVFk(E45_h zcVBqQ;$Nsec?vJPHM!+rg}UltAg_03a5D`;T* zxzujP_T0eFyo}Hl%V>9eKxWbBmuY8ph_`EDFNmOumGNdb>tK%uFi}uXsr?D$u?No6 zyddk~7j;Uo8rgn7)eOGx`b_*;L8Dg?}MJ@cdBD& zf6Q46sbJcbNI3RFnPJ%@;5FXMNdEzG!>r#U9br2kSGehsX8!;h*5B`edZ3%f1TV;w z*&dc;sfBxfj)A2GGn-HyJiU!yh^iH@_ud~qv(dr0P{+yimh=ALTtQcH(cy`ijtsZo zJq1X_kjIqQf;_+;AQ^7q|87*V2PKArp#J`W{}ly?nhwGOPd{a43KZ3Fdy;g2t&eBx z`I@Ev#QIOvw@3D!{=C>-s7V7|Zh%bZCd)9idZWZXrnk`w4VbF|M5K5$%#GYr6(3k> zd*89z-718#D=@k6a4q@XgA(OBj>Np3N4O5uyuC60-VU3Z-uLQwv=s;{8k*?IDit!W z=O!!7+T_L+gP(XtU#))>AIcqrwo1zwX?`mH>k1h@STOId@@#fB*2n&iz1I-j!l`MN zj$8&3(gIBG!I266cx|C(e=)RJq)%r7gJfGPFgrGZSHdC0?Ru8qd-dYS;c9-YRXy@1 zj-C_J55Yi4NBWAZ88%p(`oX(*-!yk$$_F&-;nl;rA-`x3uBA3JzGHOtEuz+iw)oi? zmxTy<1%)aa-`fQs1?c|f5h+e2Y?|m19h~ptx`X#<$p_Rwf;o36iHYBtb3Yum27tdu zPY475Pie7DK8Kb-tXW43bs zOu-s|`%Kf--a8<$uiO+qQPCwM{*3T!qm&4&>X(&DxFuc#z40h;d9!uU3A|ZcD`P{u zgkzzn=L&?gadg)`@Scoztnlma%sfj0PWAq*oaF_eL6S`Pa7tNeJGXFOF3ON{MKc_71&qN0WI@8liT00GB}ByF8HxJ}={fKdU@_{)6@6j=i5eG=g6W@q zvbfM(IJ~$6`#yDk)ZPh1FBY-DfN+yNX`_%-Z|hpg)^>*W!@&p@tuO)qe6jntAs^f< zX-*)pzm$PHh*!wOB&>O;x`4tXQ?{0+Je7)1f)Bh9&|>gKiiH13-S1|4bx*o$kx_f< zAPRahMvLNI`og1HRAn+RVvg~n{`}CqeMZ-w&|nT^s0M!d$MSO>6%&CqX^OBkUOH&= z6`}Le2kvSW*+kqBVT(bd&Q$uww7v=uUf%1J;rpx!j^B?gtx-Gj%IOSr#{^3}m@f?C z0-ybjX8h}xR=vNn_iTaCN47vj+C!R!(4pfNMQ9&RUlR998$I%=)Csatug-y(gs|%S zBy?j7z4kvxBMO9(8>BH-SJ&ZxjrQm$s{o^SyF-%{R_)DJPQN|~2xJpd&Y&?fYD z$*B9_u;sPg5xIjTONf&3dI0RXOb0VvYADlKmn^9bG7nJq3vdZu+@l~QSI+a7r!eZ;ffmMnM$JU(3$ zE!Y4(Bkod@IyPRdJ55B|Z=_EB4Bn?YIYx31%olM;M$v&DqX)kS@Nn09eyxL;{as+* zDcy<)d^(e^Agme0G+Lu{hC^NRER}-FgEU~qdHt{)j8K+5cBVI(DAv~MC9nQKZ+G!8 zZnd`}G%9-274+I<#86q|6M~#ud~Al-j4>fkMtEz^O+|=#)|G8Y^%+Au(p*$6Wb_Sa zO2E~3Z!tY+2HjK?bNjf_f*tH3%r z22XG$@9;T{em%@KT=zRjQ=}u0X)=mqGHz{Rde-qLIf6Xl^BFj`k zz2D`;rXeTFN##vZ))x=cb;kpLHmQ`hZ72qaV+gVSHKK8|;zaHd=;QQ#fc+jB8_KJ=e3oR~5X6l}gY%f+wWUmaM&C#c z!B>fUUR`kei``XTBx>HUp*mO$AQV6M@k>VyXJg6;z}o*NJgUn>CN%^PD3=%M=+OxEkIrA!FwVIQx)B==3y87c$V zkHdR)n|s#>45V8`Ogiq*|E|mjdsOwBm|zwAg<$cvB+*d)HXgG25DAMe%0u6l8!yDv$j>er)&=jg=`p=*vuM9*IL>3rfWl%d~*6 z3HirQzkh)t^FqT9OswFLS^O6g3IXJMUdXi7H?QL}Q4Y#W<<_HyetzlW4virf3KHNd zivfIczq27{8%lCK@}ls3aJl%=VoRPb#FzFFGQ)+?J4#{>{L7MRxtLMwP1>P&jYmj< zAmI@h?M=`}{efffoq`mnRd;{8#;%#_ zs!sknMZHFo1aveOcDH5P6=K~V!^Wx0W@H!~k+Q)l`RL85+k9zvI%xd)Ng-Ci)5}IY zyUB+n_gs#EIG#xtkXulZhHAYYw-itarW9|8HdAzUvPB6J?9F#C|4gHl1X2!Ar|++& zn;k(rnocqfR<930QYAV@GKz}#-|2h*cTfZLx*UM=aj!^$5u>3s>EAx=2IQ>%)O>t& z^teRT_}kkbH9YM4*=+YmqM9#yt9GWUkTYi(S|!?@Y@G$KZmKYmQ_Uz^sXYJg55w)lFaXxG|`hdJ-ZR z5&Q?}F-}!;UGTx&E)gg34iOU_KdS`ai~9$^JmksJpKvhxa%*;QVpEeWy*R>FGo6!nz}*fQ+z0A3Z@cms`*13r ze}vA^149SY=Z7|zvWLr)%%Y0<1mcbk$Na?+rwnBkgJEV0fRy`HVtP+GO(rAwsxCSE zce%%=RyOz5HcK9BiP0L-;uvM%(2{4r_H`0z;$_p|5m5=vUFBoZRN-NCT#6&|g5GOq zmXu+6d`JwGf3tJ(hlYWGQP-SHZ*ZzD3PqbZim9x&58X!}j;bE&sW1l$oCr|2zF^MP z3HF?Slcq#f(gLfq{F=lkki^HgEg~4l=g>6x>s{}%Zvv*`4qC}+)}(U}zukv-yZZyyT`ZPJ3@Git>g~Cq;_Fe|+9O-a|o)H7P{^=MHz@>}a3ohI zDen7%{58Hh)eBAS*V^rt=8|%n-s>#|=8|oe?0PziV+mY#G5Ic|)e5=BY&PRcYziq~ zD4EM5H6PLE3ktTwx*az!>GkkxNK>Pk>76Y7f56xgR@j z*9A#QM5JxX*kE&5v_0#vY6a|1N#VRQHHIEnd5AGr`?15T1BdI2?t0SmYp&Ees^eFt zqoEBkStJTLE+6jZEc+A+JuXo4PGbWc{7GOnGP+0Wg5NS-l-h#%>Er{ZlWh zt3zG~^(DJGqmKZ4U&Y>ep!jCgGqUYUkk8>A-bqq<7?op~bGZj`7FtdCv)#OS{5GBkWfpV zBBqX8yN7e;6*(p?pIU%otRgrHYxnrTzx=TEIv9=L1N(IN4_Z0vmj}U?Q|rNcs5aj3 zUMI}vq$?dRIPk_g>Ndh`oeLUaz7}-+Y#Dp{GfzYF2MmJ@$AM%4{1#MI-V?8`$2K5~ zjy1q5c)=WM6~G{1&1~zne&>hOwX{L!4c}ppJUuhm*l-D2q5u54JfR+z6Yg;(2J~}7 zwu{vr7jqVAi9Ysm{9XDYhx8k1hARx&_P;9Go9gZ)O-Jp=@4&_pWFpWpNPWFoT}4~Ok2Ictmuh|c*aLwKa8KUVXJk?Y>V{U>=&t=tu< zJ;#AYaJ07WEd*-xR=-UEN3QM;kE>t!m^Y39-pt2^jt*Ay9uKG_H=gn*Nu0jAFJIYk zpgy+Qx5d#p&@$L*pqEYuFxz%PCl#gsh-S7bw9V_sQ_vQ<`IBa0O~O5G9A>kib+tX% zfK*j;+^dFfgNo+_Q}pG3t)<9+=yw4JdQ8T?u=cmky?GxYP{ZZj z)3c$a?UIW1v)YATyW08Ai77)lH(lK-($IC(E6Bh3l{%ot168)gZwHlHM7-YVpN>GZ z^;G()0C)Hk*dAmn?S32Ot5Axt$#xlyR*tmcT?Bt?>V$mWBKbpJf_S{a?`>x)6D>16 zQnQzdF^b9S&KiI`Z{L2rY+ie}bni&ETQdC2S~cp8>2Z4Wv-UOd!V{&;FUirXrIOxT zc^EiZAzuSnB)C=oybWS&H~Hl^PkABTirtlLwy2foW~VR{g1x!+%|p!wPs`&h2hkyIVvI-BG>M0& z#CHn5_r%r8VE5RPh=m0dBni`Vs@^+VE@a5b9t$NA{aRON`zb^Bm@RF!nUnQ&(C}BW zE+mE1`12Hu^`0H)VumHm}I!M=McIs?CcB%(H3*mXu$(yv?a4=NTw_#CgMi7f1s zNT{7$_?kKw@b!JipI>f0=O&&48)49ZOlzU=WF8Cnad^G$nu?B5eOQSuV4(M7J!JgZ}-aP<=@{99NS3Q&I}ez@c|Q{yF6_*x}kBKh2DD^OchEdFUu zW&Cs+KT8WaJpV|V3!2)MK?-Q@7*dqbZN(lmUxJ1VLc{2C8B?OyLoRPiso!on{oA~b zXJI@%hNB%n=FL9nY;+?BXclTy@N^f5l1LRLkH2D$;~Bs2?XpMnh#7)V7x^cv+A?OX zaDk9NCIU!l6>dwn*4!aTcNS*-^dLN!^5rgqxSXYWt6{k3#pizZ@->wfW~E)0oF_TU z2a9k-Lx3^`jj@~}DX~Z-AuIbBzs;TGe&b)PB_IJhZ!)s&NNO}1en>kvG`u`d^Juzj zbC}VZxr>UPKIfUFf<8T)oodW`{K{(I3#x7YCeNk|;j_=spPfIfB4A`kwTLhlKw38Y zva_V$$=AwfbOTesQqAMbw$=_($Z>M|(*LldRGBw%%IQ$(XC+}bx2Sdz!`Dp8G|KEP ztT=10Q=E!k{qH7AQ~Fc* zlRZv0zB?fAURw5NX>mA0Iink4pJT8DGX8}Ogh=>os2Dnp$Tux3g2*&PMMd>~*v^=5 za$TaC@r_*2u~FYxQ+1x+z{1ml53j41lzBDriLDmuD3Ga6&7Cf4G#*OingFfI^Bb6K zxu{zhseX7M1#>&!o5z!`=W04nrQ$Lgo+?8|&g8+3=;xma3-qTa z-E7?){GOit#bIicUqBsd;*+8xy>7+BX2=CJw9hSSDZYJ2Z6o>dZar zs}k4FNF&3*Rsdej+~ozS$Jk)ZP-xb%che*n;xq> zN}}3$nW&P2+N%Yo`TFKEXsGmZ8P*-9!v~ngoUhSU#V7O)vonXgo-4 zTO%nooE%8xY=F;RM8B@^dTiZBBUT;1yn?m0yW7IZ!ts?zA^t)(pL&+BV?_w{@N(qT zy;}JDFpTB16%jh(gA1B5w#g67g2`~0XZU zML-z{I(8W&a#Y{ygy`Z@N7LV9K5(%3OHXX{V~ty`@}ehd{$_7Z@5U2a%&fRr{I5;G z+w_Kx8gui!JKsW^Ol5g$PbdxeJZ3E9Rujn8eWchE-4BcoBs9|n|88*m zuk7mLa5A*W?OwubsQW~iWD?irFIJM~EeZ`X_MlX)L!3gzbm@N{L80D+!qO6ja`MD3 z*#sDIrpC>xlzTIQ_Zqp#a9l#+8`%}c!tP&NFr(=$uQK(26FGm$QHv~>GwrSHvp=0YJ6qJ+OOD@l z4dB2ku>!=wB0_hv!WEa3x6r9)^y8#GtwL2o3v(ltcx75tUJ20#1I`*h*#YB;M_WD+ zu@0tMT#~)z83*2cbLUs` z5k~cfC^wTRMh_|9NnR8tVWjt!)Ox}qyBqPOaP#kdDJH;XN=xma4pzr?c96Ld5rgPwUt1dahZ*e2!{ z0Wr@g{78G6IdVi8^%ZgE zqX^r%Iv-V%z}8A%nJBsXB30EIh}Xw!eI2dtE_rpvj7InK52N!i64FN zU&HG4N0}dU4^zcz2K(`v8_iXCE1<3*JPY@huh()t0$EG1y6-^Z!@;zswzJx6*AAIW zSpuBox{at3qt7twN69>;9qg;RuE1QzhugjV1>J{OwC2Wr$R|8T3nv*8KTmQViCw8V zT+4@xY?r@E;ypBL-|tsA$^LX#N)g;Gt|lSYq1)yj*^@=P`dczHj)VtR6tP#aZ8@~D zXz1ujQFEhIaSh<~YzC`EcWmO}N-@*0Xoga+%p2@H80+(=X3+}YwJ>78?3WS~dpJIz zl1ofZxmPv^^w@D#9gM<7zBok1S{3y${tHUbVTih{57m*7hl5ms7H;yZLJyUo{|}(4 z@=V#rIxx7Hu!q^d?-t7%?j-~o`ZA5pR**F{UDtdXCrDL zwZA%ph6E}ku1=@JvJH$F0@YQ|{1KfTx2j$a=JyQfV=I&9610}p1 zsJGJkFxP~k=C{^nZC2r%oHFl|yzlRU&a!B4_iPvV`;V>Occxin+1-A!ofGZW`=0%2 zQsn>+LsS%j6&1M*CfYIa#NN=4N5sfcM^=iD* z!bq6Wm3X6L0^0Y;sLcaS^tO8}{s}F}&^rTqedB9!nb_z$WQ$t!CU@AoxI0N9T)j%Q zgs;4UqSU;v0g?3n*V3BK`OlfZj-a-p$y^5_2R9X#S@`qa2wrG&@@<wBYz$1}_~DoyKHq!1P6`b0gd&tcSkT`v7MT+(SWZ#YB-1BXB4W|c_s%oY>-muecc z?S|nQbH>ZS9a@tYbz@wp6MC|b% zZvBEa_#KJAB+Q05@7_Vb^$*{fB6mQCD|fIuKqTIMLzS9MiuU+`m<#B+j)B&OjR02Q zU#8Y)EumA{fJGnx|M2$mjl{J#HR4X~X!E$riwM>k9ZekbY-OT9 z4TGI)@6=^}xrLg2jjmlmv^Nd#o9LHe8I6K0#b^cB-r7Xb_jT zm8Em?6sykl%f4Z~BMRc-Cc*b`S;Gc36?X3`HG|(FIXaFVG7DcuS8e^4*nx?h9EI!*UN z`oOvG4jcxv)V~#St}IIim^gyD50Nzc|qAAivJV+)*v7eFI#?Y6MHKz900| zbPc`!BKqZ)r-Dzd9++YZ%cD#Y)y<4KG8~*tcfrOR(hN&NY}N4MI^PboY)s7#IY)~N zkKKN80Zzp*=46`sshS;)>0UX1OFb!hyAn2DT&~?n1Ihc{yk{!b)X2Uy*nZqp+6Gy{Ml|84(I&b;J4%RH2KyCC{#Ns!+u<7S3G zE?-JR5w|&(J!&w-YmUkH4BSRA^ugVOw}?c5B8nWOLfcFb`3Hmg&7PEmh7XiBU5_;z zMz*%#b${U{YmI7&3S^uHu!bO-d1=>6321Ye`l;a6NF&j zWqCs^DImQ5=@s_CVJXvwvmp;m*Vy;!BhO|l=+>7i=r(5@0(3ttjodDY$B+$_3|)_U zM3ZJ|<$CV38D&}Cj=-84~gtFDZFcQQva=-oL#QAe7Xu~Jyoqb?9x8Az!En&SqcfB>f zL7h|*dozK`@n90sXZ`VkvZemBZnzb8j6IFd)hR2A6}YX-ds=s4IKa>{+~W$~@Ra2a5TZa^gws zo9^!DwG8PvjRhCXY+U_K`t?^0vdB#s1ddnaLE~B7+KhRdad}}n?e}GZm955$ z6S+>kbrMFvol{XkQ(qC+Jsd3Pc~IvAxf=BZf+B#+?X@`OlQ7K(yR<#Qf^KwKUv&u7 zjJNj*fnmp|$jcDkHHEy*t-fy8Y(q zl5iT&-@~j#13Mp&_;I?*BY{Y6Mzqhp`Bc0WHk1cB$hi-&ky0bW_@}>P=W4HpT3(L4 zcn@#vvf>6GRf{fh7__*=gH@y=r!lst$0i6V6ZWNulR<m)~5S@W20vc7w4Pv>vJQBOcX}-im*gtB>|C7x9`(4X+-H`CFjylTJfN?lG zr0t=dP7<&CobERKL6<#G%lLSrO}C3l*O1!5j@rZ?vfCc@%Dwf7=dfvy$9MA~of*8% zXKV0Z-jR{=qeXUnj_2PzsyU(SZFd|S? zN+1)L-cvQA>lYX65QcaL!fsXR1Qt_BD&_Gl6guiFf9kRxn{iJh>GAg@D{smVX=yx@J7AbwOOl01g&+zq?{z5E8b3Zs{RqK)qnW8rPN z0uu~dXQ+n3!aN4n@90mK9YfRSPSdn!z@OnA!7pj{nAd`XSJO{anh!>hTgNB)Pa}n^ z9ejEHe*`iqV2{9w1gx{rV)yRz8e?2L6v{=I8NKO1==zSXbRbg9#%v?IYXrmQn>!t3 zOsFxn{tc@IM9IXsR}1uBm=c?Azr6S&$thYDSikSS&M@ z&Iv13wRb>Oku3x!-acXndgFxmEBZUH*{tpUBtuNhgzxa7)FjqZI)6lAh*Y7dIu=I} zE(dXvSo;WR_29VtFO;yRx$U4cBp($Ax!-d2qFMwT^tR-2jAyW1O)@zQm*H9L>c?>> z^7U-*yV8iB6py6*&d~<7%e~aC*#lF^*=+!gsE!;)(<_qCt>xKQt=3P%#C(t+CIzO; zcQ@;GM;?65>0dF$cRTp51_Pimev2EWT&!kQ_Wp~8Kc{QG)>F4*9{U(YN`)n%A!I>d zI%qXeS;_zdKT}tx0}gt?GeMu~ANaYYsQb6en0e5@oNn26ptGm?KH}F86yknQPU0(` zU5o7$u&TtaLZgmYc+2AZ>)PxqZlV8uMfrCSf|s}5V|dTS&EZ&CxnjFI;D24#{E3t7r@J3U7rqZIfmTkrVcwu8ZN|csUk+TN0YzgscFmU8tpR`gM+P z&==+6s;_MrbaY7(uOoEAi}dr@-k&f>+6l-MK#iVS!L25fQM3GFU+# z{4#?v{ht0{VF7-Co2q+B`wUoPMGM^UdjziK74d3T9NbpR&54{AdMqR%Tr6l#(Q zfz$S?p81sit3kGCK5LqwBz4aBcQ3Tv=rWdP4}RTa8m=<)#j0p*O76mYl_iMMP|$k? zi`S-b?Xh5&Uc-MvpsM`q4?++nQfZ!_NQ|?j74l_@7yRVbvddRx)28p`dgRQ;FuP5i z5*r+@xxV#1;{vfp&;?#gs2y@2R(8p(h-kF_8z7uuAH|nM6{%!#8d7!dnUC&ejQn10 z{e4Q$MGAtg{x3OG&1z{kg1<+4G|4cw=18r?jH(>1iJ;du8Iuxdl#eKoH}>S_eKXWR zzGK$hC{b!^^iUwfrWMPAxJ7a|%c)UAlQ_JeM4t@S^G6d{Y7zI44S4G}Rgy$i6$N&0 zR*8d=z-0ZL%9`!GSoFODVna7|ih(4TcW%*uv!cdKo#Md+koMa@HQPI^x>4ZIiUP~*?2%gx z%S&P$CB~|cjJDg}*g-e5r*yw&0VkUEGnhj;HFTc!vg)xc zv#R-dN)!1e%kUk8ybbUOj0Q0g?N)MfVG}`1sZOK-k!oro9^#M-b-~gWSwNZ zL~#Xu_x@zNB-%0K@d!8q!^~r_G?d#&c8>>E*&l=_hCq>gHJbG)8or@5ef(rHb=2S! z9MDxQXDOOW{+PEO2H>L#9-73`UM^_ZsO>_9H2$`p)G?{d$f#N$Nd$mBGC zRXkECK`!grBmB8*nf|Kwg!d0$ak!z(+wNt({qU6T$`L0=OOZ{x#CZNxlDM2YJjw`1 zmw>oQTc{9k*sptIBB*nTFuwDEk84ZaFtXhHmU%QgXGopSZ zg>WiXwg=nqhvw$|CT;xBYFaie_-yw*l5b)XcmN;(P{R#Ag-VW=<*Lt>nP^0;nqbcK zzW_U}dWXGDzd@K9yPRhKtPT4kih#u0N-SMGQoLG?xW?xbf2&9LMS2EIXo{P62pbSB zg{(hNPs>x1k}vs%^^(q0)DI$KX~?zqn!f4<)1ZPO>t;g?fw4ZuJ7%UxnLXK?H?~<1 zIkLQdjjI#ufTsrHa~wH)5t4@B!WfwPRoSBW@RW}bAzO0bK(e-eTP0(K-z<<$>AeSBjX{1Zuk@uo zy~Y84oz`Wj*veG|2|^z2LN2MX5UcJ4jtBj|66cqmfVR^(n5q8C z6{+7A-Vw{tg5P&r0!Xwy8>d3WX`sE%`$HE;+Qlknk7I--A5QlI6^NW(vul1zQnp>) zLQSu(lX`H6|A|+3ub!f1BV^bYU2%SCiQ4h0h<^J^(C|!lQiDgcL`%B5oXtquzg#V! zviQS2^)8aP5xUH2RC7=7WI8-m)!mg*u2r#5SelmZWj z^AFab#l9k)Rs&Kw59LQ2O~bp7Y7)Hc%JjZzscs+aM6kXe6VD0Xa@;!mr?>xK6}0~< zr~wyBKV)Vi$9V4;KzHLuVZI$mgfn1;1tf~N7XPFNUM4l+m<2@wl!c#usB(nVzF@S! zR`9FKj=@$U=l%AQKN_nE*qq$IVQ72%Pcpyd9kddwPIWkpqMSQ59`Mo- z&}pKiED?5kBj`MSE<+=PMDZli6=N}b_}wa z_NCWDP|vn*^EA8rL!7Upz~5O~5Uw?P0=S`Dr{O!%wsmWN7N7$N*cLG z5bZR+E8%-mUeL0$nd%UJP+{n1*jya~x(>qA_pzpcp7CPM)|!&OS!>fPo?OOKqg{)& ze1wRHF7%ARoQcpYD$d&n+-l(bK@xT+D-jq;C(Uo(ILNG?{)gkN%n|uqsXW3#nM1& z3*^G(2@HcS_IE}7bs548NgAD9J1k?zbS>Ga^{oQNOA{b;RoMN%K$i=#s#p2(J{~!j zNV}z1C-m7(LHwsFl0gzGdZHu?PStBiC)!-c4rk113*Wk$czehhbyc)Q!8r5Z*2{gw z@>5#coUExg^p1*kY46yX*F7BqO-}fh>0?u@C^((Rl=CC7BA1n7DJcY~PXk5YytrST zLb=-B<4LSZYodQP|BE6!M}ocHZ42`ak+`wdIK`n&mQ|@^5_a*3gX$0DswzOo=|ERI zcdFOJZ62H2J?f0HVz)4FArh4NSC8=HD!UQNdx_PRo7K;Bxb}c!bdOTq=^usm4F-0O zjdx~$%NlBCooVq-m;*NG!`?fVH5K(^;=PKv04pq?ZhQcQjtExaCu^U$(=vQpOunW8g9heaGMOvD@oq7MR3bBF596!N*$iQT0zsz|)^sJC0;UVFk& zt;O3V>4(qn;nM#uVkNq}Y4zUIgf-nBfG1o1vm=6huFf6c<=>GFp!#4qVjKsxJi;pK z({^}u5nfVRnQ)wVG%VmqZ|qbutacYA)EUbY0Sf0zpC&_KWgCsCi8rQ=__~IpuZo>p zoUDdH;ac~t>bc-zEkDsHG|kpJf(nH0bA5c(rH|ZQqo_j(2EA|uH}2lp$41vXeGm69 z$~B37;Tt644`~|mWDkE_;Ggp)=3U+MOjg^!IQ&oS4x$5gll2RPO)5daRdKlf9{9Lo zyOgc2$^3KM&J@)23!7Tb?}(E0$s)J_MnfCIlbIHO$v^E<9^H)_Y9iAs;LlCSKJ|lW zLuJ)Ob8SlBw5}CLzl##b_gFX5p7ari=(A=^V6Mum`D*qjpyExE40LwA!dgn%suSK6 zb#2>aX&qXHCHh?SfLmfepcm;+;9t~kY6;hL^{$$?4``vB*6Icsu7xg=Bb*Bzep_{p(9jZ|6R*JHfc3j`?%y98s0BGk8(e zVI~>huCIk)u@dp`&MpET#{1RLm?p)mu^K6Uh7i-@c?whL3)J54wxntUrYFyehZK}>LU zRnfaahfD|Z751aJnGf0tNp{a2mCN6~y{BznHD}V6^tHNrl>lXMK|;vv^WNW^BfhKX zppxe(!j*R8aB*}1bkit+9*T)No6zMsK9Yb&>ptQA!t^;yXW(>q)Dv}b#Ey>#I?J;u zr-U+98oNfL4@qpU0iVtBhW*G%GZYeC0TYp~n>sCI>2EB4h2E_pVZuS4LL0OE6eFqj zqH&m#Gf*@8$))A!qIq=0{FRN4;>$^XTRP-+bK9nBRX#BXg8s#3SlR7^fNf> zn`wFN-6Ud1VE?$0aT>M^?$Zr=FP#dOH^h11D52Iiq2wm}{P6URI#=2yze_nLk#2P9 zw~w;?I)-E?{Gn0|DHF-(&I|HqY~%}93RI5A0Wc}#%n+l)hiQ1wOr>#cXYeR7$m56) zOO^Y$j-pURw93V%lYrEzW6HG^a=gO-xN<_?<1We*b=p;IPF}|)PFsOLv}t^;`S&nd9W}!it!($LtnRX)P0K58IwT{x1BM5B`=3iMd~8&3gaMI6Zee9djouWqzH@SxME$wM#jcYtBC1p9INi znT~pr>o-1igb4o$w2LTPlHrqQbK^bruCf?8qgq)YzMdSCNiLMd_~~3Mvk|CDrpCGt z7)-o8ueOs`tM{2@d)-H*#Z29Zu0F|oLDjDmvMK7-6#RUh@z-ku@7Whd@25~fl!r@W zzk*K~?p=J_DEM_o^Ln<$`T)KI58Hz)UPa+LmaXIZ@|K6&)e(|tvL&@sk`@Vb)!^Y= z1p9*T8pKe`Fj;UJ=%(<3NU~DjVz~~XZaY(7-Z=tw;s)OgwIac=+JH=IrE&~ECiUoS zqjgSr)<)ghq;M42X zhf+b!9C~mreDlmJmQWsa^arXbFTehd24nOGrQ}G^Q^>I?nyhrVM2uTfPXnh@uTBY2 z+&iU|pH$4vi zr5srGR_7(hhuRHrKAU-tE@MHvu3e12z?U!Fs|c&yl|b1yKObdzd?Y1ly*r8U@05Zv zkIcm~*t`udd&)BgPlpxEp+EV0{mQMm(?*s%1(iXCf4Qqn8~H^Ef1bWpct4R$E>9&S zoQb`m5exLwY?m(UJ*gLe`f|LvPFX-Kxv97^&I_My@Zr8Q<^IGfx)sg~A0r=C90 zMem2fit%l7wMvNcguY~>0<`xN+ zZq3Oehlme5sgkv}_JZ(H{byoo8l*X|KkZyxGjzMY?R$$N|H&*P7i;{>BX)7V|6G^B zHkam=_QuZ>s)&d~cJZj-Q($dZbcW%*=g~n}?%Fd=IorFD77G0OcUnN)8RrO5<#N1^ zqN?-qf1s{9Q&TsXRM%yGKqnWxXrO3oWYA0AwXPL#*F&C~W>>cSCme1>_(}2P5;m^M zKd61LQietJr0mw77u^9AYdMrF_wwW4_5A-_6!9Y;ZL7Ae4vS!5e1E}&os{2 z)gc3*Ha9wnpM+LcmGSP&JVr=`NvXm1dRD#bLvNgYpp3oYA1B>w$*#OFCrHQ|K1Vc- zIEHnH!6(#Rw{)g-XRkfK$FU$jd42iAzM&QO#;RgDm`XC5^|oq8GB(t*L=o{$s+O@$DO9UQ8^)It3J+sMe%2=Y$1Go176C?j$ z6uO7n^8~FsRlA@_!OFMJI}aAO#t^Z+%p@)_{C{2qZYEG{RqtWpen+T|+nB=UCI&fC za6Z&Cu9gu&Rxk}5*tXTSWrn$|Zb|>_4%I_9yDBhRoBU;s3B^QwzPyw<6vb6<&e%KF zHuU@S!T@b2muoM9b?c@()0PwYU?})>)ekc|tK?xngxeUM!%@hRCKFrfX&3g|K=C{o zzg$cq@PZ%rb9!a%ozj~9%atza2pWk4;}0G zc7P-M_wZ;W^ue(MpljiI-PMdQ5%^yzwI^8g>PRDACxo~D^+j)E6%wDS-`w$sh0jiK zpocAzj<>T3t?Hm`ipztY4bW|l`pXLEThoZ@O0(1~rE)Qpd?5asr|)?yW6M_T8~4%D z%6D~!8TN5Zhb>*9zcr(u7Rk)=OZk0T6lgH3m_yZ*@z?xbMs~d=kL_wGbnBrr)_ZOE z@vQ0DPggck@3wwnaK_s9&W&9y05{H?Pl*WTf!SgUH+(`eyaX?-1?hKMk{Ad3+e|#M z)A@Z(yEto8;Ayn%^OL**YC>)0G`_Oc;qMU9-#)(PT|+>Gs9*w@ja5}O+dRgj*xH5M zCE`KuHrU&4#+bk!pZ(Ft=TA`^TkvH%RC)k%m_pS7X;^1N_gu0d!}O@$cgcC=c%BNT z7b?@crJ^Gm;QYA&fE!#}Vwc6au!+;+?$-C#ABt z1i!NTCy7L-LA0%2Qq|g~N!o{=@!V(j)(5T#csrfxZ&Db&3XB$(*Q zGa&*eQLV)ta}mA~Ghn*r{H0cC4YN(NwyqeVa0M5H34FeoDl3DidVX{d`o1>+u|(5E zeLEfV(|?jpWxSO{d`rOXbgh-x`My6W6zfV0xPO3g?OScaPix*BYjhEQB21N}OTYsp zh-8El9EHQvh-CB%4R9{`aj4RP!ni4E!K_x>$r#P|&dKI9HeNcT_3b{Ym?Z;!zdsn9 z#9GC6itW?0F(|4d?SDw>eVdvo5mM*i_^ofkHh@;+z32%Rnj4|>^~|6DHJq;``GSnu z8aMPpO|ZGzKJxdp4dt^?Q8!#Lmb_NA-UMYFix%)x$f;S(mkB(IiQhr*SYGz3S<^0@K@?+ad>}+F?RIbsY;P0b%UZq6#u> zehz`@sZ0$<{v!+mJNDh$jc;JS@hlReCNYZ=REx z`qLp3*LrBfu6C5}9mv}ro%6r5(Qk8yM2~A~|EW&g7lo90rg`!;!hd)xfeMFpXQ56g zyRdBeQ2cf0w#V8NMf?~ z89sQjUnS_ne0_9;6E4GJ@0KoDy#K%~8#PvSR|qekxP|`b&{&}b8?`!a1XeWh`l9ge zt)F^IAae*#*5lQdh)&?)z8NY^E^AIZ>24eOdA*l3;U#84&FoD!nX3aFr2nF8H*n6V zt%uZ+$2YQtViQZkDP!l4z zD_7UlOtDdlCz%s@-oy@c!#}nAttruns~3zqkm!-|48Oca z3TnEvVf5{0aL1h|vT2S3x3E5ZH2L&b)yhKnd*C15C}Gmu70jl3b*vc2RKx-tVrENt zL>S0a1uRAsb6 z;YIS&IZhM#?;Q~%th-`J?T-a1cPMF%h%kxXki}s~h97OM4*Qj1?`Y2d!Tg573$aID zdnyo{#GN1(C}P1=RapyhoLBmEXXHN5j<3ttpVHpJA`pu(3qenV)gJ6F!^cIu2K>3@ z#-EEXVJ+F0BaDKzK6eedQ?R3oqZ|1~(}?`p)&2lIZ>f&(}I3@wDvpa*YkH=%XFd^ z!}peL9rNUuY?IlmBkFq*O1VnSSos&VUsv}wNfPge{;WlM&&QgveqjkOs!*Z={dMCM z0AqmMw|%*9wHum#_{xX$T|kjSgfx>%=%WdZP{wBfzNao;XQY;mf6~wLw##Z2>)!{p zbnb*bax|N1EJ)6z9w2q7ULAXdgMxUZ)QQ>OSEHjyDnSC{;I(&o#m)@EdkP z#B`?K;a)(7ki#6ES#o2xl~@Dtud@$hk|9Q;eD#G|Jo(4%9qL@mkCbH03W3fPY0An% zZ-&CdK-cy|y4g>P7<{4YmxU~`Jg`K)=&a&XIVgXhB}`nFv>mTv=wzJk!>>YLv&fY; z!_893@85tT=LP!GnFL7w9zKr_9Q9+^>kilZXi?sm?duZ^N3D$y@-ovvOv7LhH!{#I ziOp&8=K7e0$SD~Wk@0DXAEWNEnV2iGlgZN{ zdu9F^inA+FV^USDc##vzB!XAm*$;Y&wdL^{t?Z4?WTCh{6BOc(l=*M~j$}N9Yi~Y0 zqFk=a!zxJKi2}Q<`H_yQ)HH<*&8(?%F1{67|@T{r11AS6s zKnYHStxW-?|I&1)lVg25wPlI#9SQD+(aoc7{;44@1_@DdZaj$bkNBsqpvQ+e60=ta zG9R@nb7kHVre~qCS?)6}iY<>UF7UnuRG!4O=4k(J5I+54IT=}3kzZ6_R^w zCbCPM4hzky%!>GH-+9(o9Bp{ZZI!d5*vPvdh1-Is2TJ^Cd^S>~=uL8_R`<1{oa2Rt z4U3$O+kg{{Yizk%e39o{+k0Whb*$vljg&PO`N~al8irg$-!@iFGV}XRFJqotKF;%z zt^Z!i_bMe1mQkojnzYM3tUM*^ozq=h{EA|058kk$wrhC>!ng|T-!nkH4f0dW%G~3$ zHT;mP5f0SgJ6$Z6g(Vq{@75NAsA?jHsEyF8y_KK7cGkPTj>IS5=9MGGBT~8u6x96l zvpKp|4^J2#1W%(OW_ryfo#iU6WzX2uIi7y13)IvUQq2Odo||Phej>5K|DFUp>?V8O zuzl&y?ORv%v#VIRdd;+;A^Bjw{`N_;+1_a)cxIT>j|fop8e#^Ln)<8n*Jg^$VxE<48GDzLCY5@5&p&_Ay`a@>$sRE{T-3MN z>V5#d%l&QD?rxN6IVm}~67NcE!%oU@c>u83Tj8|I(D`Vsc4p2-!oc3dRxV%Zy7Cqm z2S3xoUF)tzi_L+K>=ZZ1GHC4bd^JhB)Z7!X%n+hB<#E2$M)s$Yi}#5N&Fux{U_`Qg z;{Nikye!xKqfpUD^>16$85w2#u!&raXx+EScBqo?P4g$O*n>=Yzj|10K60bM0&&Ny z`t<>KX^Wd}Ea12>MQ+k%th~OxX@u&6?#NOyRIfK9Y9Je8g33cL-?U>tCLr}VE`R1` zf0(mW30d3FM;-GH3pW3L5a_IlkLse!Tq)*NJB{|n^4Pk&i+r+CG|4?er?)-;Z~VEk zrc?x4VJCZhWxKal)f({Ogu#LCWUAaK?JKIBFV!*Bt&bImn4A~UX z*(GP)oy4CoO#|DUlqD=v(#ji>gI4Dz+!|nSX8CkBSwSsneLF0$t416dQkz6g5+&pu zOrr0|$JhV1{-WB_CKaU^Jx(HI^t!qiNDq{UUi4B+(>z6;?~Nk|<$4Z(T<~5G_{i#) z8K|n&r=&TmUGWoZ?mo>I5D8Y1JJOZ4yWNZ-qd(n390tD4$S$gA{1_=1$w5g-6nRra z3>_M6lmD`R!@#L3blnbC_tlZmXC}Nq+WSEe78WB_m*{}BO*FNnm=Rb?pM=5UEU$u% z+T2|(!+mW7{jweq9PDoS9RxhSEzFR-C4RQNG<5g1|6HS^!Kn4;v{6QG>hCrLr&Iw= zmHpOpI^Uk%!#${BGbwKHZNr5#x*k4cznD!sy6Du}4p7 zdo6t)ef2@Fl&c)xN!7JCvi@J7!NmSj@s-~HzY^B2_FXe9Yb_lM3IruexI!l`ZP4N9rH-(qi`Zyglxz(aSq;s%!bZB%4 z!`%HU#6b!Jb2dh3Q$RMZAB?|!tKZPnQg(M`d+5wRktW^@4gOU~VfDhU`PnrnalwBc zo{p%W{OITvSD=qxS7SWKe_ZLhN1hZ@GFjE-QjTtz`QIr%9x{5_vYQpmp zDpGP{bn24Ft6J^vC0={HDwgvRq`%Ku2?+(;j(yUP zl!S_!91R=2jJ)`vs6VZy|9d|L?`!wBH||~8FSxN{c%8P*qAWPyV0bCg1gv6X}`&%;^sslF~Ld)#zkqIQllS7&uC-{DbM?(KuBduJnDryi5(3<6r=(AC-u zbB13tpN2{$xiXKMJ)EOIkv?e?kkpejEvYla4NONam&iz<$8i4`PEMaO%_f*8sq#E8 zc~HpZ`DwfRlQULuX8x4JZJ$n@!}{D)9^5EzjN3twkZ$L#;5E=`zKt=fDxqztnup0$oi-p`nf^ z)PC)L;spCU8(G|BMiS!5lPn4eVa2bucqZmIR?XW*d2Q;CsS+%JKVH@W%nZRQ$fsPuUvg#R^=VBe^Ir*M!iAX=9 zHM8P#xBL!m2XC^5IiO4e9@=wxbido($+I`(R_8e?bl=O*EvLfnzA@XYeQ48DcfXtY z1ozN83WL}XP7(AXZ^bkH2ARx|P?zip>ms=Gu?jNk2B}VDl*zwLn zSz;nR-zHyEGKR8s&G3yS=Z(di`WDY_LhkZnY7L&714JI>0yLgj!s zvdz8ev0naW31*-0qf~m3`5-3!^5j;|W2;q~)R`rnfWgXo}UPVXcq+V6(*yL396)O|pN~HaHX9Eh#^=35Is1fcdY%Fnt&oyTDB%NUX8D z)>@cbw!+IiuJkFalDRw4fkLo;n*}sk^L`y|9uAuI1b+C>Q~Uy)m}F->zWx|9Ua)gD zp0F(U(?aLBU&24uG4c+~0-o@4!dK&VD|sGer&sK}>3$HaW~4%(_jYar6KZ~~L%0S~ zgUKxx!KZg3RN^=B*&iOr6xJdksvZvRyz?&;+Orvk2Zi;E-sjFZ97P+s*|HIJ(YCpQ z1i_Q6kTmm?>GY0>KgFn1AAs)_l=#(AS*59SYSjvjX@I9r3@Igb_)}^gzLyE6wroAU z4ujWve(nnf#`E1KI%gInMiRBXqa;y@f4JM(C$M)}4u$Eyl=4@djH9x-gOHE3%{{A^ zL5zhZfK+Ypw`_9b3r;)GybI5FK{V=loFC$%&nX&CY@krA=K6C(r_E9Qh)ZflK8g3s zg}xp_pexNiz1IGyW!ebf(0UdcRXBYzkKW_S@~}z~dAYbHNIMNG;+$sw-raH(YwWwC zxp(c?`Y<^PmJ#EDT)FRi$U0r^?5s9@HK+bkGDC0yRuh9hII>+$mS_#rekkgtbhxAS zw(|qDU`rv{8f|Dh(@+=!y{<{WE`Zg=o{UP#Nn7vYXl==h-2pJ6A|eIL z9si`5p!{f8`PAYH97);z2p4kPuDt+rJ+2<0BMN>L4q2xyMf_~5Dj#N=c0w~Jlem39 zSXm@^3>NY&l~NP!VRR=rZ4a5tR9Gsziy{hdEE-UwfPepS*P05`%G}&P(RKJ`-2U_D z^Q(i_5&Q20YtWiY*xV`16j%oTpY!65yUTSaxo#*Y!trIIi%PuDHp-(Uzg4}-1AZ&p zWI4Ebz@=1OM)QHZQ}IHjxr-=!#)Y-(?;3X#p}^Tk%q;snJkonkdbgC00nl=O`&N8A z&;wV}sn?u2lMz=d5*u7bOYY9~fGRg6r_zu{)6H>Ix3 zZ)~&Huj1cdFf_aRCIE)=@JEacksk@<`R$v0UMndEEpMwpUN&ZV(1^ z`a@z@mn5<~xO)(F03ncVtYD9kHVGq;ct=bMHkUyN|M0O5vFnPT%fxU@Ma1%)d-R zR1ri2yCr{P&kH9OPjT<*Y9I-Ag0&yYMrY~U z-s*)r zkKyw|r`Kt(Kw3mOt)RArQ(*uXp3mUT;gI%h#_LX-74ZKN);=ZDln&mx%( zdWA2221WtSuQ2No=A6~7LVdq(UpOrLhm^(PEaW!x<}%BuAK9w5h73$EyUh=p^M!?Y z-l%r^3t3_09=5JQoHDK(tWBaTtQ0r^E!}Qo3Dsd4(a9j zp)D@y>vgOu|6!raZ2_b+z}su)@&IUJ@R}9ADSdRd(C~e5SKE_9bvL4kM49Ct3dJ|B(bSZ)$BE1Miq=|s^UIIu*iqgA)(jn42 zkuEhzuL-@kP!mFuKkEL@)qlQyE?^8UU<}^O_gQn!wVpL$sgZHqN!K!7>pkinl@J{{ z+PdS~#5Qab8`i@GV$Rtdjbin5$juDqD;VG)oU?g2hB?+0YL`&Lxv1? zJyc&P@A0kAIN@$NRL!j{^{X5}jjV@D**1lyd^VC}WPsk`PrE!qyt27M(W)ChXIl?g z0^P}8O*e7Bj|$Gi;9oc%t_roAsZ@-;H8Gcd)MwAV`RK+}Y!U*|GOfcL^(4>0sR|bv zttIQ0vbR1DK))xZDm~+hfDF0qnF=xV`8G<9FKwfBe>;B~6)C$%?@j$$ef{H7~?lA1j@^|N`Z-c<^5*xwUA8-y^^?JzcijS*yQv#u0ye<~%L(daE7V$Z+hEtqw z&NkR5Ky@maz$A2Z8UDzoXF0OY9g{@UPd=dOici;&9ezYU({1eCpRDZjc2TUaojUdp z`7mQ+w~FFh6>1K0rjjA+vyFD_eh2ORnOM)Zf9+F(v4MiAtHDI)@NF%i6bcRXJ8ZLD z0n?(>G;H*-=5yd$rtPbBS3ptem#^78bB{bU_NNCKkb1okwB ztt3@0{rQ5=yZXzSzb#WFvY>~r=LzxeYYZoaLo zvp)9QI^2G*o?cr)o9!jMsvg%&Nl66&*0yldTZdF2ukh-8> zZ#yvTF|Gd2pRzMG8Uk<*TIRI;tsa?jGyOOFr4xOqUu4Kf;mg5n>}a26^X*E+jTJh3 zo;zfjQ^`1Dfv*E&iF~h!iizc=dyOi@pO;9QBdVmuHN&y!v&?6m@Ns=yI~AYjZldMu zw^>yJ7kRz1H4}(kf~o_V0p$!cAr4G<>@%(Jde7MC?#xCWsMDt#4L|kuy8eUT=zA|W z6}Oy&c6lS!+Ha$Itp(ahl?nwWb}O(=AH2E22)I6 zdL*gB6`0M`6RG;?yJz`Od-nCO7xLviwuSFx&Q6VS^~~fBkOl#*B79e;>wmhqT29Z? zcpQ`m#rJaBTWv%KBGDIHlWrGAc` zJ+FNi`k-q(#P73G2+w;x8)w)vsce+KGznEzyXseuDDP{(ok*0nUFA($aNL{9c`w~d zT+YUnrP<5DE1nP!dy<8kw$xxb2^~i{>hptbYt-e~F`_C^_u;Hp<@Y!Qjdj&KpY
ru(gqW}v@v~|>M{;FQ6PlfBDpth-Hlqa4BH*bo=;T*Tk#wyNA6)a<-N$+gVV7W5 zFr!-`YJM=mDS5{}1;)*L_h2Ji&Dmc@)gP#8O?t17;>T!I%*$i`H1Gg;?fDNL5EI1- zRbhW-WpWAYg-?I?95nx^f01I`(Oyt z;Ei7MZ+3o|zvfD{{$_#}eDmY!#!Q1-5D)d(s#aUh(Pfm+IZJNK&Lyn$RiVwU9z*H^ z?AI$dRLkdF4zfaSE$4&ovPyCi%d$S4#FGzFrpgAn?>FY408SVG>Zh_?Nq59=r%7bC z)Fg}?6L6q~1v-dW-_nO;Ti)BUeja8syl;@<^t{^-hJqd-Q+*^Lz?RiN06p>7{!T1M zbxrR!jmJ>29Pvn`##fOQ=BV|+ZM6(HA_l6OE1tx5hX^N4KXHcZ#CPTMYwkJ`0M`-G zRr~tF(z7qy-os1?tKKubRH<=X``pcoJ38kG*;I*d)}p@?F5 zDJE?>@NN$yqF9jbyflL!p{4)Kq!Mq%sL{_aeRx;h)TOIM5w-a|r$szM~bvOz^y7G1UNL2O3Hh$Btvr;Z$78QeY56%x1s@HXob zF`DTpf=EKws*d(aM6j>Oa;HPvt~z^MDD+L7uk3a-%*@_Sd?%jdF0F!-I%+y2g;XT= z-0`OV3eK+Ueej_E{aa*=jn{)KG=)kxy1pDu=o>@D(5r4dU)*_Wnn*`JVpfNKQX)*;Ow zDQlNTDXn2K48*>%7C z90zO>kBghGaQ{~?kO=z%zB&l;P4AQ{{ify>AYC?*<2JHX<R-y$DjJzL;N1y#aUnq0^OPTB5rmS zMHNDw#07+y$}hPi8#SvMZr9jrQ0zX9o?gT>Wvy2V$U2-esS~^vO-`kNxFM8>J$Rai zQ*lJ1QYZ?Z1Z&|h#@Y5_a(E9$9<_Q(7p%5Ox*WZl$RrkGHec)cKq^z;Fukb*&H2-M z#DU@H)o>^~cbN=&8yfR?uxiqHn_1*HdZGj$R#@O)n?rydsp@}vRT3A4^r|GDPHt3d-6iNFvf zYjdVdt6(RGS*qPvOiK=Fq5QlLU2gAJR|FP2_*ad)1mg|#r-dbdevzZe!8DF)lIHie zvnbsc8dH=C!OgS&?h*mOpsCM=_B{7Dgr$AUX8re-rB(EQevImrR7$qjBz2LVi9rr-6-Ov9jD^JkjX%pC+t<-^yGojvJNut1L# zZf54!(}gSh48TAC4s0WEoUace=6xAvTxdgold{hTr}tj_kGH+Q@68?=63wt}7DSLc zV{h6C5{r8>&P@}kb*2CCJ|CuWX?KSvR^atmB2XWQVgDj1Z~S}YxA+h^e%j4Kxj;ix zIURo-4t?{HN&<9^L$S;bBoKH#EX$%xEsBYO&w)`uSlsk$kvFo+PcLakLC~BTeE%*% zK_CTZf}9y!Ky~+7EjaRJB-tvvif1|KyN_miRi5G$e0eO@h?AOCytm=knGiN^P&Pr7 zoG$bJ{y7~h3vqz#esiQ^LEg}iIA|8s@PKZEfr+eqR#&;w8oB8jepWWHX9OF>b!I+N zr1Id#dRRHjSfU>(G7{ezUV*4~DsaAU2vexAXp)?6tv>@0Y6Cy2ckP*C8s^Tu?t$wc z4)x`qUAfixB8$QBCnE@WJ&bIdet2t8mxEn8PEY?}yY4yb;R- zQP+6c3MneHa{C`uJ*lq{f_(?@+>GDf-<(P3;~o*iE56On5%R7^rHOKJ>bF%geJH7W zvHsO8K~%cFI83oB^;ezrUHbWlEiTZUhGa=Y-k)3}FZ` zU+~(#iKtXyrT#r`w#(o$FAvrGzLLA*Bil|z@+ogxp7pxRD}P;qvqF#T8q36rkYOT+&Eg}wE}&wmpH zHp#PlPVA4deM9?6?I)kUepPLoT06pyo#ai+Pc62nqUPNDi8Dj-r0;|2G>gH2k z5x)Kwc?(Yh9R1{0<(&|Q^m`rFd&v70WhJFrVc&5@G%)X){AKsU-(NEth6lLX61=~w z;lHuj@Hv*Raz|JP`sT`o5uOupz10?`APzj&75H>lJU$g65A+mC8-_`M(`e+*yC1Ry zz|{OWpo55pU0lyv^L#SaH7o95H>cP#U?uzh%ICJ{f{+Z&i-+w#LUbd{T+fzHyV+p= z7AamZfW_eD*#5nLjhp|6U`zT%a0mfG8xOtoZ4{htrepstqZB zwn%m*r^l(tC23yKJ!`aDD8@A|!ixg_^We#)--^TZlC4vpWghhTTNB)^ip4YVyZZQN znGZ7CSHlGmb}A1MMbmB1P#OfuZ{i|6&rW`tvEMSA>sPtohW&Dn zMyw!Ed-{%@ZsWM=6Yc434o4t=ni@|b)1zx;iXz$@Wy8Bic^1h%J)yYUY&>ghc{(Ek zxCyP$V4a~ZzZ*X87f04ocy)(hqp+t1D(s)i0~W>~-%hGT6#RTSpkih%W-@T~X3&Wm zr!2-%lJB$61M6Qb;^r)ii8^lcddRnIle zmvAesdEVa#S??a zp1vm5Jbq~)@pRC=dt+p5oRu@+P1emXD@6IvMUF%V%_);T&om94zZRqv!eYi=y#CLx zCg!${DTU%2P0#jEKlwO1E-^5((c-;2S~K(_=j+tV_VWeNc&M($ZgalIc{AA`Rl;&L zKM9h+F*Lzr7O+25laDGn?@@2VVL=g4l3!8#`+JFfVRy5rsOC?w@ilLIf`XmIs&DKB zhqxmlO7CT^QI4648a?o`&F)y*=Z>JSfEM3=34j>?@8l6tOzHnYaa^~AP;Si9{ z!hXw;M5f9qG(ehY+c!$bp3j!6j8R4m!mwGI)%LQ>#M`;=8jH^~eQI(1>&dXj2Z29| z5pfRhO#qTp&XebK1-H)c;Cho(P?za1pceVu+iyh$Y=x%0(cSoc5CIp@(idG^3k^;2 zTCHAYg&zCYove@zwj=%>bzm32m(==fz!iI%OiljA_d(jK3r;%>4Q1lmo54?Nu-1cs zP~02>H?vU>4l*Jgn*5?B#5fn}&q21E3>*@YSG~^6w^t7u~F}d*4u?mCW0CWiIzAklE2OQktLB zSSFhVqzq2xxEgW73epKa$D{0fEO%C-B`hQC`p-Bn*Cm^ghM~pN>H>mDJWY|kt3mpS zpx0(&S+qayXCSPs4jFz@6Mwv*`f-U2;EU|R9&G9p(llNCGZ!xbUle@hFX*=U3xAdT z&bGOH>fodHv%o1Qzl7Nm-CFAP39Anvf3FnP!xHD|^D)P}|C~NKf#aRO#y#2*u`yY4 zfa{?=zo|aMt3w|rPr3KkQX@&9(!RU^Qc29Uu%3_!a(Osaqrs7%BQ=C21W6_kwjd*w zZdng9my;$_6z+On7^?SPHZkJ;iP&@7HAyFcc-9MKC{fWrFU?JC>7Uu;Y7Tk$~SZb;`t6f zKHq0t`z5R7_#_PZu9GyP(8m)ix@r5aGC=Q1=$i-o(cj>tm=d3b(c}0nL0$B*do0Kp zRTgY8C=7749w(vf3s-;KZo_h`X=e0QbdBq0f*p_fx-0%~X%U`eh!KuJFlfMk^7tz6 zUd|%$(@BlTg-d><%l>LgM2s4(gYR*r6}*9Za7ag*+*#`xZ|bp;Zov2%;EJ`lZD9h` z6;0cARu3izi%SeIr-D0A*P}txLXa&X|ww=@C_5wvw z$Rd?Ke*CyV^>-LFD$w_u{xCJUX-1S^JT2vJk%nz`q}c1{(w40I$!~_S5NLMaGuI>{ zC|359zFB}KN7mvKeesFe#(95k9hQMk_*;unWiyvu>MiNOmBeO5cy}1iPlqqN?Ojl zqpp^|Mob}lafn-7`(#n|{vO@;QlKm^J-(+3@zZYw!@=c=?eFmKR2o(;Z<&ps!`@ zQOs{kpXHa2uLeb1}v@0^{Q*5WVqn&eN5@(z0sL!8f5{q$3r( z6!hq#z^loy-P;w%y{A^;AjM&ROI^D`RbKASftsXupuPFY zyAhd_p(&4Y{X&_?LQlbxlqT&D=ZhUJMFCG%S+Yf5 zHOZDeG(lu$6KKp%FHrc&U3J&{5B$c4&_|Y)8=c@O&nm*mVAf*AJGYhWAtGzt@Y75OM@mGRo%ZXv^x07w;TqV!nYfIHT(Us@wBIGI-c*{%W$Lz&z7zG~yer(Zr0`NK}1(yE6hGFXyz zr&3|@MGk3x+ASW^G8A8*7ba$-=;`uD^39Gn{#X^D`&$+-;Gu8nxTQrq&4?U|5sHcP zS~3-%c&7Z~c=x}ir(hJ3CRt1p2{c=fHni)x*(1kc?9f?)e_pq|Fs#%{o>iT-ty4yU zw8?Btj^4bCE$n)$!(V(v##4x?^LuxO>9$?irbn9;w#U0-9N z_{a<;=`T~hy}PthWAICR0D8JCD<^8O*{<3`w_n<5f2&GCdsdIN?f_f?NOE0g{yuZ8=xW|x-RXig4+H@$f5n0gcH zsmHvd`v7Ck`y$d@#@zQ}tp%8|To1(-t-SL;6;0x9iHnX7+wC^zj?%sM0HkZYIOI14 zY8RGjp0lKPBfr1+#%_V$QD2tqLRftuI@gqgBa14|xuxN^=LWQP>_m0A3SaGd88Xg` zyNggB)*mZxVn5ziBBHpnje_vjZM%5Fy!fRL+X+|9I$uaTk~;bAzM6I4DVhEC6Dt_@ zFZjSm?7WQnpsIK^j3u}{sc8NfK(yS)Do`$owe01wJh90?I!URuE)eiDCefG@`ugpr zj%4Iu{+~eC5Tp-)=mNsrG?nH*YalA_ll+OgIlvaTBrVLi#nF8>J>Sb2_uiUWUueuq@ljCs zn(`ZFLeY=cC=~fBAlSIZcD0*V*+rP?J&g!2XKkV2@YMTWeS|>2(2_ z;CPOtU8EM{bUNFn;z#vcU5#qnzthcenUY_G%1dD}@xCY@=$lcE5o^ z>|o58TCyc>rzqJZu~#Lv)PN#Tt4aT9(z`_wGkduMq~QzdX8v!jx7GH5pDeS;4q`bkkb)Baa52g7j?l)q9-NobKoYls_7Ic zLVdWU?_X=4-TH53=V*2D;3T)$<_goWBsb`fg)H~+kY;md8$XWSC9ZII_lm35v2CSx zWZ-$4X>)t6W2MXYiFMuI+o6CK{;_0uF7}|MH}S(Vo+A-znGZWls5>P^mG3qxMz=4a zhCEnJ^2H3S)eNJ8i7kq4vvqh-r90L_3#>M!J-V4e73*s`m9?sq#eTb;=s|N!qgyOH>7E{Sf^(dgkPYm z$?WZr<-3?@U5i_ieUL(H>iZts8u-n0WpCWzsP%*>$Ov2J>i)Ik;!rkRh!IZk_O31OM`85d+*J z7DzcdHygXI(-}68LVTmkRc}gaP)AxmYR87(U|BtPx+0_LQayg2<9G1*YS|B}WN2t; z9vBM=BbYy5--{T@U`GOCb=6%g9bbVY9dVT8&-&8;-?!XbxaGzfP6F@?5Y2i-$$ATR z^LQ1K(z8~Z@Nq~@_hdQ)&6eg=<_I#GuZ!l{w_i)tn4GS^7owz943n$6I>%KQAv0K> zoV_fqvNj$S3;wc95NWViI;cI&_jnFa6O?sT*Go$<_5krhG041pVxq+WngwC{o2r=` zDs+#Tdg2WO>3EIzZ#{dq zB#|+2y``w>yi66@AzKivMQz|TGhhty;j|ZbU?te^U4mYb7~YzQW$*D5H9$tw*#!_$ z_K;QtJ~)}cB6~!_9IP&nG!=1gj1Q70Z`uf+>AqJ>8SrGw8AnseM`k7l2L;XgoUkhf z(92^@2k5BgZk}kHE$nsjt3dxGU$N}&ku^bAbrZ7nHo%`d7034#^A`&C^Bzu=!6U3-oVp!d$x=?TifOk`&7jGO@3YM zb@5Zf#+8*xj*+^3tPWdNQ%NeU(MCk5^{v>qulg15Fq@l_7#?WEG<6M z8L0e_h(vPVjMclB>9D4W$5sx|Uyjz8O5mu?23j-P=FAPg0<@>onKnutA`18_n0oe( z4efKxh>B*UC9`wmU%VbypaZ*3atSU#rwsy?w+{E0l2bvehLuucDFxPuMj^U+l+zol zgxo-^hRpeoi~jbytY#N>C%?h@MytL!|ItL}wJwngW0ynP4Q<4Mk_#@wujxB>er&wn z(6R4t{rX?lKFZr&RaU--VgXRd(ND^*E+s}u=OFAU5g6EcRj-_~jL97o!khi#BC_;7 z`t}1AlLpoPrx}A1j`O7nvkt0vVe7^Hgj3}N$YhTXX+2vIpUUXS*+Q_clRo)P;vV>WUo-di^FZZ z^D^$s#gF`re!)IN2vWSD`S~h}a%TO_^`)By_}o0a>iBA^L`B&r{!il%tXF1O3yYqX z7|6mJ9V6??q!C?{nKQL{ty_bCG$nWjT}XFMVSrqS{os$uBVD?QMx&i>_68dgU2D~`3@cw!#*%bGmYg76|GUQz^k^y(Rjgj3VW;ETm>G^ z&ZbwFh(xcPh~lNJ&m6?YfVg_qa*o??zkWykxD9C8rz8q5ba`J7D%G?1I466t>>>6- zf%39>uyw0bEGtl5=;H8RpWk{?UABYqbXNxLH)rmc$8YH_x`vL+BH$(imJx-GR|^>D z28!VJgHZn5Nl1zZttWF<;aZaBuJ4xG5}+LzSzt+a&Xmn{wqz>FxM)V43izHOz~I{i zyx{RToNX+O5!h|MXJlOHk(Y<254+0h1JKfILfk8!+1e80`ARWCqyjk4nm2=GX&B|i zvsr*1okvF5hftB98STQaENxlv3{KsTjt8gUt)x@VTU)miJ+y2eFV}ixmv0F+Fg*ti zbXbbDdmX+k2d^TDo*QQFF=X}n_24!mF66BtH$fKjys|g}{Lw~t_TgxM{h4g}0Y6vA6!Sk;PsH%ZLgym{x~^4#{n zb4RoDl3b1H^EJ8M=9LwxNAwVDT<*}Ha>3Fdgx}M=5FrI+&C_OG>wo-`_=&~Nj{^>Z!xpoXsW%;WS6dweei__QrFSr$AI@85C8wD5ehZP@d~ZyJ9__a|xg^P)1~~}< zhu#+C;f~j4DQYQx@;v8QR?inc)qN}!!-ck1)&1ck)Ra;#<@=zc{Foa*I`Wp^&BHJF z7-dXW!&w+NVhtUv_Dji+cUjoV_js$Si{jJc_bxSdWc?@%ksC}A_j2EM+Y3J`D;3p| zbT9Ai^5cH&r{T0fr2aT~c(-|`wZ3dWjS2)7)wT9-e&%GWsQS2;)yMYB;{iZfk<94~ zf&A>l0Z!XWhi_p^S>FpmAH?{jNuJtQr}IsQ_ev+fg>Jjxz`8Sx+baWPfMIvAUuSQ5 zb-4&G&!);nPy}v$ICVekMhhU3h7Yhq7XWii;@W@5tP{o+o^5kO5p`#|sQY_i?P$oa z^Nf{^PE(5X#{xK?>NlvgphAV4C#Q9^%G}fj?vD(OzcY4UBP4_4yI2iefkSWW``@jb zl~#*<;&nUZ$fU!+O7brj=I!GT9veJ-z zU>B1+TkrB;(IC2)c^=d9TTHbhVm@x$R;87*M=m3_bvrFsGklm_Xl7N(kcrV z&VJqnx^-EaVzaN~Fqbnrt@l_)bQp|T<55+8$2wd-m)><@{#e*_b+CbXMMUCo`Nxvz zEUg**i?A0`e|bhNn?>f~5JImYK=n=~QKW)bk>#oa;hD2{?LfnU)Igr)WS@}y7v{J) zT+@w${3PA!zyk7#H*%UK5VH!pr6t~USOqr@R3$3kE*p}KJhnXnz_5Y!jhOm;( z{LCji-s`?5t*ofCLMam&6YNL+XG!zRz+pC&X->{x==B|zjlaa&7ypu_yiDWgTYjvf zMxb+BmC@oZ9z~^|4cd;*8BPjQ^KcEl-g5XB}GRhVL`lZQZu%2%2UzzJQ2Fe%R1?j+u;c}Sk6E!7b4-d94>MOKl`l7_-n*lfH z!j3+fkv^KR%X(IfYPSz`!_Sf=wB}2_g}MieVx5i`Uv@kPUS^~q|8tM)n=)w#! z<$iB~WYDyRed2BRgJ>Vj%ElRT!iW4YcBOlAqQf2%`>O*?*Kw9(Jck{V^omM)6)!|q zIsysRig+w;f)nhWN8TUajkA-~g|F0*FP|g<%cs_02lrNve(?zb4}7OU+e2=G@c4^| z!rGg0L=ngazX-Lqv2nBfxg*KWMK`XZ@fkK*w?nY++-vNJbQ6+AzO#Gl_0LwGR}BuU zvWGqthGrhtZ^SNthht5(c^PaVY>V%o#uvi5@zn%0p-$3Yjr-kABf<_SL2 z|3oM^UR71OK#x;kOJx8w{WX3+4O0~NCVY-@1`vgE7{^xD>1D&{3DiTSaRMT{Q}QJROOo(D5Y91y+;P2Bi2IL{d;(p>L0(x z`Fw;ht0no?^>%qJcE`WJVu)i3p)7l>6)wZn%MfRTID3L((@t?{rBo0uyOaI;( z-4Ko#1;}@H6o@G}M_CSE?#rM`VM|w=U^x)5O})6(c5*iK%BG-QI?8z3J7y*%bJMI# zTq_^*+kDQw2TT6aQqX-VmVK*y>;AJ)o1MoprmIU;b~~B@CRg2@<$Ibl-wP8lP~nmY z-$fBgll7&S*)}`2j_e9rw&^(2&uGyw&(!s6iK|uZK(M9$yE^xd3vlG9wu^(wc+VV$ zgp(?xbWdJt7}}-Vw{NJ|tHGd#}vRxp8302#rhHa8cCavIKVm`^a4JoxOk~ z&-a)JZ12`+M}pw@K(ue*;RSS<1-*IkSu%YxOR(p^JAJpjTAuMb)C-SNmLw{|0a$v6_2%bDWb<6Y?Ipy&KCH@3Kibhh631nJ$(D+f7)7K?} z$=Y?>4u3&{^JjM=U;okVI8#P29zznxO;)hxM@jtTHEU)FU5^j3F-IXQNy(`8_F6e9 z=<95;OP>TL<&!1q$b8oAZ3g%f)iy!Ti#>>C*aqmE=>6>!0G_wwtKuq3gcIX(ygdA_ zYQ*mS`zuCuHX?NGN3mI6Nc;I6Kq|88;OR~FHkzjhi0TT_u@eEcIE5G9=%^MK3W4R= z1>)d{O@{~QrX6W=oyXObB|P;kA`&{TMUZLUqd#M0a$1UJx+QA!w(_`4*uY-crMP1R zoft{e&0j*wMA<3JbH@`~gkSd*7fDOACnbqYLT}ghx3ly5#ZB2HV@sHUGS8n&+$($W zefI|eQi-tNk3~*)6CiTN$c)3t+h%>t9)dLH{K{#4ks6Z+A4g_hY(9wihp6<@9s2?j z4AiO<5%P9}K|Be)oBoRr@?UfWd*kBG2__X2YGR6s`UmOi^*1|GO=|x6q~F8(ctCr1 zLdc0To6|zn5Fq-L$H%Xw4n^P7QNL*Rll1H3Z(Pf0sLlr}{jBFh#p@vlA$zM@a#g9a z5v4Ju|B#lR3z8O!2_zsF20>vq*%_HiA|`vPT$l6&{g)XU5h~!2grQR4vzs0sj3tZD zy!Z81WAzD?wO3>m(TvqJ16()F*LWwff2mU@)*U&yLm=ORw)bBZIkbIq>ioXq05JRt zc78|hD40T9uut5($@IU6n9_BF^zHq3=Ct5DhHc@7Z9Dp{2Ogl+69A%*Wi+=C_Yn??dSLe6`XJzpnc!4j(Rlyo0aEm>d2>+R#Ye&zo@bBuE}ENO0EU|2HrY zc-0#2G^l(KiNJ)+J&XZ*2u#pkwk(NEW#w*%GtH})ARkq)> zOUvJm1$au=(sU9GSNUU`!mfIjBw9CfBDj&G*p>(i?4srK=S5$ZOgl@11@^rAGiO}Z zGg*o8YDIPd@Qzc;=b&Zatz&)8kJK!$0Sjlxaiac=NwMqKuMQ3|f-J}*)vaRp(esygFqd@3p9fDbbN`DB6w;R0)MXAl z!mXlOoo-qVXFzO7w0E#evlxrxo_KpEaldKx?OaZ-@eCoy*J(mM#%v-6Z*xtf13i2* zVg#Eb@Cu1M^v!4?rM3#!!qNvo$in}$z1#qr>z0{ zY31U`4M+B?7Wu{gI>4gV+^w@J4fXQb@3FSv)BQbfc&3X^V92H6YFP4J$}C#dPH`xe zd@HGmQI6wg_3Py#)Bkzsx+h|qk1uC%J13j5o$>E6x9#R`mR;MOc3i?4(R$?s6|}hxs;CY=dhB9CJAx!*Z&2J|9~P0#dbp z%iF84Gs5GMF!CcoPI5RSv$>LWt}0zOvDTDeLVV!1UEM7mJ^#3Od)vh;{i-3Jf(}c; z1phqAa!{C!au+s%v!~#G2x64CTl!dtxFnI%Ifav@hM&z{uUYb97PO=S!p1u+V9FQX ziYCn;GOS-=I3H59 zccx-4T4n=ajujq30;%(kPs6`-k~wSu@10m zA2j@YFM23zw9`|{PplTvqH9_pJAF@%UrA&1f(@Brp{__W`1McYs}nW*)zFb}leI!Y z`fbk|@siAr|IKRMm)uJ$!cigtkERvtt0TMp5&oan3ODfniS1*_Jv_cWw+mD)9-~R7 zWMoq1^9*Z-t~2S}4iXl=_|Gf=Jo^3@MfwjTUkg6$Z?*>Un_m!Rx%`?`qkesu91u}< zwGKPnWI6;!f&y*ROq5dA{}7ziTu|vpLs)J#52CXd1zBnfT6(_k4r}owTDuIz#Mv5qnWqs6`e~#5)^sdX8`M(^?KQ2^i z#o8POz(21(51BlKDa{p%WfriCgmga{@1)j_1d`k%`DXXZspQhUpWF~pHOdY@m-(mB zh>xP)y#0i8V_1n9niXWlGz~ z$6xry_g%p2y65IMs_ro3iD%dfmpui| z54AOzYkBxG;_aLzIl&%vw^3*vX&6?l!?1Ht1*OV}ey!JFf4@)b>k?y05nj`X-CYXj zMR}GtR{75aBMVxbg4}Y@!Z{~nuzcIeYhPWKb&G+P;+B{nJ)+74bWSSCi&>m^ZmeyW z{r)Sz!3mTD`tIGmSn!nf3fw&`c@d`orB^i6k^lv+mmy|y13p%~s;5ruA0M0t*a;Xj z|4U=8Xda>ymQa{T`9$S~&^~S=1U7syDQy4Le`@7(T5HJA(8t?38@q!#Y8J#O)`4f^HL< z-U>)bQ_VL1-O$7R7c3AuvRoS4gu4Vc=;cJB73iriLig@d%QeRIQo!?=w4>5pD$bJ9 z!Z}VWpg&K{;nJq?vP0{o+8U)4{Ue+m;(<9IB=T82gmw90QSUtuvQ5QNzaI{OIdDf- z`oa*I2%d%TW6-i!@;ILHQ+cZ~|K>AZi~YhHAGi>jS;%+)aJkn0-w9Q(`E0zbpy`$zw27V|I(N=ujliMWo9E@SVTm#gHYs82gW;T4h=ll>Aax z`n8e~$%oXA?{C--;@m5{O>UMFVXgP46`JQy9Q>25!r+N4AH7ua&HI|&8fsln?tHp0 zN8YeUAWCNE9f7OxYA|B&*k44vl6JJTEK?e0&i?I6or3?-ZT_@c&_Ju)IhDD&8>Tne zNZW>JH3@V&w|eL$ChmUFyd@b^Sgb%@ZG3RBzQ6=}jqVj`I+4y@#jv1B+qG3_TQ6jq zLA|+VsJmF_aVn_u*|kGNkFKPbA9r==-|AP3E{upcf9{3@`AQ9dWv%Liu2uXId_jBr zVD)26<gp-p)^k>kCPH$Ij}`*t3It z0Y=q~f9G7%KnaW2N*vQu6M_u&Ow|TeSfa*5B8h}7%yU|6$1b|R$g$XT2N#@>^E0c- zct@x|04Gje8~?}btI{&c*;F(|jBXL=Bq*z$2;G2_bCDPqtP*BX*m0{P7I?py+SvDS_asUWbT`>8rYX-+MEM1u4Oimxw)4!?-<~hDU_I6?3Zl1Y)?8_pn41w zIM&nTNOiE`vU#C_kE*Whegb*w)BYasnDLE7@yGsex#ipT2W9-HHJ`_lkF>UiO7?Bf zH^7O2M-i?2`%d3#3Tj5>YP>6fe<|#LO2a=S{Xaiyj0t%qHC<-dH|}KEN2A$NG?8=# zfQfuX$9nr%n!W~f`Om*Gsh}EhH(*W4hZ%3L#Q+*}ws+hgnsrX?&!=ks zr6(tZoGeksQ3a;d_~Dky!EYJ!$5h9EWf)AmR=?nhGf*(k0^zgCaWOLRP<3wo(wXUv zAjtopX*4LEljcplu>?Y*5&rxoWo=?St0+EmCFv!O@F@LX62TNe&mUfOwR);_mZPFW$|qOLP9M@kK)mf2 zd!nFXGOQK(K&h3@tzk6|T=Ep&x+tuCOBB!S3nSyOid7EqvtyrKCNgs{l#*Zl5_xjzSaZfpBz;YaJp z?Gcyv$mN?7Q1Q-qmb=!?cXsuLIw(06=P5;)?drx_H>6Sy3Ac`47Z0E=qm_xz9OP^_H@p_*gdTI~uZGoP> zACcO9AbIa!UI|=~vf>rBi1?(WKn0zEYP8UXNmfY+8!I+UxK5+uAHCjpXkUKP^)@j` z>Lu$(-SjV?@8e$AbC(**G;U~Ex7bkP!%(iJm+<>8IYPmOtaBYLjZLAA3R)jE)s!gn zv^vEFZxCnvc?YqG_He0%GImlwgX(MV@bt$xI2a;WZcG|n(16skgh--8hz{RSzM?nQ zZVjdSXy#DV9pTx{N%AZT6FlB{H3=RhgaEHoU=cSC&`|kD?rYk5qUCq1yAXN<6<%a+ zyJlUHKYk35kIX^`Ic+aoG3jPmZB3aKnNqW(fWoomI~RYbrxpo-@SVr{QI%BuUfJ=F z13vkAu?|^^7tS#Dahx~*>HjhI)?rbuUHh<#N;ilgAxL*hGlbIJNNu{Mdtd}4plTUTlv2R9l>AF6}5;w>u>|v^%4|<>Cs$guup|>ut`{6TM7SSe>~C_VnAu0^QSZ zrn~<)LevDI?QSt9e`VS9xuA$rp~^r7A@iAQM!5=8C1kFtylL!@^F#G_Jr`k#LbFvx zP&!7LWM+Cr4dbO-S*!JjP}!*9r~^kl0$s~r`3W{rOa7XBNu0dA&y?Os@^SOrIR2r= zNMVOjAvQ*Lxsj&L4_btpm}on2i=XVhpk+a)Q*oNF6wNrU%5^!nUx(|1?UPCj%w{tvSt_$g=D%4~+%WaJ?;^SxGCkc@-^hZ`o zk2pkGq4#Sg$+O4opO1J1zSNCXrqNu06%D71%fQC_2kcWT2m6&GZDKTOF~#+%-=_*w zN8>#{wdZQ#W!Hi_OlpOTVe&INP63Dbg=QL`en|ZIXjk`+y?qhIGhY(DC><-x^+8eGqxt z)?OZoCKn$ec3n{~!2tst`^U%QPjYqVyZ>-ye~|j$aQ*-O1>g%_EqsLovSUguW!a7n zmryn_IO`{u{VSG?G)cc~kNzL7O`Y!fzh&2n?E6$mdMRhor;%f!;Yl+0k5QA?H~K-D!)C*K>rAe zTF6w+1mLrZmnoJ9>2q%?1z`fPWN80$(bvR}z%l33j&eBo@$*FXEDR!#$my34V|YjE z9G!<{jpwEoN3UYddeL(#zQIl{Xxna?Pf$JcA_4XkCF8tpAbLol$VW)NBbUvZ#U!$? zaXDt(rt1w7OUXu~o>pKla{j#9)C7?J?KeJugWQ@07$sJ-t0vIwAn8`V-Zg`Q9R6tT z)(|$oCJjOIx0tdDtz$-eIpbi|4fuRexFb(sLd0|8qP}}^W^Aaa4ZT?Se~fHMY^aK- ztC)gixN~Db-&j)VTueP}a@<9k(QZ8I4+aQ6C6*;}TP4Ff+=_{pz!C%b#ia#fdFi9I zUZF*~lkYS7Jf95bxKbmYO4G%Xk&Y6`6wHzz}Sz1V33(QZ#lbw)W9Jq|y5aQs41qulKv&2=l8qC0w{IB>!Ly1VZD z#FY=|MsXQHZchjv&65ix|0y-T+x$dc_1xd(6B%=(h+LGrrwjIfqcCLM#p|#%4g>c^ zv|{sP6?n6hCye-{SG)Cpevv$CfJ?S^-{ak>m4pY0jM&D_Z*7*ETv+<^lL7<{ zU*3YayDvn76`71B&6RMDhN@#JbFwlWon+D#3wi%RyZ!+8(EdltA1v~+GUtKq`UUk2 z?C<25TuL*mk5)dsqGpgHEJ)|ve@Yjczr_lt=MYe~MhZZNtY7mhCHaBh+i0P)9`lke zUkm@LN&`lur&I~KSTL&y(fF>N>)|Q#>fI<+vPwsGP)7X_Ad`V5TI+qeE`d~8ae^5zbLk>6zuiOPaR^W z&7`t~OzpRAe?`w(J8@teW#<|$dlroO@&A_*BbYYCSuU=w>TJ%RH01`1?d5qA?3ues zO$N->i` zWj~JuT`1(#S@hNY5z2h&QVeaG`v#NxLFAnHK*?lSrGzZHczufu_~5Y5RDC*=m?Fcj z__7MqyMncdyW~DSw1snt`NI`N_FW@9c7%e(Pph|ETB##8xcnGZI-4%)w$Od{?dwJod-<-_H^-Fz?q+WSDrVd_^ z#T}BxHORjKCzYvnxW2pg%P%_I0%<~sA0`wTWmo-l1f_k~w@vbK(Fl#2FbR7;MzHGy zin3O+Ee!owj{Vx)c=EsU5FUSXqF$S)GC8<3Ur2jIBR#Q|Hr*zY7@3k83w1qGQwTR0&U^7Sk_kH;r_x|@+MiJy&MCZ7Hq5h6JF!)p2;V*@!9Ic7XSxQe% zz{%C(`K#QD^Z=C^jc007*Nc-LsJrVI(sl;I-RBc)(KO2G!T;nx;Y95@uAf4$W3HvM z^fd+F1u%kSOcIM2q6`!n<44v$5RX^{l1GZja)1#%xds;4y z0JFXo`Ht-!Y1wBf_KHIB3Z2VsnzehyS9* z4tHR$X9Dgs(fpG6@jH6HOi{XYz?!GW)~d>e&A#VT%>Z^etpFP-q5edyiBh~M`yzJ1c{ z9=3jvc<&InAMq5uE~`IxW2*rkJ2jb%y;qP>esuO*;Eb-z-v=vyjJt_X6zPe^bx^9B zqtF75#mdQ=Q?U5o@z`g^d+j%qq_>M4*CPrGVsPNBP$Rfp&7Zsxc&-rzv798)TXPhm zY3bitvgWF_R#F6(y2~RIdp(^3J^}yTYY2w5zOl#uRrT%jamHIT{Q!x1W=He9?zrA= zm>^S`kM8*+{WnW{{!aOsM)5|T8L5z-Mw*H%x%PFWfa%{1-2I?|v}DbYs!%L^p4u~m zGp(~FAe}Ao`!v%`Op^DxjXN<4OeHTE_{2Vk`C+a`&Mk+)oEd|$$D^-*zk;Pn86r2M z#8v*^zL`hgocuEMM%Jnb+w9>b4qMdd&qqC_p|I1uCn1a=SCtZ>Z7y<>N9KO0SV!6g z6}*S~;HZc*_*1OUIG-y10YLe@=32l*Em_h-H%6rnx#6Ol|2d}rpGw|N?7ISZ%*Jp& zFDi>0WqG0EX|Z@U-PE`%Kt}W^?F#Qnxsdomcv zopFMmTlcIo^qPw5J`277526-whI&o!DC;C+KFJK5vBMJ z7y6_6L+ysMyG0OR?$J%sX<4qfi>t5AD5l8`IsRG}7_Ye#nr^c@mW~z>_O5f$p+NUk zvgifp2ShMy`y5x*IZX<_*kY^FXGE(RQ|g6%>)N`hAK+6QAvFs1H5NO;gFy+Zp7s?o zr5F=SbA6&q?M92)oOxTW$f8TV=XweAg4=x|B0a6zwBJ;IErjA8%d>}`uQNT|j|ruC zLx(W^1u2S=JAI&@Dj1CPQZ$j#q!_s$mbA&VZCn>*^$+00{k7*(`BILbL4<{|^x>1m zYo33y(Ene+|N9U5Qgk8l9+;hyl78VQC)^`DBXw-QU-@_CcE18!0OMqGe}-qxli@G= z>H8o2-zjsDxjNhoxM0app_}V2{`N;asXXeeX(Lgd75Dr;s-2qk1TuyN5JPb1K9+~A z+mb==>q&PMsrO>tpusE^9V+FW#(TPyCu#cMv*wBPc6mXyaHxTR z4J&-9+e;ZKXeovSV!_&sddf|n1=|@=9XsD;{Z6>sgN7n!N{w=t3pHEf(RZD2wgzZU zpj#D?FFqKqdlX7)b9{(n0Pq!Z4Z0M|1G&?21SB_|y%msh~Awj_kz0} zC0*~rFP#l@$BxSJ(T$93$f3RKvlVd#cXh*=`JVRRW3tuL=|)}UvzA-)ke;N+=3Vfo z`LHvKlNC|*>O9+V35M@WX>KQ1X2Ewqj^H~^k40<>to-vqyOT$;vh~;wA3OPElnIhD zza+SXOJB*m^R3UrPt7#Tp%W~H8;QE^BpUK`7=ERvNZ)lD5ildehAn%gABzZ>zP%Rq z4>0In@4`Bi{8VO5=pL`F9pk{02QmLI%rg9OLWN~(mV6WYS8o4HS)s-p$NbKA>dv66wXU-*&! zdq*$NjKND^%XHXXcCOWi-nW;-7aDJ2n_Q8ZnPY=h+)`wO)KdCe_2z*7`L3Wa^A1az zZdAIOiHEu{$!LOX{edX(^TfnZ8Bsy~j614W2QOi6@k^WC>K$Z2t6E6ZtEHz#Qit_R zQTbr;?d>N$$+Rx()w?zOUf&byok^+2NOX}*w)23V7CVl#l2K#S2 zOs5fm!l99?vc-_1gycc6p4GkDeiymae<*-mo(>54KuTFP0n>!2a_KMmw%CK)|BV>^ zbND=_fLM=g7Mk1_!@s~ySXrInV?X7$_;YK|rZ2?B#9#{wfD!L`)1ftUUmEnZfwej` z^~y2^D*I!wpVzdwH*(~#L(ImR1<}dPgLO$Fj8ZK!is_4ZpmoqgV9#~d5mxch?UNJB zt&N8{nD@0Fxm&-0SN(-#IHxHnsu&c|+R7?@s}8u#3-nckLMI`6?Y{U^s#{scu>L~n zIW$h{h{RAhM8M0hq0u0*{`J~`lVUfolEI;Fw?v**U}E&J;KY+ULNRT`1u5N@4%_^i zM^pmhm^K)QdR2vBsDJ?}*h4FcEUEJM5gKyv*SK8~rmW`>l%pJcq&eX%A2sy03;v$| zP;E^g*-J1o$|sqIrn(>dUn}@Payi#r)0=7U+?jCoV#<0`X@#cFLgqmE^DSE zeeZ4mD)qk0HR{a>2#t*;`*u&X+hH4b@Ij@J$=z}=x?4p2TJ8L(BXF2CUW{l<>~6rAhOgSRs$|@(=kUe zZvJUdU22+cW0cNhys-Yr%K1a#N(AN4!u zA9O8WRAAPFT{L}+l4L;+&@1$bc<9Q3(lS{&H((oDIi#BS^}`g8>@&5bz?J{px9-_4 zKV1PaBbHS5L`B(8CWm5G+c%^C>WRB1t;utx>8BCE1u2aHGd<#YQO;7#-Bd8%pDsPvo`esWAdf{<~qq}+^I};&gp8i z^E!p=(qLo8tgPf|$L$=S)Y+QL&NJW1(^`N;e@Y}3;@9oV^RoH)hs-zCO=sUza3z9D z?qfMTm^Tso2EH5PUiBUo;r2O$fbK--g4dX@q*})y2q=BmX*DTgTU_`oPgp~>uUk{F*q$;GcOANlrI zZ$gLn3%a{u7n2~ZPS(+?z2R%ujjjCP2mlOWVY~a2Qz)&w{qCW#srcB5OXF#EZ9G~l zW_P96(_A?$mGt=cr_T*NAK<&n_S2kcJ~vkILk%ihdr4mAu7UrGC0pYhy6VI zy<=~$zX$kFb;A`ox>0L}H{nT$={ zDkMIxRi~rPE_)cROVXmmvw?Y{KiBC_K3c8Ks2tUxvOu=xkKULaSm$-{#Fn%$4$yC8 z78I%zXcFg8=!O;9V~>W{>V_)N)B2j2K`f_Evna4fMX-naE4POVI;j6a&LB~sR>BQ| zk@#}qD>ugIVr{nipEds*jsC~?gSc*MjK63ib^s4vH|n<8Crn)nbXMV(2YwrwFZ$=i zg1heh@W^6R>SbVW_(>^sH1UN_punoPAuJ2FWoMy}%zXI7j+q-taW5f;u>L9#6Wd)L zbY>B?-ab6bM=d>WZWHXv^8*Ij2Q9L7UH4w7#=-}DSaA!rJyJ&qreO?fb+FV{QAhvy+<7q@L# z!Vk_Gw$P_<`$;%{Ss}WjB=Mw4_5Snvjez7mRYSgS1{$@KOH!4|^_MRH9XtZttSOmE zPd>o8SfORgcc)~F`EEIoMtBSt<}3=Ly9EEqnUHXJl;)>d#biNhTrw&(4Hg}@+F-Mu z9bXT(uzsEc=2xT%^so1({y>Xlp{~?L;yqCn6M1 z8M_P*#6t>0QXPfx=D&4#!2D8fuedy%%Ys(BM4nQ4;pv_&rAt>8FZ-h;Zu8y*2SXzV zP8Eqi*lQG{y%tl=Ogn`b<|a*1*CfO%^_JGu8iITcEQ|X3-L02!O>7`;9xqP_>0#7z za41vLV$D0dgv(<0bBqh-J&zg#)PtqPNB0{-x!$QLE54Ec?#4}*;`th@zME8poSs|r zA;UY3-e1sY!bbtpZX)1q3Vad*{uaZUQLM|@=>Dvz{{$Y+u1)%*??uGN@t=S90VMCzUs+z8ts+zWs^PMN&h*e0imlzYp#7v0aM+f_<#tMgShY7PnuEwjO~cj0-gGK|N(~K- zv1={$;{U3H_p<-}_RKvN64Kl)JN4e9j^J<}cN$E}jYSOhC=EHp1Naz%-fAqeE>{4Q z2B%lz4JIh;hGkN<^rB^dd+9hjeKu#o&0l`Ok2UDFQ3w=49ZLGP1GpC!XT|nxP-j}O zW6{|bZMD>UYsl3Z;CXw!msr>7BwO(C==@XFeH3laIrT9i)=zxO(@Kledo_~~>yfTI zd;(gR`-#lG;n_#%Jy2!39(HkSq85H%`eB;Ub)+5FIXSKZ0&6n!MkoQ)SXbq=J=?gt z8K%M;?i6M=zV{U5J;VX+=MO43`c2b-O@6Op8O<|7ZcM)KgJp zFY=}Tp#^ctZ%HpC+)x-2 zu=P9iM~+(32l)4aH4MEjWYN+ID9E#U_^Q-6L^k@x8{Ko+0CtW}ro5)*#$Dlbs^9XO ztqpXMzCgdqeSXE)ebFWZ91Hs`D9o6qsQkh1K-JaT>B}iBk?ZqhZj!W!W?hcKQ%g7R zVLHppy+}-o6Zks{H>$kYG(l!*yB^FKBX}+*gh;`Q4$asiLd6tmm+j9WE^naH09T#(F=FG>(^l$A*sUX)|z9&ad*_J zKaqWtfYkZU7Vs|5D0qLDP3j^K-^6$4`FkP%3D{Jg_~kZnh1@Im0D`-GAb?cb9ZJe0 zMZk|0T#hkG?Sc+zif*dfLnp`NO=|j+pYC#XnR>sb=JjYW_-W-3-qyco$jVUd9go`leCqzjuLCQdx39yq z|31GabrR#}8rD3w$fC{Dzg-ggJZ<(C8e0eB4B~W@sIv^>#{^sLAk>lN8FN6yQYEAqTGbV61Gw&_?aE>@^Z0hwu+VrrkA<|qzsr~v2XJf3QUzXcoJ)?K7e zD+rU*XTFrN$EwhLq}-6B_A^gqz!?~W`}gn>Q<3F7_YV~;_^FehNfw9c4h`fgi3(?Q z&X33sdTWJrJY%|GokF;kgjtaJ%LjD>bUhKv39dHoTJUXi8cW+k3>>2}i6&4q>JlJ1 z;EICYUk&I(qd^q&+g6_s0cip^KkBwRU9pPmS-@r!DSSs~z}R>78}BnUa+>hR(%DC#9`a-Ugxp^00bdXQ2`FlPU`sG{LvOevdEl~MH7j@0(C`dzeopIe zi|+NPS1Zn<^BXemeW`xrINW`a|;9 z=hv?`X#%k7Cx^C-p7_4_9r4gsW1tm*sbbRR9i_ZjNXs0Nos;Qu{_A4;V*<(_{#ClM z|72X~Wt@Rlc=*nTtZ-XHq#h*zrXKlu@!hxd5vIqF9}m-MBl_C9kpG5ucM3*cJ2I&Q z_vZ`{RpSYZDJ#N)$R&f!D=ChG0b>$q+U1OtVW6XDo8Md!xMwCO&HKsUPFWlN9FKrN zn@vv>yB?Q^)6VPXyGbWj`e9LP5AVA~T$U793^P;sOI$k#in;Q7Fob18^PU;+)5NeW^fhW82l2A9U~(a+g7P zHUWZ4N?pQVQ0Kek@*zVUqnB3elg#hz+FNdV>2pRCVhTN*-{iTj3ioNCZi+h&`mWEq z?@neU-0dmJC53_yvV@G~f^=mO>iO+=rS@U#xS_GP&Z|g!#qTDL)1G*kx(9w5#XAlS zlGW+uQxo&l1r45cggsod#EbhK zYmqAZ;$esk5*(PH>YO(T>KuALlYerlU@uw@UWMrurZhUd%#d~eO{(JI;c14oDE~Xq z2}%k|%e+;nh#t9#K?lDBWkq!s6-fPOVAdLx(U1hmV;RDbRM#$z8g2ZwN5<+;GBA43 zJq`WMA@YPk!a@q~kL+lRyki~X3HW%77Zq*l9ucTzz$m-5bunnvT(j~*)l9$dE>tr$G0qj&X~Z!tpE&I_r44hB_s4?e-7rMoeQ% zK`GFUs9pYIH0Cg)$AS5V+iSp0HYD;Y1#uYZdm03$!^KEAhY1NLCER8?2IDaLA<%Z6 zKE$)q^sSgU0rv2fCfNQ-o=i*=u9+}$c~Uqz=BmN2*;#Xz4Wltkd7!%LH%z@#MK2LF zdPA_bq#o8_!o;QL?%%no3td&p#FxTg!BLl$Z5p(ZurFGdE8+Yl`p zNXXfD_}|OP)dmj$z=(ed$!gY8T+6Z8gM$XTM-UB$N`ZibN9~a6&%VvQ<@%hc28a)* zqWO1@C#E{VUGiN(nEPG--;O=C%gW0-RlAW&N+;9EdB)M9S(WZCagvVzItqL))*l8x z&5MArMn8&p^i7~U6}W4k&|fzz+CMZvf~A9~f>#YEB;=6cdy^nMZ=%mln%W1!D1^Ec zqDu_{rARm*o}{r@Lq4ig9Uju>voln*+e1nyO>f)-nX+>NR51gsMr!#=2Xgbg> z8R#yK6tQY9F-k)?R=>PoP=;2;YeZK3uZX`$u0F|6~&QH(AqMAlh zCO>M@VcH5EaZk^t)R1cb)=VcdTs+CSUR+PL$wRLiue(tDDdJx4n1JMrC2Fel#|(0E zBi3iK_!NI`_1NM7<;IRpB!?)oiaz&nFCovlA z&o_$?dO!4Xr&i)HxGM*(8M}svly*hEbNm2UUXcY7%c!2A{YDl~USdZ%H{RE%=*W@Q z+h}W(FP$SiLG~PgbE{|YcGvNtQDYdeZm*6$MjHVvr1Q`Ttg$!NBfff|; zu^DTIb7Es(SmeaMBit)t@Q2{q1hDvnc}8S<-Br<3F5+ODk2G4|&%*OS*qUGw?lqyY z8a3}LTElhpqSW22gB?9$&>N)|zuS8d|1H~Bf7K?r53N#O(HWZw9fMw)Te`y&#R9;^ z^NOgzsi0nd?)WIZAAZgxn*Mr&`S){uQg)|-QJP^Mr6UXbPe0ck{i~v&xkR@%_xTHm zKTo^w$%9lzrPhV&m)mO}p9z4I`}UEte|jyzgj%N5Ha z@q5@U>cHc`fgbIJsg9G3Hm6x0r*5$?Sj51U$F4~BOFjE0u`zBh`$mC>ZZ)r3waaxsg*dj{bCZ5@ zhqV{oQnCJ8Dd#!unU{aY{UXWX-j%JiSeqy;b-^)p@I|VIh+>v2k_w8R-piwPu!QOe zNT;Ga_IX-j%qGfiLvd2{{Tk~CUstE5>#xkhH_@P$y{8GZhJH3%8E(Yp?5(iTP|d{t z`;V(i?zXVoVM0*pV94~aZ-sS9ndWGcISJ(I?ZP?n)KYi|1 zR?E4fIag_3WdI)(?ow)BiQd88+Oj%g{^KhyO&L@_BxqIX8@+iW-{}FIt6#oIRmQdt zjZu*BA}y6X)P6{4rnTv`r{%rj{p~phq}NeOSH5w`;43`wITxItV^bL94Z|u5-3?N| zM}ZUE)rHaYmK{1u_Y>zU2sR@(Hr9Ab8=q&-SD$!)j~KK``BV`;1fgP4N=4mX6P|vx z!e#A6$VP4nxTHVJd$UN7)rf;POYOkm#Xk_3X+B~vi12IxSo`sNZ_Mh11Hq%S#B~gE z2z6@o=J^kVWz?c`B`Iu}@j?sNUhi86J|xNbdpm>94NCKE@)kvHeCR)a-ejG3(yf%+ z_*y{X8einDhTYCZVq50>oIxZmLkpTYBRQ++4rGhb85mmJ>N5>$qo6YjW2Ci1djT%^ zHi7?0`}afZW%N3~Nzjo|x^eD$ajHo#Ofec)n9u`$f?jxEqrmJ{%#U5i+*!P4(RH5^5}~rYSO3-S#+<(uNGQd8RUg-;L*6#+X($V~4Fq?us$T1}S)q zDi#r-4`(7%iT$&)m`UcLEK`ysRUqbux)4<$QxwNVL)qzGpL}n9O1CacLFLvM$eYl$s^Z4RRr~1KerZ zNHW0rNXQE{C}GCk82!U6=hG+)I^b?S^Mb1kcLx=XOyU8Z~{~T6qsWr zkXt_p(KC88VfeP6lq;3d`$IxW;n+S4>x;hlH(YqE2WaB1oWdUlwRxgGkLvXBXzBwY zX6)d1-C|?Wtv+jcL(U@O^Os36L;ZQE`@>o)mu%iUY^)rij@gB{!k=Ej1%zu`jYO zW0azeJBr+o zX!Q7y{yYrIEH!7^S3k&a5Qund!^WW>_g=H1ytlK>XYB~Jo?6GK(I#gQ_Rmxm_|1f9 z)7zC^O=`_)=m^p-dyKQ;zD$Go`>u*G*FZX3N&>XHx-Q7b%E85fY9!}@4g`Fh8b&|?IW4ak{UXL z(|hWN-T>k8lvgw+_cq6lvz1lj(uuqU-{egK{Xe4a0Cel+!N(TJ1F&T8{K?1C8L7_h zhE(W)>-gzPr)L3ra+`Zme?%>RX~~f|rp^l{`G{%Dm4ULodUMH%c}Q*#$7_VtxrBGc zKsT~K7@4m@vwjM-U5iG~+3qz%upOIo;|h1%hBMkhtXI-8s3DnF`{<==S_#p6waR!=Sn7Kt$ernC4{3|#@?(qRZVU9t9zg(#*Kba8| zMza(2#6GXwI3-<^>m55S?82E|>4O%iYf_{W-hw_Suzws7tJ*9=*A$(Ghm=%-5K&aj zo_?j>b$$F^))syj@S6B!IeTr9o%gcxi%Z?$)u$2v0Z6|VnAw>Y>;X-espeRjv=}gx-{AHRVEEqiB&%Mp3vR<@=k2(W^Kn{l9x)* zph}O`B|6NsN~xa@7WZ1Pvd&arS~%ofXeq6}x+Ury+WbFAWqff{@*XgDe0wp$!E&dkGtObgsXB#+1OHIjng37i4jG)x`0Hn5 z_EeGu(Kr4{oyc@FCD`l1JqAqBz?P$xT-V%_p<$rykRv6!Tw!#=9;Juc@CyfF@y)W z!O%hM8?&@nE5vCV8JbMM1|`oCw*ayJ3`kB{Zn&AL^7{13d*>CyqGB3aKVXOkXhY;0 zk}da03U%pl;^!An*JTq!h+HQFcRFYG$P})O_@O-?nqX0Y)prB5VLQ4l&U??7Ue4WY z@?_8mn+o*WtYun2QevNfcL;ooTIEEYp#nmES0qNB)lka6YB}$DEp?q{^(!cV*nBLx z8jU#lu-@tX><-&%%pibTxn^%3^o>d&W*G`N8~KrE97C|xu1BkPyV$EfGa?2ihBg)Y zn+GqrPC#DqlflP~<~I}AyAG_XO6|IjU2lP;@7 zN+Mq5zFtm0AB3+~xC4`YS`y(G=dH&W95I+pHyE_+-wRcE1VLw1614syueNB{O9H6{ zen;2H+;&*mbrF`u_t0;P=n*J+PEc6SBkooHuiN&NuN2Yt{<*iN8biX{8vY@8vnTB@ zkBY$^3OtX+D~qPL6`D#ZhzdNcT-7X7d+4_r+-2xD?7U%2z;5&%>!o&Y&A4|>+h1Xh zOcpKayrTNtXe~~;iRYA}i0uP=Ebf^P?}t7D_cl_fvmLG)K}un@8#o3hRJ&)n7|u(Y zVtjQ`-!Yf)gHR~MCIBM zkc``m$8R_az?(d7vJ2=Mu)?Hh4P3+1pfi#JBlol|EWeDAw?{72TP4ywGiXYf9!^E$ z)lSSOEn6E71k|SJhZ0C1S23Y;&2L9M&+L|T$@fZ#&|_bR;op(hQZHizN@u1jBDWG| z{DQtvU3b^RV#*r@Cx2bgODmc5?k`}q`yAth_sg+Y?f*N(*a+c(uXR7tC3?gXLvn&< zBFSi(Ar}v@ai|=L(+}1CxuACjatL(%_yK#<2JoUexZCie+p7#?HX_#Po(=jx<#Ke< z|K#T|3?>pP_rvqTI>jzt(C968qes{0gsSj|B2aF!nhi;89V`q2azc`|b2j@Yb>khe zRVv?NC+r|BxJt89*hejyC0y8>o#b z7rI}%l@@eM0b4-6mb!|>JP5>^<-i8GZ9P3!h2QvU@OST9{UD*{Za9d!0?%&Hby|fE z*bce|A3oQRtbd4;#hyT8xM`LDW>Xm8&Fega;3&9#aR0>5KHha%c*r|20?ld7nW6-` z*REM{acS~Ir!9R7F|Zu8%cLY^T>Kb*v9cE^r-xa(ujI1w73O#sj56U{%67Y9Dx*}QHjpqD|4BI0q-d6D1#f#|BB zm=SQC@Q2RJgn7!!3`4`W`J3gRzSzj637d%aq=`pX=#d7{b?$+b7(OKEwm@U}m&y)- zm#bFG&7dd4mWT&v4rVEWO4vkduAX_D0x^Nz_nt}qv2y3VS z^#9?j>2M3{$#a=slaM_OgIa7vjLOU1E(d-Hq{|*Co?JwcW_)qd$!#_f9v)c2Y(ECC z{6>wlhT+ju{236c-z3hatx@5fNsHVtI*d)U-Q||AIM(rcrtX9vEXVSG8EMFXbF zvPRmeDfEo*1{?UiH)eQGtu7l-2f%q$;p5xX7%vL#wrZm6xXqO=Wv9uTpF6kt`nu>t z{#}sryga(3qA>ZAy|H0&uTIqu_KB&?E^Om-P^|&=1DSSL!l> zX$9gWiKKWv{i0a{8<;$6x616M0&2NY4UR|nn*35S?KTSsK@*x!qs=>`jKOp-)NVB- zb<=iiBNtnJo~lmI08$=uuStH51bJ}?n+Sy|MtT_eEj?3!=c3@U$l-Qj0scgvTUY9p zpNH1yWZ%f}?KM6#aY!5~Q}E55G5X0tr5tf4WWZ}!kFeU=?vVm(#<@nPHcf*_0mby5JLu^JixgXS^H(QtqYC+VOAnmUf zYHNGbYdxB|ewXgM*C5utGm}jVC}#|^S7A4&Py33q6&dkyMbU{K$>B!y@G!m6{3nL! z=ataj*U^;i^zeZ<%};VF5`i5?IjHEb(gS=_CWq%tA+;MM&0V(1gq@z4;C8RnH%tf6_Jp)OP&8?i zOT}5!k+4S})7ERm$|A>h$0R^bCyYeU{R2U&|BTSq^3{atvK##A<;R>ExZP$L7pYzs zs!?azZ=fG{70EqVDhb_l&vm)Yp!v06xECbsx=jOK`>H7^>V)uDw31_5^}8g4blM0! z8)Pp(29aRybQ~PAy$zg4zd_5K9%T>pJrw#^sHVe1j>^%tT)X$7dz*xNQR-{pYDAF^5X+2o0}>B1OftCRVKv@I&k4#iP&7KKwA z`@_seWq5??73rk%kEe?~qDco8lua7E-*w(nWE#Cqf@yYd5|HegxUiXM&+hOWi0wUmUzu!k9AEz`15)OQQAX%vyP$cxqUX>IgS>M` z)R({xkK1m?!83&55TR5I(TO_0kkIf>hN(QnO33Vg9+&eU`B86>UZlNm)O(x_$Qn;Z zLjPIsS~vN@svcvP{zI}3=}arW2A#}T0xw_SFceDQgII>HbzguszB)9=1>XIryPKH- z2Yl=esQiJyEvR?M?KdJPC`kR$;AEK&{1R~F*LAVObN;Q55JMCE12&I3G$#+QzWh^I zSk*`&nK$#~dM5mKGkPfhIr`z-eF@C?hE|`1zD!V%XS=6Ok`U%UKP+za{a8^dSt!cnrb`yMmINO#uCtFF1|BOwJ!wL$=AxwQXjs z!}D*F;t>7qW4HoArpXFw<*mHZ&oV21@{ALF|!pr<{X>uBguA4Q~4|d(8l%qmtpYz&L_)oXP&$Q|W= zx*l3h{7Vw2^&XcWAXNfXvfw*fQ?TJX-I`8$rG9{zK ze6N(JTLZQG3jcy~=s5j0uhG$OwDoBGYrEg*0781Rr`@d(tzU8)*XZ1wrKp#*=hR7b zYYy|gATOPoFtoQGz|N6-`nlu-ergPuK(BBsZOc=QR&UOm*olANpU^!>m`2M2F!`uT50!2kLJiedW!=Igh@$ZSqDEN_}8x znXPr7v8`#n`pMAsBj8^t(b2K?wWZ8!{LQrzb8bLz!PG>C4(rLQP}#?fQr1Zh36&&g z{2huwf@79iVgXD2tWb)$_ez4>8N?hV{%%!Px8-)w_O#=JVjurrT^pU@J&SED=Uy59E zt^3sMM$ND2WY%uSAFUHXUq5Ih6L=ozMzF~_P#Dj_FfuBLB0s0$HPmRe=jB3lkh0rd z{a=i|cT`i`7Cmf3x^$2(O*%-G9%+Jf={*V}ReEnxK#(rIH|bqKIwVRJ5Rfi45=!Wy z1_B}Za=rI`!F%v z`O#VK0x-AHv9Yggh{!E;-(B|#{$_HQf43c*1x5HZ#)VvE1UNuNPyCiVxz<#DX4<1)GyA2 z9vb2JLgvTsKPN`Aud(nsU{v>|xLIk;1;;&`l>LAvmh2^!u>jv|9oe<7NnGHoUjm(v zXHc`Zr{oU%nBo(-Nk09Axssm53LXTn-zn_6jwDT`RAcX1*UPfXaay{%5CGraNh9HM z4%1Eb9ZcDPirunK(;1v{x}t#s`hZdq$nk~YDYc9?KPX9tDGcX(`)Oq1j(Ki+^aS-}r!u1N?e0}l;CO#|SYCZ9Dg})WV-qS*A zRM+Vrp}zw|nrzaF9V`Wf)h`9*z81SpVVI_4F%PQwwv?xqW#Mu%P|aOYa*w>q&TjBk znw7;}|I@F*P~==X{O82O^S!*)#=P+0lXoe>hz1-|#vBvHQ?=?U<_D0BRSTHY&Z#I# z==-+yWVlXS;vleZ;s`8^nI_EP=I=s&dfvC8T`nLg3O>JJ0^z2DHZ#IDI8W2s`|m+R z-LuQ@2_9|kKJ*`b&q1&XHnBcU#k4vhAz~NI`=6zPQkAa|+|CkDA_AMo4y-!cPGPZ~ zKX5u)|7$xP$O-)7mX?nPdxmb{HCSY-TO)oG zzFf8jNw{Di&Dz;HQ}0F&uZO?YG1Cg-|8?Ss2y5WFoST6;)?S(m2%{uHX4kItyG_?P zX}#`?G^e!3OWqtfb^V)#Q~Kaggn%xq>qk~~!!J4OETZ3GM|uxpxn#E!LcPgF%L#X@ zh+C9 zjw7^|TI=XUvnE~8Oi_gcT9M~0IzL)3diK>WE(+uan91@A=a0+dStE?O*M6T;>N1-p z?9&|x=?ig_7$?Ia*CF>)lM3LQ{ke759*;y#Ht4N6I~SgE2+&J(5Qe=KxUgnRy2uxH{rr}jsfKy= zZRWUZJo&whh69fln;HKX=TDR#2QdCEDMhB~=_$0fGKjN@&-&<}n~u}44JI_O5Gp$_ zI#LlGXZFi?e^UYZ|CC6GGffsIqrj9(IeN15UW*z*T^_^vK*90`j!?01 z%KJL(`*?+tP3I?_vGMbKUw+$$`ECPMzGrO~1N3OEcYEz?IHQ<`pSZ@%YYGEcg#OU; zdd~8`0<0uE7V{j>=ZVWM+zDj(ynIW#4 zk>c4jQdcGOzRrAegLN+U^$M;hiRdpVLceHbL5EKwPGqN5# zNd<|t#N*OR;9W(m==5Sy(A`oV{?t@B>xW$iqE+2Ja?b~M3_md;eAyl5#(U>XK3(fB zV-BYKz5ChA>y0N=I&9{Z=JL6Mtk1jl-Yhlm!bkY{LYe>Oho^!yndq$Aa`@}I!f?Rj z&qq^dCI|VuE2VdAeCb5yrI(}$Iav_LT2Ao=gW>7p83^KgG+eYlNgr<-uD!Dv2>nM~ zL$|tg3A@F9F5md!))?2cTG@T5C&k$+;UR;G0cSxANR?fV-5)=Udg!?Sxr!$MYZjEi zL!`h-5fuS_^nHG?H#v3#w!rS!Jk(XgyAJ9ofgqc>H$D6(pKDtE=K5K!m-9*Aes@Y zfmxLe8YHwb%o~T%ICg3lNC!7Ir&i7=*9FQ)qb*eK*OY3-rvcruC)V5EUvNgS`a2aS zmLN0%$62cOT3Q$J#L;jQH-p^8`jXe>YQ}T^I>2$J$u_2S(xgDd4@Qp9(oI#*k|8xP z@U;E7s(QZND`=v#GjR%t?V6nXpHu)KkD%=lLqpGyd+n+6K|&kJ@)b!EeG8 z|G@rg7)>zkL(S6#3+oz#1HKTHJyHJ+s(r_)gTmCq1d~I)SN-3!+FT|MNiuCz!w@vrkn6v!xYwWn%u)d;!g)^(%kW zwt}){hffbDqWj2wRVZ znML0o8SABcr@>of>?<;7{av(lYN#g)P7b>N7IvS7ugSKmXO>i=CC((1p;A=N^i!(u zZv2yMqO``|SY$sUHv*CED&EgOe&UAhlYY9x)#6gq_@;4qHk!eJ!!9yXpU*MbtAc7$ zyvSUGJRgeW#`toe?{T`2MYlRx&0`$-NXR%}y2nSl&GkpZFNSFMXT4fM__!v7*M&;) zV%M#WM%#+yTrVttv~62{qmOOgK4X;&ef1*=uevHH*6uLkmV zvA*AUI&hRCn4UfFb*XfrnwsHiPb+?B%@~UDlL3s#{Lo^B{Vw*qrLlt(E?F2oUk-Hk z!{1o?-;te%I zN^bN(&$++|f)x3O1ngnwebZ#oMUdk~z`m6<(3D1{V?3FA!-2(RE;WBn)z)v(L6z&O z*S5dYc}TcDwE%SpSo=P~@m7y7Ei*gU^zq;QwgMnuMjL9=qV2`GD@*7WTh}?0G|o}$ zJa>IKb_Z7>+-5ntmpf7pzl%^-Tq&vx_m8e4J39)KiVm3Ig`MzT;pO%fC~umXDnN+e zzU^DhQ5F2dc9|=O4$0w2<}e(MG}CPBErDbE@W7j(%q4FM1!I0rB7T_S2ZhzJK0C!p ze@l0U6n~0ftsT{y5h@R;NC|t{KRBr$j;4g(JbT#^Y=on9?O&qWvl?L(2kCDCYQQ2+ zrDlOt4V+EnJ_*AcQa~;jCfUm+v%+KaeXnJU*N1ppnn_Lowpa=~GD51I1_BWZrGPI| zf*&xRqL0}#qw|4Roym_R3^#U&quPVd$#VwzN*pn&^9G7BJ0JV}je1PBPv$Yve(Mmq zp*7T&}dty zwja%Oq+atl&|b^%S+Z;Rse~>@`92pF@Gj83q_S9lY*F$HsA)gT;vm-mJLu?u&A(Th zZDKuMeGQDhhF-&Xah~5v3=1{wxv-{omurc zzjYeU%VlyC<*V{?TFnN1=y1c$SU$ex38C^zmx~%e=Ai*gcio-Y)Phva;Pqlw>3;jX zziIBl;17>vB!>lCmwLL-mj~aK7nIE$2d_GF2V5~e{MFbHSEbDPT4xhHLi{78mhjmG z3*`*Acirq|W2JZ=tr?yN?vbnA}#14-gXYqrSIyK#d(Co6FxEs4;8 zNAO>W02b@la*{7x29Qnmk2;Tf2lgaiu#p$yu@MT^)^eSJjg98=vphhE#2`17SH?co zGy2ulRiJ?Bsvo8Y@D)Z1dJ1|-kejrVv3_16!ZI%HK+Y_tFRXms$aWmt=lG`)XI}=8 zAOv&;rCJ=w5Ix!4sgc{YGr6na(+z{K`reExIanhU`Ce3r1Agc#3ltBh6q&U-ucxSvRSQ>AM1EHISS~kvm8NAt zd6z@w!<$Dg12Fv~ZNZD=nt@GJV2nZKW^)rd|6_Rqzcz*E6X}LaKl&Q_osBBu43GYR z5wnE?kt>PwGy9{p+tj3F`FqIodnFDRw}n=Ci9HzmF6Xph>Vhx5lYjHq8joEVUPXiq zy)4TT6WQM09ePq&j-P%y3)?(c!U%=Wgp=k69eL(^t&oRbpM64my@7llt+2!r6RfY^ zQPh5)O%=FnC(l2uH5KKV`xR3#Ah9i_TFRm!n&meLY^~p?Rl6rQmJsBsg6Cv5P!^x# zKjb6)@9)`(8p_u8PYRULu-J1_lbLH<8W^Z)Y`$P-+aXsu)!Sw`^a=EQ}7 zWsHwXNnFy;N289Jk*Rj%rT)QW4fHY8)CoU5iZwoq#*agd$RBq~l~-e&^~&;SOQtFX zSYB&XLE33GY3Qp6`O*xvZW|8k$^E?(3bLVFj)x0#;B>d(Yv;XSw8sJquBVraSW)<) zYQb1zwnBiTg%2P9EwA)(dNuT{9{>B-sCQj`s=!hbnUn0X;NS8^f~C~mTy#@^ZfN`w zWckNDW{dhU)FZV!s{E$=nT-9L_PFL8Av`?99k@gdZ9uS`2|>vNf4bYD%KZMSd6B_` zVqxnsR6WnQkWUiHqYOz1gK)y+S?Vt2^|&6qLx@L=@(7ZXy&{n5V=o~q2o|x@Q4Fa! zwgd&FX1&fi8bS&HsqY3kTY4PmTl;P>q~D8pN<1^tC1*15aB)vgu7BmW0aKn+lZOw$ z1lT2KCn+Frf@&K35`)Tpa(z2Io?(1Jrtb`=B0tG>_9=9d|9tr3)mXl=!Jdrx(0BhF z((1r~2`>KF-JgdiYN)`*MPNt`)}NwAL_}3~2dF2?7Dq&h>5ctqT$*=9g$%j-#>jl1 zsiK%D>q+fq46DW@5>ah$={crveb(heBIseK9c7isWEWgDnt%JRON?%*I2T4Gq1RQoL~J<^n1Dl;l8I_x3&X<5{#Ro49Ey#BJj5qJgd55a*SbE zrPZ^tLMNq1%|svSFEmVrW9iRda`|`=>iQ$r?);Gg1T>*5%RJW-6LDW^iVwUbjO&(% zJ4Q2p0;D1?e=f>WhLu=-IL5n4L5SkKgZ&iqA&klpFN4ZrB+7!EtBj3TKBjOb)sYVI z#4}ocU@(Ufr>GhiN)VIy81>RqUAMU{OMwEYV!BY!5Li+E`Fq;Ywq(>`cHzS6`Kgr9 zg6MZ2lNuq0lp7T$_u^YFhB!Eunrw5sqsqQZ_+OvJtppDKF49A1vgK_=navfbx?nwa z0Ll*2%7@qd-JX9@OWx9^viwIG4oEE)gc$rH^QM6yQjN~zQ!RAM< zg=uU0_oq4EH&3Q2jK0gS2l^TLy?ua_9)+-)lg9{aRJkzgdZ}T40TX-`* ztz~`jU|Otb!kxPXTi)pOBURlL?2OG3#x1Jx95JT(6LPW@jJ6?LwVjL9u0UPTQSAKZ zm$wH)M-l()szT7{uaVCj&IWwb{1LD8K_toMPU26074?@Sx1{0&T}Gc@+!IGuWHo+z zsYtD8M4|LUgUHfDW4_^tR|rcr1Bz*seJUfPmb{HMrxXn$&1y~<2v}(T{I`05ZWMHC zYlGCypM%cIpY0Qy)M3(K`NhIH4%x4#nbll2DE|`Ne?H_B(PTe=A|{kCm;07jo;Q~7 z1IO8Vy^@FVx2Vl5m;oy9&wM2PMr-zP-r^u?R*-{9N|eg=)()wpYiDx&V%?pYPpyH5 zN-vuEM^E*8;v=Nk=xn;`BPB)I3n|Aqg-Y1k!y+2=^#sG-RhQF5;o48I^XHzVplV@7 za*&O}y}Jw&Ej6q!raHiF;NEZg9JkwtsaQ<4>TrB07rtHrF%QxmMJ7Ur_dmZ?3 zx?wY$vsaO*iwu?x{>O;5qwtjVfDTX6?f0i^XXOqTBC68c`}GQ#C;IIjX`b&Aj&b9w z(eyYbx?(qXF{ITOrl8b)U^;0+Y@tVkke?C+;$&V%8|<`M=d8iWY078pWK>hqt*)%A z!CrmA1r=7^ zQcS_n8WjHKDH&pQHOS4z+n+TI2d@RpHVJL|)Prv4)pLWe#mpl{&;t5%&_6ieABFz^ z_6>#rEKz)hUjEk8Eru`&->JqD!hq<$NEEB@T$MB3Ge}oFu@BzLiPUr{AR78w+XZ! zdnsHshodi6o-c?R5v+Yx99p`j7~{~1~oemJ|`(KEf1`+Gu92q#VgJOx*LEHYHOwhfE%phuTj}Y|4p%#8US(|EgKrr+`=XlB6h&8; zy50%L97U9zI&`FGHB7A-tA{oBcV4dTJgty;-%v^0`i6D08n7__-PCLB(r-ez4v-cD z@>~Rv+7*e!CzS9jD8%iF$gerVHy*m2N~NFLbFULC>1uBwMk{7Y+_MFHzH@tA&ET6|e(Y1fr9Th5v8JMfngK2-ZGS+Dg+H>PYJM2g6p`?B1huV_in7|Kv%ukcq90QMXX43~OIr_LrG;A!=o)^|^bV z*o5uR=^S^XN?Fj+^5?{#8g$)ob&*ZAk<#0is>rX7_rI#z0x1CyL^yXsy`LT z_zh6tz)9ZXQ`EvJ&J29)W>NXX_$G0|;J};ln-5|YJH93TbPguk^PxFtk_t?KrqrFyN%SnWI?0$i`7dH^omL{}!c{3}RRDj6Wd2V-9aHJ)-WJkpyO4nUIWA40HbVAk@<7If zjc^}J+F(Rlca(kGS`o#=*XCXb(8j%=KG&?rR)T;h)oEf1X;DcEcW{ZhDWod#+tdB% za}az@bhV3vVWdVEj47yGgOt9xcMy_!<#sJPy6tg;A>KMaWZtf8JKl|NM`&hUsT=!m z11M;NZ8{E)l5-ke{?MI5_l7i_c>T7nw)QayyMkzNnHNnLv>3ZQ+eNJbFmC0289j6C z7{zE2JR%+Cl^9b3-YFdkqc>Q#uSks@lEUTGN>~NvEtA1c==TRAvgDU<{uBu z@?F0%HFVUsQaVB~dg@bxAKN#doty43Bqh|SdQ-EfDvGGCTpDZ)=| zH$fXBO|DZT=dgf)PPYcRoZIXIf8L2gx+(=7nWl=X5DGjtEiMuL<{7v6rB zr6RATB`GaFDHDnc`QdD7u$D;hGVS1)e?|Eq#N#@H@buL^_+}lBnF49G-qikDZEw>t zS~Rigu?y97u+6BG%i~i315o<~E$T~>VAH^KDDqL@m!V?(*D;o_#u|L>$mm>HzQ>fM^ZrF)>p;x(6TJQ)yWofRSoPhDE7q{GUH=dJeBwqX zW}y*TWsy%KJ!EHd8v`q%P9%i_VAnuqcBOPAJoBJS9@Ay-bowZ9<;ses`#fsZ(sXw< zj9B>hs9ONB^K9MghNXYSPcSLRG%zWGm-AoDvsqU9qF1Fo75m}u(?a)q2#AkR1$?V0 zXDbH54s2aZWU4Fcp`OFaT1^%6H5Y1$3o+##TpQ=p>l^%vnGYA^@c> z(*Sg|K9j_eI|s6eUcZiM(C*$BDGaQThv4@N^VJ0)ei52RNUXZ9cqhiC|0^RO{r->V zmbIoq54ZhOdb(L7qpXTy%)y{ylXHBBiD)J zbOS-g^s3<2Z+->f3 zzem8_71c#ZdA!#VOqLb-_Q0Ft!BKatpA4^1XInWzGsRt-XnNdKn^-ShZdoDr_>!13 z3egbE-P4G{i=v!;SF?~DzN^$jPQhe*2kGw2gmkG z#YrAx12^-#-+bP^)Q#~p9!ZRPoi8>&JQ7L8etIf($#^Ec3$WDTgAMQM(tNX167(9Z z!3I9eZE}4(GRDYP4|spUQJQ~>q$5*zD0#9V)FH@yPtjyCvMv6|#R zHmu>(lU%7t1$vFI9?|>mExbnl^qn*G0S8cQLW832@eaXw8nao^rZn4sW zEWM83>*o&Jbc}{>o>9uOL|7`y`;+ETINi(tIb@Zjk{dPPbMuU0_V(d~ZonSTB6}t1 z1m29$)4yFQ{`{wb7n%YIqLzyZRPFjyHIo}@58sxW{FrL&T`KTChPLP)UznfAbwVRi z-x^RnbySn*B?(aRd9Xx_sxB1imrPk9KrU)FFQ89bR}*(2;H%EXm3Gd%)vV+A%iD zPA;L&JvCdm>uo~ntbJXk#R{q4SaX*_pZsQq@}JJMfr@9QqH6L+sSwHX9z?x)3;J zDG0zLY^P%ywu9caZo{d6XF zx88E~r$ekPD~s(QrqT>d^d&n#+876&_w+KO;`a*v$DV zHkWpdcl3EEZd6=;YN|i&<&PjTi&^BT8J6gad46SOR~x1#`m>WG*`nm>cohhqTJ%mv zxILK5EpYzsm>Cpk4RX35M6_(|LESf4Z!}UaJgXVHwLAgoM#va$ut)VuD zlLvsSPiF!^{Q@J5`9l{A==eC;ojJpepRum6C@Fa=z#qM#jYM=66a>Skv=W%_zFT=3 zUR~D}0}Md~Ri2+xZ|GuyTj&1@U;#!FZ$dXtpof;Eq~oXJ4_TgaMLRnBzb@xp)DNUw z0|tAJ*!T=Uo)DO4O!MZ_n$4|ip7NX32b(jUxna7Hy`jRY?D`K+#hF3g%%CInpPpIb z`q2Xyl!m<`3@K+KwAwZzcX$Eif!+_&?wZ@7l+7zA;s`E&5pW5nB!KurWiYmz1rz(g zoU}&YVEle9BS>lwb~v8f3iP_1je$Egc1*)>0_E+(kvdFwou&6@#Qn>qS?xfYiS>}h z%IRz5!OBQ+qP#}+KC0Cs3e^cZu=Olm+o2ltdIG&0JdmkHhSOM<};XT4{-*jvWTVP$0=Wj2` znQC$N7WN5=Qx_bj!My6D9s3=5rkdEra(Pf9qpl<`h-F1gQQoV@YJTuZ@xa(^1Ff-j65ST{ zj54!_E~fJ%vBE&@?sSiFj^Z!xe3a!~7Nx%9imcB*AwZ91v+JWse{F`RV5#VRM4H^W zxP=CgIhX2g5c96_sn}1c;c2Y|WgLg2+J1Z$H~!`jA;EuXI=TC#(AY1x)sc7PF5?pF zJC7_<>yhZh>zPRzxEsSdY&5)vX@~cCN&lChe|CYKRL_NX^)Kv`%zQ^Nx%?fq?@Too z6C)!9-H}hVFSZjzJdI`roZVLL_JgPLApaj5^Tb zx}0C|DSBi)Jeis~g}Ci{sb=cDZ6At<=#>(f>PEA9WdIb!3_}=3KpneKG#-p~DdY=0gqeMcI@=Qd9}y{IW`OQh^OV#ns@xBW_*fBg|9 zFYtv-ERG0cV8Wi~^5=&*!^8BW29L5FY3zw+kd(7kxF2>&#+cNO?G0R+!P>@TQ2$8c zPPC(rNQv=I8ua~^gGWRY!*)Hf)q|@>qt~YS#>qH=Y>R#$J?M2=wRBaTl4@tPL4j-yTe5!6yIWD!CRF*6eACa+JEb# zxU>^*4jj>bmsYw>kITAx(ca%_h2M1$Fq)h-dKd{C{z}-buQRC|#wd#|`Jb~L_`l#J zA(o4(54gW-b6!M`BztVu^ywWrgR6`6;^3x)5pW zl>|R3r^d_}pTa52{b1l`%NwS``#!2K;TT@)xClPN>yJM2-!_Qb_(f-;OT)C3YPn9A~w#Q(9wYC;v#k&XE zN!zB~lNIj;_fFL$UPXEJT+WcL1TQ~b@weg$SNDmsJJXP^L3r86zfwwMj+%QbsrdU^IR)q3i-#%UL=p7FHc=aGK$( zgJ6uh&T8FqtRyR4Ymaps;=B={N~KNszh+3py<}$UH-B%VEDj9{rIPilE;J}&KOHp@ zG`v#_&cDyvR!PUl+8e|ddZcG@)ZdtPhbu1scTBJ^ZLK+tMlj}?>0`m1AE^j(297id zWfUI=0U8(F%e_$%8P)Ez<&V8VC`7$1)s6~#mL-;?nX z9}k`~z3hnfd${7$JUb3M7P17%5EkU|tw-c<;3S|(PS}{PxPz`YXr2yOpnvCc1n*dM zwuM|hTJf@kEF1f>WUh}zsX4Y>*X#ngD@CGdQ8|vAE+CgMx23?h%s~&stUpe5f)*x*X#M1*hRF`(_1XK$w92s; z1}_3_Qlk);w5=h#R2AcjFNdB0ghPj71F&vg1y6N#2P0)xj^?bej?Ji0?7xVCV|oq^ z0%uFgz6DTXwdDSaxlF&@(`-HS%uBzq0;$ z++c@0v#&n&u^qus1sXLRiu|Ds!KHE=J07p@3XaqdRjf0Ug7tLxC5wf+aGFNw!^%F^ z3QAUgfPAPAP&+o-TXO6t?1D@p>GHnLpmVdwvjuEyP4UFfUw$4u&tx#ki0U)JRUpym z;inH8TmXIA3vpb(`iH7Wt$@Q^D@@IM=5ld#uf?w46Xc9!Igvq0fRq4tE>f-Cbt{HPx75S=4=A**Fwe~LbS9nKYXf-z?j@&xXi z`1y~}Mati8jVC_3&FG24OVFz9Do=2n?5u73Ra(oeHJ`X$VZ4WBi+j?K`jL55OV|sk ztEloYVea3=aoS(nt}!pkS5-4=UKBpgnL6jekHT#cFvn6uNT$RHLvC|fYelhlbJUm+ zh;id^;^AYgxVVr%Ms=iOX1Qip7aoDJ9vtS9SHEjJ^(GYk`w@G-Gp@3D0emyItBXkE zP_sa9;ayOs?zh?q8pQ+J=T7HBm%x(3*Q5X<~jMyVz}Ak zM!&SLy?f4dm~3Bf^UMu*dAEA^b%zv-1KFP1^LSBlJ=^_Z@W;>i8FfGNewbm5H3MGE zNvUx!bgjo2SvI_Jtmw4C7Peu=XPV9rNua19HkNsbZ%yOCNLpePOl^BA03BddR-SPD zQx_Z08hj?~IIeX|sdxgXa%0_p1-ia=`Sp2~FcH!|3?WDSn*e(OD|leO-+_Z6gIwur zYot&2$Ja`M7~bFFcG7X?`kd-o7tQOcp9JgB%|0Vk7AkUk z?0Gn(ix-gzUoRK!i-{WtoNSeEE6hO1pK?}SRC;w7x>cfK${|&$b~FTJBx^yHhM~Wx zPi%2c5RmJq#TkB(-DS%w7##};6pxGK-+_8b7-_U(rj`~rK8yZBfvL1d# z;=d*L_B|+5!2F(TIm&j5dmPfFGQm z$;=8O?E3X|2?-2^3#WUvM#L1BU%QCf+cfrjF9IZ3E2LlU$FSr#+kGH)4m4{nQPf=) zng22G_bMCTFX80tn8!fbf>dE^QbVX}c$y}o|NEnD+Ms)*k0d8#%7ht){gK%ULJQ*G zJ7~H|NM|Wt;L!bdNshjIS+F8`Dw0IW*h^gJXH+lT{tfeb&CGR;^@4Kopon}*lbe4e zzHj`=*R?fVSyI7&dOB^|**LaSWNnx>w4E!j_WHi`La@dnr2ODlM_mCN*cH}1Vsow4 z+?L)dwx_j(z`E zWC#APp89B(Xm#FHLP8KqH{Q=jHm=WFc7spkw_NR$@5+7zzK!}{1J2BH4Vjgs{4N9v z`|N`~o;@4K+E}Y7-w3u_@YpSM&N}s%j0`E|Ri(3*lfjp_O69mQcsHC+k~3B6x`SQ|Kl!T9D1!n z8wvyIB}n!eSgNBDCA+r{pPu+64?43lA<1AX7&B`HkQqGox^nzNQF|dL2(FHc`>0Pr zh|M(&em#fXUcqb@w1-Z7R7sO#VP%kgNXn5GF{874#Ql~Sto~h-wtQSBKmQd2&d*9{Ce39m~FU#qrQ29t(GwD1vp zfUUJQ4Ol~yq`GW~F5kQNYCFs&S3V)xRF$|xI_TMH7;4lvxJW_cNk>UYy7~`a5j%ac zBX6Ll_6xJ-cDO8zA+%K^i$s|X%#K>Lt(v(0HVNIht8;#x_*Vx0UnE%n`a^7=Ne_C9 zf7Rn1g(z_D^TxW2k$_<(D@rayE z%5#SX7>XI{Ak*XbJoZwChmt3(c}?!<$mQXN=#clDKQ(#)yj9)>Dl$|M?D@FhazWvB zwlt4API{p?Kn_8JSPg@BQF_ob3v+d_U6|yRb#8yeOA{sDo?Lpihhe{@y!FnYq02y& z;;@e~#}qRh|DUG~*avTFG0FGI1JLZP!6$AGqgppV+9foNuk?{3y5B)BYoVX4t24&3 zg8w!DPC4NphkO~KBSpkebAq+!r`p<{3~t@Xx0xS?m9oN1pFg|h84TpRx$a?Oso+aO zH)j&9xUX*|6spzFb8LLczH#YD3ZL3@Fr2J;#j$x=dG(;;wh1rDRN+#nhMS84&HYPl z^9&c|dQSMS{4{ipju>R^vQ^W|Y*W znMRm~6hthu^hV$E{jHRP{eilnrly9f$DTos{_?8D(Y#m*0?Z~&)-r&t`M{v7r!B57 zeGi;eRb_0abMX-}w`yZEZtUnCS$Vu+@sZ$E`Baq5w(~8(&?>X8?ngjU%b)6T`#k5( z25tfhRg7&;aSO!~w3Cf=2-uDe#QGobuZ#wahe{{g^`8ehNgQ;+#CXlS`8V97tRnlwh9LBfodX&tD0Q5 zXZW->&X3sgr#?bHE48w2%IATJUM+{_3X9mRw0{mSu}_Eb&5*%G(Ye<)}($pcGSAax!){h1l9D*DFIIX+8*aH zg1Z&80U|Lz@yMR_ng?{$0C@pFuK#;C}jr=uHQUlkYMjOu|fZ6*SmbAnh| zGzpXKd}iN(BjA`H!VMdf86Q;*yxa-0-_7~oE}WqIR{ZKM`!}yNltfu6~8%IpzfkL6Bf*hI_^p zphM8aB~({*5rK|4=Y_EO=&9JOJX4?n%9L@B(Y@QuYp(F9B)?T$kr;rZTGY&z_u{K2 ztkI3Lavj{HJf>WU+K73J+EEY(o&!$kzSO6APo3Z7^JkspK>|rhwKu1Z)Ux5!aHg?R zyMp;kx*4a_uQf)AS+g~&o0-q^N*MSL?mPZxYh8_Qk`!D{m!paSXypSaxLe9Zird9} zanKFy%;6R?<77-a^L)bpxnK#!Et|FPW{QjbA8rXBby3Y_`- zJQ)8B9g`=Mlj+5wVL7@WA65?A{X0oBLBE2(5eHz-5UVv_bsru-JkqycG{?kGXz!mx@(QS#Ph)ZaUJ=upN<oM%;b9cR1-0d(>FexM2ND>hWgF~(8#6L z@;_iw(i~^Mu$ZjEDY$RufObMsC!!^R?H-wApx1P*%U-EA>iUHffbV$Y;Cz1I5wU

ly&KJo^WZ67#?l!0#=f|@Be2X ziKEzmI_Wf>PZW4ALeP-Cw)3kEnOh;(w=IMACC~>5^gR>c^{ru(v)siHOkR+FwkaBv z1pSwmzpzY!lr-3+5;}4NA`$8MUAlk-@PN4bNeR<)mDsu& z@w*Ykl(Lpo@n%LJKZUoD3$P?_@0~$poZhJbt4=z0jQMtNeAxPgeE<1Pf0_biORWlM z0)gEGID%|!!pK6lu&c_fmmxPUFRxJJtmp@<7hkcc^$;GGa4HBL^|!&u*N!C=T)rI@ zCmR>B(D(>@v; zY^rOGZsl77b7B}#w>&B4Ub!34-kjCz)Smw z54oQdNf!qn$>O)6#$JY^;BzPZZkFSlq}{6|JIo&kjZ?Rka0f0DhH#E3%U@-;%ER|5 zdEznpbP?* zZ5(VD>RiMBSIox+eo$Zt!t3chg@HWi)h$2lVXS!hV6U06aw1RlY|Umoj!Vmm{*C_^ z%U9s@73)9$^xtL@bd!X?eRO42+ETL=x&zWA!6*C%eR+B#`e`7B>$d^i$$ZQJ=V~IZ zfYlh`cuTh3clRN0rg?~q4Lh?Lo_KX1KN zu?$w#Wx_L!jeH4)fFBaFm6TN_&N;ps8_pt^?NLFXK-TkR5bKSQkPt0h-2#z6%|N<*dGT}RExi|J)ZqB_AZT2}MWs9SBH^JTsk!4zTQz}utBUvm7Y*P#8s+-cq}{dMMI z9Zwe9*UHJ_Pv)?n3xv@lq4(m%N1R>W*Ua{sE!O?6%7Dab(^+Y!&nR~g#f0G_Y{=bM z%%Z#dG|3fvKjNkmr(jnyiFA89Y2I`x_I*WUYmZx6f1a+o7LiLkLx8W%!WsK?ZFPJ@ow@*sI zppxAy?%LfDb$|pdVZ0lCPv1ItE%Er@DgXq{XQ1kdZuelZ_A8T&Z4{ z2yco8nVSLmB_^`6`ggC!d?wl(8$SGPIxGh?ZSnqaetlVgGCv*~X#%;N5od_EVXGg$ zE&K3dAB%ZV-?qw)p1>~Q%+{wa*!u*D!C&}!2SH$%G@D8X>ONCl#BcYW8#l=}pP35< zPiwHVE41jJv5_zTc|4rguRHq%{SuD~6iy%1qo4@bH~^AzyLWW5cLL%c-ASLR<4TQB`XimQ9`QP7}Ibr z2}l`nQvK&cT?j^aMUp9uC4w;?7)&qZ$2k^NTqF|Rw@BnsW7?40+I1!vj1()_l!u^M z7`jPmG5=cn>CVt;YerPUM(~tApEH3Yv8cAU!jm7skz7VJMtG#e7pKBP)#Qwl4%_8D z>ZgP^XN3l$OKhTf%!gk1m{E`x)2M4ama~{j)#Y`nbTEB>W?&@bTd?kIsv{nfu95BK z0-_zqxM7B_RJmh^$j{eF-9V(4o+zl;-VDb1))g?4&M0B?J`d!*j

ucL1c~N8mLEP;k9C7e4s(OQ6d;yGhQIq) zhts_e)MaH9_Gd8x@8!8HA(=!?dT+yeRYN-HkyTQ{8+0xk#BkO(#Hk|afT(-s*|^?R=8dj6SKdqyhF4KV4 z9>%O2n07j!Z0+x{AYWtcBVldc#IYT&LmC)J18Ke7>?0UPJ|2t9=&>;3Q>URtEWW+U zf~mtN!`vJRwsDyXomDRkPv}RL4NNSD@4G_2kQT~1l&^{VTf_pHvo{3>1`aQeikBh= z{i_s+PT`D5f+(1q-l$aAamH^QvC%-#MpFwS>nro&TfJ9lb`0j6VZMz&|GnFOo%_{d zFP{Gq^1+!BK32A@sM+-;f^icOJ)f(4@qFp(_>BdrF5$u*hX~dG2;b%SX$5p93zsDa zRt#oRmiAA+Hr4l9V6E)_iRR7PszY0H{m)X&SZ93Labst1Mv~ISs`Zk2;|JTMOP%ux zFE*ITzVd!q)xet===rhUhpM?)Q!#rw$UL;ZP~*pW^WAvOe|Lt~>ofZJMJ8(hKogfI zBF#TaB}feVV`q!VcfOv_F{C8r7<;u7!J|<)*RudU8vW#1z1RQ7j{|0mA}mIByP2A5 zT3!;++rfi33wVT zWXXShV@H2~!e4(`?d!Tm8_0O`@N9$8|Rf!@%!qU;O4Vi zRP3t@?MsUnt*E7Ju}oVT>fTUQmAjfPLtTs2{ibQ*I(b#?>V52~L)1HaeQkD1mdWQ- z?7F=*`|r0}$1e5Csr60fvLzWTezcuTt5zx0c%-}4uXxMDdy&23ZIm}6RVk~dprf#M zeBg_4`ZpcFYhGH@>Sofbm%n?)5$|Ht-fADsQ`!p3qZ&@%El@JHsoSEDTF(31DMY`%BPA3=@4Md)8f{y+KGXpNhj z587FOGka@mxogYqXjbxV`-o3yYdzt}oST(bJHgkGZ${wN0k`9iUR!vrNpxpyb`9aT zY5LT5gkkEk@bw+!hy8qHlX95$>XBbM+F?&)mOU?^G#M`lSy?6OL9DZWf4cEkG^ZWp zz1TyfwxLmavE)+Y($U_w^(hJy;#`};ONU9kUlyjiG#&@}C(@Uri_TN%A_l#d>0(x& ze;4eTdv}w0TFWOnra9tQ@^bCEWcsDRH|)g*+SIY}~i; z!~UL+g26;yXyci%P#4LAkK35L4Y<8cj(j;v{gK?pIu#Z1G`i()DFKnkPxxKZuivhv zO?9g2e@~RItI5~w8)_Ut298*g9X7n^7_1;4?a*Jc9EsB|o78j~OA5zbwS2I5R#>u2 zfHOC9@(ER1>lcyWR`zP!JsH{r{V2ow^IdV<-Cl`IoGdP;vDwv zjlUbPaN=%b<2$SS)rVi+J?Cy3kmwi?kIT*3bZ>JVoj18TY;Kr%#N(5qG)@@OKnazs z8gT%0<0n+N@2>Z~;Jy*9L$oUh{laxdL@>5i$(<&fX%_u@P(pV-cEy#txb3WW_WY?+ z#1Y$7CK0IBX$4=(ewfqnx0W!Da2JVSq*we?Pdh<3P(OS))o)4~_lJxD+KxonBP~7s zI<`79{YO;Hr|)mf8Z(9Y`edE*Y#${XTyl@&?-jj&+tT?%!PCtZ8a9Blbu7`Q_#?wM=;l+T}^yMNZs=ze&z@>VpV|J32N37uw9Gyp{i~>t*hwh?*2X%ZWE_(KLcj+O=1QW{(P4xFt7SkrNcpC9%~yR~#=&%R<))zFpi!|f75RU(0J^{)Z%kRP%aG*MNoVd4 zgH&T?nQ^-ZvMjcZZ_IOvI6O0Zy>2y1`b=d24rX*Nc`6MOGTuaoqyrCC$ zMnSH2a$4Y6Bfp%&xr?JI%8N|?gUEM+66J1@?NM3(nHFz~DhWMa?!OK8hcm9c(_6>< zEB61N@-6ri*ex23@#kl+7P15|eam9~ebc;STZD6t>~-y{aTm#*5b|9zdNoltG~D~~ z?U6vto*Ug+8^<#49xOik`A~t$YV?w4RNbyCG27%WMGd^WywNDiY7X&Gr8J*%{n1S(NatM*AMpB`f(J^l&VkPMi_)*dci*I}f@x zT=C;=fY%FU>^ih65<-wW*xU9x?#{swrO$6phLYP4m*!qnk}a(c#bz`~Eml#?s%bgU6kCz>i@q@Qn+RdaLgdK> zwUp+Gy?3vby?a}IN4M_&jrVpJAul3B;uOxQq?`ehq)wb@xDd2x@xYh7EO(fpF_hBw)7da=~Myy|G*UjCX z1A4^Zh9mh4*Y4i0^E$Hs!qJoBXFh$n9ZNhTWhL|Qwn~9Px22^?CZwc%?6vK7EP7lt^f;xPzQ0n-^1c|?%I=%J zwOzKs8Cxg6ob{6X`0h@UNk(-7TfxplMO?)7Uli&u0AejUUXlBJCSA-K_iAO++mwGO zpP~Fb6XLqJ=)R0!ZtvHA+nzahTXlDzS#^0aJ~WK=@nwrnFQj^Yh&~E)?KoxHJT`Vf zJ0$pSzMrzRVeZ4;W`k_wlDHzf{%;SR^L~>3%8~ymQT1;ox5n81+3=VcDU4fI41>Qm z{{wee&?0xU3`wgiRY@k@`19V4gN8_p@l0Qhqy7 z`Z4h-Q*tW3Q2zeCVdT((H+}p=#`>cZ8YiU$GB^uf$z_drW_p{fhw_#sviX=WUyfW@ z-9?e_)c%F^CERPJi`&MPPO7BEE{ds)&i$Ch-MJL=<}UQpB}iyWx<%NDGFVEj`Y%=f zA9d>g{|1QV$YVEOh_|zmaJY<47d`axb?D~@_UGldJk)M14|>=4QhnJ@?%}nk^;^VX+(G8Egq;327uD`)wvAp_>eo;B6 z9Jp7vC6U`Xl5_)95xVRZ>H@-VI`zzC^y3Q&RQ5@K5n(pbFh04xktFALaWH}RT}-a~ zLsxFxxf)t}`Bq>9 zPwv;R0!F)oF1u{Jv)Sb4-g=%`MJ6x4M%aoxZSz86wB_{*={N&bsf-6i__HzG_f@-3*DqM zR~w}G$Bu_0)bT2yL92m(B?jDN3UYA=^z7SsnX`CyqD_&SZUY*%+^1jOY^1 zWQ?8$^Z)P?(9|DNXN1w!zu%?vo%40-Q?t>JpsjCi^1lb6E%+^FigUo6+Pk%_1$%1u z#t#AGq3S!Of_n~FDGI)QC%xfuPmpa-XRncuiT$Uio0&(QP3B)w@{Vije|z=HnyO57 z5pUH#$vFmIPo>uCJ=(_(7hX=>)_;I%d z%XVzIY$>nc=b77Pgtyz$mAhHv>kBU7bwf9QQO?WB#-0oiQqSL%xMHN4v^?xHa!dHJ zc%I-Kjtl!DkmaL~tD2=<{P5}ZsdFNO8Bh$#@S+26Vqfsf>%w!`jHfKiNEMNQ81QqY zet9D>PNE95ix3_N7GHeIFuitUsO*Sym-Y5S>uJy>Q_bbd>2MVem&<9Xz{ zfel7x-j9lqco|%I#WcS!nszjM`oR%2Wk#)d`q>ZgmqF#GN^J^xw# z0goYvaIfxkojQbj|5F^8oxR9=S$KWaS9YiJNZhJVN6ci(jKZYhtf5@)$KDpdZRy7egy4haOYmx#nXs3ksf zybgn$E*|lCe`s`hnI{EQQpPD}q=j?`V-6B5zMMo+OV7g|dhR{fT4#-A-^(^B`<6%4 zjz3jnv*aatj$~i(3kl`9VjVFtof_TXlv{QzPW$Ivra_i-y>L8vuO8%M4kcRgym$Dv zt(sm$FkAqE+Iz^KV{~hS#k2eKefg}~l7b*cNdI_)xYJ2ad$X|gGDKqTqz^fPTI9yM z&vK5FzqkTe{SL2_8RY23<)4qOpv#*JtlDdgNi0JSwXNcnz!p()@w)hG&ZGJK#*_qD zKwt;IUZVzkJ}~@L`PW&nSVnP_fWuOGRlDS$E4xxAs+Rl}Q9Ef50>;H|QW{jM-h=dH z&*q7jJZaHUV}umgh(V1Gh@E+Qyd$cFF|rg8EmFO#8H9duHfkx1rRb!Ws6y;UoSFjp zC*UQpp?J_XtIMhF-7}G@>3S6Q1I!pigDk@dRAhS*mUp}Cyk58bJ7Uu@C3d&9H4U@2 zRKtz=j#g7;3_Pgg;{}R#eJ43^7A|Nmw<32Lk~Yl#u}%c)60BdB&^P7f`BWDRh1%c= zkpdP!p1lZdkgQ_ri0<7h#UUjryShxgl$<~%R?xd~OP(K7v`uwsZor!4za1$0v)nlK zX1?R^NenAPu;D{ysddeW*T|fi(E$GAmD5h6CP{wHQl*}qhumON*CTD0Hf4OBW5p7B z|EO-OttNy_d(JyR}icvesblwI{-K^k?N% zxZ1tjaN`r#%HsjKufT=v|V*#z9!9U9jeS41duaTFvgf_dlKmzC&Zr*wpHEXv5*d zhWx{%dF29^UnE2(03t``n_M(&jFW9#uU=PIo4ePJUc5Fgpf=T{*YMjKZ#sZCd=|vJ zRAZB#Y&d{SW-w@2F4R;T3Xs&ynBmBP;G2PFUXdUP{S!41Y` z$v9Tg3@e$fPN(LhLm#wN(w2i2D>q7i+J6K#E_ah^gS+DM$Wf}oh*LwvEeTeH^&J>j zVJnp-VD| zMpd)Dcgrk-a&~(1m)I+}VZr+W?|4(x^v;}l&b5x+AY$Tdp56EX%4#qg_auUSx=Q0O zP-GJqv@h%oR-*n;w4yn|B%|sTVf$}0Y5yoqn6<`g#{hQ;tK>T$c!7 z2>jK4H9l2)B3GLdIieOhzIz>EY9zV;NeF^pUpeiH)=hf%+$u%8Mc-s$(|x)3yFC~O zSAx?mQ5sN}j$NIp1yc7p)ZB{Yl=aH2zrNrex;&3$efJth5$0o~EtmtM)$Eb*<NFG1hF^Sm(Qeq z0pj}`HgZcm#hci&>uNwwt$RN%q+eovwiYPp1hy&;Z3Zi{z+@kEzuHf^+GDgoX}=qX z^z-NOo$6~h(JC4z{Tuk7nFeG=FM-g&8?BO=cepz^%{)ueG4QYIe*yaS6WmE7StE*3 z5fKw*?cO^V%h`HCi#aA`g4l|Q?GjIZ9y9e~;!vQJTLTz@ znxE4=JFy(+Y?GoBaoUx@7`+T>iYAN3_7ZKnL#VNmOHl;>1-l?1?(|F*cya4$OvTnB z@@HNsyZ#w_XHL@m9l=c$SZ1Tph zh>!n34at*eP*AZ`Hh>zg4HR=~(AYEz z?Hsa@;73Q6hF#lbCdybz!IyS=tDua>g#U7~KcEqyz)N^$_CFH+$9&+|XS0x{YW8BC zmc|pSzeaGzK`e!qO}wnCVaC9s-k(FkHtKsoPsTxFTFp+?#iz3VOl_%3D#R-?9NDd_ znllTmBHF%(o2`7(WimEGTLOH3Kl*()r^RR9QAR z?oROES5B8)mMxpKknc_xM)WS{^{4p$J?KK`H+TY9Fyr1+SK^5hI%n@plQ>Lqax4TJ z6*?GClfvFgJCAx(%m>$If`E{~KNM%RP4z=&5Ml}~6nW7L@R3$urPdy|^Tg86```H|)gRXiL}hd<0K%NWA&B zeg->GpB|;U`2Ct>mB3l6h&?Ptr$rM9XqbBUo7*)5aH?$bb~X`V{d4OmNh#&5NOwd< zi?IH#KY1z2|ESe337IwYU|uA(xFjV4C5yXRip3UNP+u;77lLWqf#jh9bQBNg8NZ07u;jChT;{sVnxlrX?v&P}%;oULNRHhHN?`t`|Y7rIZO!R-7~({t>^i~kVCqqwf|)WUw1%m zCFjXJswSdhi78C1KuwH?uV6JoBUmz5rZmAEdUxNAM_Gn{X%PxEC8KJUtX@KmI`1Bo zylcdp`D`opsTJDJKHN2Cj!8fq>RqSNq^J?Q(-7i^`V&d> z&RUda-IF^Pyjo3-Tub_p3ZdIyvbp86Mxq#}aHQY8-!vq?a`Naq1z+7x;8PoCJb@jB z9fy6F@UI27cs-z6B*8O!mf0xgA(VOO-WR|^@D;|#g>UOji%#KN)ua3X>rEiSo|F6+)_dIe1Yth z6Yaob#NCL~j-Gx&nXCx1go@w?5gy7Ico?G-=H*Iht;=t$7Azro7@I>I#?ao`0vq(I!HCbC4(bYo^2US3QAde1th!8Ks{}lUU98Qah6ckiV+uYx z*<&z&7SMlN<4ou=_y@hsZ-YR^B1?SPvk{T`)B4_sL6lT8*RF&YoEl9%M;>KyZ6ABa zy@#CSsu8~6F|gkhr6@{_?}5&=-tn@#iy#+i?e(hRHGBHIpqT=pU9V)?7-J!M=(D5r zuU3JXN(y= zS%L*Y2+!bFXfl?imIG-D`|LY*!2)<>&xb(3=o6213fHx(1JNvAPN?08lbP^fA z+yAaUj?6kgpW#qX* z(MF7?M1b!?QP5&!WaRj#B|mO7O#6qS_-Z)lB|!-DZrLY4#1l;sG|uUHas6q>;}}Gp zk^u=c9BA#OwsWIsXH?A+p0DaR3xaJD56%-w)eK~vh#SBJjg`*@groF*k2r(!-4LWO ze&o0e+N7f^&bTxV7EKFz@M>&*uP5*9%-5V6ejZPO=dHdewRvi-NcFD6 zsb^7Rfp@Bo1)`;(i)Gdu&wSbdvARc;$bpz2ZK?|>e=BpPpa_^QwHSfJ8U z<*IYXM(*A8bQ`hxqIEL{eG92=7CTK^f;`pD*30`X&?E-R5#x*7C2NaczCt@js~Kpn3}Wm z_*37-dru46)PyP)s*T41n&*f-%#YoK!kf#%RVdb zz~igt&hJOd!pY3@Zy5IKZuKI+l(Y!sI~U<{v09O2!D}5D039wzmJs$*U3{Cm2z!Bu zL!ShVemraJeu?Zw)?8l zGEh&TV7DiQJzTepM%sU`Otj0bX)z0MqcKUvLyqBq^L;O46%QhE0lRy|lN@nYW18iQ z+SX{Lw^wI4?gbuEYSrBKm;7%xp?9}9Pu|knL)W^Thw_dwOAK2yE;{G@))y0hu~&7J zxwD^tmOSKqKlg3k)JU6nDOYq`??8A}00{ zGsz^sxmHGFKXHF*YJkM|o|!+!_E9A%oO8oS2$ms|QxUYufp@rK+APVnO?iNA^98(D zwmXNc+-t855;5bUsowZ5d*W{ICKSfhqx`3&O<~1NHe&JU?r~&D-~1P-k;^xlAvxf@uMESiRQU@J*P6?qOc4_#aFF;`Ag@Q ze>fD^K>pLj)8|n1t3Z9H3kwadB^`_0$3Y(sQkU>w>qJp4z>q=ApKAHrXNJs#U=HdZ zcdE`mEoes`ru!?y6?4(hkBsybR5+uj!zB;hg;M!kI8;J?QF6emukUg61t;9ieLx~b zBSL+BSO~1vyJnH`o}(_@e=}N@`zL)Ws2rwO_j}!1R7wA2-ZDM1(HX$}V<CgucFQ2Z1 zobnKRG!il|Ww5bV-DmFXCubRsEB2cNqOqH3#13Aoh;pUW!PvLR|@tcbTOm&#I?sL$6}1HSBFs4mwxB^`DtiriTz4aKt(@(!BYr{nN)r}{OzilRQ z{&;(_4st8CIg?uqPg^hY>{M~a_>n6K6~$894L;cH-c^rTD(3}j4MLSWd4kgw-)U6U zteMv|tSWv-9jxJS#bbeQ$u~BfG+jBMv0mny$BywtG4$8oMdmabelru~n5+WD>PcxvvJ6 zBTlGYt1jm4)1YPO8RCJAS*0kTFM;6KuWzB02F+l*a9kS4`;5Cq@8m(2meS7SK_EPk zU*#(rC8p~&>Zg=*SFgO4H9#-D?{gH<8PuF zO~s-4GzqVcXABarYy=H6k2xK+g?FhzomV97kVxvuWH!MOXmd1sdywHzDlcCf#J~dW z{4AUuR16!lEf?^tLVI?>0%;Iq2lk)9p4@%MAp6exjsgc$vB6_Zho+teZ21(a1oDiD|T7##YLQY(>-C05HYWYft=AN!&0;x!upu;9o?4GB0!0k>yz7^}{BqaDTRsMEy8}>|07m&SIaD}6=Aey zJ`Z@v>c7VC4)a>?is7L*?yk(ocI;=9jqM*prtJ86!3l2c<%KLsjewHLk<9)B#8^a+OpJo53xq~wAjTbTS#XM2n8TJ_ zH3S!#M!Cw{ZkAyCz{k~X3+*8DH%5VuQ*pQX_5A=mSDgM3vPEQj?tI&33U`OW*8-aH zCpkE!Q!qXYY()6e$mTRXg>!+Kp!*>R-_?nfZZ1dKkG>wl@6OW$voL;m<=&-&M1-HG zflwHk0P{cU?!-E%Tx7hOI5u*u;DJTP4euZ5)n9N#gCfPwp9u=)Yw+Ao` zmAI2RCSb6m&0g2m&+Q%3yTOz#bieSE7TB$eWWv`Fy;hwZ8e||Y^!|~iqQ#BYdE!wm z?ZlUMjE0?4ERjiB7*7L1upEreoMo&mQK3`nGgN5V$7mHQUc3$$tgO?CHw<&YB|MNG z#Dz*s^1~rv@asD&rhpP<2Rp5&&Udl|e`-l&vaE-2YQh}ScQ&?(_v@IiHmL~vcKDc0 z6&ymR6SLN#8bt2{q^!^%Isl^U4<)Y<=5YOpo9j&$@ir0wqbI`)i5n3@2=*uPIq5Uq zXcoUVjnC>Fgz*J7kI*0GH55IjOP4z{WO9rtjS!}xV*&HvaZLS7sh^w4{Oapa@fBBg z;J}XT5p5N_fw!!ZbLqYkOa++tk>!Pau&@Lv$Sy3xxaR7q5LUYcky28{lC(0Oz73uL z3|W*GML$-6)wmfAS{5NyR=+2}J+?U*?gRyJKF*@@W#>^$&-t=Rf*dFsgyZ@ex3Gm+ zXsQ&tPlIxI=(Fd$)#`57o?1C)J^mPA_WHB*vp`+7rQo%dqtc+bfHQDWi7F5vIF1_kScdzlp$N# zxGFHG7BjsB{r)7!>Q_XbJ_M@wU(G`foU^RfFc7}6o8g)ZRU^KQ*a99YtuQ6zO^d}K zE*&8kT||bFyQ(i7ncDhji(LBT|vT^{onf1q3IdOrbN?YP4AbsC~x?o-{j=pd~8%-1G`P390by! zz(!KV%8PILbj*1}V18Lj6IHM$@T&W7GMI<;(x8KpiLm}>#Cr2yw$PL;8~4#96XD;` z(Zj}d?&14}MHMR6KzJlE;*Mr7IGUh3{*1x-XbV|f=i=awmTdUY^^np`Y3E7-d_D-? zvvpOCT|2)H(FaZ+^Fl<9xH~DgqGz_j-8*9;EG=Y#kp}0ygfEnWtXAyRm6Bd`&OkzT zAvUlPdui|sx`vnoYi#Za$YU03>bee@{{940SE7wLxLt#E>^C-6=dba-D8F_sHy6xx z6PG>iw-mxlqZCLi`-Tm4@oiD{*mckkII@2-8o>qQrHv0{KmV{C?J5%e*CJ(yclK5R>KBsvUDPMEA*0Ml5z*oC{T@H4~s-i&@=~?GC2eIuO8{$pp`?LTP}UbujL#DF2<^bV&(KzU*lnozpUn%fkk8NFS(@ zGP76)))J_&`VT}5$0M>DEx8fU&O|64;V%c;8DFTl%gmxosQQ7*xqGnd-ueTuw$uyn zJKK-oI?jTl>Y!OYhow|S)ln~Mk2l8*1P5;!mZ z!nbM#1%D`J2IpMh4?AJDk276M=EfJB=GLKn_z^AoF7QC~av?E7Sy>{J5nV(aQL`rF z<<5@kHn2%yz^=QcF@JIQKv4GjLJaP z-T1H2M^2GAIwH6Rd2zpFCK$1v0V|d6jE2RxaauPDqu>i>jtP<5Hr+T43Kl`)aPXl* zVl`8;?9`(}<(s=Q0zKc&Xm zjwbCs+e7eJ&acR;T@^NuBHTKarPd`4RezJZ?A&m2$BBpWe%oy)!^2K6@jcBrg_uv- zkNT6!X-<5}9)!(xiOTApG7Y@@+S_7^?$==i4Y^+>w#}ZI+?Rk=r zi4Y9IGueC&FX%P`S9DB@wL!lTxKNPv%V8YF){NCajsl2ZnJ-hQ7Ot7b)2DGbI3!f} z(GAm_G{C#k&%uZGV$lt|JcW_=10ddU$QEgTdtn4QxDNKgLQTHDBN~GTfKdr{!`S{) zzqh#*uh`NR69|=~ylkP%^D~e!M$poSnQE_&5C*}UVYh~{@q>3ozx1Eb*?7%s$%ziXL}#gPj>I z8{qCclBU(V&VQe?W@&v0BgfHz=LYe204lq#Oyj+($jyM=7W2<~v8&D*M&Ja!F;;s4 zX>Z0~3?@GS{y!Be!BR8klj6HEMD{>AtAC9IiXtD}1jcnu_kn7h<{OQ{1v7Tw+Jn+` zFw!$Sz*t0ikS~n48*tErwO^Yx-y1V4ybA;mfa>l#(SbepryJAs0L~(T zYXP8(xnn;h_5l1Cz;3wr>*g+DAUQ6T)T!CxRQXb|KJE)t?qkzW1Du6jMsYpzFK~H)e+OC8fwFdH<8@JBjMvzAf#13NdAlQx&RFIe3 z)(Do4VQlQR?XTugejovM%VGT!o7zfK3)iot^xJa?TELbrJMjDh8DFHAx8<7Ab-3<4 zyvlD`!BQNDae6RH^72-=5)g&Abjf5iOn}aj5|C@VZmG)c!$uL^)?`b-L9g`I-!Iel z;!@$3mOgPpHB~Q0_xivP<{Br$rhe}jsC~$T$+HF5Wl$6y@3tbMNbWU~F^wZJ(nEEp z!Tp%zVUhdkCMDQ}*b&4i9sa^5?yCIDrhyW&yZ3`C@}T)#H+I$38|f#3{gdF;?3LDS zLPR2%YQWpF6n_HcGWJ4r1Kp@O#t%1zpm~~p=@5GF64>-5X9mZ(*Nx3D>dF8QaHzMF zN=`xIKlOtkR&@8xNQ4%J4SM(-%ne|+ZFxKuJ>5e=9a7Dn1dG+i8RjK(sFs$3K(C;$ zpCI#EGm>NhFa#NG-*~x=0M@?OfzslVH#;~oSVhV4$m^Idx$mdz;=*wS>iMz zHJy)r*8zI}V3e$?87HspvZvZc5Iuu<>c^+@@oSv+FQaqdqKiiZw)5&QUq!3=)nGKk zZB&Gvowd0MR52h>%2@w4*BXJb8N}be*YJyj`hp2q>kASHzx}|gIV)vpN9XW69x~Md zwQzs~vN0RT(wvqVF(|)e5yw>7EYf+vxbzMPrV#rVfyLbe-B>=d>^^o3i}XhF!J1PC zL-OW2WHolQ1bd775;JucydgDXx!5FgBtf`#kaz)JQbIy>_i8%(bM((jsUqzICiSJ^g=k@VpJ|!Y1r!W`QNSiB53H`BRaJt#R;8mylA@tKQ-r2r zn^sV-R$44|;=HuR`zvUQ;q)HK;J1v$%S15O2jzV@GXM_$p3VOC%9hX$XzI}Z9TkFjv6u-?#WaX1YZ`>sQ2xAyx$(TQQ{=*PuGaXWNzNAA@`-J)cpw?to9NL;gUl`b=kU-`g&t8%8Zx1WJ(n<;ct!Zhxxz@9 zkMrOK3&S6syGn7o)jtOl`sQ%6shdq5xH2#`*iV&LINkemqGtHRQ?V=#0!};5{-VYawD3FHF`lAG) zNN9-jupI|8LJkxG(JVcny3k$>*+JXcBC8)Y*(G$!>xt-aq5C~cqE^jWe}FBEBm;hd z7fHd#Vn6F{mCV|6t@cZ8p20(GS?BY2=>G z11!BMyu@?2-1q61@4?2XI;go!I+9LL1b*uJ`I4D6i~n%u)>-9`iHIh4x9S&^g3l6{ z>K@TZZ*j9UrwYRaB!k7K-pKe?Y4ymK`6l=gS`Mh!Q+Wu{YJC6^=IgYcD2O(y&xwjG7HTO)Q=CL)Gn4-h@J6SkErUihWu=;_B7A- zLdQd1a&T41;fx7jp;g=ndgk>e!%O^+$-b1$hlN>2(#o1i=gAzf0VsY)LNmZ|x-FBq zTY4IIGi&i_q|PwiKt&{+r6=MuOOmfGJSo*rbm>N}8w=W75fS$m^&^Lc7+4*t^3?|9 zs|=J1UHXWyIhZ3(-6@3aXHP!A4JrdJ6NjXDRppvzann#DrQ!5z6*|dh2d&lcTWwSG z=SEOp{H|54g{G%MZydxanCBpTYf!*yTv|JD?_Tp$<+)XYyNHB za|ibIFfgLE&u&zq>q#y{#p7Gyz+U20KDw5@0*y^V9O`bp zUrt5;0$+{`vn_}4P2tCfzaK~W)#aNfRl?Fxac2M-7?CB6E380*Ere9)%mpws_Is!!+(Mp!*Y|Ul7g}ssKdR2r-mrbZ#;3 zVD=*T{NBn6;wAwaomixS^z;%BwSPRo3Frs{Ts|-^2k6s@eJE2}>v!fzu|Opfh`f|f zfZMMhlm<~?E+2dT4Jpi=8A$}LF4}O`5dH%+xAigB61X^N249lITs4ri;?##dUeH5R3C$>7 z38Iotc5+I7T>UzSD@{3a=1d%m8|N{?(yOdy6OTYk46fVY@Y~l!AnFV#_Gt+L511bV z2GtxTp!25S{t%+_d^0ly<$f?-b`E@61evUo`(%8w(rG^u!eZf$)q817Y#Ea+Hd_EQJ;*;}6{#;DM3ob1|v=Jce&&oyiGAmg9Xr$+O zhkQyP0nw31CSjE8NIp%Wh72Q~Y|?LsfPQ44xWf`+!8o5=TU8!)Qoz*H=N9D+ zcN($D@U6@32LclCKA0SHj`UkyBXFPnE4P(-uW1X+d%@q=#!}-sCs9*st-W#Eub*1v zZo)MF`v^#+hV)TB7vKq78lQMpxN|EjBzCQx{JaS4FO;shN!ETI8sx-x06RFr9to7K zQ$j{FK!eezL=C5tfb}}guqObdFQYk((U!~XKn-AB zFhSEhNz_1MhcKdSrxeZWC^ij%hPm(}nF$fKzO{Bb(OB+G1{JGL#?#}s4_~a_(}d;T zw|xq;GCPFxx%S4(+aZD83fbHkxzhe@IorOLDJwWjs>=ILhqktX!mQEf0=%Gztgpff zNd#xti3;rQ4|VeARHKMG9pJt`P<+fed*lp?F^-uD6UF@AoNK-d(K<)j!s5wvloo%d z?yp}C^e6nrNQ3WeVi+1v>seK7IR69~Re)j|ZW?D4AXMTAby-A>(yfQEH;W)mbe9B& z@eT+TZ@?(X%jjrYz&In|y9AwJNe+%+Dvc?kSbL)~hYJ6AuwW8Va6z5Kdn!^-wXtk^c8}YM+wZn9H9z)E zMVlw8E%Zu(6PS;FCP}Bcm4j#Eu$+s}j>!(aQ^!c%_{`V;@FLSUAS5RF`mb41EOZc( zw3V4r=mdAcGpC6k2~}G&PXr+sOm{oz-{hNea=W*cD&B%pcfIaV^A^nMUSV=tVH3n* z7Sqa2shf3vYK_$q@bWIhfQtdVeHFpYd!h-C$BG7lmOm4y#>EceVIM;VfI)^@q~wgb9(p=&4B%F zyQ9U*EX?YBJ7q|*I-yQ3=R?P9SwW}oRXTDm_+4A^Y{3MR&T&n~`=uA&L=HJSJ$IVr z+(hzdvU1x(OI5^Yn{nffI^2Gy#!T}6Bkj%Op?d%Se@kVn?2$h;}r6f_lNb@r%jhtxFh~ zRZ&AD%(_~DikJ1x>hw+@qPUBQda*D$ZEj_C&6TsoRRm{iZP)AuwM72P&CyI7fRds2 z8~p2+lmd4PJhh%u`Cs|zbr2@O;|if~uX42Je!p^hUv0bTx*2Ca?U=vTeL;pynRnZ^ z)04amn!%NPCt3n@4?PJwd*wgnD9NZz`QPsX{MVy$5EmuWTp+7-Sll=@_R$B7s_TR(&;bt~yqJy`s&BgQtu_Nr7g1?u()-lyHzj z6?hAnMM_0h@;+#wmTP7ksb7V@PbU>$)3ba%cIBo8#T`vyp`)eJU+Y$BAH&0uU$^M*E4Z_uyof)!>faC@EZ+q1 z1_bET-Eie&U8DLfxL@BmFGbpaa=kD4Ia%*e^VNE^5IXqPv3AY|=jiFEvqql^=_r4S zpd~n_0OhhU7||sLv2}J)s(rhky;_Y!s29ehCBsZ)Q^9I}`nijRtd~Y+sm_uycbQ$r zRK$CgR)g3_5R?$wmYECdZ4;X;CmWn45iwi-BbberNaqAEjsmx1tAI0wu5ujvCrs>K z2b+73mHXW9McE8h)a92XEj#KAYBwlUg)tXLv4%aDP5F_|+nfD4b+a1SWA+s&v<3nx66(+R0NBbSd-i1eV^7@sWbmG^Qu9Co> zSXC60w#RbnQ57_>`%yC{au*+CRx%yPWqsyqMM>$K#+353U|T51izi*O9N@Nw^A3rH zxVDJ3=_q><9@HIrEzcIG8I;#9AQ@^6ox<0|Xl*oXAWJ>mhTxfjOLH+1%`=kUUc~y4 zU$(vy0;6cYn6;7BPJ6?Bt}k4Y6C1>v={l3cQp3>T7}m|Y zacH+A{)p?YJbOoeT_@#KWYm_}-qOt7%+sV~BKJGsb1{rx#qSc(Du9QSY@vReql`pRS$$ z@Y}ibQYTsK_2O?l^=NMh&6Do#!ZjW1^g6yIW-T+;DWh}tr^4ei8{!H#7B_5-)zWT+ z7FsTC%$4p`DgewpxF z@)8b?x&Se9UnNkQ=^REc{d=+f+VC_AMO?$2IBj%iJ9hU&R^6=Caf=gwJ0UVkC=vW0 zEj`B~S8DTJI!f58bsWPN+z!dwvrAcbim~I^n=mcE-q4e}N=9^!>(~_Q{H+e{YL+QT7Rh@S;1~OCXp%mzZ%njoc&%>#_REGPQIpwpZ=-l zO(%JuJ8C&CGt?kl<;APW&u*5NAg*ioN;&&JeY~zyW+EVh;SKn8Dl0`Gd*D2vk?2)wUU4H!*m`t!cY6`RG&1ho!P&iZS@?cP*3hyp!%J{30AH z;IUVF$Xu-il(VPoq-;eyRqC6A-EZN2f2 zugM$P^3^qJ^~dHi4$Sty$i7w_8FZ=kYu0rq-gcIQSZ_3Vgyg`-h4^VPCGE)Mh1Og< zd*U#Ga;N+s&x|-6?4H%$mLf*vYG=t)CM~^z!~nO=Aq5>P-+o(?s{~2!o%yb42PcPD z+st@f#X(I8#TsQMvYTo5^=d2Ya;}F$A?eZg`OW8h^J8_D&2!}c+IB}CG#lQ4xPb

lt2*Sxrj| z;4yuFX{6p@MC_1itHs2=@A0A^^!}-~ZfvH}-5YZb8FA;jmWb>eQUB6$arxgB!@Jmy zfHQIj$?z_JRyI=f3V3{p)`73UY;|k=dvvf&={dq)U8yaWbg3PW8^oGX2QOX{V z<;7%}Po?ku9uOf!Nd+-^OawB1Kkpfpdd29TI(CU=cn~cACKGp73;qs@x0$MAmFqFI zqhq(ri-%l1b~6W8fUo33lJ{rB#25x5s#Fwb_!*3kj~snjB7XmF(K+4d48p}b{?}&^ zQPISq*|Jpczs=?#;p$1iBt)=mbJ`a(bt_%ZE9~hWbef9ekbSBYd*Bm2;{|ugBp4OE z0XMd41m(ZSMwll1$<;tA1RWi0-}T&jWlP8U&cjmS8bZbL`B|8C{%8A_V<0Y#|LPe2 zBijet=j5kkgqeiMHqPX)SH8dS68dd=QnEKsE4pH;1G@kE=~{V=wl3|P6i<8R?1{%g z9G|E>Jl*<@WiEOHy1iv3%(GE-Xa7++%9<>8hH;|9Cn9Ekf%7U{F&RzR9eNieM@DRn z=rz~d_qU0st}9g;luAyvqcX-JA!5jcxA&#wXy`g)Cc`GaxG7hXF_rl8#O>cUXYZwp zJp25&U<&=Xa(67ETogsI^cxGJE z>XJlR|hJ@MSF9Hu={UftUf@(sN45d~%-G@Nr+X40xy+~W5{vxQD zu341HTJiSLK|bU@!6<4^jG{xvq)8&*orvLgN{R8FkVFq)J=()~a;?TebI9>QE!;POMf>Hu zjVK0V3Amv7W#cQ*_FMdhUmNXxUIwQ3oc4bR<-dMJ)YFzznZ{a-T_Uu{;XB*CUYC)& z`?=_G>>IQ0k;91H1sNkIPdUdf{TbEFHp2_(*-9FhndaXAiy)EZV#PS0b=?dg7GLd% zN_t$~2PGpxo!?R6->qG3fnRIQ{jn(4V{ecH+t51s-FZuOlvslHcTG#%mBO!vO+L6h zJkdp4YN2bD^Xh-S)BixgAX}{Y(3HeDt>u{HI`-v=_F?3`crBZVuh&kjqictK6odD% z@t8PsmzxPI0xWs!=6E-T1(M-qkPTH|8NWF$*Jbtn{TqI*A54TdT-b+Wvu;q*0nw*x z;wntar!yXAUoX0O@pW*b$Eeo{H|Kd4T_0sJp9>z> zqdb|!HUUMZ}llo5eD=0-=9|!G5d&WUnXY&x-Mx^X|L-UJ>Czdfc?`k_Tkf8c7CtsAA zvk}?CS<ou7UKtej2*bA2RCYO9Z}hcGt9I3)LF3bC(@7dPrSf{(M}T^e7%PD9S;2NHSd4A zk_~wlR@UGYhh$CV5eD7KHdY*N9Ah-I4PAxtKT>nW^_LBGwXa-K>(gLX8gB1wH~-Y8 z^tWaKFd+I^`4FEzknDW5+JsxZOc-6VQcW>e0xs3a<-Sog&~tja^MobPa-y7Eb|06~ z;+94;)fB3|6ibih@0zo4&^15y{NIeG1APcDb%N=j%CCEI$T5kr(e?7H{c@^TB2wDt8@bl6u^z_A%MAeJxcDvj1rFKity) z3mbar>hN!JD1Qj8V5gwyqWx<SS^Dd0_n-aqobE`HYpWoaUFG2g1cydau?5lBj6Ua5QIF=^Y_ZS_O4FZmUDA#&zdK&v_FT_f{iO^Rno_`Oh&8t|Mu6q66ek$pi`2d$l|r z>ieG?D^#Q~-BH*CX4CM~?iosDDK3rYKaE3Dd63L5Vyg+0i}M85ZYJV6b}OmutqU;+Gx$l} z>`yY?U)u>Io`;Dm1ozBZR%s-{`xWQok(U3u7Zh(Ov!>+&3+fsYUzk0G-*lCYih;f79e}f{TNf`sy(39b+Zk4;r=iB`9C8h(zTgkRb^1t z+=ls$pa5`Fg~{^CJ6{LO$(htHoU$dai8@Mm(zfInz~kwhdfK7VEKNYkcB)ov?J>|$ z4V8WpPRQh0pZ@p0fq9xYTZXcN`j*3hv&c4l*Qn`+^96q{*j*oa_wMR0U_>rl=dXF| zTKw7m8aAK2rW5G!LFLM2`ItekzjUmC9FbRw$R9QwRqLwv>SS)JX)2j1@_m>4YS_g< zJLQuT9a<``J`~SP>4~_tp{SktWOu6$WeJqU#eb&<5gbaO9AZMuYM%w`|3IH*0AKQJ zI^$=7H}8#NwxJsv`BuISZ&)viCZuqeXFV2+y4&G-Hs{24g!5m9GSKNyF_ZWK5hYDA z#ca;uZ@zYMak*PKP|LvHeohACwbV@0pALYez{(P*X=zBOXp4V^Y2oTENOnGYK~0 zhaLRIFV=^?NZ4KHdz=`h_ertO$#ZYs&m7FS9e?-g`S_Mu$>c$XYD>NpqZc;ye`k_u zh@+J0OsdiQCRV-ir#d*?<5cNmZEJ?9A3Q$1d6nYCb!J4W2Tc(ds`M^Iml3dP7^K4%<6&Z9hf^d; z8~&e8;s5GEmX4!_nzWjgin;(X^$E$=^ThVB6z`U%5StKCNOSDj)2m}Ib=CD=n{q2Z zpQf;DItK7sGgN7gpAm_%<8Ek#fY12S)m9-kb0A=6ryR8r(=p6ll?=KZ8Sd-_VwXDvgsn6$&;K1`~C zTnu@>H~n1_(0P8YY&LA(!U!W5KCQn0akxn=_;!Lr{7bq1GNZ}!y^jq+x zo>iv6v8H?H(lhlwf7R7v&R9$UaaQ`AZRF z%hreJF-+6;`M! zB8J8wLd#BE=Am2lEV2)eWZ%j2Fg-OGs{LDlUF&ZJ&Vvf!0!;M!0{tiIM( zWbrTz)c=+!TKsi!MXZ!z=HVjus=1@nb|9Y;u;eUwtTCp10UMa_OAz>UQd$0g9u1B# zHQV^V^RzzjBun}D1&e-I6$_N>5HpsvqEC-8z=NVX&F{zDZqShot;eJmZn3Gj@`>l* z%mc29{y%%@e|O`6KFaz?u&&j^pQu`1swJaGh@~%HVPZVj@c~bnRARI?T>iYfgs+RIXA`8wCeZ4Yc8v z^%xjOdqZ&o(;J@DEApD7honwfnkivIcB_iA zj82M#vi8Tmi&XmevgN+P@2_1O3=|^;lZfIt8t(AW=8tsW0eId?WK(>;c}&T5Blb#T z<6nCr{@9Cs9oJbO2YfA8j&0VCW&Vq)0KnFQSv|QZ?k_lwk=m7@FmFPiNfJet<19pm=dMf2a zzUu#{sEk$FX~hz{BB z4vt%Rva=anmAbqy%yL2K^keOc#{+q6Yje!MThqiBdBY9`Z2Qa&9BEK*3x zbC`YhLs)`+Lxz20WLMehsO;1NZlnA}%O+!h*DFbrjZ;hVFu0D==L{b1W!Y!0+lsU` zsFOi@|0uMCkCP=9uiZI3idx|!W>79?DY0t2M1_Vcq5D=C=|B+i^5h29Y3blg*%Mo8 zWtEW>#hLw{xANTO-+Zg$*yM<0k~R|L#BFLGe-b^#-73LpMNwa^UsCk(;S;B=u6p*t zsKf-ike1|+z*ek}bT}aiak$vJJaK*hzUyUnF+@U_gx;(4^HbK=R?~$t7|p^Pv4hy+ zVh%hwT>HJj8l7X3u9ns_v-}a)Hdei+x2O~>s)VH>m(~1AG%(T1*mv&^l*7Qj`?~_{ z`(fOj>-*AD-?69B$zOgGF&QB&OR6PFRlq09w&kB6&$w-!47je=!?AerK?m!p7uc55 z3cCUq$Gj&hyTvlTynNIPSW)C?TQUee3k!=5FyEXT21dzm3u>s}mOIWp@RDzP8~_@d zUASjx<)x(}EF!@b7W(P>zE&!p;N({q5BCHz%GzI`tO0ZuR1c`ALG)C;G&<0I{1P)v zT#O-O*7sjmL_MMJ;PI8%nUSI4X{8-#zrDRZH?(%jX9k8e=tCe~Bat#7ZKO>PW+gH^m>%&3qIQ*-u-~EIt#kenKCJR-fGr%%-KMRl4-Qi3ZbqrTVf| zIrWA80(60B){xz;E_m+NMo~>;<6~Y6QWp5SV!&TgeBgR-b?(#&mLL}FXn&6`*>coy{_)?;4B=rVvAhy35en_% zySwni!`FkMU4*716q-V|wzm&Ihg0W8Hxi#3Nbvdz#05P1_T$1#w1hZODgE%))M~@( z0};)Q+M4^FfL-vJqGC63V)?swM+9z>*KSll$WJ$M^Klk99xzotiiYSCM5VNz+iOh) zGcfe^!E*U6W8I7#_tP+t#jg39o_k%?;MEwZTMnyaQG#bB|5o*HvS0RNNM8Cg&@4bz!@@LhE2~mbeB7gMvpXOvVDjKst-=3<}oj z9*gS4x2PFZWy3Mt*N`LU?)62_BQK`6edd<%-dqcv8$}UqYig!~@Vm#0irwKcmz5ov z%MQAApSHBT*d~#r5v#*_S(yj=@blEWWtIIf%S_)#Tk^4S*7@nwYf3TmoPiNk>P0)B ze!bBOhme2AyY?%{x3Bt37VeMtx)#ecJVnv-N$cU>GcZ(bKR2EDSw}~Q874`wFR*yh z+|g?-FhnjW{^(_C9;m3O=rj!E=nH&gBDtsOQ;+Dr+1}KJy_K!x{#e%BRf*le!b?kA zw8nOLHmn?p0aS)mUIY%r1O)z4mr-Cm>E};?TV=KMy_C`0lF2c2NV>e;q7>>K?g*so zU%SQ;`e+hoYGEM~15{U6?-77v_xu(Qbg|(gamxbfKy`knKZBA^>&lDa_5PwzJGv?d zu_~*oOk+dWQQUO4t=z+NIWS%pRr~EiqOQN+k@YDZ&IRJD72v1c5g6`;SZ!Ln6+!CW z`2u&7QGp?Wse5u>JSOA?yPG#}-%1Q>w7ytb4O;DrAQE65VQ_@c?MW^8<3TX<8T+6x zLMkrfLcQ}Y$`OABCQsi@q+1=UuC1Nk0m#e;IEDE3v@rDIjN^e_PwH4JY7(;`xKvlEd!i?!O*70t9q|3O9ci+dYgjb;ap1TIpzG^v zED5*2vvlaH?7m~xM=Nh=I0gk%?gqnXKG@oGOdMxFo7Js4kLU$oXBZv$z^_VQGv=n7 z5z&A8c!|G)*7Rmj{?-D2K*CXBX~z_JVT}%?Nt?7Oz*MaU_x0hO%2m2=9ZbKIpz@M$ z9fGI6_@s|dM;BJ+RRB|(3=+O&JH@1scgF4X9S9C1z4>Ram)(egXy5$r5KuZgm%nK& zEt~G{14n-TjQRP|+41xDNk;-+i$w&+i@t(?{h%-&7Qr4_DPbPjd(ENcF=O3q>a{x~ z0DDdj4B1-K@%8nU1b=S?3VU}<->AE-1>4;#S zfYQ=ZGC>rbuxG4A!N*FE`-aYGTH13ar{~=BDB$!S=`&`L3 z5e;I|SGKx0)wf)i+etK*B90Dwes}=GkT_`m^s^Ml+7omw9PM1)^Z+f z^uQ|IP(S#~<1`GDlGMF(}BQo zbZNn$E*)i1>-N&HLfyiIW|D}*m#m|rpX;<7hP zbnouMl8y}%GTrZWc46(B+2Pf`vrH{k3Rd5l_$YYM>8d0cD9Z^8rGpnv+vcPg#P$uB z+zdwpOSb*wx&2F$u-oulrYc3W;Gr`tuo0(_=K_zvTfNL*-YoPHlzsgBO9o-k0-2Bi zBqY$ie*Vr7gTt_?+!5kbIOmlueflICORuOdG ze97)UzSuc1Hfza=NK1VjlxdM_+lZ@@S!(~C|Td2Ou&hveTMZtfwgUEDnVmdh7m)>?Cfti`(% zIR^S&C_-04`yRya+Y1t|+8y7`Z++_t^=bXpJvp~86krnn{CPLJgus2viF43-dLUFl z2r!;>SLzF-y88J0k9Y28D6y+#637M^-Im-WwwFgnqvNo24`o&K@V525d13TVPDK0@ zw_AK0FispVTHZ*NGt4)4qE$N)dG@-(X&D+u+$nPeIsuqldd73&MnIt@e3y57`TAa6J{I|6=^v^)%)*P% z2-$@b4?LWatmC1}f;;U)m^+oxu=^H#tDlhA1lGbPIrGUGqgQz2pnw3niFD+fI^p1! z+dThKU~i$V?Me0}*6y6oq3YV|>BLLu?&uT=$8tyn#)$`n^oy&#_3lP-2q<>2p)N40 zsHminRfEZ1cK&qIXqBs@B&Y%h>I^JR3TBMnSf>NB?$a&G{Ns(YycshkrIivGWDKIKrV=VCsvmDGWS zLDGi=fZaQdAD45k(%jwM+c9gq=e8%E@q2m0Hh#_G#paeU!TrN z^WOOpkhCJ2I0(};K@NZHC(yD+of4lk?m zEA^xPlOY+xK}X-RGQmAe60a#W@xjXrFDq<-Q+O=J_|z>Pd+1b^Kg^*#-U5eu!5}(% zOk;NnZaSCv4dUu}!=YHm(BwFSU)6a!75haAt^&Up3BB}!Z-e{;h<`8sE^9~%K~&E9C>Zm6 z=#*U@7p$_xL_c-vsN;p1&k}Hq4TkH2wFL4hhQ^P-+DR`WU>Bc{ zR!Zo=J}-seyS%(JlM4%J9;uks*~%gk|C96bEL^GJLzg`E7@V-4pB_3EyA}fN-q>7i zNFmo27P6`0ADnwjvXEWmN(l+x2i1|vd9@{-n#ufT#jj{1wA^1ZeqsB%|p5|SJE$1icb*qr5s8(cV=rf*8dDFYkNyzA&lKv_dR z9xA<6&5oz9-W61@6MgXMnMoFo_vx|*1D-kzwa-Y!OuY7a3|B#(d zO*s;-*1y(mCXRcrKiPdr{mneSRaV}%wV*$RPSSXzws=pR@ku@!oA3}vdDQa7FsS!z z5r-lyT$d*5NJai;RGH}vh0j(TLb>b2BSH^74=h(2)=v2 zToG9b*s9cD|Fod*wkt1MRM>E_WQ&KVTUU9gFY~tWrGb7-QFc^l$PzJK3D*&))HF5EsdY;r zXkmXaC2p_u!Es+n5{yORx4uM!tX$_k#-cH_fFp4IHwBvwa()EMIG61!eMG@-em)7Q zX|2x!?}Hf#1Izu~^ujU&$KF|D-AYpJl`R&g(K_$TunDtoE!xk`BQ^9#J#M4z0{Wr$ z4gN18bhhCyV{nZNbUH+dF7N&Meeu^BxOYz{9cWg4gOGe&z4WWQ6W@IL4TA9_XIbOS zr7_OEOw^+tZVHx&Txqc8(*JOL;HLIPz4xWx2=65lTizyJE?RnbD-^E(&`O;nIO-!O z+Dc%n(BP0YiZ_T}o?pFJlR3AGtu5jRiHO4w=QMba0s;+j0i+>o;%_BDZJUaJ z{DgTOh7Q-hwbkNfyYKNcb}7_Pp&KB8U@Cl5Cu4IbZL2 z!`}YRw$4Xi1FM%I`0mA_Vq(H3c6t<54h#>^6cv|gzvgNJt>;fo!PsiHZI;MCz^D61 z^pTiEXOI}bvd9qT?u~9HN;hv0mFsokbt7&y3BmMnu%i;Y0!$xFfYZ&e^)ThxJZ>|( z)%my>gdgoA0bK`05|9Bj{$ad*@a>}TAJH`GL zl40uR0Z9Wc#*EK)devmzX+FMO-=inR`–g4u1@nM9U8K!L2Ye6j=j)PW4)9v+F zorX~Re7T#^%6tQ+fBny@Y^ z`GoW+-!gT>XNa6%1SI!;-jMeIPRvIR<9L69ELOlcSV2C!)^f1yP}(w@{6t)rhS4=* z@3amzHiPL9)`+ax^}C>WGnC?XcG8daXy>2e@}45Bsh+CuvtYw@gc#a=tI1S7EP3c0 zGQ5JGG-Z1?aR;kQ=k9{4=n8DCMYFc0WhTjia^pqmU+FuUsDR0gB`>$!iuE z{&ZH1jnzEt=r(rB*;8VB9sa2z?}}3PE>B zERRp1hQG#jAW$!A;QiZWz?0E|f{EqS9EAo)l|NtVf|9wQk{yR#d8O}!@@V*cBRc*l zP)yD8c5j3tN&3HE)0D0QJYA4eHDC!ec95N^HlyI_;6msFgXi%|P-!7$yLK#mu-1?6 zuxu_1${NR$F%>l2@9a`DA+0ew40IQQ!Hb=erWHD~G%s=>6~| z!F>drArWc3Dl+b}8$3#^RnM5A<2)cQc)+5GfUf6Hg$mc0JhJoU8z)>r{WGA zq6S1I2l!i~`1RlypnxirbiOnn!4(|`a2aOJEGzjy@I(zkTeWA1$Q`T;?IUnqO&A&^ zpVI)}KZ>9w8PVcMh#Xp$HiBK~LJ>j?;#k0z zV!f%n>pdjs20h$WKAp3qxjf7Cb0b>}QOT9w!*{}g_PYx*#Xb9)1?ylzf-3RDu+)1? ze*_T<3IxZC0xY6(9oRgQ%-L>@Z1DZ^TdJdm^e*Z&WZ@|&3f-Y2En|UJ8tfqyN?=(y3k2P?ioxz|1`Ov8=vy>{*=F;(cxW8<7abM`hr9^Ht8%BSBYH$ z79S2m6GWEm&9IA7Um-hs5)}gQc3||FP8js^X(=Bd0kSw;z6<9<7fAke z+$|slHdV>D#7crvK6EP{e)e99(4`}<)=oZ$J`s^@dYvRCrkQTkD)pfq?b+8o83xOTm|$zdNcGhkbwwX>SSTWD?SuF5*Q)euWy@mpJ)8U_%4F zspWBQ`_2kOsa<$#cqRlRE}TT}?|v~F6uLeLag{(_3rU0cItof0<&3T^bZyr>EMJ27 zxwMwvg{uD<4;3Cpxzc~+v34R#f$K$_r$O^qtF{-SkX>u-U`8WYn3+F7Ehxki06x3< zBzs`QifxJW6efm<*rkRmHuPF=BPKz=TXJv7#Eck7^fwg~6zW4$Dsi(VCgp*WU5FpI zrk4QC32NG`JUzx9*ddWv8V*%~^D52nArE;_);?w2>>4^LNo_THgK=u(5EB6s=`yI&6a3<*hbT@X_SPEAab5cAl})M2?brpdPWg0 zbMw@Kp=gKZX7KI<({3hQJPKg#B~BACr!+fB1+hdW6Tpc(l~%q%mC2??(QzXLB>O^= z6;Qld2C}6|!vm&xQypn?=Rq|(ZU%m4EK@=j?H7&1@Ihf9PYptO?QI9Pj#R+e2;Oo4 zWmx%#5tBsc3!s!eR9pv1&59*LRIdvwMCuVDmYs;J2f>sr{ck`oQ$4hM21Gw8C9$x-5>!ZfNf+oL;%dpS_(y!` zgf9GP0(w$&0DQH*Y1BRV(}^ONg9*w))l#CF21dz?>J0oHCrVa$=ZV9FqH6e7%ECc&cnZLa|C6I_!3!zgKI^1KC6lXHPy#xaBThn_vK8qb@D9Hz?Bl3eUUcN=W`hr~+x~G_ z+FfuFdQFn)hHfv4#sDWQj>AbQ@7>aT6Z-9?q)Ke-CzAh`zUKLTHy<7#0FtZ^=6v3O z6sK{-lzY#4wD6C5B1$!ecFI4kYaU~|gchTlo%uY74=p6gSP5GLM#d%AKhlx=H24aD zcl7DSq*Kqv0D&RVKs#2=B*<9$5q&%pl7%Gl`e;hazypv&aw*9zdX|C_9l+d96oY-Z zB@~wfrELGK051^M40jPjYkU`|j$KUCMC*K_D#Ir*3&S4j7rCW+ z5j;(+09R6j$8WPc;h)S1aV2uUjZzW&tds0v5v6NVZ~vnF{NqcfD7Q2(J$k0(L-CEO z<>Bbq9MU`ZTGx^-*^0&gvEyQSFM1LACy8N-Th|VbmArw^FC1-lrMm2Tw++d z^HS<`s*=wpIk7^%`LT5?=v2x7qbgQOPidg#xDh?l(UswNbl>Nb%RuY6X@hO>5aP3D2BC=iqV(d2a3I=eiz~{tvi&eN zvVoX{V>6QE%D-H?c%!wus+x$XzS$vDOfRkB{9}DDVr}EHFH;-njB)ODlKb;T%efA@ zv@O$6viQ44E0;P5>mEsb91+>4#{iHA1S?FxT++(l#7gHq`uo-61!x&#EiH-?`Mei^>LXEUm^+ z9Wz>)-j?iQ`ZXri<-o`?MMDlDylHxHBhVeuG{^T( z^dgDYVc-ptb?D1k0N&VR5Mw$dYaI)0@4u2Ss(p*chlY34&Iun(VUnhz9r3 zI|$34=?YZ3?N2UaacS0k4?cf)!Yzo@bPXl1dJT&Bxtsu?!9jXJiI7Qix_yBcilz9JSB2+W_iVaLeIdLf6fK-JO?FuN9am{^j3364%r`; znNX)6Zk=YG!zBthgvdM#wc5bGAYws{dmzh`+lO7Mv5MhE*yExRBxD^yCdc#@uOqO; znfeFN?LulykmcVyPteNm&@5JvMM<_p-c*dU*zcR)sMq2_t~+x;5t!$%pu2)rJP}30 zy@G8!GY_$mXSl@Ks()_#!?;$)NF}!H_HPmnPGo1&%HTLyq@Wu~6cGCbaXl0B2K8aT z88*w+I-l$6PeAmHpzK(S9=I<(VHlxsZ-5$JK6d8Rx*x=`i-|>5E@9rO-9HwZ zOSJB!V6NIh@wcGLOsyO-o-S08k->JYQ5GL0zFmvE77e@ui=xjSw~Pi1Mo`N{>uhjz zTc$Lj43fe@S8K7Qh(FZ{OE@Mrf)Wy2E9deHct)1##(Y!O?S{ORsagy{MpmHB00Iw2?qlwp7y1C#=?8b5*nR+4{+`BtmG%%rW7bhRtC@7+RpxYXa&qs<9SM8^vOa*n-bDW@W2;zfL(x4nXj*4lyx*zQ9X?rYveTr+sQ7-4>BrVNT4!HYoQ z!Wmsu%mPpYjs~}#A4b5Gf^%SAGI(R9fYJq>5YQzEKkRP+pivsj}Z>?aq*Iny8BG9aKCzERPA)B+p8yH^ftv4j~0 zW^K|?=#z* z9*(k{&yF@6EAQN}KPYht_el;f1O;L-fmEbVui=xDERn6Exd7Hp3~7 zkb-Qrn@>H(lJv>1hE(3B4<>SrtA#}R+USi~M~_nFk17|fIo%P7qp=B_8)p+rc(~0R zg~KK_ws*`cc$SqI$WE(@s&8a``m1GE?xn^_WV(t7w3PzcklKoKhV2MGiHvzzkXM6M zJN~~od+%tt-miTaAzE}INDNX4f{+k(l!zcfh#<-kL@&|Bm{B7|@1hJ5(W1AggG8^< zOJwxk>&%$)9?9qX{+{P~pYK|~@4J?D*8E}3nfu)P-uv47?CaVWUb5gATsO^Y5H1)5 zW&?{j9Z{5OEcZ(z76x2_#RSkR{>-L6t|{OgCwkyjyv^5}lv>LlFEY$;y12bN2CrPU zu{Nptunu3p`DQ)&P%_kgvjiQ8KPs{Nu@<#`u>7s8>SpaE39}146|@?4Gkb=pu6i|r zjl<|_k@l0*4eYdN7sHt1Cu}ys0rvRXB=PB$D&m@?J3Hee1wP@n8zA}BYG=5?QBAOG zIDRy)(g`1^53guuCF_`ib78MOg+0MjcME26d$W{)_m0QCs5g*t;OpyAW5i2D77!01 z1=&vQ>_!zMub43(g;|}3nE{S4{sGG3)~3r$Tr7?u3mfJYA?=h&WXxCQeMWrvZRW=q zgtRbgvt0KRKG11xAdCx$y4sBfJ?<6?Kw8`eH@-T94l~66H1c@@z=DAxKuM0`5zgIM z`x}AfE z_WgoOxn=AY{5^XF+{=-nR>|BlOHQEOMr8qbr?<8jwgJC}=(n0eRg=yZ-n%jc%3Y=2 zi^{x+*jmsCme>384V`;Hk3r;R!l|+6$U3hoPHgQ=wcCxqCB_gg13&?C+%reXZu4nR zI}&`8leqaT((rjd_}97WB9*0@^@og!gs^$gFI6B{S)9171vud8A&|xNld2B|a1)%n zUvT$R?)*Ch(%GwCIJyZKTLk=LGEi4IKz&2AOA*Qz5Q7BDT)aLGm{VW64za*FeHL1W zD@yU{`B-(5qzD%?gWOoXtBv`yd{J}ZsWn5aB^wFqeCQ6GmvIp-_^nB6uuGEKc+`fgz&6hZCd^{x$(5MFgq}1sQKdeyQjzM7e%k+z=h=V3|kkrAN=N4=dxR zoH>fXz=+NBxyfh)pxi~i!O*(=;UPW_uYG?WH0c@s6ErN@>VzCT%FV;;vt?s*>D$}E zegr0%^l|oXmM#8D5p}LNo?$zMiFnmwa#Vx&E7Wovp?u`sU4kFzzio%R`c=+c^1L+G z5`!ps?*yMD9zuV}h*Y{<>i`N?lm#Am(RPBBO-v~P(^5>lB>L_MSt@_O^qpoOe0~`& zNVslBRB6SY>jmDbxay?dCr&@WX{HftBglBx;ZH5_^7ov9(%=VEVu&d*ikYbFbS=sc z*dXO7a5gp@olrt-J4HN)Kf&4nNQcO4`4J`p)Ce*$tx$(`paHUFdSnwM+hW` zSexD>Fij+71P1`!*nxor8_0w)(9_NEVNf6%ZR7p|cp#@>VWJ*3wUUqzly<=MC@Diz za#Ogc^Fw&`ZK6bC(o$vNMC= zUa8M*4_=#nYO z)$C{o=Mc-C8Zjn=T3T zG##CU&AX)47Z=`CO7e6gpWYUE&eZ2{5@ z2fjrmv=Q1o74>3q{#8s$sRAoVqHlIfZL8Jie45dA| z*+LjY55Lz|!pAK@+VGU?mc7`xbmR=-7%gi@$i14sPR0bw*?`9bws?ZN-UcQKYB)Pg zDz6!Z^;381WN5$bwR7F}EV-FnK5jNa48XrS=&QMR#Nd35sm}QgM4HfkM4Ujg|3obk zKtGYD(b}!GQ~DU|P+DVfdN!G}iTWcVlbI|F(ok?77?*u}tjL&>q|>V6+u@LA*>rQ@ zbSP1M{H$$4;UHxI`j_zVfAP`#Vp9M~$Q*6+uLicu%0|gy)~le4OiC9O@>A~=;bt9k zBX&ad4sp;dl_^{-S)R*p;dgGbyUjR=%`+GB7dQ{1_`y<|O>WU;fCwKfTbXtXa z$5V{eLfcA`30E7E2cTTzzn?UJH3I+mYBWwpQczIn+Qw!u|Gr1WTZv2uOryrg7^IVU z(wx8JbwofmqV1r~NnOIj8=6{}#tQ;iI(n^7a2O8H-83d|l?5`LiW%|rgieSFgbK@c2HA0V! zlo!s{ESM?=iC^#IaDhiQ&9-LHAz@>!+mdh1eLXp@OC?jTuWU(QDT4+4KWQE6%;|C4v#hwCpOtV9|e7 zkJfWaq=&zm@B~!Wh{48lPZ9pp?My-5QlPj1N!Jid#gOwrvs|w6c3n@`E>#jWv8uW` za1boF$*)G4dc#`5tp-G`_cVG^^kD(k6(iQy{MTksp_rBuagxuHL2=O?MQ9cg@>2I7 zz*-y?LfxoPYkcH99S|Phs~1w_{FI5P4BYH^Wb9W#y@J#LzFcZO@$#rCehu0>yNIjX z1<>J$hcoe2rkER18I%9{N)SKTV@0n8^Ys2!hY$mK$lmhs^m;R1ObU{ZqPdB@L9$Cu zP8xJ5g_R>!M61N^?C)9k z_H2^VM$n>_o5KQdYs~5OSkI!_gGf#|a7&WK)SZxCi62rh1|OYbEymbs2`uq=!S9I# zHD1czh9+J0`DXzr4fV#OjmD#~;$Ou*VUllnkd$Npxf=odHVvj;q*fp)@8^-tamC2M zncoO?Iy%R<>2l#h^!c@$r!MCSu`d>Q5hg=#fDJkxTnxJ}RMN zxxPeOk|2)292=3-z)U;(#d`U(P_%6Zgc5YUD-rb@%8WgUq*^XHWmfJ)Jf? z<$XGYKGZVX3o(s^tO}nIIV`&7rAQeI3igwwT)5jKK&Y$#e>?;qq(U`XnMx5&3Yqbk zFPy*D(Alr)roLBH;!oCT0HH;b`()+5q9v091=yg207v&{NnveIpaF8t2Lk9EQVwG0 za^X{1!!wg3%`_CA8yz=mb0}$~K`Y)!Z5;uA=uW=gj7bqJv%|rf6!h68W0eC>gbXA} z#ZOG0=VwkqQX!&u-RL(MroGBdV<4OB?Fj~^?@oUEF_7@4_RX(@2#mq!8Gei4wCn!! zGe%V33zibU-et*$rX{6vmxQ8V-Ss5oM!qt}V!Vu(j}I}N){><5mw@szr0d{ZlLi+o zzcKw^PH98iEmxXpmT5Kymq5kvlBMoF+-o)7_eNvdckFjb=C;o$rg`vzHysdRsQnAc zp_4lyqNooUBShj6bmdKKx2R>Y>vES(J@+h>eedG`b;$ZJ2M=rpjN`G$SQQSLr&T0T zW`r_nuv`n`AybdXMk->zuqg%P@|J&VojJeCqS#vWVLop3hM=!&lNFOR*PVa9Mn0kh z*4R95*Lk&>C zp{)59^ugiuy_MLV|MH9euRju#(aJE<5ILhS%&mlnyTR#xJRl|ISZXC1r1?U2m3kBj zg!?L4hzpvL#Ml0y#5cy&p62)aOsbYw7KomI;yWxEr07rOL!z5eJ4e3X&b%LV=u9n4 zkc5o+VvAD82$fgc;mbQR&`~aOvbPM)Cln!|!<>v!BC!g}+{*riY8qywdKY2IEK9F< zd&f2MU;FxpEcsus10k}Hy-Cr=n_bfRSHRIu$#-l7S6OX7bziEKWlv%e!2MG)))bnW zni@1)Rm>WEr>sU-iC}_`VB!-&_PHc~)*fs05_5=XQ0^ub zsFj^@cdkchwl&OTl|_m4=tI!XFYva|anEYj#K~M`%o%!EmH?l%wmfkSmt!QoPtM1X zp|&fhjnvN7-5s*~tb5Z+&+fCh-De3qt6>hSaw98u>BR%-MR(t)`Ucr`1vEfgF~ECK zHTA#3xz2S)t2|tr=JG$4d;#zt$N6|7XKzDqbe}>ugUVZoC|E2MW}JV zrMf9fu}_sBALb!QCz|j;CSCaka7}$66Q?8+r+On*RRp1?ug^V-5nL=Re4=L>QCBB3 zGB%c#l||q9`se}TxBp;GndtcomjVE?Sf=*Q?}s)+dkIM;8JTc^`5LywuBqzyyPSh- z%%vH7v{42$Sf-|?miZpMtgNj5RFp$e zi|Of(j)2xiDPU6~fF|rrh$Z~ODww44^jJv&lA}o(1da8$2TLZBy>@VQ&A33RK$>yA zMH%)CRoKvF&u-Bco6#$p&~5AB)b=J~qZEcef5&Mmq%So!2t7FzG&XjzvZgA#sPRgu zvMfy~89=`Qn2*Y3g|ss{&K~R^#Jv)AbjvE9Do+ABfIWpIZ`lBNhoooh6Bkj__X&jt7Q=6jfX!yq z64W<4N?erU^HC0Wzx$*qX+`z-Iq`_}lGE}2h`&h&qL+UkAl?OMsf4}w@iqqNSHE!M zQCdLbm(2+IlgmX?1oD!MD?|nQCdl#6hFXpiBtgau?$7(%$MW`X*y*jWCbi5Pzae48 z#aedT8FPSKKSKQT9FY8b>OP$Y zOM@j4@CA^<<*RCuE#F=(03FrW0QM>=9(y-v$gURiJm75*U0_|W5@|>{lSPIxXALxn zMEebm#Zu7_g(xY)Hfb)2B5Uw>5S!4m`J-|6`99STyT?N{rH+%bhV{qtm6es5CL%6U zr|=;HfI&}86nIm+&URQOjJ|F4XZCHcP4Y)T4?YXOR%>%$aE#WlNDkZrra`hu%9929 zq*87g93}Ypm(Z`@+cmFNb~X=0mv^90|HN&6vQ~aH8Do=vIky;Lk{in0a^%zo+#@bVO1k^3-v2e z<^8hvCAP|Vx){%gPhb86Fu~%8w}XR&0oq%#bYURaH3eCJnH!HkhCGtlYX4$-kKiv1 zd*B2xiK%)LU*SX z{}-S7u_6LK7CaBXEYC~{@}Y{nw=MwCwPBYAksuO=MbcJ-Hb%z#JhdORqp31Ht*UG2tBX&R0{9olv^n;J+$lmb)591tvj-zhhb~;W+4dNoM*bcW=9ZFJ zjbiK(X^QOYA>0Q{JryJwG7R`hj5d|x3}cyQBwT?Mwo7}RT9N_o6x@YzaTE0R4VgFn z@b7YWVhEa%tgo*Rpsx?iMtT2=M1qg50j6KjL%LzQubH6|a9Vk5B@f1q$o32W`z1&` zX6@1Wh4$@tC24LnT5_*$w1;z2?2il|co(5hUhPg#%j^O?skh)67$vtan#u83DYwE8 zEeH6S@gOdMQ~S(;BYkf!M0EBCi_MGHAL@DCD_;-1}fw4d8}-N_kEawfz)rKYSt*ZeK7)d9Blb>sNy<1mDPIt)5ypKz1Kl7k>(Q1&Puj=KWbDPhh=w?5M@yz=XaW9%Tqj}^1%PA z&m~0mvXiZ@um4DO5ovd-nR$=IEr|kT;;LSgry}H30g8eN%AWogj~X4GpmNWT(&^_F z3}jb}e1Lj#Pp%8^V}!*=?W`&QWOLK1;^yaayU#;*R^r?Z9yIx^fkPp8d^0QA<>cQ} z?E>J4o2m0Erjna1pOzwXR$ZKoDv>3aqAea8`-p=o!)k#7p-wgY>^M80Cn!pRr(sr3 zm5Mo8=9FjhzJi=j%W%n`-6NjcAEn^I&@+-*Q|WN*@&@s{C;7)y05Aqu&hX@X_mX&b z&Mco^zM*oy;e%R3Wp5jfwr%G3rZcO;@lS+3vML%wj*qoGoVS!6{3xeVX>FbCRW_FC zJv`Yg%)G8J6%qlS_C0iF@T>L|o7vUuC?$Ut`FH}^#LWd%2e3s@ATV48XzBdhwWr*| zz7EvgOKy3pIf%B9k@%IYED<1NI*65)U5se7AI$Z>`+9U`uuvpyYD&)DK|HUh`;#Ez zfxbRXts@Y9X==m6Om2ia-Uy9X>-;?YFoz0=)OOm<2XZ{B4_kthVsFQjFtEy{(`7Wh zJcllKBcJP%{7};9cs1ccb&@Don}I{;K3S| zRlxlaEznDD)DFK*gC`jV+D*M@g2&DL*tWK=z$dM z&GPA5hfj=w2QF@42_TA;-6I?TTGsq&0kpVM8WXbLxv7^~=hiG%pB8XYh!p9nW;lU*Zo zx}J}00@?aZQbOk_H=x%uuaS3KgO*-!%vghFa>U=Of@Wr{Jv@^JC-ooL7*K~Qvro}l zS9*JV%BkYIsgUao6VkxPg6r=Keh*A-1v zzLpp(wr`!v;VOMO`M}Z1IqY|D{voP+RMYoT`KeCYt9HA4_{!nij#2VKT(o3=hOqOqsTDH$^=oq9j zI7eUD8$B=a-M}~Ql!@G5^;Q<4MLJ*?K>rFR250e;CSC_zw0hwuxCbSKs!I)O;0xR{Zv($$vry6J57!}4>r-+K zl{LeW$UlA@nY-(cQnOKGD&m?hCE)uEYYM1$L>*c`T62~9Ze8^7x_A`}Vio(g*64y# zxVO1`_3U@d1)O7eU~1amsIGU>(eI$X zPSfzXfBG9y^knWgAb%md(-7g7|FwrD^3z&n7-@uOvRECHUOQbg`Zm67Ww+^)KHEL4&kN3fgdx3D)2Z`q@kwloX|5#%dYp24J)nAEsGk3c_&I`=M=Hb{W|(U zbyVefZ#LkZ>l6p1BSwuo)U5IKLfjxSN)^uJ_?4ebmldb{POXLRV}{HJWxLQB0-ITk zo(X|p{w(JAh_X(-I=VY{3CYdA+KXhg!lD0H37$`Ud5*2s3!vEDF5S@RLzU$ND2-{lTY@0E=gHH|9!~+ z0kjIVM?Cgf{28`a2q7>7x=rupUUu^an77EsDI=8{ZxaHTE=f5tT(}}ZqEF`=&133@ z2%^1slbjY%yVGDCS=fj2`S2Nr&}Qs%3cCf7zYQoC&J`@+56TsK9pGEe7vw?GMf>ZY z7Z6dYJKjm97F*AR{hip%;1(3mgrN+83mzm@g;Y^Hr`uj01VJ$S2D>gRh^U`_k)$y- zDgX^27!cKo1mG^G)URFI-V|B(oZ{_SIbY^?v*mg+OBN?ebjh&YhnM|Vsi5{dyc0!b z7e7z^?7BMek?<_r5M><^L@*rORnt$@rEd3CS;y~1yQ7m19N z6A|vz1NsbKUp7g}s_!Sa5@NBMjJA%BKgv2lQ56F+w5N||xM~YUHdk?IIzplLs z*|;t`8$9yn)#*+@_{^XOL_Iua*9%Kps=yy9sFXeR*^0mIyP3c+Y+hd2M#ez!Dc`O) z3RQ+h6*o&s#s8SNhjH&L`USpiN2VeZ7Axn*KF2OYJ7I(HFZFrls-mv1&jzSFg}qwn zIC+cxpM}4_HG~~1D2U9s#<2w;+=e2ExD}r{47<6TJe01JZAgMbu^U~m}cL6|sWHwx)zZnDAZI;TLAh`|YX?jMWssPIx z1oR1fxWPos5x~ah|2b3STzOlRY;Yo%6CrQKs-*zqqn~lPWM5eRtp3dbf2R`24$WXhufblIXt*W9@ynk2&ClZpS-EROa z!pn~?fz607x{ALbdr`IM^z`&(Fq{++KHJ%CR$(w-coUIDMsAtBK6Uwx_S6+WP&I%1 zBd#}Z6h^6FMIQ#^Iv{_}DM}9w#4DGL)^4e>`6_9bxgx+g9rT2h>yH)5M@COtEyB3p z0IF78fYOJ6y@NgPwGUR%p_QJ3)q(Q(!BIL%rm3>tACfG@5ox8A$DaCe{=aZ9ys0&l z8bbk@G8XR{Us*s_j^al+(cZyr!mpJi&F9buy1o14lJsYWhWIL`#uqsL&9@7XvnVy4 zgqj))H9#4L`GH_8Q8>KQ*Vo=cP;`tHP>f2JbONXO_4_6(;4j^|b7yta>mok%YuyUz z{TngeA3Wl?nw+khj?io&d?FR3(j>Pf4pOvNS8Y2+4F+1P#{0>bq6Z8Xx`j%bmy%4k zKQp!d65zcb9V$RM4iDWVmsKWCv!VOxuHSHqty00%qPlBt&pV~Zq#YEuiF&Y>4pa`( zT`%eV4wPs7{vFT&+rbQM_|1$@3bp&}g_3E(=umnU~)AuvB3`F)|DX5N~3K&uH{Vkmk=$l2^ zdJOAdu$W<|HJ>T`8DK_j&$oN?`RKREccT2^wS+A?8PJmm;RqpbLAnNn1F zmg@x*lm+ldR~E<5Y{c6cW{@9})066YvMIH8XEtLD{P0dI7qsi#{Dz8MzT zZWb1)L~QYo>GF}?ME?sz`JD;|vU!DG%&@jL)`#(MS&A_Wmx?&Yqd(#SQBtV2-f&M6s}WN}9@1CC9K zxGiP)&6XEe&ap#EL^Pg84rG600TWXE-+#Q-Res$#wtmU^R}0-Emn@UKIk(zqMH=3& zP!DF^v{Q=K>%5HL6v=p(sAdlBhaF8`oikYss=q#&Xw_9diddP%;BQZJ8=aq?DDU?6 zwU8gEs5r<-nMqHXnIwyO_?7@vYL?ToOuW}FwMT8Mytcnn>)KGBlW;S7%Ii0g4FcVp z?$@4>WYa%tsC5;JQmFr^x|8HuT76skbx!AIMi++Nq*Ka;Aes9(T|$rz{qv*g?R>cjO?G4m5RSyM+SCoqdaG~WSujLQLmuqvRgUe;^a)Z z^NMdns+e-?mrgm9BKtvgOlH=}T`BE5H@;$kwc)xL4h0i{Z2`m~FdoY9`Pk&QPbj z!EFLGs1x8jpSAQxB}Q?9-iDk~@drEboZ4K#O*w3oh4kgPB06E1HJzXLrbJ1!_Y~Wm z&tG!f#(FEzqa`nIuZ?ttd}Snzlue=5*YSWD*d1@evR3u>uQwXA31FH!>(8$Jl}f)3 zG3sgZ{*3uQU3EXheC>Nf4O{pL)vy4wx!4V3IP1c$#oUZ}5uCb{>^hlPiSCz4&sC$J$cF_iR| z&HK}&e|e+)g(1~x?aL3my-8VPM#O?2EJ+-->RpodB$=s@!0}HWbm61oW{6B@3uQ+Z zK_9D_UQGoHFEvU#U~KiNJt^1mt9R~ydh#IhQdd$F^r`{G|BA0~Z5Vp{RQ&GL-rkKZ zwI$5^=JFep6N&D@k3W=dGfdH#H28$WOjYE2P{!RV41^SEceCv3>M@2qp1YhDy@O`Z z?Q*?u`>`<>1Unx{NA+%R;vr{u-+~5U-&_(JPu!2nRaji|<;u@<++ByW0?!qH$}md{ zLGz8HT(9D3u&@GbiPHuoLU;T6{0%0Jz?|NRWN*VQNxK3(2>uQwjRyo8Vg4N@wFi;1 zKW&GGhA@)@=~D5P%m10gymW^ZR=2_KH(0kc%O z+LI`{5IMbLzRl&{z!J`>lgO9tCzG8^H~+Pee+X_1a?ELrbEi4*P>z3R?9jl^Hbt{z zmDZY;sY&SjJU(zioqw(J#MHv_d@P&B+15aC^o!#lTdO7G4d2sA{ioXn4;;2IGy(}n1` zxbHS1O(7-z_?I^xD@3?<9{O$u>>On`Pfw4?GhD;c^3v^0v*>lC2jnOZSs@vZkQ;c_PyGH@Xw{%H%x1ZLcro`(ow%yZAP*3 za)&lZOK@hE~Jjc`w`Q?o8bOJ-l$>!&BLFJU!%AjWn=T{rciINugws$Nf6wo%~uFAFeIIX|NX;jKYr-Zd^_66 zb^kf3^6*{A^9bDFKEJOy@hwY^n8R3CWxJ6bjnj28ZSsr29qT_vl~3v!b?Au}Er>>y zm@icOUz@M4F?{g)ybZ#S9*k{aklv~seyFZ4O}4}GMKTZGwqpnbq%}WWymc1XQ!xiC zPau$;P-<0Sn)8d|`2DhQS@%mRlWXcfJ>L+b{zeCB2s1Ku5U9^|8w* z^P1}F7C7`f=+*{gWxuB+)A;4G1s2l8k@nodY$jJnh(ExKj2uNZn>*Bp^!-@1wM5*7 z>A8njIVdb9O(Is!j3VMZq27{exyBg6A2TS|Uw@b^GaGVLjGd0U3Y zGZPM@mU@jUQ{G^c*~MAuB`R$N0G+DGxKp5oU|=)tY5_5JXlvz=d>jc&jg7yk$t3PA zJwKC}9gA0Cp5WUGgUSwlQixTD?nO0^YiSE+-IZ}ZHcw4FMcA|B5s~YsQt`G88hkes z74H2$tmMeqHB zbzC`r{0g^w$rMuOdR@- z>1PN-bB^$@R9BJSK8q0!r(Vi>4OMAiaCt00#7NCrU;dt^aTX)+Zgh?kMN~*Kg>e`e zO@e@7y_;AAT;$TubWPMshJWNK;lymLNa(njl%%t}%Y5HAJNfn6&BL{%;Bu3agCqDq zv7QT@b@%QPrP0yx`NXF3H}drba^K5^fsTS^cEZBzGjZDC5(r#0VphRv>hbOV;XI{R zt8w)+cZ3GKS8miq-8uj{Y6V+7$5JwITm6=Gc||-2e=%2o@U{(~YWezQ?yD9f)n7PJ zT#8JwXqIiB=`r}swHGfq6y;B#nl)X%t~9|#Y&`e4!Iip)>j^z~$Dxrfzj03Q-_YYH zusc@cNScT-8dk#z^P2~QT-Lu$=0ysD?`!gi%)|vF0dKAvWZZ@7PzC&V*ntyWbx~tY z!lma`+4zzLY!;;dGU8wUgzC`oBsVdybS0mik9Fi%Dzn6?=NJ5Xp4=Yw&yCnA@Lxz!! z-r~G#x`zPS8#ky#?KnyP=cl%VKI~MJ-2)L^xe?+nRtB)TnH7^sjdy=;f>Ost>sl>a ziMj17x*b&9oYYmaREeW5xNBYSU17WTLR>c~ke2!V`8n*K^w!Stmh88MV_Ecq zytn)Atdfh_YmJbD{!6V0nCEQ+Q@M}M985Mk-SO|A-SS*XwXW$*f+WA>;8zvA)b20x zNv~fpnb^GZXGsE7Mu+`t?crVco1D=q>XS;{h3o3JTuu(h+cP}{`h_})ZKYW%>N|-_ zHnIdJK~X2 zn?}>9J~(+w^I^XeG?jW^CA?ricw&#-Ht#nQ0f=utF{+9JGW^)oq?L~1*i(qJ>^fH6 zDP!N*fEc7gr#_AhWHaoh4lM6QEl&6&?IG6(z$e0R^!FX0U7otyDO=}$>qW~qD|<6s zx%Rva_x_qH(nfd0XIwGTytRhnFklhO~)L*?))W|3d8qeoSA)V_|<6 z>NEXI->swZN9Gi~)-lG>-)j-z1WgyY^sgBlW}As8>VELPcakfq`+APqr40k-H&-lp zWs*e?CVyCD$F~k`n?$r~a9%zOD!@A3F zLwAzmINaMMwT+}>i`ox69X16dpWtatl1a(7P=X#>%L z^3-AREcp;GC5VvXWF~93*LN8^Jpu{bbx`o$+nt1ZH$BdWMfGlf+=eka|0dG|`abLK z&&uoTsgq@eZ`xlTw7)%f_b#9y{Lvl$>HNQX0f2J1xEq?Pyu2cILF?nwp_OA8?ka5h z6p(G4XYfY#p+E##YwH}d)$q+bH%!H1HJy`N2FvhoA6)gw`G2F^LfCiS*GK(U&7BIL zKj-Sy*1vgE_dsV*$6z3_FLko9K0XMij6L4?;Pyg4BkDlDmY6e=W$}xDQfO(p+XbFL zvy8gape4mvBmYc`@Ut}cYdjne2*20wa09(wv+?JwJhkUqQ&ccZw`|<{RI}GIVuGZq z4hXqGBtd!NbRNbarX$u^utWScT&dpADqP_50=AKNEBrSktDS!ntla_j=ibaP@!caG(B)sH#JhI;0225Z<;!icsJx^h0yi$&jJ9IgN z<0uCFep1ED+L-l4eYVZr^dFCAxeiiZmOI!oa8z>7kG1&R3 zjTW;P1#gYL{-IHN&Nf-;>ZvP6YTyOn^{I@O^8=kPNYZFt-#yQA=LT%+hMM%6du)#5 z%G%==ob>lr8rDRPP1QMiq987pDd->jiaSj#?_5u5*`>zwI+kr@yD`5yfCAlq&jC4H zmA=5|Umhcr?|hsO;?Kd4N&Gg2YVBBNnJ~;8l*-k4>%|%|H_wmEFErcyWV%bD>v`! z*E`b^Qhb-L0sgf#emxcsI!h~nIzW4kuHj zBsDd>b_a;3lz=^*b3`@G7=J3&zDV?KGlaPvT)2_0`_>v`ruGk3>M+oViD9zt-IaSl zzozP|dSzO(SH+5X?_N%Q&qSS*ki7eAF{@73m7-@_jegAQD06~8s$ow_XywMkaqoLM zibk9{Pos}}zT0WnweBI%SH7Qo()%KsQ#IM1PB;Do<^0D-!K62mDq`zDx6rf}S^1Nr zylTRm8KN7elqhO0@nb(<696~-?{)uZI-A30=wkS6oC zkho!TK5})zh?7e*MQk}fo1@n=Tav%G@}MhCSE5pR@|o@!i6cL8IbY0Ny60PoIRoKu zD20vh=?!OJxcobH^Ov(D1R1vc<&uVyzrXy(IJ=&ye&6BRusVQzZ*G;0?Gh+=4jOmh zQ2^j&iAv4I0B8Q?qwOC53Xx?TM=-V8_x@q-IOav(=YhJ_5<&rjO9|OYQaNNj#(^RB ztcei%KwN5|$k`U-t{&9q=8AdUcfl@uuVg!&PShYti|$55&^i8Y2rVcOd=Sa`SSyNq zSve)r3vfoMr<{Fto({EuUb@e5icMZY@%d&NqeHI5j!>d>XxBguW43Zz0z~uA56Zc0mI5$HcySqzVx6X;*63Ox2kvmU+DcrZ)gGYJEL~P{&1EeuHTE@i}3N{E4YOn_`+Nm=u_oL!=w*hg)8|>JBX5KKr0z`a=Dud>@{^koqvK#(>HWT@O0^B!P zT%p#IL1zmlP^QE5_V2n}YeW9j=ckBs4|LD}1&#kB|7Ry!;=U}uyeMkc>I81=L#Eun zIV7%kNy=To0KReg^`gxSt~A-CzvG`L`JD6MoB1*$!F%j_ILFUa*G`RYaGqjoJWI)W;e*sTc)Y}5 zsD?%$B1G|dv&o&D88+t51m&}kO3cIC+9zk7^Luiqlycf@+eX{^R&|k`t*uo@Lm4k>4%=Mbu4n z(_Q(ZCfjuKVJ-0{Klr8Hug-Wfr&MHhYY!=>hNh;^AtY+|1WAl2FJ~su^I!Jj+Q}y>6O(-^*VM>449Bi-B^ra}sL8y{h3hHO@(= z5Vie2y6b_4&%e^=l8t@yK6kwcwz3UMXvnN_6ca(}ePx(@`){lz4!3M50hEhY*pX-o zU``Bav+hL>qx?rQe&k7T{3G$#pHo<@6o(35Ux5hGb3|69-@m!rK^xzyoT{4ePLuRG z#n1-Fl+h3IgTLRuGX%TG`o%^`(*OLSRmYo|5s)KUZ>okQyfVdyuYV+8k2>^wrT%HX z#liy^AP|25=-&UBzfCeU#AoulUu>H&Nx^!w6G&quAPkrGE;X{NbQ@lkTy1)1ELoS} zTW8%)w)QljyfMKt} z8s$cIe68UVA-IuLZCwHZK?dl&_W^VI+EIjKF?mDq z$_aiaIh7YL{v~wBf9K3B5GF_fR|B_#pS@zo%A%=_;V*Tpd2yuN{Zg3su-o$j|tlcfK2adw`U zt@+g&%pl*%qAl0vJ|NdKvZmMB-Mh_<*|`8~)he}}<0~f4vt!nrzzfAFR2zus!vKP! zNf$bza@C@^(`AVN!ry-$?b!w3tr-fp*Ufw42;NAWeuom$2@ZY4bW_QY{6F{=kSfb`YJR#fJ?4 z@>TgfcG(X-4XGOI>-LjP{#-_rMrqyis1f{QV-ULA4s?$$R+8d|5?{Cxn-q#<kIG1V*<{fWF2bM-fXRiFOq{~rPE$c zSO`9`axh0VX_&!|t`qi7kY%qHUb8O0jHR}u0bc#(x{a!4Kh*p%`=$9vR+Fd)INcqY zCJHNWZ430QNo7z~s-Uf7Bui9Be%{yF*+5S02}sex0>au)F~?s@xQmfORSgrq9Xvm% zG7#i+zulCQU#0Bev@EM?sa(4ib(3MVE0XvFjDZPXT-d5Xz1;p)J0c~?7}b`n;3Q*{ zcKe{Tss7e<;Eq5knVmxv;K24&0CW7vX5u*&6{IrOV4$S9QK4_1{Y0RMtOvu(*7D<; zi%fc(i+$ul90=9j8cI9)?bSKs8m9$K2$s3Fw$_x7nBJoO!wHxR(jqY+-&33}n_pCz z100Cf)}}OX0%q}Wb29(n|65epYSX?pzd6wq9L5cO3&-I>%RdrzKYMB0wDxA1z!n}2 z)#v!+okBfT%@v;orev4?v=u(6c$WF<{N-P9?l4_Pe!A^|65VcTX&T9kiC*rX{;@}J zmqDWHGc&1u8uy8&;&7~tyF>@!Bi?bcAtFmv^}=Pj=x8>#c~?DG+!X44Hw~~s@cmpc z{vGzsYZ6_(RufmO&Nj9S!+P;0vu^jzutZG8OKY~@_v{n?zjzz)KOxb+tv~){pm&%8QE=Hsgv${@)=*t zz=dq8-+nBQ*0Mf=a+5xKy~gpwPIprQ(u4nrDWKj=CGaVRN&@JM=dkH@932;czjAU8 z->*5c0w=e4RS-Q%Q2TH%8=2`4E|(8=dVUU*GmgC%;4=a)8OR3Xqs#mJmpu6owid1X zT1^Fr>1_t;fhLf(;#m0KjOL&HKRdzTT7{`Eb#)&NAikkMmxQQoZ&%Ssr9ieeSF??C zveN691vRB^iQ@rV2&4OUX@MpOAz;mVyTQEXJ!H<^>XUf(Pyk#*Ed6{v_Dei?{uYdY zn3o5l3%ev)1|oV5b9fABN$feC$xV9`2GRlRqL{AMet2pOfegsHH1)}!{CaHae~TnW z=j|~6$pRea#(fU^($c6uM_Y0^|AQHsV`UuNBq~}$G!T{SAWFJxUEei(?K2?Oh#H(| z9?H?gte}W3%TbbnWI~@;rM^npKx9*tEV0ayAu>;iDrc4o6dvb{c|ZIPc2C)JS>TZO z7n^gX8jRl-SpqsumW~V{bAZ~2LnC|i8 z+hc4cOYd?s%)4USZ13<|-}caJL%JDenE74m zeLv6pzTdajtZVTHYnV0cbDzgP_OXwhlF^|n2}&Bm%!z7g_s?nD5RD&>_VFH`j;WWx zY+xSjt_?dTMQP)rrb4-mPadUu&KJ^tA=aO>UQj{?t%{!n4t$y8tc@X^<>fbI78~x7 z1Xu(=gdXR;pw8Ot{8uK+ju>{w)z0Xa3-LkpR1>NM=7J?pV;I(QmOuYvK{S?)E02Ff zEpR@LJc94iRXJ-6Vaz-zVQI|iD!TFl1%M=ARUij1FRp=kwuTZZeu(HrgDVFoCnIf? z?-dIBjPOhX*!YV5-14&}o>QT`_2Re59M7emE;J~N$=>HLm zf6D~ev&*xy$j62ypW&Egp5$cen|Bms?n>r-4lmAluA?SW!LR+Oc%4PZw};IQ`g&sTMrTuuFEguciPnHx~-Zd zf+*<|)y>TX!sG(uxO82RP{++h(4w-t+PSkm#C~oA*rT@@CGL<{S4UmcqV=hi$>-$f zK^kC9P0y};U*(4jgLdvsIvI{Rxq`@p=;_xzRa)#t&bZrg?o2{ok`OKO71f1*8b}~% zu)a-VlJ@PxG+y=PC=j>B*rXB(UJ5bj(jS+;CS$SjTu%>D8-ez(t8PPsatc-lSs_OUjzF#8Pj51^$63tDLMF(&mX~_iN4gOZslU#1TJ?OgRnL*HG@AtWF$h+8{AmQ3k zwSBTTTrZ|eY^Q|Nldj1eVPXk-5>Z;iFuoW}joD*eL%~n|4rpxu1|2aCjkHn>2LO&d z=s>hp59UQuXe31f3X;9w!t_;SY&p5OlCH#Qxh!bzcOeVxQ zIf>*>uJVx=#8A0Sze#Jm&`ET6=Z_pk(|zt*=`wTvy?axq0oHpun8raHbr_lAuwn(O zk8EgI%y^2yOM8Pxc_0wH;Ffdyxv$5G$;qv|RvF7c#l}P(NiT`XXmzxIO0@%@^B`Ls z3=_t6s=;JPfjGD;{D4r#$X!<1YwJ{{=ogjqS1xK74FY^n~kHy|3i0YyR-_$8%rt*v)aoc1exEN{~JwmSb(>`j*n zD|v3pmb($(cIOP*8uE1SM|b4^{JEOub#QnHXV{p7U-4>bF?JPQFVwja5YJ<7&Mg1<(dTCauGcgt?7dSS^S6>w!$oME^3 zTK|JzAdJ_A4*98|@{gm9mGhWxekFH?%%*SY9m?^Cmoe&xk&g2!@KzZWKk#uhq}|a6 zG8L63aW<~~80^SVNqBZ+?X+j5g5bG;&FI6?G40{^^Z%`jqrX3Yz?bE(NEwxjmLi=K zjvbjES2_CN)^Lb&C8IbuAN}4Vn(vH5w*?V%74&?@pQpTT4Bcbx7}d~h3KetR;M+W# znt?$;H@q6_P8>I*my*MukJej>u)&R>{{}Zu1KC*fHu;|6a2D)%`HTp!9Er|VIj+uZ zW_-gewWN9hODIw5x)>Y6ryQ7l-*veENBI79=o%~sdi^+sqK7vs6FN^rn-;#fz^@sQ zD8%5SlH17av9PLs6yvwxhSn>(<&Rzt=Zv=vIO#a-2A(Nkz& z<&!5d=NNTk`<5FU(dSmxA7zotu^9v+dUnP5J2~K;n{gnmyh6sSvEC9?vxE31KN9As8WFM=t z-8k?cuxR1NoTmWzz%jb%HYXS7+FQy@DE{iShu@9UP z#5{V(MBwCftQH+Ub%HJ+Zf>N4VsHMb3jXr|d7H42>@Ji8=K-8l>$;h2!X2K9ddzAh znrs?_o0^QgwF;)%NTD461( zivsiUy3m+PrP1jw6D#3Ei=_L(WncN}kyF`)dbL>Mrns{*d)G%=uEBeb18C}v{-nMW zUP*P=7%|?(dfnR@^>%^@8R8MKFYZ{bR=1?*p`=+YaqnJKfBz~LaUkzcwSeCVb*;bIV&29>5@kx>TtKdKskptK_zNE8i<%(64#Kt3 ztMI)4d)13ipF8&FFLAxy`9Y$Vo%i@%M#bg-3n=p3qxtP*OghtF%MGC(OZ{dw18*H1 zZ_v0-*IPv^5_mqTfya$SYdsxS)kdFv!mqZ?&Ho9tI{P)zfS&EMLs?_rc)g&V_#~e} z*GS^)w;*rlG9=_rrDwmMm-jr0zx4BviSUOnvMx8Rt$%Db<|uof|bE zT>qpquAOn;ByH$uQ`tZ3h#z3P9OTunWeJ_^ZMbehfW8b*SFxG`cZA_MJ^MogTV7S} zv1Lmo+)cLuAbg2ee^NZDXc)^0Ik30IkL4Fx_ZBBW4^%rb|C5Uy#nz{<)*f#!JXvr7 zXI>-&>b*|WviOEV*duoxI6NZGM|(2we1y74HG2c0^sVykW-(o%e(k5L4_E4Gm}G3+ z-^0eZMD&NPT$@mLy=y!KMi9Mx4EeqeA0KY#wF2BMY@y2qKcUOOZrD{5jCZijX7MG{ z#Q+94inBe@Hq{6KxSelwk>UZZhc;Cacz+t^1Qff=>HMp#P2KIxP2#$LY~E=#!3$kW z`NPp#^ex8X{~Nd2@2u_2wtWtc5?~U+w_~-wdqndekKgRCv3T4otgh>m9~?IT|Vd+evM*z@yR61r`ov3d%Wq-y$en@ocn}3 zzP-;h#LBSm>w}5ZsEsOErLPPtifzBVWa{V4NBubwykrcc4)eNGNF^VkQsl-t0RH>9Ot8%wwmLtFBWfe zZ#XSA-!%5wd$lp>7pKPMT}?n;b^xo}NuatX1|Q10_?%t6l&ql%-3xKDUGY>oxk{bNFD^j8L_~M3_NQDB zt*?_$SDnx5(dp^ba@cmQoA{Jah~F0hiHb)tpzipcyzrMjCax0mwC(y6vMaW8Z;|JF zD4R*B=TQiy^$$lWTx@pjpR_BsFPKRRjmwkg!0t4TjYqeSy%oB{p|q-%Jt&~sLD45_v94K;dp_}wY-6L3!kPG zNNV0w?_1=j#8lkwD^7C|1Z8e~Cmf%mxb}xum7FT}glQbG)!q#`X2} z0PCnCv>VK=eHB`~l@K3y3=B7vy*;x@PENj|YHTW;XUvxx7$C;3>{jq!prHFm9=iBC zB1J4h+8kd4->me3fw1H9fXHT9m1i204co0;%a=;xswGF{iN*1j$!|pzkepnvOO}iP z->?@(_S`oRUQ~weElz{!{QgdY#sraKgn!O5;!us58P4N9?Nzqwl;z9XcV+31V zlZlY;p2k6hF0*5J9nWr`a&WzB% zH4hAs9nT-7h+jf5*IX_qE}%+=!sH+k$yxW*TE`~oSFhq`endUN04<^^;|y81vnE-} z3$TV-46q%`gi94rZ4Q*Oy2?gS>WCN3ja=5np>Px4L4F6X`?{^_8QNyQc9VY58>wnm znn)Q+UbY2?f#6pg)tVu+ zPal0rLF{*HJ5}k*NyF0|alfuuNPfl@aDx6x#p=l!Gu&|kYCBylR^(sinCJv!R>vK$8?{*BXQwx=B_s+zCZQGnOM*TP80V_gTi?)|Yiy4CV zojN0T38jpTxRnzguk2MXu1KAM6(p~&rfT2Q@(Rp`b-bp1sfElQ9=zt=xa@K{hEz0R zI!Q>IeFVZ%Ql(NA9od~1DZY&axH!H>FU}rK#@SGPkD-Npf}s37MBJ(&BC#&+5v!%w z8N1g976=OZU*xvoLO~v(N!l!zu0k~)+PGZHYwNE@t^BM{0o6iAg|jFP7P|(1k?#h@Nd|uNYOi@@7wZQ}s6Y%BaE)ZmSQ% zV=}}h?@y?%i3kfFwG!|iZKODAX}dP3q>x0qoW0UL0iwhCpz*>5*dFGDuuM2wWw-`L zdyF^PK(%owlrJyUKI_WqMvk#rL+g{{Y=T)CEWwL~onx>Gwok5`lB1M_WWb z5Gybz6VXw^V5z@yiNAHl7@AJ6_|xR1@t%m;JF^6aI%|(SLwsMtKl8Z)9lRHC5t}||RKk+OF z1NZM74;j6_88Tz&b;SPZ_5~wtS&4m0k%90@t1@qi)sO>ka`F4$2se#I+-rS87ib)F zS<{f0ZaphuTYY<%MVz->`{KV$#6_Xt2UY6kr6(bB+O3&F^%uJvXbq1O0 zQ+E6xVj67?2*+}ZZL`hpwJkv3-r6|7=(BpmQ^Z%4o9IS-3?%2!L8l^ZMNB8dKU#;G z%z6`K5LC3_;@0;7F{T?!o;Qpi_(e8*PlTL8~b(0g?*ZaEjlt^XVP*>>vH^ zKY0wZ?KfAIUoboLRbE-I!TL!ip2A~%+zqGMTjn2gpTDqfkbZ3$B!>3Dh$uM?Pox?O z7L4|a;pH_8%VGmW+3h~%O0kQTG>GQC(Lt$_KTT~QqNBs1vY!G2YJr%eU4C&#%}a*ZdM;y;N98abz<$lNG4GOA#QTtlXKx88;K_sc8W51*KCX7 zj;%sT)NW6yZB*PETvvZPZ^VY!w}O-`To3qq;QstWU8(0Xl##WCo~|VmC}<6{k^1-c z`aMpB`2#q3ba?niZ}j?$UnW?bMX1^IXjomAiLP$f?1U{)XFVY&C-=|Nbr4=v<7I?G zgui|MHGWO}_sG&f<< z>z_iUcGeCyW_^>o7a@;1S zSFy3-yn!d)0&Gm5F)&t>nN!$fTfqbP06>@RcSA4Uaz$O)O?zo=U|w(3xDQrU9Hqy& z-0$Je0&Nweh8qp4)+z?gJ+g-Wful&C&7oM6OJ)jODFJtHAT)EnT0@EEM zp=Sbs3VxMP#Ho|Y>R#CM8!x7vs~q6H;wPKhI(796|F@zN22q5Uf>*uYf&^#?^_0F$ zJ_~8dBhwPBx~HM$dD1RvK&8mlhae6v6*TlBETK#gUvePXZHK!YpIF3e2DB$xqC&O4 zkm>xn5gRWXecV?Ix3PbB6~<%vs0X6b#61xcx-<1w2aoFTefRDDHcdI!&%|7(Yh7_~ zERXZul5czE+r`jBUV7c$3QmVndp5s}-mdw_ zX4#tigsko^2z`CAcX*e^YqQA0G>CpYn2#=P`BEFu80}CX8TQYid~*zJ)=1C{(u4*< z5l}X<2r5$LX4L4p&!WEHD8qTrWJyE{w~FE@x?gv2aA~sq5m*1|nAI?Mc|E1HkrCYq z+$$vrn74|9+NTT{RQ*Qnzi0;hW5~;Z`BCbm1p&Mbott_EM)9!8n@(Tbn#+ zsavkhP3M$sKVAo(Jhf+A&RX2k+-?*}pWmxizV>G+oc z&CkQ;zY<6j6zS4M|IpiBdGn8Q0HmDL?GZoOC^bi_+1aOmOq2jN-wKaU9Ra$n!^M@H z_EgtZY^0_0b)30k*z-fIS>q3lJwvdQ`qmvB>l~~g4+qbSau$3Wxaa?;JZcFAlp}X(;2D8ghBs`8C$$=i6Dt|O%`MnM0R(<&HCTkD zw;vaU35k@Occ}rBqS8IZM(l)M&ql?N719w5F>uqXe`zNVRgt}(^t z(k;xatLq(B(7+V5UG|;~3{fnl%Z%?^kzYp5ZHP`WK$PH(@7m`Nt-<9BG%8q47e@8r zoY06^lZeoyr|6@+OSVqk!Sl_l_#Im-@kwJ6_3l4+HR)ccw2wz=4cD*37@9#|H%b*_ z9x=(ix6B;DCO9h$kx*2mtn^T7dIXItzH&Uzx2A6aa*TsJ(4qAL+hP`RapcGA4bMQ4 zjRrNKRvFHWl?Nzjz`?6C@D{<5ht@Ed8#0cV(%074=?(OtENSw5Ca&zRrKA7c-kv)t zDe1SyMY}U;&6M-f!6 z_^)3l;h0pvm#6}zy|@?NRkEP-KqEeejdx#t8Z9{O9_T6Y!5&%Z%}&Nh5|){&zxJXE z<0&EinEnDEjvw;zLmPV^@%u}Z?JpW?FUSP)u}M=zkwLYVzP^&YmzWN_+)#Ss1JM$Y z)u2QSRbGagnp%{uBE7y^McuXSrvFrx{aC&x#mI|b4feGT^SR{FQKN>l$+^d`%YY@6 zBBpc6k_ZGaV4P$Ko>@N~9O_c!Kaz{^lrTMyq*&j$(zosH>q`k2r@J<0C+93SuAz5}bg9-2%}zD9EJ@Vqs0e!^fs-@Uk}@MFG^r!Oii=bWSgRqT|>9>VF^1gC*##P83o+*W+|Y^s~vn z1HzJFj$33DXPsp&ew^W>o#i*zX*cJ3Q@h=xU8PA!ql$0#(ej22PshhFkS(@UoV@<_ z;>~TV#&O#_@BY1z(BSI+_a9WZx_Z&ifGLfrDxrG!{k7W7^woMelII;qe#SIq?plh{s#Uo&Ch@eII=(S?9G05Sfh?yrq!bFs-UDv~;%+`Oc%a?Rv-Wi8II4 zaPoJB`@iR?R;o!sY~q068K3G|1EmkAZN8&*u7xh1V3Kw^BMh8|{!Q~)=2-Tp zh(rSu88@`F+Q{w>W@A430&{SLMsm(Xn$wRY@uFTaeEaOBs`^^X-j`(pj(n;P%*9 zzBb%-em65vd=o6ZG0ts#YNFH^p(W`FA-BqKrMJEgJ9D&c(jp4Qv+`W^5i66jWpiNL zoJCFrBTt*ujvH5b?S1Fm%{)k#j=;zqeM3kJS$q@(!0v3wQoV=};0CZ5VXO#rV(#A- zP_yccpbEW+!7Md(w7FktR@sE_VJr?HY=gx0m4Q;`DTL?Pi+#(bd`bl*rc? z(+$;jR4n!GqcBGkndQU#>u|O9B!LUUf}iKu_lQD7F%2hSxpIokB&;>%p~?7C;w(o! z>(P_Di}j&w^pLMs>0ZgNxZru;@^4nSC~1k))Eg8#*3bnS&5nGfiRQY+T8dE40Hft!cQ#W1Om$UoThJ!${ARTUP$}qLX2tK^TRp#VvpBm znxn%5QKGqww2uOgi}k*n;+T#)w+>dGzAQ($OuF}qVy;?>yW9?}wl=K(0L^r+Ra#J2PyEf3i?G1i4FY;eL?*(kyfi zQ<;`o^IUzc+-1w+RTfzOq)M(#{Zyh6c249n^l7+!;FhYt6Hhwdmb zC`=6wzP?leOJ)KD{z`HE{z?GyAkWT&>dOmY{7F0-KJSbRNFTb?nDrXHg@`3jb<=(J zsd_JBv*R4xki>vSFoFRT1;`a~4Udz;d-{8?rq3YV=)C3T0BT^DI>$b8wXx6uwm1j5 zUOznk`jy1;c&!J8gqd`a>l!LbgQYVMqZ-lSUFL;|C2Zi^S{)!zU{pyF4gm~zak4~F z=g?S25h&#Z<=0l6m8m`X`<~>&v6bE)z zE7OZAVPT$uMRQe0=i(>8087w^rdvsd&dYZ&d(28yDzxS*04Uxb8`lMMLF+yR24bB& z5&_6WfM(A?t}0EEs5AeU7kAcAVGVhHwqxICt8&ZAQi%Veq zV}4h57GMbqzkB|3@ZKlgc)6W=FC#TSOneV6U!cAP|5bD#(#FCevxebIW#dWjzi8CN1Q{DaMAUm)1K5 ziYCFdZ_<^nPRGh?_h)zXDx4kx-lG*d=qmeiaGN(^0p7g-tp1(#4+ZhO0f zXB!LR1pA`gW5_tqrRzl1EBWBaGnJf6_c)HmaqYL1xs#CQ`uHmNIJ#@4KLtZHW#}pd zznX-UdhhAGYO6URzDFkmP?LgNznvSfw*XRNxdd`vV0?1o%9+9ZnlhS2uNsU^9_Q1T z!?b*l=|eQLh=|C$k?1*en@;dD(}$Rpl$|1sCu7Ui3gPa8Hdqg1F@^buRQk)hqeqZo zdVxmu?);bqk1(;#8(O2;WJKEo{`YVB%Wy^NY+Q~@#HpviZx*3awq-Bbc;Y%!VG8Cx zh!KA^?-0kO5+Y^Wc~WbAgF*g(N%s|0#O~b>>Rak>hu)(>K92Nnr!?uV&^8Btx%5NZ z4vIxBRgYhPc2OA49z+m9cdx%)$|;SOO{e96?=4A(hJ1n|GYz^FXpBS?1<^Z3sjS$)bF@H8wnqu-f&oK@MA`ax8JzxBMx zU>bpndiY~|$AnzsGrL1(U^2w)nxT)vnBeIl6FY6%d#2Jmyh*&8NNoI(jo3TfGDbK> zEmQbMggW6e6%U_&{2;uOA^$S*Sd)_t%*wN&h8a6nkLER{ia4EUz@&a+pni^nPer5s zrgb#Tv(e=WQrrjUYV&AqURAf_A{!4IQP8+=AU}-5=cVvu*-J~48nh_@BKlsM?WE(I zgO%Ma^=>#Kp|Aho*UBrXw-7fs_l@zL6<*9SuV)+0VzZT`E2t})K|x`5nmr-eDJRgWVp5_U zLA8inyg2S|i`FQ9$(94*Z_Gt8lWM9q5b?BSGQVCU(NuarEMwk)K`oo3sy!NGAeUVb z3s38&6K$*xA|YZM?_E3 zO8O>Z;}U7{=w_Ybvg`KR!RsPhw6y9}$26h%oG0VLHAdH$pnr8*A5 zYkJ8aQ^_YN+6`chO9sp;ZHqLw12a4!zvrm5?$qfh!kaD$d2+ciClS+k&6&S&jm_oU$Xbhib(b z<6iTL-njk@5l}A#X1sZM#B`UCc<}1#Pn`Y@`@c{143mo;T2UweE(X85HzkI?xk`&C zIO<{XSjU!bQchh)GSaCueBE*_kh(2J7F>T+dtdcYZy4D2jv7dWv{aJw0p=cUmqM&w z)4IW*7c9${QhujFSO;dq5w(+fFNzM-+ZHV$)2P2{v3D?H-sQEn9GX8FQ6F-mU$aNp z*Znq-5S(?cdy8K*K>eLHMmniepLu%mZnI+dUCsxx#Mc~QtCl>k7!6B(`>K8WGR{yP z1hX~ZHkSA9Yf{1K>E^)9bo4gEgBO>CZ4r>?(gGq#6G}!{bzUg&Ph(5ndXn<3lHA46 zd)-X`sev=k@CEVjw*g)-sY@kmn}HIGS0YB?$NpKOqTN+{DKqsR#Jza%$bLL_PU~vA zfR@pFb5-`heSaP()x_ObH*r#H_nLCq0o`AE0s8Y!sHCN!EU}%k{oY%Keh91^oU4{q zc5|IkbaVHYzn?iWf$lCAg5p1CW+gJu2QPOx69w@*kSq1{HB-iF1?N8eMuUgq#M2eQ zeMccTNIVJ$e)snCR80~#7RB1XM?D7GFRm7>!t?7Hm5H?;hUN%6GbW;iD+{w(hiVK& zi5{^$9wBvO^rXP(#57^+2JjIIKOfWMR}6G*I+o3ejBjPL=_+3ijc(PD%ao9e(jux} z^yb>z+vqT{&DfjEupHVqddW2r)5|MWtF?>-Y%79T%zc-h@`%8F2||2vSJ#i4GuvZj z@J1g=e6!$0?3_IXp_Yjes7+8;c&G`^$bEgPO6Hgzm%R@=m!QQH`u*#5lMPizFlidy zEm{Q@SK|BM^72A~(us>!+}-X|tRp)i;S(DbOdgd94u*Ed`JmFC)mxNI<1h!i8@5BIvsT+IQ4N?I}JU%I866|rA+7= zE3&4f-Bb9cpMN|o3tvgH64uV2RM$tV%34?O^+TjZ!7HK=cg(fje9f0HBklP-`1~_> z3F_*s-@R*5&e5xMA>TtpJx2t6q9D^3?24jYkH%a^d!kCa@^SOf4?A;neWXPkeOHH$ zP7dl*#1HCX`g98U@)ns-ZvW7+@kP!NALa1+eUZf_R?R?Tb*$Wgpf@2cG3sbC(6C~T z3RC?Xv@hUvrv^T_0A~!Lp|GWB^kH<(PO%3cWYPC!Rs%M8e->@1J>9X3^=25Pe>cQ#?F$dq}bk@jX3-^B`^ zvs5Zxr03R9*g<}~`5o4MmD$8Maep!vx^R10sBE%|_kzwn>kAd+$vj*CO(%Sps@!4}CX*cv;m7Qm zTKP_g!;j8)XXQ&p9|%n?E8z6TQR5QRy_qUbi&g5Jt{P4^U-;#~Q(qHQdxf_8i)*w| zcESU7Told|y*#mj`i-5X<_a^V=D62>*NLRm)|VgsUw2R!I=uXF|G8|~HeHAVQxX=q zrbLox>M}MMm3y|ta*Pej`s`%s)0`m3&pviY)kLH{8Js6C|Bed}t-Oa}!H$?yWrCQ$ zunDB;%XkNteGq)LnMwu9$4I{>PWTD?2Rw9L$Y@xUk>oVWrGKdz?jpBYdxFnr+??^U z{4$%>_@}>f`f;0ifXvTBK7pZR-mmNrSsV0=KCE&sp5|s`mzM`+cSZRNvq{;_pWdM< zo@qwiBFK$1K_&yMBfc^zL~Be*vb%CNb63yHNoM33|IBW~VwE<(m(wwR>{HwoDYDp2 z{u8>=9j@@>rLF%`4bPNOeZU1|iGCl5|LuKaf0y|mV=s$zW-8t>!A>A(A|LmI=i!RA z-i?E8aGzkG3`lxMI7N(FUe>9jP4CfgHB7$N@TSTG$l1)$@fqZbO({oO3RqJl={fYw zcPZT`acUyzW4WR^uDoqiq)gNBXEY8uvuSd8e!GcnBpJRR!itns!ik&8|6DYOSXbgJaA} zyAMmml9<;-J zvEYU}=7smoEuOmr7HgExe#UX3w%F7U6*5q=VALvuc9Bl@DKwVLch45neg{{IGI3Cj~QjBqXbd2;ydR}ekZSCyc49?wn93DTI zJ-qN?lS9g=IH^It3L%ffoJlNHW4SeRJ6P|Nc%@!`Y4;$@XwQBQ)IKI5Vg)MJzgUjr z_?7O(ke$^<-$3}{I{LQJytB2JkA0%=>A4Eg0E(ZN=(>ZGL#7n=b4uxEH`>W!gRZhZ zozH{ce@yL&Zm}0<$&hP}0oHB#wPDeyU(hn^9<$hNphUu_zntHksc)1ulKzMlyC{$ zCY_^}eZW3h%`2{3A-)01ytTpWeZ1lG)E9lPt9+?Wp4J^2<~_N7HLMY`4NvS4jq0m& zn*xSCB)d037q^NV&u2YbU|JQGmB+R)^p5YjYlh|er#QkS^YX|{q%8Coo= zub2EMAjxoU*hy{PI=mz9E;%%OI9!2%@`)4M&6P!Jl|J|;69p9tC8A#%UY33NbK>2Z z3C{Bmq0@~%l(I~npPM!LAT3z&P(3wQS66IMn%c=^bwivjbU}(Ifhoa$WfOiON$<60 zDv6vKa%tOf6zpWiL(p|CuPF1ZpoiuGbOrt#HP|frLb^!b98kXOE;gpr9b`11)HMu> z{D|lz+7E~MlpZG7u82T(%q1=d9FKu=EkWi{bT*mlz<^<+J2xfw>%f#_GenS*1pBWM zK{svqF_KRH3mFU+!ab2IP3&mtZ?Zsu^fHo40_(WYNl4+=?IW) zI*N1z6j=KlW3YW>Lx~4Njv2>F-q^c+-?_QVX-AB(Zc%OTgBa#fxaOyhaLN-T1IDlE zY%kOk!wdEYL`2sO$b|zlbl@%CeDsO@*3N;s-FxN&jA56hu@&3E05@T=dLG)U?WCQV?QV3llocf_ zzgV`|d0K8jhA}SfyES=FNPo<>s5F~(nvmfH4R=Dd5(DJy)5Q~?eMh}<8oU&WDSOLj zpb_{_WoCemI5rSZ*7F)*5lyF}0Xa8>Dlqs&D0MuD5UhYP(u#vqyXbu)yo-pM1HFn+ z^MuMhxe-TUa=!h+US-QfHJV5$j^@)R+qqbXSDx4;1c_2vVVoz74>i5zdH&|3%^MJ~ z6)997mKfzOz&kNF*9Ezr(mo}=tFpV6N_h<>h~#BFJv}`f*H#}Jo5GdSe$>UH`+iKz zI+=VA_McCwawxz}7616)yQ}8STX%`#Jof>FHwP1$<$E1^V1gBtLo(+s(G`G=)VyPF zPc>PXJail`zJWq!{Ka!BN3LQCKXnGI&OWI%7moSvBdlf+D577Qw5wOTw@^#|9Auev zZFf2}OnvSviQY1~gEV0+w7&ky$iMO89A@kwnRt#XBYWQ!Fgj{UlcOD|WK?^Jm)P(}D;0H67y9Ix(H%+S7)o3`m%3Brp335eZJ#ioaSol*v{1&E` z2f`b|K=WNL&iHN$KWSiuZjYVqC1vP+Bp%mV4EvNo0@PLt+T4H9xLUs#lN z%L%PR{i^$M2vOTuYd{e0OqmzKs=2v#FK_ek@V#gdx~+Sw*-c`dEL_=P!8M4DDwTWAOK7a$f~q)0A>NyTlJxc1$o&MPLY4{+a&R zt4%NZ>IGi{D0tv!q^A76&j+^WO@~-a0Y@Fm6R?-F=GPwx*9g)^_UcTm`Zl;@Z{KF@ zltcKzt-4Nz#>QN+;8UW-UBIHd*JT!X=?Pew8@5$;i>*?a<_h1U9Xp5vE;hD&Yx!rx zA@jZ2%J-~lV8!DU1B1&$KZXqv$cYGI-vblE3qDdVH1QzJfBB2~nClqf4n7e92MO60 z<3B!*Xd7L+r;c9BJ5Aq8YC?s2F=?u-h~7+6i6{2klJ#9{Lh6-S^lazASotNt27iv) zDFZKfqc{3YR;nS$>TTl+C;e+z!%$$C+>zACoqQzt zC5A&`WMoEKI3b09KIRu}byRZ;$Z1Lv_j}i!f1O%UY#M3izq5j-y zclwFV1a;KTu(7r2$nmX;ijBAO$p5B6QlKOPDysU#>K}Anu7^9xXteS9-kh?diFzQh z`KmdssOM){S}(}$JL_$M@{v#qgPqH)(qi0%0X;mdpkU6ccDHs z-CQBtGx*FA@Bpt5V2}sdkbZ71dNV0^+xLDTIzi4e0wfhUAk4`!=nleQkTQ+P*IZgn#vTbkOj zo39Ad$`{N*H9@_0N4mDQx20{fLuPUxJ9?j_bWkZ94KvIqSH0EE-?(@BatiRHAxjut_Yb@d(%?89S0veCi`k)iO=?hJfW^Wp@ZU8K5 zHTW4_TAKDTkkvY;;N1+6U)l|b0Y~A^+j);*t)`18M8k&A)Y#07<$k#0*di(0189W$ z8`UqEu;#W8%yDW*@zqTsCWr~*()Zh=?-1HOyjg)tGoQ86yHddWo1RMnu#HIr2bMTR z^$>>!^3=C3Q$`-8b~8C&1n#Tmzg)8d`FvV^#zuX~*g8FZhfVENizAw}DmUq6^5mqB zR{nUz6eLiCBO$U>fh>wKre~>kpX!AC^FZ#A7JX0I>-?8@%_$}^{~+xDL^R~e*94g= z3Qh%NLRf*TQUi+QIa>7F4L|hziW!{9LAx8S?SVgVyR-5ww1m*YworFz_8cD=)WW| z@EpQ{JChA?Q&AWiJ_JF@?>+y}@LX^>6sb6k+=zB#xSbLph~Og<&T$~ zcjJ`|OMiVz=%kcpan*a7dAMQuEM>rDj9VmTX!AZ$dkkGPUEJv~dlF%X4Tny7dXBN) zzI|INVAHb!?FO~#Sz!@XYb6nQCm>n%Y|jr^3+|DOcvIDGSt>Sm9kYe zG2xxi7djRWJNS`F7RpOyUJbt{ zpo|&3?QsbJ_`S)w&d=zrgnpT^eL8evSX}!So6ab$g~|1*doT^Sx97r%)WQ?lVF$?H z_4Kk+0C2r{p&SULj{JmRXz$x-|t=T$N z4y-lIoL0hBn`5%r2oT#Yuy={sBLE&mqzU~zp2z6J&W|5oguH46IDdw1#VA@S<}}ne z+a^db;_kflU$SuOk`D2<*ib0?d^pqMM|=8$4;2*^Q3I<#iAlkd2me!2x`JTaq*0?K z*7xkD1Yhn<91kUfBt-iOe9rj=t&huUS^7}n;@y?W4e8wf3w@8Na%^z(3@^@LHBVo% zhZ4~(wI7jcUgAHXqgh{CmQ59P#*!XMRWA2^7a+f2f7_eB;orkb_M46XD~3K{zyD*x zhW4rNXG#+Z+F*t$7}i$%Cx@hFN;=XAUc7B-@hq$nGR81I(sa2bnDzAmSG#V^rRf!J z$}jA!$N;=7jYRo9oNeB3o1*g4X6QBF7n1*vt*?NJ^4<1E1*N2sR8*u>Kw3tT?nZLx zmIgs$B&EB%yA+TH>23kZp+lr=1}47Ozvp=Fx%XRZbm3Y$%Yo;4_ujwSE!;O96x1Nj zzHL9Yk?VJT!f6%sVJbz}#>o1GI}opPUwYY)t#(y z{gln>opq?FCYvU}FZZpkP86I|5X%+baz9w=9jFodvUnj1Fs`Hbftz!=T1Pr5{`u`D zeCPdhQBl6oYT~d$60fU6b5{>CjDDv^JGeQ!ZevJ-0J4Fxaii5IS%Ah4|?oVm^6;KQa}rDGbwtJLWb1Ma=c${N$lr^Z?}udMA`Xf@t& z);zcT>MAdYSX%;sk4(MoTuGpcp04h0al;n$C=bt!Y7|o2^V(8bQiN>mEwVVsxjy1( zF8@n@!qdzwBQoc^h`Sv_luxzUG!bc#&hP)b7Vgl4C5~-hD4k!{pugQBn$EVf9-s7) zy@lg0@%kI_G3mw=r7vi#P9horXgD@w~v?={UIqJGB)$FR~Lho-x)an#HN( zPU^iHm^*pLU}7&rDZoStc>%HM%4zZ>8=@QjGmnGjUS=Eu`aOfo`IQs+ubM>1Ch4X3fG+JPb>*vY3p0zz>XDG#@|>Va1TNO=Cw-5(Bhq;e?u0^(4pXhl9*gzG za3^(%PU65d9|u3TNhpzbvxo5=BTMQ@P3!1STwFvVFw`LyuK_%5{2@5#2*gUTjMal* zeeX!6Ru*mf9AY9~UaDrufG!urBOxysjQx=*x1wg_AJ8X13^HiPd>Ho zw5kAmO=(JRs%pyqQ3^O?g}POW>8M;MzqG(Oi5ITUzUMD15d^FsJU*OyWl7uWg$w0A z@s6Gl2cVNgU2{290W4D-tuN3UY%8F$yBpFvyDtVTyUcw{Yh-c_uz`RFVha3ATEfww zH*A>G^kltP-?w*bWIyB6e)GqF(Sqx*oJhcPKLJUP3t4y}GxFt}Ir7E+-tI5&`Qhvk z6Em}r6_i-nj2v0g?8G8ezDYmJ5Unxci{&eky7TVfV99MRQ@`1Dj|Cn-1->_z$G8Wb z--QTgnsf)>cO5^9I<%?WJ2*1+t`H5)z7z#ClETdtoMABEwz27P(NPHcM?b6>g0eJK z;d^tD{Wxc1>H#?(6Y(t733-&-DyIH(wA=Be$J+nBJ9JMAA74ly1 z*+wgccrhM)BtOBRGKsjsIeg96j*p!;qI8gl&4{B>7oUA%akpng41?qftyn?C6tnVg zmJLmN{E|!3*AS26p`yhbF%EnJg3k`F7464VtNkF!v2_(26)ksFjA~8@H;h^@JK;hxvqHvy2zk#+Y`t@_%V^{g!=m z1o>0kd3&#DKV5o9NiJ8`)tRZhs#M$P2{k>xyuet+bZe5SX%xX752Qp9EZ)sr{d5@r zoPgqo2uV1t)NloeU~7%`^tJ!pA@*QGm@%u|Q^^4SLxnlC(uYTq+lYO%;Q&Gf)Sb6m zf^(f5(*J7`JwK?NF+=9V;483_p5id+_9l4xxb7s$X5bu0i33gM-Q~{KSX`j$sbcX2 z4eXzmlFr5Wc!Oey@fAI(EtvN!#GyPo;2DrE{56 zrmtIr`vuI!u_!=_(7}@99~ik{Jye~LH(~fVQ%*Xyi0EfME6LWd{4_{l3t;o_1I&|z z{=tpia&w|uX;07jm0RZMxF{)As7KZCLcJ{unSyBb+qSgJul%nfjE8u~d+y5Mv>(|5 zbqv^#7QuRzu=Ck{Syz~}kHfMIeH)-oPe-;bVW-I*Bhq50#4E?a0yQ3kCfnnB-!dH_ zB8yqeS;tQ!A|uqwXBZ11an%xjxRIpaRAU@$i(?!F?t(01QF01R*jt44-A~{`FX4&K z13PDCU*?%X2a4%}y}+R_wQA7X*4=jOkf<6U_lKU#s0y+5w6 z_Q~`^q0~>urBG%max6T1d{z5B_gk>*Z5|q2jc0I)*K2iG&vmC^s1&GM(~h?p{e=9x z$o!)kX+WICR@WD4usW@--XeQUu&JB2`^)|fc2pCASUZm&%k#soPz`d-G*Lp;P*X-( z#r{V@IbD@n&sO%O1h5%j)u6+S{tfg(lz>qo7#@BaVk<={EhY7%0RncU#)-g>J_Zca zk2{ZMxB1vOfA+ZfFB54}7RM-WFfFWe!5@2a#i+34;fOFWI~6(74M+cZLy#qWbksVD zePlZ0f(VN~Kfe{IMW$PyL;V6%xbHeS);9hg-1?>Z1V3y_N=Zo(KaM}KN{7k`C!6qp z{jVBTsE!sEf&W_zQ1_m1^7||%cH#-oOD3fWZCVVh8Rt~em;KE5Q!dODsiK1nI9$n{ z?|Yja{vF!Lpf2Qi+x0a$wN$stE`))}d17wS^wM4XwPV?5lt85v!pT2zBdW(&y#4cN z2)*$_nxIL1M}T7H_!BxZ>e?6szVE$utcN%L*t?I3aJs^kqeG`bcl(}9JF>*vD^7)Q zSA-fX#n7=szMdu$(UIhI(QF%5@Yl+tE$N|n`}}87so%;&DM(%_KZ>Wg;ST@R-K?2H zWay#HP`&|MI8`1M*m|uZqaQfT|Wy*zG99b=zpFe0d&Y+Y5a4-97%zq(S%C*jK%3hPy$ibdhBEPF@Ab zYniNC3wK{D(Wrgb;SM!|AaOrCe$xF8Z2InQ{7C#K%~8&+DM!ql&AU&<;DgF)m0F&# zHR+NW+s8#&CEB*4*8$R=zdrXjZN`N+o;^HTP4-=*&;eWldkWj;)o8LXSI$p6z0JX| z>UFy*&=o&BZ0h|9q7$?kAJ)#ZXYDJl_pfDvFaDAQf7E6uIM8-cQc-OK@=okkmQVUF zVNqBIA6HpNFrJ@aP=}=AxQ)C4_x8hrpI_YC!uFRK=UXNEwU9@fAj7@Jic>lY3m{$k zqbmb+EFf)!9jD^ly8BXPm@v1n@KA2Rr;#GnZ@Vx`K&JcDty1-syn1jU#g@}cNmIOv zJV>4fcV|J|eXmuid*~pm3zD8goFL7QxLs+K!6^qL68}OlWPNRG`+-4e^9zeyH+f%Q z-|v=Z%ZQV_i~8%680HUy3j$$nX{&~ct&NV?kR5*zzu4GGmSJCiq^b&I@>A$cShC`C zn#I45AwX~DW&P=E)PpqLX)>=|waN*g!F9AIy2YA^i}Y#hy!&m{{N)4J`WT1tyC=Ly zN;}>EVMf%eLtT%NEs<&&p zJnqB&!>(mNqdm5&)rzXIGVZjeSgz?pW4N5Iv?i}uZy21 z@?7)kokQHu^6TM|vY;917y8m}h0mTAkxVAQgk34c2K*y%Bae5=68M`U3KT*6ce+P6< z7nr*lF$Fe`c>8QmCgXU}de~t-M^Se_rHC8L(IiRToc#{pV7Ky~E|JAMs(n{fxsdG$TFs23I&9Sx$(=PGhl2Bi1Er5Zol-9&l;-Z1>Z z_07$X1_uy8++c4fWW3Nel+@O?boKsTdfuT`TH^%BMOUucqwmvHt!evibpv)FXl`{i zp`x7(2CLj`FqvvfEAEmK80F9NZ2NtRWv=Ytd{KuQu5|C!X!zQEBBYQE?(JvCXsREa z=~cORbYN)*^p{7?oaNf!&SJ3(7W;NM+S{r`lbNYoi;gQEznIa*>Nw1HLNbLsmpa*(fLD{HDlqcsHWB^*>YpU9sOS(t zACKI>$J4|YRAtfqfP2_1k}EPNAz5sS=W7bQW|r+7l+x57!#2z07P&cFBvPEfK^t&t&KYQ+4A zjkkBi%8H@QjLlPali<*x6V4}394SOKGk)e%SLMer5bSyHp)>J(J%PUlAjouFcUH~S z>d2i7cvmXy{;T;c)K6QiHB0~USfx~pYWK*}$_o3VZl#oy(Z|jwrYdn4qQba33wpde z%m2A>2IIN;6B3qU_n3dWW&8Qv3hVW=GHy^Ko$av6b97VrMB^(XQw8C>8tH<^5Ux3& zTYdIP!6a!(^=!9UWts-(=Bdjnd`Yv7N0BgieQ^M+L#1M6Yr0Q;X{YoL(PNsvm0Ff| z-WkqK;)QDbG^N=b5L~2s?9})JWcISRG>J85=uzXtV~Fn`!By zsE*CW4Z`?{#G`gRmwayy$Z#_;cAK{cnS8q@^Q9*jimQCu(E-vd_a6|eHfHq$bx%$V z`Wy>PT(y&(K~S6oF*8fx@x9gV)$nxRz0{S zI^DiGG--C0S+1z~tKc%7b)HRRgoqbR!}8 z4O*0|w|>q(C%=|H4qAN!EC)$6ailH8a08zKD-pRTz_$UIrs}N^!`aUJbwPrv{Tk;j z)(5WdH|`c2r;sZ0^Kx?&o%kHoBIghA$--3;qT{B>Yd2R{_bfH^zS}6UID2%t!%+ZO z`R3(u-07ggnrO22M!o5qa&F8HAcH3dmX!+7!7soYKzOlYN}j)+b60_b0y>0Fqr&gx z(}`t7u%XGRTJn64REB#~$;rG$q^ih}* zBQCcHi<~Q4gffDdd{l`c3riqFLKmCaDNk1HUvuElxW&uu_b=kd8kuwSS*GXaq?>dj z;X~fNs#=qY&wR)Gegi37+N|gI@l`6*)GgGklaszOs!=&;{a1(j{$+B}EN!YqZBf2q zxhgFykFfAJ^d<#tLWFK7@a(ST&wl0^gUg+9CqH&h?eP-c>+PVgdTb)-vckjt3 z1mDZYT06OZQ~Oh0$Vs0TXYu1>Gs$oFCi}9DuC?WQ#fLEAZ=`m`*-{g|AHU_NuE3dj zEUZt5OCpMglx*#J-))}aa1uY!Fvyc~-h)UNK|-4b=xFG+NV8#rql$`~&WgZMJ55!F zY>&W6pxR1CR(_KmKR8Vf0a3u_y?h4N*dCmV5cacYVOANZDRHH70t!7OoT^VHzCVWD z#S&G9C&~r2K-Pf0K{Z@&5gA+MU?l5s5{HPbn4F49#faNL_AxQ$+(qMFk-+r)OlK@T z*KP48p0VGtT3h8>&jtAgW;ygIX?O!Hvv!;pD;w0AHL_b8bziWo%8zuZvA5eKbI+y>qrb+U-OOkhj@4<>Mzywd2ouL}_c| zpJxQq@lWKfCx0CHaHwz!BDPmU4aA`p&bin zYa);dH+VF+FIsDk2e4ELJW`MMXRGc4x~#;VT6O)3yJW4vdoB+%pPV zySezShEytj8-Zxb8eX{Ak5COLf#_?EXCOkHhG@vP^T`*ZW>gQcm8tmJUMt)_1oqF zW~!1jS|V_5&3I_*vz0&PH&sBw8Ry)F!6SrLop@0l9_@Rc4^aK@Pn5xE%l7Ll$T;8y z0#NSPivxqTAfZt`z93n#g6IBL89KY$1p7>gl>}X`2aI&RCJ~j zvbesex+DAZ=gHmD4Z+Pt+)d~CL4=M9`Y(6(uP;!0!2B*H-M>q>%lld+W~j(|(2~AG zmnFeHw4ZSV6502J5?jM!?j?CMVMbp|3G=IT11pKO$03bUVUM$-h$2`71y$*$|6)8# z$88E>NtYH~q;jl@Jir-%El7X-eB$)s#*65=*Sv+xqcf58$Xj8%Db;+FZ^gu0Z}P0K z5(WDdzswM1upB5@p-r1GN7kQJhV*reDkl743u(7@rZ^wMX|3lnq6{lFC@Cp(v`QVt z&H>r9Z=qVQ2D@5S%0s2)1nDY{H$!Ls$Cldvlic@a+KRK4jUO&jeN|~_S;9B6yUYCe zyN!A9-Fs9tymuK(6W(~~rEO(ENlv8T%!d-Nmp^a&y5SW$b(S^w$thSd@*zO6WXs@DfbFH#$E= z*26Y$@v%QCg&yxb65cbnu+y~Temt^0D%u98%JST%V%<8G-?*1r6%eMz5+5rYjT@w< z!t8o{J+~|ah>7!Z49m{|C)C6obR4*_W2>tu<*lD521!KrDF@f^NsSpe@%a2sPa8ElXx7%DVGDOb9dhRrfeNSgtJlj{^Ud$PT(3j(X`5SGe(!c@(x0x|s@5$(t-MFK zNlJ&t1VS3@JPth{ReO$Jc*7mqyT&p_;InTCPu2(_DfPBdlqE zy>8m#h>Lkd+TH7GfbJ66xE{aS2l!4(LS+HfFaYSnedyV$VWai`q4lY~iOXhi0|EkI z?a|`LqRDpZVXlmkbTIACWTFUJOgf7GKvqnVYaxpz)%2ixT|dTOXq;`)VaEv~z9L}x zlUN?nB^pcRFAuqsa3n;aSjo~FaSyj{HX<^bS5#1;Qq8=EPo`c?!0D7$H(kAMb1KBt zBcA$1b}j{HE|k<6+agayA@Cdc+{9>!m3XA0g{Q$k(rNv~;a~N)U*N(zZJUvz-)TY@ zXmok`cp|Fy1LdD~xBzyUTc?1tr@OmbV~AYBmUiw*yke}i+(x6#|Ej%-PcvoXZB8Ba z?@gxJ)OQUGU;`RP0NZ=cFBWW7d-*K=#L9jsM%Ih#=4P(R_df;L`QaC#=%j7@s-35= z0GX$mYYOn_VZ4NBe4|h$mg7@Ja`mt&zbV4hjtdD|3s7Ul7bBXsv!OGwqzBin&XMb~ zdHa1ZAbR)Zb-4ANNn;w>P0*&o*xa*@=PlbG(C@3xOm}`(z)6HPR|DNm{M-Vax3f^$ zZ0)?ygP6>uRc^^W?Fv0vsukQz=ER+$!VuHsec z%*X3PlDDTTk`quk?W>fRR6n!nEOhe(1Nbt;Pn>tN_nY)v)^vs6Y!Kao&e<)uCr-U0 zuJmy~^vGISc~p1#8W;dY*t9DP3Y&!1TcY}#fzn8Tb$ZioF`V>{EF5VCunq@XtDON6 z$)nZ+_CtV^pdP?;mGn>&wwDLX{dWjOzpb=+ts~sXelN4;LIcR+9OQ=c%6s*WtE-z5 zeFxu9NQ_C|vzn6hB1CwKcLJ3m?EAKA_68J9V-!CMNcd!TflVdQGY9@^=YQ&}JB zDl8HT%0#hX0UlC-7BLWoO#o@6z#0$;Y8?GZKABUI?`z+jN7INl8g&FuhQK*{)lMQd zmBmzW7j*UZe(eHHbo#Y)V19ozFW=iRzv+SO);-&7;8FCPPtfcqEmqY2=bD!E+1an^ zLv*P2;|03^1 zB#zu7)e<8;MjfO$@2*3r^KkV3{a%RqZ|@C~x4*Kl37Qt_7QSUwtBOb`%@4Q~+7cpL zaAuY!&Q76i=)cbvxBeC@emjK{bB<&L6lj=llrTTT|AHc1ORk zlqpoux2eYke45{XzUaCJGjylX1A5q4aHR&-BkCgE^4qIToT=MnRXbxpFaLTpA&ju9 z@&Ud=S#ve!9K&i}VPO)=#E6vppu|+ricPKx}zSr*?%05Onk?zwCuRDw%jCJ~GhK^aYj=$$aS4iG;k z1;xOw9##X59N2VL=8bvkh#k2(2{ZM~Dq<^qDz>OB*^i5aHPE`1jP+|xI`G~7L}Lg6 zcBpgpxV}Dg#)5$ znpHi*mog3z03njCQ_0)1^QVLXJh;6fi(akbWhaF6ubR^S>)r z!7@@x{u6&193&|ncZ!Gm?uTUwB$#%=QTAJ;+}bl=wM*Vf(D>O8$AcGS!e^@2wouEt zu}u8?_tgR0;PThVbt4??h*z_Qik7(noA%gI`EOf2o+op*B>>$e_U?zwcZ*-u*b^t( zE&8K&sMwS8sE1r$R_Q<#jYfFf;N=?SqUQ&new=j~*71983WkLz>Gx2yA7jv6PqewZ zyJm@_a)$6Pz*wI2#=eH!KSsl;qJuX>a7*9Mi+ENW|RXV*m-3^0Tk=-k#1(v*8 z+uL+Gk<@$eWT*pA3Bd|OABJpLfbuaNjlgNA7f7aEC4@#TN1yYo8hvSSc{uMSb*j86 zQuXF453BBLy~$6Iv7Kum@`gZ#nmyAUxpO1tb@i*Tx*Ajl(U<;gWGwKmveNPyI%pR4 z*V6l^9UXUX$JQv@0KPF)75)mWit6(GlxPeFle_&GelyZ$*x7sS4|ZQVp8~IR-up1h9i)*KRmB zL(raRL^&sBuP<8@#{1)oV!#xM{wu!jKq^~pVIdSt$myNnJGU)7Xgp_cwvEI`3vq5X{=;M{ZtwC>En2)Y7EZ%9b@#7{&Bfm?Vo9i5A*KUc`I6FV%tbMQaFow;10 zl=zh4I8lPt2>-n) zGThZY>L)tll!k|6=&t0tGy;cBwaAN>Sb;K+`jk&r?LKNux}7u)&zL$oIiQ+|9E69L z7uIE+07@QoI!kwW0$mBl01UPe#3x7&&@y3UZJajGwMPL~hPB2Ov-Z-+Nb_uK;th|1 z@27Kanf-K-jFoo3Zx}t4coogwR6-x=WEvH9j+c1(O1Xy<5kOEl#d=s-K1G& z`^-}M=QtW911+CAv#Y{-3qUJg?CaRZIt37VsOTi~^!!}t-6Ot>YOT3n9=`W{eC}Yr-e@9720LLrG%$#VISCePqniL* zOd=b+7OLD}!oNDWY2`6QSyZAabaaI(x@Ck1+0bAXVq?7@#`Y zJK6^wTvr>G*Sd^zUSEI6mvDKRY(AVcJvlEFzW*1*|HCB$oWl#SdewJ8xYc)F`pTSa zzDPs#@)q+gx(Upe6smzO$Ehl`35;ou_gm(19fOSW_}VEB&x(n?okj!S5kVeCvHZQG z4V;mea!NEbQZ0l+$(zLSv%6BX#)KRc| zfo@=ae8m>o-T|e)O8fMQ>QUKbiNLZxIlb%X&zyX<)UQxTEFBzpk@ixkT8x%WZ7KVF zNi{FwA2~i&F+_<}OWsIV7$FuY$ZHIJb0|3uhu_gvGoL%@UpkXzULKxSEtQjQ_?73iYQ4+=R{uAq0X8-jTu|}GNg+rzOw${FDli(~Co2!Ro%!$|W9DthX+(XJ zrTd{78imBi$3Gr0jh>u?5;9xIeKFp+`@o%xvbVZ`1&FiAYiSYLx%zGAiC2EkJ@I>% zl{#8`^D*ujj%B7N0h?AuP8_4@HprIaCBU4NTyg6p-3lY_2CkLI{IFr)x@36NAc_c8 zXznV8kUIh01~cyBN7W( z06!r?_yf1-LD0%6A@TNT_V~ROv@fR*yzrTI+~TYa-=5G!x8u9rz~K%X24iIKMTYax zmnEO_$;~XUf1KR?31|(YM*fiiLeZX;ol9Zq+cR>B7ZipAr(4Ae>!+Yo`u~1K_etl@ zSaEZ6@17Gkti7%;4;vJ@$}l397gU|z#vdOSA?9NWnFHU_vq6Whzvr0p&UtZOFH?&A z=Y;(}XCNw=>{HjV!()R~x$=3F97v2s?cnMwy4)%yHRG{gxU#Yj$v50)rW*b4vo^hN zM}zQ5LmBAw2?@V15_YcibPO|#G1je0_-;!3ZYn$Zrs29+6c&_v&Drr&QByw~Ub-n= zK&6Euw5AX0R_tD2$VB8WsCE`usLi4OobulXy;E62Mg-rw_nnyCyYRISnQKoPN(v@R z?o=5p%%o1JzL>te;ri!;L-JsK*akC!Q3P(>f$sm-0tj@dYL?Cgm%jMfq@t~Hv+$aX z*pa4Gq((I?bakb2eaLI4YZ#|UrbWRtQ}iS)+-eY2+DD;*WH;W=w(k6d!IDSA;flWy z4k6BC_ehaDfR>K*%gFlsr;u)xM?-+MUPR2<{wA)%g=mW4>}v9usg8HCZwXv+@>p!Q z{L(<2g;jB`)?T6{&wb!4pBnF2W>J8~zqPZfbygagDYQQijs(IZwwr59++8&gytGyWj9P=d)Sbo+c-JJHOvi_WQ7x+PL9Ys6?(o(`~EO(q1s+(Sl6QjbSXiz0n zGcUq>=p^soW~DvcFP^k{NC6_VQ~TQS>&#;+yV3M%FHE6Ka`G*i4igu$PS;DAP>OBigz3=+5b?6U>UZbuK6L?G5|`yg+Kj8ad*pihGvn^-3wFc00hX$mdR!UnoZu_lL&v`5=!qs2-~bu2IbOzBRyXq0mzR(7RI7I`X^53{d~s_nBDbS$L51hw z4-Sh_ZPcTLSnuYu<&6p#_657ay<4^vm-OXjt+1Cxr^Tw(X8=4Y3`F#C_~piFH9B^G zND7wJVai325(77PbID~q*K4f{(Nv%G<`aP$%_A|!oBnmCP!X&xz>`-Pr}ZjOE9vBi zX>^VG@EiQ|t7u^1gfehvenBWdrlnmd|2~!f zd7++dv+GQL*oyK{0>?2 zuR-BgTerkiNn1at5+E=0Dz1?!P?J z*n00iw%#q^8*+2lkq!Kcrd#%85&+z$OnnqYEK_viz77a4K->H?)p-xNwk-V~;);*g8Gd#FjDrZ8EBybij@|E@bL6^Rw;nJ;TNcxMr^f$0J{u>gCZ11qZY&24+3gAh##=)%Sda_eV#ws61?l#e7O7Tw=@W`3iAENEAl?~Rolwy zEP$!X8(#BpuC#h8yzG9VUwu-IzODvF&KKj5M7g1&aDiHrnX-)`$i^bXZcd{d^7Owe zQt@rDrggpUcbobpKiT`TmOn+#*tM2~5?Fm}rUi1a8oIN>v?))ngZ-6PosT~24(;s>G;z^imrnH9&tAh_o!#+p|? zU#r_fr)p@xmbI?K;%SFDN2G41MN%#{Bl(mK)=BZW!tC|mPXrN%naE6ImNrV(Y4K8D z+>QJYV3997Yj->5S+Gr^cp(Y@MW*lHo7ShCP-PQjBYnGF)Si!qkNQ3nztyH>*%d{Z zKf=|Y-`9hyP#rhS85;Ek%3;C&i%TJk$D0oP(>!?a($6x%aX?a!M%5*;7<}`->@*U4 zIGM{5i0?KP61Dd|e9?*>+OHmoe)UCN2-)$?JD+xKY8@^9O6TpP1xdINYf_yfQxN%9 zgz!VSbsFGcn{(N?2O9Wfk9Z71w_i;zFC!7K+fU=9b+cg6Mv$~;`pD+FTBAMCV{oY`5>-+HwHHIXp|=|X(yR40c;bHcm^v$oNkHT9-y&PAwYjVpyNW8 zUa>Lcab@*!z0H=s=9LmOpXV9VZ-~;VU3!WMyfLta40JsS0n{wUm~*hjMq_%s7K*50 zWyK*$8iC2>HlkZ!wEFH4E{zjHVA=34GNSE~%zn?E?EY5xnbiEzo#-BH0L?2bFE1~7 zTw#87xcqCeCAr8U;n5%}*YCC1MJZ@48FZ}*WT}0Q>w2P@Wa5Si#yF`rTut7O{`?E{ z^_c597ImaB8dhKJv|rgU<+ZCaMtCG$1hk6EqA1J`pJ@lDD}(n9ZNEy~kNa$^0B(d= z>V&)yh>@ZJxzZ;ax>JF#_qis$8k4&QiHAUL#l1kPWujQNfP@ih30|T9Id0ydHcK16 zsB^G-Q4M(r^rKp+_IWGEn}_JJ28SE|dzVLa0_d52w|1ZXciuRsMG3m>g$qqWO25Zr|72DoTnZ zxwWN)E=U8N*q;j&QazMTTu6p(d=cN-pL=r3XK;Mb z9unmxi3&V*sW!J40u~J-j!M%|V5{~;uzBm-3$ZiuVS1>okiVS6qRC+D>|uipb}ZzV zw+2rC4+?Lgr{D-Twz;3i9|f~kQLJJB<-D#uY@ z!96wW3FI0gWotO8qm5g74aWx%B)^Xja$^C+C)%?ELVE(?7`p9ON6`F9d?u|DWyB^A z-i`EWQ15cMO!-Wl9&n^BVTT<^ntOzgM5B)1qbX9A-pslxNAaTMC29O&Bb*Vx0yq(3 z9p}jfIO#wxha{>eMk~n)eTVQE{C-6i0tVq0B;wNW+W{B<`GW)x-#A9Qg}Tt7<-bV2xBU24G{Ypo#gcZh7Uj#eS#K9{wKX^>B5dwaX8zh317z*Y}Lt<8@g zPju^H!#i{_OG~)%KeLNbNKC za5yi*TkGgmJu|w{E;k!p&TtbTg7qC2PPSP((vT+Ymp~b^lbrHyj?Wq`g_5x!_$qlF z?|dC^orF?;3MI=GzQ?cR@_RI8|CTiRZTa24+3y00aPdVccc8G+t$Jx6H7pz?IW zKGgT6cE{m`zEdxGE3gu2`{j`jVR$m9&{JsH;C{DwhTazE#*gfjKzXi~1mLc;RdLqO z+Pv^R(}rY1(MYK2PL*ZNtniKIp3wJhFl4XV^S~B7{9Egv<$Wr9CHCWKXGM&)_#P$m z!?rY7_Zkt;WP#lVwL(A{jC%m*g6=y*PyV z>`>!IlnAU4e9d4FqzwapTo>P28>gLU$Osn%5P_`ScUYxEp^o;Bj>az>hvW=*=|Yc& zv<4}%F+yUW=l^Q#0u>|@_Yn}@2g3S%L+4trU$h?WK>L5!owiRmDhUd_Ssv?7Vkd*j z$1b27mnEnEhkR#}OgWTE?0Q3=K!_*&c1wa`P`ei}U^~^gcE@$YuO&GC>goI(z>~CI zj5@5%K@R2`kJ@ia=WO1sWt%5)=zTqwr$aKJegQ(*6{Oc$jXWEcAt`(ehD^Ke~Jq+Rt#hTKAIq{-sa5~6^CR&fJNgB({ zOjIfU(qS{YQ!5#zKO6tFGRaC^1lhrj;KSzXstX~eBA=o+*b=IAOz%exq+DE$g{g@} zAvV@*WwqdK5v?pI@z2&4Y@t2%bM2oaK|F*}+vUnS<`0*=XpHE8QP8s+G)LI3IKB{Z zKg=mE?)TI-M|VE*MPK{pEBlfFC}pW5 zxdpY%cIbm3^$VTK`d9pqymz0tA1=jO2BZ9Z#V$dpdvs`6zYY*)X<{^o~+W@-1C9~I&ue&um*B+#Y+Iny)cBS7g zUX@r61Ju(1Q4j3|bf$uCGuF(2{YbRC-RH*Yx|c0TNpq?|yiGNK2DjMBBeQR71B9H6V=ZHju|f0YFBA0(a9!ALt~-vZ*&e7|ZU5po5~1 z)3n?AxT0*L88iiJ&=}osy5FhTa*Y4-4F>&Y+y<3><7Ud0Ek~(rby^$KbN!5eFD(F` zbu4#3-F0RlFVSFYJx#1$WrCa*3m(*IMzlwDAmmBb00HY}03Gb{&E?DfwyUL`4Vk?? zp>K3Pr_nIP4mxgHStK*6iXdygJ{)=P!+;!~Eyng(`ta{yL``i2*_muHKGI?Ize?Mm z0vCvJ%tij-OoCyv0jwvju<437)hJWvjx!6v^zhv&6hg4^`jk{mMxWmcX%P^x6V zxwn7UShE!~@^6i#^){7J-(aF6}&V3Fjhy&rKppKxZHegm6agWaX zV;1t>sZuYI{lWvKoc3S}DkzFR7^2r2tZ>~`0JiVG$#9Cu!D_8C#HKQu z406%Sby_6Q9;#zt@Xq@q`|yu;Buo2E=F#y?$BkDzd>p#j`RS2~kz-B*YN?8(Tv;TN z#sNVD5D>y*H_>q@wi#CKwoBF(mu@ee7I%X+2?NXDDpJCCo*^wq&FMQ?i}Fe4k`*|= zSDw6Y+c!Wo>{sP`P3CQED<}VNm>;rmqFIw#xV7lT`dCVa702HMOM@&R=RpPb&&BmB z$Oz6?wc8#kC7s(w7n$bkNEYLJPsv|P5Mi5}xWb$5LlH5fH*+O7vff2;hRrbtxTn$;x_y>F+IwOsbW5hAR z4~a2PvdA~IVUyi+a02Zuf_Y;cDhuarwrLKj=nJgU1>mB<8}O%_Gz@R6v%K;*&%i#&0PdGCu?)@bJmNW4O=f;t%MV{6^c|U8DDdF2 z^`TO4A$jcYrjrDk@TLJ{`ceQ}TDpQNPOVjDt)qbcAXcFTO{`3hX7<+hGWOmKg~`cW zV6Ewaj}qZ|oqkca9U(Q&a5=HFRM6HJZGEYFU2Dq*me?Er9$+`s-FAckcx@bXr86dc zW0%BfC|9q0@qwyXxm1HHy2EPiso2?`@)+x=Uo`gty|Q>3xHTnG8NNnj+Gx;dmbrKI`)H6O^FCW9EyHBAdlGP=jfCDjuV;+9hbfl&i}vze{h0gDzH=d>i+0m+z|0N%Bk<~ z@Qo8%(RchLkgrN!6>SFOndE+F$WR{&sH$?AwB}%KCsq2^oa=}0)~dh%w>(i~J0O&5 zazUaBHeXWi z)aYgG)fx6@fLri3O^|!>gm`DHY|5%Gj8v{JlZw;0VtDS}TJ;qV=^}&+7o@H``XKc| ze6PaKK;`ZKfh<0SJq_lQm5IQx&x(Ba!umxz@GR9D7WR}Xp?=dFQ@dQ4wJ!wcz4BuH zTV^Q1IdKIVaXq;jgI^x-+djc%&4Jr_a`O{;!wng63&BLXnp)nG#+lw1ZjlM1VRuW% ze6`;z__UW1x_{HvzS7a8dx?@viL7qw7Z1Z!`xw!TGuGEQl*V(m0A(6*;G^+AqXF+A zk078U^tT)M@h3t#^AE47qHOel+b(1Cmb`+3n0e0_ z*ILiU{vTi89o6LetgA>b(u*QZKzdO?q&JaXq(~=%^xlg^f;8z!4ZSG67wM3QfOJr* zfDn4<5PEEduyV7zO~rB{PMo%Dk@20Vt6gMy{*4;;NL7VvUt9?G1nrYQ zT^WHF zrdXg$rg_~1))@Z`O+)S&VG?S814Xb!4bI_)PjvLN+dsP;9=o@0-d{Rdx@bVIj{5le zHTSLA2>nkqV8<|AekvPv5HmxIeno=y`ZmiL7!A3k5G z!eYV4TF{B4`7~KN35B0&s-DFG!H?rCNg!@qyZRG^FYuCd;F{&X0ZI@Sb=fdW)^FzQ zXe;o|Ela-{C3Xi%r;#r9a(DF?!^hs&aR}do^{W4T7p~C6Wo<6RNko;YsmY7O*V!(vGPTCB`! zI)n1Lp>s~hYf_*Q0Pw(@`-hk_-7q-+8+QM`o47PR&%K~;J*8iY0pGXLoIV9Tx#Xd- z$_6LxubAsKQRwm>es5KL!AD!NVIQKNL|e)ZopU+jgC196n`J-W8rj*w7Hbc!uv3Y- zMh?qfT@H_9Hd?1rTtQhSo!60!0T*QA>t2aAXT~=Yjdmw1U3gfU(Py}bC- z@-4Scj-EYid;=F`UTTQwX9ner*kPM+{u}`2dC&%b?7n19;duQ*Snb;e7%+ytg1?;; z2r)4+0dM@>R@bnZ2yQh9`4yfT1}CT9j!YbOllHWm9oe6ZZf}=F_HxO3implIoq`_` zVF$nyiGw!mB6rF1f{)dh-(D&-yMtn|k64jb`RruGH|O2C#7N%abcuOZiTT?(+dtRm zO9glmkg}&j-w1((UR6HcXOWBPCkK%o%enIlTfywXD!F2q_Ut?(bOrU5*Z7GzGB=0nJ0_e8gXjY$dZF};pWgusmQqSHTid07i zB`W17J&NU`nd1I}_El#6Cic05<(1mSjg8)l!?0eCDvOqM!1X9>nLZdnKiBNbmnc`C zcIW!yu|Z|MgO#}RnDJ;J9Po95Q$w9A(?jo4q!=CjH%UYD2m4QAP*Zc}P)3FH6Q%TgR_CeEg(M^5Z|-Fh5a& zS4}pg=oPk%=#bt<)W93k7p1H9l`N=D!C72wV7A+e?`iM&D?MQ8CSmZkytwLt<~Ksa zg|2l=Zn@aMJo7i;o8Emo=Nnw@CY$OJA@0of{o2fd@f#veWWpExI37AD?fHK>W1P!} z_r5d45y7-ppMsn2Wx5t-`tql$KV(oV*}nk%#UY;~l*2{eb0E+!U51k#H@SdPEirtv z3O^oSCZFW8+$j^n7H%HaR3M6>a4ELWdo6mcnLEH6nPQ#~5Q#&b@*@&0X#xQuW?q1P zB}&Q`>7=D?mLMYTrMqeqtzhXLFlV7zuribd&3p# z7g%hg>qS|!JuD=rAAYlyN3;t4b>4#L!)6DmYK*UVYXYV#)!lqkujxbci)hE5t&#q@ zmZlzgTVD#jz0`RN`7u4$jU8t>jJ>34@KYwxM+y%k5@O60T&6dHDTLUDacnh1Or>rM z_wvo0kTABAJSM%%Ka=Y_LumtjhH$6Y9LpUpiG5nX{V7%(f>K%0ft>!&q&PKFJ>A5# zIvrd3UktJM@H+D9uE?Po+e^5hPcjdLMe8?f<-$wV<3&?>wJ%4BlvTV$eXlDfA9?(K z_fHkdW*M6;c+5eUwM=LIG~4EtW3#EkN2x*%En(zknDP`2ajKLMSwS)7^i4xVc5}CP zZ3)XSJ|8X?7vJ$>8@<15HMJ2dpY;s%sh|QKP6Zd7)y=TR+P|NmIL&rhhx;~yM7?PJ z?eX-#G)UjT9y?JNv7F8(Uox#4<20-N-Grmr)N*FGFaL&LPavll&x^Q}LUlWlhIHRj zS`BWvh1O#_K<}Ez!Gk4)cPwdZyn|OfP+-}-Y}Q2m{KNUI&l)&%sf7%0mn9rxkaWJ) zSlzqfflDUe-PO+LQXNiYZAO7AoUIR`gzMPvtNH2tH@szQu0VyRj>QfYr(YTO;rlf3 z{wAD?Jy?NAbLU_C^5JDvI~i%*;I5vt$P?$mC$V-AZ92BcdYbLI3ICMc`{)&@grm(| ztY20&m5u5iRzJ%evRG-ku0&cZViTS{?Tm;g-@?7^ljurWz@qckw&7I*_Tn}xw6bfwCqdBt%q6HbhO`07vQ7gCw?7*Znu+O7x&VG z1xs|UF3wp~=9OFEtPRbeO4aF=86gYi4D`9Cmi{A2_!&XG5 z4uzKq!hM_PWouwVezD#E@cKI}z55KiLWZ$T_sTe(ZEXk%qX^3%GkCtmgLIsU-#7<9-KjiRvODxX@R>bF5eQ*T<9!1B5xODP9VVB+tbGJGr6?IQV8i z7f}h%%>zUd$)7g1nb8WrFqhU@{l9F3a}d-YP}3*$edHK}aEoiDhLr67jh4L#HcISx z6f@`gS1*NBx&&8|D=pylp+m|Ikd;}hQPsugC@GXpDr)_lPg5zlm3j)JOY5($7UfOr z@1Ql94>mxXv}cZB#dw3`;-2G^vDat`6JW9N>q<7v73`C(oM5h2{wZh`NF5GVY7>Y~ z{icbbMz2?^Ui%MNuPsDA&C8P0)tSWV=Nz!Oi~Ba*SwJrDfI ze>7EptHg9nWM2)s3}a}Xvx-HMDv*}(ezz&(rkma7%6hiwU5xt5r~Coth(*&Gwob>(hS~3?8$p=}mPKH>X)uPc&2);Ny35gfcQ7lNZ0Hqiqaej04gC zPFsqV{8*39<(Z3PaY*?Mczi^M5Bcx#@u{4cdU6Qg$9Hqi0y{bp#AvUdnz`q*nrm+s zqcfh${T!Zd99XaUcko!tz=437aK|K@^j%G=O$}3<2Td~kTS=@#A^g2b!jO_j+mJN7 zhN!RH08cK;2*jIKs#>Eq7=PldE=llmth$x@9!)RqqK)ZEm=xte(K4eaqMQvKnlHQ} zax=1_y>xz8c*2^Lwj(v|o27;1Dl)G-;Im<_%S52X!b^QO#P97X9=HNk^3ND0!4&K4;t(5@AX{ZxuVS%Fxo-a|7ZaIxzP!6 zueKIB6QV&lpDE)Sir=cpVpZRsGLpN+8`_6+@Z9*bEz@FM8HHa9rNP(8g!_ii#b9Xz z%lHn4@`nF>2p1j;Fjw`c=1Uc$!;B`8`}40HRs;Rpk;x~qS1p|#pC-Rb`Cy3FXUguq zj3xk;1UxF9T_#xg?_Kc}7QWODTeH$*4&Fb;rxYpQ{ZGC- zdPVA9w7Yi&uT!;LaqksgW(``bwNGgB?C(nIL*#eDhi*g7(;kXy>BB~NXB)L4e6@yN zV!s~*j4p0I4GhjOu9bixr;<@M=ePV7ab0J9)}5CZCO+Y?pxa1UY-ipE2Yjt>l8e2Z zZAj7U+Fx(dNTzfPy(pi>f)%!ukIki0|CtCBFx#=zcQ%V5rLTc`BqMjI5|TlfD4h?@ zHbDau-RNzsk-+)*x)7UUny=j=yvhY|Jg0+tLVcp8;lV0_k%hxPkrFn*b7S8f$0=uD z3MG4cf@vq)@4x;Ono=&6?^Qw2Jjndp=l^8*{<+B~1iXa3WL_sJ!hcD^P}TZghwV!T z4XyA_I~BEgX_oSD5h3wT0;qdFcSYoLT4!Rs#e&M@K$EgiivKEO{rLwxmoL64QVJOg zIEg%GYs}0R4gd5)HEwWm1I)epB3gHC2lfe@=i;o5Tuf+oqcG<$z5;lcMkO;=O$ger z)BaTrqeZqxry-WObV(%9G@=5)}op#t>L{ z3uD~^D#cDeFx_b1ty#lfj4A$Jmj_)^q60E7Sups0xj3b=)oOW|QjIG)OGwh!2^@G# znO?)kPaY)xSwRm19k*fT`4g3Xa+7JWpL2sP?j7;23oVVl3$L18=DuY*&uTIJ6$gO6 zAH8D}Tt6dPW<0XP|6|Di@dpY8R;rhvnOoTNqgWpX3)8yP#2zkR>z9dj2APDN zgzYlW$aDVMlv*Z@@;!{Vw}W_(A$`Ld{`shs_vCKagZX=^Dji%m&r6$THt*y`{=CocKx%9jhF@RlPdrXt=WAQ8X=u;ceu4)}GQ1Kg>TF?wEx}&EhPpN4RiN&E@*9CWG5R z8Y;461<%rC2fLS}13*-C`{1~??BwwubObdZL=5v}ga_;~n{Jj@o8#U?ocYT!`8PNq z6#0c`Ywgqh{rm&qW5rKuW6fwAZX+~&z7^(rV~*<7{tBM{RM6&Bli;~8G-~TO82o0j z!|U4F4)G4fc&PvhIeSvCOe$F{$`LQUL4(jl3qJdJoK1O9^5!~-_oxpRdvywy%W&Mr z4BUL7Ik@`>X~9#V;f8zR$x{seR4gwi!9<|GB{g zAbD#(n=9F;wT|(%$isZS?}>$L3n*hBElk0pwk8T>6yj2+l}q-BVaU=wI(dYITB9NXBlDp{zk5dWYBRWbBKKG#BLuNaL4fDB2^Hbu-)c5bO5x1T)uwLr$zh( z%fMnj&CuPCVm;u0i_?5*2vJ;geWzgPSGJF@YLzWC)!CTbar5p{>f1F*uaO5LWI<+> z+t@edYqb&I&t%KV5%fU*gPYv(^J98(SkEXdAS&#GGwV?kC=fd$l}+}{%*cCYzK_Gz z(c|r$TrM)^VR5TIAB8seJsu!&6V&bzK-aJV=m%DgOL>-w^|krZKh+AtUI|r3Z1U~ZgSjb-T2pe5(7(2G7siq<6l?qkG_bVYm~}_`Uq+m3BKOm zj?I9{kbE8_`U3S*32_E-Dmpe?NB3g}&Y{X`z|QdIHRE%}-=AE57JP9IswDKHn#j*C z!22XPn(;78Tk)z<5UnGM(ptHpM+!-^AW(qfcV*K;uRIr-1eIdP0mzG=qBm&9!Pyal zjH_S{7Mns85%G%`vAK`VRnW2Ha=*oU1z_*u#(^8!%z@T&#Cvk0z34ovs&C`ymUsr6 z`FXi6Br72Uqj@`MNuo_Ze z_cZkT8RmmlAKtxO{%|ki!FwW&C>iBTs^eVUV4hFEq|x@eS1e)q>7flA0da)fJv}6N zw5}&}beZgHwsEb05xe~P8{x0JFdhf)E@iF|jP=6*@DA9Qpc~*z_|ua{vZ-Mo>Q0i| zyaz0N>YpNkKc>m&O%5%UDQV???fB6OmdsbQOl#q&<><=S?L6`?vyY}i>W9Dvo-jk5 z2>{4NvwM3i`0v#@H*&x=n=;TdyNRg>Y5BH83H^C%&Y&9$KWW4RLJqZWCfV8>i_muh z7L8JgP9VE98Jz=oHVE;Kx$T7KR@~#Ng}P+_n@uqvp*(*dtg!2laW$#K&*RXa+3Grd zkrdmT)R?eL@Ra5a=?Qquwuf)Vb1)6=b=5YUcpqKiarxtx+|`WI?_)3~x?*^ow=Q5# z+VV`F-n&o7+{YT>p7!>0x02kiF#uMM-{}HxX{-#3t&zsINg{VV_uWZnTCZU&@IyHy zlglQ3T9mMTFg@5Jh|RryC5BoXR6TOs+a+~*;|>ga(Utk(vs(Cxph-^<2EL%pjUt=9 z|NA7j|8j>VxPNYS(Kr7|;%wKNtvR~nY*G@VjK)3g9w+q*o2zis`&41SepcH}^GO5z z7HUOh6TrhMzZ_e8r~$i0H)Z@h)`Bq%^;(`*EL`x%1V=x8B&g0m1BxsqxZIb6(QH3{Zy}}^3G}__KdMBj0wgn_)GJPE0*_iHu>?^G=3!;mbkVVJ3g4T5e(S=L* z&6*Ev>t=>LUkQw#lOAnh_4dtP%#9tArPZj(%={4N`tb9_$mo9dUJXIYnAM=(Vg=JN z7DEI2ELx#d$60jH<4dbMcitdkW5{AOo~dU@ixOdXE$AmwAiI9|XV>rZS-JK4j{HZB zbZf|+K^w~tje{Vuwu?z{JmipETyu){M}^hr>6)#pfzJSbiOQ=hv4^%L)CKDf9?y(yG2T2eW- zLObhbGw8U;MS~fA^?l~r50RS<8kxEF--LmfENI6dTRxUSD9$Q~$8^TI_=ha%;KRF+ zi|LNJ^W~b~V>N+HRUl-hx#!{|ZJ8h>QL&uM_sK-rVQ4T?yZJY=Xu;zaR^2%b7lQl# zd3KA=&DKePYwLGv|HIWF0P68V^WNBgF4W;}cC93M7}oi|n6IPM6&-sMy%zTH`?~V` ziq|jUeIU#_?U%=WF9f@LM6b|F1+~yIe^6_#&byWV-=6TvHlxxQC;}8mY3KjSq}k>v z->Z?0sp1m}-#+Ov$v2PbYXM^>&U8OtswKw$*KSXV1ytI*z@aSY zTXa2mCfqyG*Z9Z!*@p-5LKwWhisZn;!e2zv0P_~abh3^hIj0;X)re9v8gL*vj5ICd zY(v!qXC?vhXnh;yl*AaGl2BW%uSOo}BhK2m@oYDnp`|o(&-vI~Qz!C6bJey%q>$^- zL5)G2DLOUR?Irr0rHDA%%fjVR_ND&yjHG*ig@wMzSI3*uR$c`1Dk=ogY9(F&TNb64 zzOEviSs<3x>Ey0?x7NI#Zv_J@^{9Wi2n4lJf+HbL6-;y>xMdU!AN8h=NyVM5p(Z=% z)rm^H8643O=7?74xq~;#--N~|xHzx3*$Dl7Ts_LL$7pvkc1QWj$hNL_ERXBL^ZS*4 zq8pUAsT-f*{Pe1doujIoetPWKa~q|%D{&v`=O`NHPdU?ENqfih_=d`Kjv?Ge?Qk-GC>Yy!thMG zx0%3yvPwMGCp0^%(CgE-&b?>Pu#4A^wtfS7=R@Q#Ka8MY(Bze4F6JQ!2T6#(_ULRq z`d8PN8TdWljc8qi0W61++c#_+E@(@zV*S0li_0K1e5oKd3WC31Lv_IV24+b5+~?v7 zyXj0yQb}@j(W@GqG*9>m$Y-DVw0=+LIuER2#ZMQ=8`bGNFkeDYGq!GZZWv73sAFjd z7Z~8&{0rs6(s6$J)5&CFs_rIruF9n8qf-`h8jGxRypvl9PIK`TK*$Gb>!IvSyPD*6pcG}3uom1&ql`iP_y}(8+Vl>Vaxx-k`-b)s6IDXrc z$Q{XAP}|w9MC`fGnC=1hB!VxiKm0oGaV5Q&ChU8Yr!xN}Zqn*#S@_~Nf`J@{qJdj> zW*Kz`vOA9f6ecpUeVSkRv5KTBTa*-+dmVHvZ~uCG5jNg1*-pZg$d3L%E_Lt2+pCiX z`Ob-lp%wSU(bb0cZnBVJh$k$PK>|%xOjn9dmt5tj$I#2Hi+i)ggYYu_Kf^B`u~}MB-Pq(m+g7GU}@hft6~&vvMy9cxaMtDXFn z92M*$hR%wMoh21CL`Yuo7$S4ic(D_+nCM;SHLD+!66Z4I zxsn|9A6N2jxbB(B5&O*}rW#^TKkp#PYpyqM-?XnSVlTNSO$pe&^Lxo-M zfQJ}p_912xWxV^M@tcqMZSjn7I=0U=Z)1kVv3vLuEw?zHk|^D;^E&_!yYwX7F6rlL zdT+U~fA$}@@{>V4a2_-B{EeuU2aHoBE;df8@uyJ=x@kdDHn%SQA5ib{?%EaB+J|s4 z|LThzo?-I8YW#G1MtXd?D7ElNO9B#l=}6a~iLO%w9I`GPz|@d>1uxx?e~S?O>NsK5 z&^gTeucemYP8P*u?1WdDsWE0np+y^~V8buFQ7<~r>p}R&|MM-h#~^t&+N0iXYg5ypdsAn1e1Z<$zsi0vKB__dJzT=II^O~i?mg#A9%!CMdVp7^ZF z3yRIrU7OJgJRd)UM@^_CIayeguXnh*ZA_EfV;s)Rg9V)z1MW~)ec$!4`pf;6$qLkk z_0b~u3l%R8B1I?eN=@GK(xwu_4)QvZlBygrlLRN2fqwSnMxD0Q*_OUfy*K^iC(&M> zcoXMX`DWABqTE-A{oVSVD%+8~g}~nkIbq@6gZ=BRg;|!=@zFY2#r}8@5kkoIyz<+R z6}G&f_77rketUFsTYoM(^2$m2WZm-a{vqEGue~}CX~M}hAhjFKmbb9fQ|b*xp=H@n@GVLYEdR z6d5v=DaECJiNi)sdy9csfy=>NH%x!fi9Y_k_k)b*8Dh05`r3frV_yyIIIO#B)*-@= zziux&+3vmFGeNKDW4@=Doj+i^i53Q(Gjk^eYjZaGA?Ws5`mlLof-CUW4bMMRL zQ>(w-r2aFbBOyQs1GQ>-+r1$w`U4!f2n z8C)Ku0T`V;JT9yvubtN&3Z71_FE@U`hM1BSsA|)cr7)Bz{%#v;dMrFbWAletskb@0 z5PtV0AJ&?`0DVgy4T+^31byz0WXJCD$=`jn3m6L z7kl1e&XX9(0Y}VN;9;VjevIqJ% z;M$J~SJ%$cjjk*o&M&Yo!MW+lT^5Gdm;@zk@Z*3%8(=VqQs{Txuy-9n60Nm~RhhT< z_c1GIt8>|R1wL;FPe(vWa*u&Tjj!K5Byr}Vxs7qi1=mEJlpkeVV=a=rSvTvZ9RJKKeZ%*K&wa4?r zB@p8a~UJ)uupwCg1*7U=9})AmBYCPUlOxsmz6B%+7<77)bGGGGeCA&T|>tk zkoID5;4cs#%z3W4>+1Veln7My8sacSImW2R6!x$RU>|O8;Vek+38{S=+-ABlX4Dy) zlthV3K^obLc~9`G!S=+K4Laz<(=$yvfMbPK6S_$`Um!g;~< zdV<3sMl4z1gf!m!xOj?ZV@i>*UlvbKeJ4$Fu$*)Q7_G|R*?ALLWL(iGoYu54OM%;? z(Eed{Q(M9QsJQ#aCh{6`ihIo>{X)Nt5SKas#i0jR2cR)`U%a~1zix6&Ba&_K(aD40 zb=)A9fIR(ioyurI?{9;L7u3=C!W0@?fY%!ilc)y-f`j%$IdAQvNFMz(MB+9L^RC?oD{+E-r%3|oZXIBk*-2NC7K_knNHxfFkmiD@$&OP??mq+#%9 zdy&gh?(6&RosqA9BvRm$EwTctk~G1_s^DibNh|MFUIZQuzHDey_ZA*`6brnDd^U%6 zCC+vNMHHa$hv~2l3_mhHQX68fl9j2pZ!Sprtz3Mp?;=ppCfk4N6pD<7NY?NG#uYPfxvQCG~vAOoP?$G%Jh#{35m-8((KO> ztvl(-P}A}x3vOMW8AJXER*GZl!x*ZVOG=HS$&uXZeept7`3M2d2$$%T50B=qmtQ6v<`-q~@$f`^-91?)!#jH0HM|2O?niQ8 zlzJ6@n;PWEjF5Jo@eeZ&yaG4amb{q8i4nO5C{~E3e=}~!2-FdMLr}B~Cz;j{ed~QV zILEJ)9rShAq8?>*w=+Pw~BooF1Wi#Qn+7 zY7HlNSCNuNu4r#0eWcI`g7RmllVZhD)p??YlJjcZ)rI{6`YNym45rD_W>guMCFa7 z!#5G;r&yDE9~HlUHxX)kH$-@EhQ^MISa+lCQFf5J-Kl5?a^;KMS96BaODe02l$?uE z<@gYaidsCUo1X{bZ9oweXQT-x)KHGlRw2uKwA1B8Xcm0|igp*ZZP-M>PP*xDi+qK~K}kcPF$>p+}h^B`zv%RTYO$a_JtqgpjbkyN>Y z1bGoIJ-neb%N}+Pz0gQeg1`tAv#Qt+yG5))WB}mg*S$KkS$<;NyxsD8g0rflwhCZ} ztY!`tqDU+jXpAJ7)F`l4{{#^go2PB3@I19-GdEcgVd~g6f!!V8Jax|q)JF-84}6GZGch0VKF;K6UTSsg+ZOUs6Eu8n zgUQGQ84k>>$7DU5bb&`TP6G5jslNS$3a9a;S*%ZTChW7UT^YFo>QRMCj}5JD4jo6V z`sd@#n16>O{%!7*P5PX^+Dz4YQOfip~13axZlQTo~grzJ(^nF31!<5RiJ(Y~$+e`?G9%rhjm==&L{hG)< zY#%TI^3>@3=OfUB2eO6FEs44+&trZe+C{GbV0J*LXx&gy|LCiS5v>lp zATY@D=#B!*ewS|W$(PwUgtX! zam?!78JDdWCr$##B)5~Er{=b}+xAR5OB|(qmPY3)jpS%g{G9!MhLzgx&yrJNI~&2z;4qf=FF(3uuHDq`kIf1S2JNO$z-7ES4KrMhw52ZpTsWSo{l>A zCf`VoB}eWMJMI_GZ$4yA`o{bs;z=cz&UMegZ-VWTnl>zPR=!q@fCMq$unC%Z09DaF zmhifBy_aqMgTa*zsM#Aw30Jic7`#wmy;R_S9n$mW^sI019J2sII?+vV9t1grj|bbb z1l`r)d3R^9?;*LI*jT*LMdK!|nULEbc#3mpz@LuME}qe~di_iFY;WjNw)Vr)bB1}P zrvOWw|3ageuAG(RpLHi3r80qzC-uk|)H)2j6(tL;h9i8Gd&e-+N#Kx|cFS<6wG|j~C^?n-(LqV;V4p&j>hv#ozb%bcNVNjEZYF&khaw2>D zT_D1+ql5=Fntz`+m&|d~a9dcA>v}fp%EC@B$G7`tL3+-i+@P;W?D6wSmY=6%_o81% z)Wsli>}eZKDgnCaakJq6ClLA@nr@0O6*J5r#_Y9EU(Cm9%4>!2auH3QshC=2N~sW0 zI5??V5{IAuO3865RVRynE>3Wwpw&iPm<=tCraBK2#H_-d9RfO;=c(8v->x>JR4w#| z(gyk0!ElYT)S;2Y; z4cyN5tH%yY9NqzB9h>!}j6L{Za$j|zFMcO-AYhR7BM5jrs!&^78W=vIxz`ltqVdcn6l3(d?XXbcF4WWC$#?vKC0^?lU zXP6+SVj7=IhilF|AU)G&b!-O=+sMxr8sFMeU11aI~#r@Uf%9my|@WF6gp8oeE}Bn|O=1TY;pUCH^0@kKrYWB|!1RFz(D;aC`5 z-)!kMiIJ5DYY^#WocUA3@J;;Ie%CwQ8xtnFPPp=RJcWXsf@9I6)UU_HVO2*Fx=0;; zN|c8}mBLBI&q{_1_uG*uUXi8nNCY>&)7Rnoz=lEh+gj>5 ziv^_Nx*bgpsznAWJ!U6WLr`;>-D4q2NXY9co!J6Y-G?LBX?khvntmpbkNAD&@#T-W zvO{1hog!6=ACtXmzKtM|-p6D-ST|N=H_gMPNcfOaZqb=)=yqqcP_ee1Z2!uK1pToz zrjA+Mu3gChl!eerMfNO|5Xo7n8nSEI7EgYIGHX4f_s`MC;7`mUNk39aGsnxTN*It3 z6$2yc;r2Xm4ZaCz0!NCVmkTVT8dN*3^kcr;NIkrZfBd$s zd&D@VEn1YXUXG0oYP@=ho-~d*55Oxl{*m78!+GupLf9)IE%R7qoOYm>>J6aqSRSj?!gR^p3#We4yusj+g_m-R=Z#Lwt`lcxj zwL~4q@UClpY6us{?9B-wY+7xj8HA3zcp8R6Uk?vsPR;>K-B`xJL2)#%K(9gesH99kY4Ca+GDfmu30@J|52y)m`TVVMv1FrxtIM(Fd*sA-quaSbEL zdJ6Qal6C0I45ver<$tJr;!+e|mC`^fRtyZ@c{37MEbO!B#^`s}3WSbZ35&+jnxt*K2;y-0RJ}X7zjb4ERr0=6c$P-nR)XQipv2Rx!w@o5@qkJU< z=hMv1;p!;RHv8FmAN`TOc6N>Ca5HH59OHQfD!5t>&Zkc_nkG6^&z&-eoH9svYJxnx z=G)}cOcea6wqclowYGd8$%)!-Q6>EwP_bhC+)vTD2%KBp9}^ECPv6cy+G0E~;OZb$ zYtyx|6IFX^VmyI$Gyw_#kbBkO&z8Srg{vO5Y&fi*I2&nc>$r3Lsi_?lj7aL zwaOZs0wFs&OV_QD(qn$XbRp9}0YbWvZjlWh=Y3iw%qW4fg6BhcAReN?AXoQy5|6Fijw`~eka^rZqk^GQF0{`oGdWT4#n zo88k7rFmlP+{Gtd(e-bU3HsqQtYn|J$2M61KiLq8OFH?G-av9(Nhdj#z7*9|{Z28` z!^fpE5vdG>xQb+Z_wW_V)`F(Bzc}e?Dcy3SB;o!5OuSX(<lW~xWJz2g%oO3;59MseRrM}90ltRV4=O~7B@T*8@d~rOZBqDBnb6%%cs;vH@HhQ zo-ZibBgA;hQg@2%8%1|Y*emeSQTa>V=XA)3)U9?KE8d>Mew|-|@`MWfl$%*#H|u@@ zM#|8WDo{>t7Vc%JtmO47jTdJF30^pMmLlGZyP9638sc57(866C`r{mFQO9AHXuNFz ztnVl44X^8HAIESp^vjGHs}@qh;eOJ&aFWBv=rwOvjy9j@&)46#n05lJkj1Pif*wAH z;helH%@kpb9f85)jw!WnQS?@z_Eaoid2v_g*M^BGoy~r_*mEkYNdMEuL#}=b`L3Fj ztf9wZ?O%U<&oj)#n4=@s*m2W4Iz%y)GxQ=ljHbKh&rcp!Z_Y!`q)H}}akYL~@!a~N z{Uv`m8#~-&yCe*!xJ2Z(v?c9(mJJ&8m?6D1tU8Z*p>Jzfh;9_Y}G{Q<)ar z6FgwemZYapN`vc6_)uvNNYI=IV>V3x2h>yY1RR{-(3i7= zbj}QVDmys&Iu!Bvh(OWo{NYyt$IH0bciXS~#OSqYgN#zE89Q1xuGswC4!O221T)cR z_;Fgcmp|WdJqxf&m_y?1BUl%jPdye5QRZE3;PZ0|W~q50S(NXc&15|0MUkY!c$A7fqzCWn(gu zj(96`>i6Ooq&yqEXE}V%_m&X9Q-~KMb{`+982>`4pJMrh@L}OCS!hWPftW(m%<_6b zWWID@mVM&r6B#3p<=I~ih1t%{7hd}_miX6ems*TBZR;2kb2QoQ}7#oAnNc~=q^NVnQMuf{Q;5wh{3J0bjEc(&Y_L_h0V*!H>Ty8q5SCnWsVCUOtB)Qr^S@1^aC{M8;L6Meva+_?jA9t=Z=BpTZ@blH8>qW`fg zS*)44J*hV1x?S2O?&k)v`Uv2E;w`Bb{Bm2C;W%Z>@pA@WirVJmqF&w_vUF|GG<%WJ4X)U*?X z72)x@3Aun2)rZ=ih^(nS;c3$4pg5kCn9@FcO+ZMFIR7YtU)Y#2A|lr|a8>>8^7H$@ zHqDAg7L%*T+o|f315EfXb z@NZVG6Vq~n2f=#SouM8fIsNtyMSy6$!w(hKExdw4#u35wK%Ya7#Ql5r)`DS?-xIWC zI#-K#+pu5ZAsa0Dcvnx4+aPdr>p|jP_VQ&kO}Pt^g?jruyUW~X+;WRB$0Suv2nHc=_L-9HGW@tb5b;E`{^YM;|HnMl^wC)9{ac6 z7=0B*#5QhmB1h z(kJ?@QYjy^^;)~O-=TTWnGyMqG(u?Ejr|rDX1KwOs<)YMcZcZA72SKTB%Yx|ALs<5 z(P7S6KJ|$-HEi(HipyQNZuJY2W1MZKj>UcD0s;pk*>j4WC+37-%mW*5>JKVOc;80G z4ho?8by8bVXG zA31e(XO?U{!&4p8?42HJdE}h*D-F59@&9pu4Zv-4qdQwWq4R1w5wG{_9*%YkyzixO z5+0vV`~KT`{K4MT6lIs*;Ai~lDesAotf^61#0D>m0z<>@34uY5N0bx^xLc3u?-`#P z@))UU+fLQQYaIFZ6lCx`@s}hkV^Uve|KtJ*7SMrV9yFBvBK5Z#_TPg0C~`=?$bkJKG~|YM2+8WjXpOGo@C=XKl@q zw>e=IeF|n^9VHUwbBfevUALd~?a^e;S5oKZr<3q}9SVl5nEv=;yCEJvPZn_1E1ikt z$9dmh`E!Z3*u8bqQuh;ffwZCC7wMHf2?VBFH@kq}BsEj)HlI-@;4^f7q6hrY=`y{; zGiXnoBP~4mF9%3!pVvL4^Q{6DG3uW~R5dg+h;d4f7Lm1kfERLG@6KpdS9Pn#6q}N| zLy<{;exL9H5j@wuVoc)R5~;y`A$7E|@Ykc)jI!RvVX3+`q8U1LKgfFCt2;2@lgs+W z_@e+YJ6S`V+bx!Iy+)V=ttzw0v0j%*9U#&XayqKS&!28!n(evZ?+_Va76J%d^m}Lp ziajN>fd48N%|}l#Lp))P9Fw+t=^tGxOmLqo-L~``mXy6&Xlb~fzl|g7>^anS28Cxj z9$c**dhOXzDXIHePLIP-u-eTT3niK%L&(Llr|j9dj`tDL#L0It57vD7vn+qM%Y~XH z_)uW4y@5UvpvAS*-i`Q4lYfK#hk-Ckx(Vf=;Kd5NPG<6s<_km;tRvp&cSHSPxr6OC zz+E)J&n9miXiE-5ue_YdO!jhdwl+46QATxc7H1<-7kiuS`zYL-CC*9EVdLb@3P5_C zqSN|sEx^}q_QrJ$S|27Z;n$DNR=oiW!*H}OAQqie*DN=74GUyH4BQozq#pWNVNiiM zB!94>gT7MH)P&1_m3NPWA>(%gVGC{rmFzd6=dh2-`fnT$(DSz;=Uw(08KZBuj`?Nf zx!2({7GXC_9q#K1+2RyL1Y?=#^c#Zw?2Sb9aWM0OxI%SF<2!vJ&{g-sWdzv$`x@n5 zkX?+7M?EqTv>{w-}aY;D8We8Rx1;)+^ zezK;~W0S$MA>?dM^;c=;eI59@PTcFM@_xo?{5gX>ulg9{Q0&a|K9|f>7i|IjpSZ3} zlrxrdI!%`LZqo73^#qMn$82~uuyH24Jz{rSa?aqrUVY8HEESG|T=W4$VN|U$8dx&0 z4KGpEvdz+7qQX+R)+^5JEtUi6!z&dAfk53^GT?2NToh$^$dpS05HfNa&9S|E+#B=; zz78g)wbL$D9fvW5H(kWL_T#mkI*WmIU(c(+RKj&X{b^}zNO zsdmiaiMz6{rqE7i+?#c+(>g=LO=suJgSh1}If{!J*-72~chXP42D=dHj5=sU8+QD> zPUoFu#5`92IZ>pxKhkQNW8Z>um&^$gD0EwVR@eK8uc4--qQ*$n=BP!f=@1%^>(u`}bbE^xjLFQbt>0@>0Vt z;NPha>dvkQt@JemcY0|8Z>bNG>pys!CL-hP1RK-#SMKO{&%Eh2mcB^(w>L(*-=MJ~ z7^Uk@Hm$+G6UD)2T8-b|JweEeHVoQ^ACbucCm!QO6`?Se$~bK>#M>%$pyXn9ldI#r z@d0_x!J*+Y;^LO6KRHrM|A4%$gLKl`v?K(J;L%{3fFa3@XYKn!Wf>L0pJ8wxV3E?`q{Tk zH%LAE?%$161hjU|zh61OaPU$9A~|Z>5uQDj@TbN$FO`Yx>MF5ru{$2(CyXg3&{mh| zSbkvsrRgE$4UwhzeB?8wrrQstL*ltGouDu5V9{ot2tTy;`@J~rX@G6iq{oF7iU5;n zH|Uz&G!!M8q<8*oZjnJim$&%4b8 z-P^7#jx~3|!{VYW-OaC>%?~VCIZozAvgSXSO!F%I{kQY~%HKOiY;y%VbJHmHaKItf zTJD#+RYO464i8kn;ABkh7EX!lM;)e8<5c^zMUJ_Re`#3C>A|HUJ>*ROF~-k-9hFqkFVGmK$iC_%OVj6@%a$o*70xSZZ}!CaT4~yu z5=~H%>HJ6T$Dit52Ob$Y5IQ5YlJ;P{nW*=8jx&G$DbSGO-v4S1U^!(LJE1C_pDZBP#w*pY(5&l4u|{>n*qhd(~}6o3BysQJ-Rt75I0C}j*4pRbfB{fkC5>)+YcX*^>WlprKN zBzyyBPHRjT_!gnTf~kk&q%WSzq>L)Cn{LtG$xYZsAjta5e{1xtdXA$sW+We%Qc!5U z?YP=@kinE9LH{mjZ*akW?1 z!bX5|G@O~S_1Jm+qMW|hq3A^!(6_!99C1gDIwRB*e4TETQXgo8?YaVD*&__-2!S@p zGqB^>M1Jy^^(bk;6ELuf}y^r-i}jk z{(>}t-5S-z%)eW>@eb7)yOlcoQ6#>=A;m+yhkBNjh%2cF(MJ9PY1M1h=)V$8FUzhW z*H#(dS{M!xs>I4vTB=1O6$LKN1iOALcdcw?U5W1X8#&If@f^i$E5e&x95hgYfF zb7BdJ%=iwP*MX1a3pBFbqZuk4-)0t7iI)~_Uxgi7t|ZuqXG<288eoz@ke!$Bm&Uv> zd^P2fVUR#Zj#Q(9bH~R=-3IaxJ=Qo55j5Mb>WoS(`ko7!i?))sb_oBk9zrCB8#ivf zQY`-0{hw5@GVJZ7aLHqyKPJc#+EprOnZl43beMNn3;mZ8^q6K=cr~L>TIZc3ErziS zvr`VL8y`4u45h}q0`FDaobyfC!2=a!$q2moe_{|p)?K$~x5laAEW^IJf2?IvnZ^25 zennejJarCD0F%AG>pk!wSwWB)mgv`9zzSc>p(euqqQ4kmuHkTD z_w++oSkzq27ICE%zPaoH&FfR8I9VLm0Ws9@O;I2Cz$ZRJ;)&YR1 zz7IzAM$J%2N88ql1iPVG6LSdq43i@4a>>(`WchI?HUE(U9p` zw9&S!9ICG(CwJ^3GU&|teyfkdUF!AYxkzGJYo$AWtH^-d z&*y4v7(ZFxO`m%{r0615tELO(;sW2vD%^oU8H=oH1V*xy3bsNVS%=ioDCRTt5dQjI z3T^PIb{!MlB}f>p8L*(sxR$n)_`-SrQl=jGH5Kw8k$RG2oEOB~GqF>R1l>@{{M?70n6Ch>&Tj? zm$f<<8nNKapM@z^ZI^V(oGM@`>3k#LdObNg>d-&eMm+LrY@i1GQKSIn&#IA1Gn43pY>T)AU+(C z*{Nqmff$*xXX4x&c~f9!mMcJu5#7CVKj)`p!R-w81!VT@mf!HV1v7g_xRFIzKnLux+^F7g#VQ-Cq;o(6=zs>lM!7SY`7KnUd7wJu?XTrX>QyT)Mz#b+{j?-J@YMK4s zGHvrTy2*y+g7mo7qGzz^L*M&@QfHx2I*@~pANqe21?OygtcB*L7GC#p%eqrY4br%= zU!S^jsee06A+3v>{Ke98dvjnf1JkluN%1O|=yvr>n2C6zwKG1C3)t$6YjF^*Jl4_) zuF0^iIv8amoTVjTc>?IIf}238YY#Y;eV;3h-rI;c4te*F|KrQn%Y>bP&+P2QCTvj` zJm$^pwH^-U4=YnjEI*9>832F`y$H5GBi#N|Q`-OdWOaDf4fi z0uA9{aPpO_TYt^i)FFz?%$lD&c;4sDuyIt=zWUFd6g=TKehtbUyE+trZeF2Vvy976 zVqBNNFHCtJ@?aRiP3~s5BByZTVv3|aX%qFgK)yia&r7HVhhB{WTFI^IX+5_n51N{L zLTp6f=iRs+dgx6;-NStY@|2Rn0l;3ZOBNP+-Mp1MoB^9|F0^wi??fYdq!;VUaO>2i z;~#idew~?AN$Jt&?x?30ZmFjhFs0Z1aU9u&<#@z0%7sQf!vukMb!VjZks2 zmMhiM-15fr>R6h}L*6ng>I4kNH?vO5Hc!tnmB&VbZ^&iB_07GsT!?~jy!Lbq8QQ2A zSfo<3sPFj#6tAJ4p&^n~9^wIird&{-oik>{_x26^$rNTHbLA-ppvrpyT~!0y`oA1~ z_RMjZR*Z8fQdsNS!Dk}0MvUW|35%w?*I7a-+0(dza2 zkmWbCXNL+?@8uy07B+Uz=OS}?m1X5?;znUPA??>j z3yX|#&0RX7d4QAmo9isCqhZJ%hwx8`1vAC7pRNCOm_)X*70XTJqpJK-!UJxsSa$Dq z?bzu0C#oLG=kkJXXgq0VLv5D z#_>x^b_{|6`_#+!kEOO0TE3nO%qo za&jC2?R__W0XZi7hfK+`ZMys)2<-m@9tlhgP~(YgJLrtM8Ws~Ks8?Cn#!NnLyiaeu zOMk&=hr5M*D}7oCi6eD0m19-nR`(qvD9I7X53)j)_1xiU3LaIs>+^9M=xbJ*y%H^8 zUMBP0rTq%19lbt?B2ZP3p!x3oH(j4q0lFMGqrrBQi{S_K_%k`y=jOY!4E6bqH79s; zH#MR#OFsI8AIn?@BPV6`s7~p7W16ONj5Bw7N`_bnuvU0**T}DYGnRJHswl$XI8tjD zrl=`r?zq$G>J>T`sF2@Bso|SLz(Jx@eEbZD^b?GFaaNg^tlXq_g{b=4Oza`nhkfFw zcxi9i)6GvHzP5SaOX^06(lo9Q$VnUlbY?*?vZG-~JT2qWsUHL?&*pi&adCgZpfTCp zk=Rbc(5FRB;>M!X9^v)iPDPRUmOW)$e>BUZr8q&5w)Uuz@UMpD#~ahzBD*tPc4iQt zu->jacgEp?}m4yJmwShZq4=XA| zxMHTEqALh9%$W#Q+!ICnd)b8(CCAed{FXn?9~e{9KE=Zr!G1jLYX;H}vNoDBFH3nC zl(L@AwU%1dO6r=B%h)8FEqWK)ym}oY^}xOUl9l4J*!A1Pp;@xIONCX#xB8ui;m^V+ zb^1Sj(LEfncmcT~n15}H#G}U-8fl(+txNyvqm?SVgeN9^-Gr3eYgg<{+yf$hyCi{8 zt0jQzJKO&9P2!kxK6mu+7n3jSF24a65MmK(&~I=`6O*NQ(9{}(6aU+t-b2>PH%}_1 zZ^dOzcZxW4F9?G?N~OtsTOVm-8bliu-lMVCN)__{Tz6o8<&GSIg3$Sv5A_QeIAm)R zS4oas76Y3`xQRBl1vOf{%H!7Ody7Ay#D~K-36U)nl9}|5|95lH^Tb9i)R&Z2C_xGo0zjhhe6PwQ_@8~)QwaG8ly3FZ3l5r*K ztVTJ=zk_iD+e?vihOsU6B9(BWRx0g#S6BIT6Z+f>zxo-q=A`r(3%4+zP~DQX^R#UtQt%2i zi5DM^@i-3*lUq!xT9;CdK3u-1iTs|G2AP>gC97fevd97Cj{S{BBgzMoKwGrkf}&o$ z(kKJWM~1Buf$o>(BCZGDk7zPsMM9uN06sqjvDAFgt z?w1xU(o1X8_$0OI4GP=OE`wY8H=O$pwLfTnOG$uO6wG+Hwj5ZsE5*_OGADmx>c?Tz z!;9g7TZY_s5aAvS!jK0#hRrCBK54r!-;MkDv>^{L66N`4s8!JD?7iI+y(7Dd68bHkF^tTSt zIVcl-rA6i0>LJwW6Sm?*NCSl{Pge?mV1gH+9sawaX}V87#cf30|I=L4|4~(227uxf zY5sD%dUlY1sU)Y%v+nAyuw!c0y%e2srW{cpcRbPTz1JKTb2~qJ#P(G)S(S3Jy%u>f zVdh0FX-tV?=zJg(gM0!7ma4Z5{niJ(ONaOS93!B?9mz2g4^21r_v(vpS}Sq@2<= zk9-VP&!~}JU}2l3o#y5rebtnu2kgxGk?#Bt=iNv76PKR1?szyzV;)g|<=+yZ-QI>7 zkM)*YOhG=#_f4`jXmzi-F}}rq!6J>kWd7F^N@t~?y(l7)%S6?boQguT?p_SW(|le1 zgD`Son|XajZ34l+r$(uu@S(nUut?JjI1&&nyIeJbq4}^b#?}T;@_F-=8*@INwDD6&)-@?dAx#Iyqx$k@4j>Nhd~1Aq3}lQ>mXIq z+_+lWS=y(MXP8Up1%r<^M9@BW>7_z!6{JrQZN;lM^T)8xHm~o^uZoA=@cVoVa2aIB zLE$HMDM_Ox;zVg*-_&*oMTjWb>LrnM+_NvQC}|%-3Mu~(npGh0f^jUx1Fwo8tqI~C z2ra2CW>P`Xylo@frJjAXlR)K93kjCUmRKx5!Y9(>)~C!EtMc?8lwSOorvs%8$dpee z1dx#$al%0t>Xj0zd>CX`KUfmR0c)(JJN2|P8mKw6trQCmhEjphM{+M;YI7GF_67Rr zJG!qf~y)5tlXe)tZ3B>{vF#fO|Z60{e+(&q>U9Ix*mKoc$tZP!%QSw2xj ztt}8;z{H_g->qu=pwX|8UAC(ei^eY>`v=Cr$HZ;M7}4X}2dAHNpN7OeST;>LqLm2l zsqLKloKEz_iHPZPcK=IP5)OYlOt7ODY=JKr5jA;c;*+&6KjB4d>FjS-=v`5QLNhq- z^bD&-Ny!QNxJhKBwDepu$ezT#c-kCi1 zB0M$hKX-q~pv1x$MK-BP&mT(59~SykNn$$~^b(v=E%?DVvd@wDqdsqEXB_`l!Rzb( zBq`?6qZtPoJ=3}EkB`nLN)x=HOC#(S`R6z|IGHvH=}rBX&NdFdHHt~2t=NBU*N4p;}>6$ z8fwYdkkqGe(uaHEusn9_=WvmEL*4eFKLhz4F$f}{x4T#TVC}niz z!@--F;IUKF^A5~OMYMI)z58>MYzM6=0iDhz>+g2_bHh6Bc}e7Dp9UhTbspJV{nkOA zNqalqjVt=uJu-HChVfiSTsr>6d!si(E_bDvxH67qLVTitD3k19nZab~r{8A56j;oq zIaXuasoz6Ho`%^YhXp*gYn#vPui|uNlpJ20Vp3R!&23viZuo>QPkRYM3^#Suf!cy~ zYrBayE(BU3d5wPH86EgBt#wN2IV`2S=QoxVoIuHfXj=pbKMh?Lp7%0Y}( zRsaqDWMlp=k?j;ZiJojJ&)Y}P0vDOqc!E8CwGPWxe))7A=FHiqWn=$l5s>OAfm~Sr z2R-}3v}ZKB_BOn?cMUtzd_|NniAS!-zu#Wy#6pDdr)3m5vm$!`f3&^Mw?jn&O1NK@ z67cSCuEN{r#?wR#)I>81l7UN)HCWw(w4PzRo=w|lPedNRXDYZmdEHPB=0{I&MKe+1 zPf9P=dYEdZJ^gRY!?-q9fX(<4lh+uE(6}w=jqNkfQ?so<@4-L_heYWIDU~n{O=erW z%UJ-SZTeTY`tj~J^G)8&Or#f6;%V={*kWO*^D?vMH*_&fq|YXWGyc9qm=fdM5;~*O zHSlvyOWc{dI?HVWED0_v$p?s&1iEnBmzgAU>=3h2h(_%03MU-oq&qxeRMfpp+vqtbI~~Ag)ciAk4rlX-To*){V&!BqXw&C+jr~T zxwV{1@85k?pTR@O9*IX)R`NoSU4CycV@uCp-q}trUpod8j>(9@wmJKEN`OM% z*^a*h=1I0xU-oR-=j`jURvOc2m!m5WRXYNpSEHjm4?m6!e+M{-15rg;S8dS6nwKjd zJ%RNaAH~6;Jm_ ztrwxjt$3;ZqLyK>67(&(7uZ#HGYt|UoGyUr0-g};#yQYZGND10=)DQ}CasA4>=Ego zH*}-lccxQ$@9Ph$C zpH1gHWi*vJs#+~HbOpH&3wSwlGb1PZwbouIFMA8qL&aaO#6Up`NYvF87F_fVbL>~{OMT5h=gFsMtUwQ=C zuRBfuy2Y~aJyF++5vrBz0x$EJDd*z5e)eF!58&IdqvAsKoC|!5rg3hb7}2Bu-Xrku zMgP;}8e*&rElJ{n>~bk1b}CI?0}&b^BnizOW76JpnfZ(+k6vN$t)D&J-Q+Z^b4wX zJ+cV@e!uBG8;ZPuSZ$D?kpc5!(zTd>siS0oZ|*C?%-4W+vX3grREI~nds@iVRM4(1 zvLW>j3#^1Q^Cy!I<5DC(|1}slh0zjDyKbO&GC6^8gLUJV@B^5^ZzrZR3VL>#ukS|$peioec?V2`PR>Ht^V zw)gX23msC1D2y{6y)=^Y+;1xQ*MeFB^(DHieF_dcwyhT3(Vnp0)&>T}dIf#52IlpXk{lItCJ686k(|kxZCB2s}a#+2J6SgVPz^*CJ#Ko8o{iO~`g#V-Ejys$=wl zyG0)_&<4$-ApGd(if(0O%$MNe5T`_=TaFSXp?+YPt*YX-$`wQmZ{4i6ML<_zo0S9R zO)oF<0M_3`LXKY(wwKAUd15`+(PZKa$7LYxIb_g5JAZ zCl6zbq@at0D{GuT*4Zpme)ZQ(_hwO`4GtK7#Q*$)*6qo9?`tg>`AZAhSD{c8)m)Se-Rl?6KD&r-UCV7Ea1V0@HpJe2ENx5m%OH#R9Obn>vF!bDiv(Z4-Md~Y*-M%H8SodjD%veb z5U3iX5YulG?+0luhP>7Pxg#0q9BKbL@zZ*&2{o@JxP1qjP}{cKWk#?3;|-<6!>EDj zzDdbQUsl#ZQ`q)(yi(izE;4a}nplmKDNnB<*tc-0PZO*6l)KWFBD!53t#}~bW)|#;HRg5wW*Hg_ixUKjeoT-O6V4^lS_{ZQ*H~2G*_EMgegz>Zz80;rNkv-*Q;p zyF=z)N}w{{1=$%F-E#%X7yc!aLaeDLFL>PDqHi}6yFEI}tqwy6)+c}NLZbz`=V${W z?t{QXP~=&|pOx@<%R#5$g&}iYP(FL6Zt|vkNMcQhF|W_bBrEy=vsf$S5s#LvLiLcF zoEweWw8(pKhe8|e6Fa)x>Pte9^oZU=r$Yu|j`g8l1z`PdhdE*!1c<@aeY1UUa^8Kj z`xJd9Ffns>eYXQvX?*(KSmWw2Rdx`4-Ur>gz}?pjLlL6{#=pVNNEj+ChP_ts^e+Yv zB~g9y_f+z5Y!^<-2I(D;<~^OfSl4}_)Eh3)hdV?MnPnq!iQpnLI>Nh`r>^Ew9P(GG zP1?VYURI<_9(h;+FVd4KttV2mLJM($prLnU5vN&lqm>RuuszNcL5f-%_w#+*7L5j{ zm5po^J}16GQKMr|b3GsH-2%`Fy2SLT=feV?*9@A8sKqBB9e8KDzf^TYH;(jA03jpQ zF^`#C*NOks1xE@r>@&wW-Hs`G3OD3#J5kDfD5K54qzeQ%WN9p)i~jI1BU)e4eGu02 z=5uPt9ToVd54xGoD5Lrss~k`L|+kJ9}#( zcPL}gfxC!n3Lv^c@8Ue+zo!5$SCKk?PJ+4dEdpj#F_f)Cq(mm>PTz4cO2RQMK7js4ncsS83tHITpLraB8RufZ zL~h*W@9oRqQE;TbIjsU7W=tq^l*9W#aw8N~W+S!hhu9GTBd?&F@@Qn+ z%^s=rHHlUqk;??{`N5%Ng*thRh?Hz@L03T|8ne7JFy|*rtYm#%bl{d!)M}|(is7tw zAVb{389zmJw+p)AS~2zwGmcgg;iG_@9nvFeZ+K|9E5x>rm<;Zv%TxCx?`Hz-&oVFW z!ehLhPN@M-yHKtY?5(O$@dHQ@x%@88!D-nthr!ux%798sZZ&KT7Oiy(8w!8!C*7rY z{v3%T|3AL~)i+|dQx^>Tk`5Rx6^Ccoa%uS#-v4Zt zxI>CEgrT`AfKkN;?pMp=>w-cC#^s6ma$&u1ojqhpUsP|eBPdx!#KQ8!&McB#^m93V zk4B4^ZiJ`N{&Ib*Q(WA_Y)8j@6q8i}lIZRLIn)wqrtgRry9pEe9PVoDE@G~wgRa#@l9?Wfg ziSA@Ra-cm~J-$l^P)L#?SDV0td!8rimUiKXc<4A~h}H+>``8J<7BfNYDC#+8B4=k~$?U*%pB}fQiTfD``nr z^69#M?~8^Hjr@Lk1RmbSf8Xfc)I?qEL&iAde7*w4^KE`7B3(EL|LtF>qOQI9512^( z$-C8R)MfOhhSB}biQtERQJS&ZTg*+?wilgWT;(n>JI`=S_>Dm3rDTA|SG@?#_?t=I z?2#`C+PDr-~%WYm`|7c|#+}HI7cs$LN7lg@ZcSphZee#r|PkgGB6@ zzA(%+hxXVrO0c(|n9k-W=6$lgj-t(NNC%iYB?tUkonb4JkapF~73*{37_lv4&ne>i z9@01J7@6TsYnsf1wQ^@1adIsKz6FJUYDneFwoVCJ09Php>E)ZhEB?BPc*eJ50PIz3 z81SK3HNOYXpw#UFA5f;r@)0B{;PnL9h1_>el8n{o|LGv*CN^B@+mUg?8)jwjnu{RO zd#`}QKVdj5FfC^8QV8|0h#JKeR=5)(bvdyK*W2q_=#qV0PAmajD+OVax2eF9GAm}8 z6vtfoC!XNN1L_kZbRg+ftg)C*K(R{4SK-qX-)r2GG#l^1_Y~iWo)tpVhfDQ4zyc2i zq4DA!?W6s^Yfrei3{t!=E-F%sMT~Y1dI-Hd==Q<);%DgrF3I+i6I(-)?E53Uw&pG( z1dA5lzkHn%V{!L5Gp|0fC>+e*^IETd?^u8Oq|Y}cHr?-yuIYPGX*}kl%Ov*&>Eoy3 zU(jYGnoHJO%9pWpSL)^2GG?1c@V9C`WHmJmBUxe+h)KmNKeh_sS}4e!a(>am{n2M7 z(Shzt>MB2{_K?WW=|9t6tDV#UPn}bby??gh(6D_5NDd&sxU!cMIv_y9WjQitPWByI zQVpX%q?4XE=AjPh%NFU%9D8T-_uA*)@*RsP5@(O^3QH;kSB3Oqi%MRm=wnYP8gB9O z_U#PXsCbqC>Y^SlNJ=v%msRV1QC4TYiAI!oU)^MVyMb}_KfrUyLC8t2^}E^P$mDZu z1*lMMF)HZM)x^YTFX3g+PVytVZfhd8z{OXh>)*9r9b zHej!fk+tBG`ko4DY*8<0v^P7&#a=Bbb{wvdox^#_7=CFdhkm8!bD%IZ_17BJ2gt?M z)A8Fn&(I{D!e>a{>-5kUkr{RjL@u#FLwvvB>WvL`INKlhe-oq<4{@VW)MmIK)V1`4PY6-EtKnCCg+d<`7o1Ku-jq324ZyzOSlNGWaVdK7+D-QL zEw`>fU*;nZcOW^k`4e<6u{iy+KW&ve3rjHfk^9Q)YN6i?Kzo(mxf=#*AVd#Vi{3s& zWh(7ACjRBe@@j>7O@g6J5!_oxKK;MM^?wO3nm*PW z+^V47z_is8bec@+VM5M9^`O@ng{fA2+wo zzRk5Z)vt(vM0uDk>hqq-LdT13ZGZeUt&sePJ&9D{MkGDb6mLwcPJZBhM=Fb3hP7rOcN(yPfU!5^*ZR;*Dm zr2WMJ6@TkY^T+2D#?A!!5l4cy=ycOGeFMnRIYM0E>Ht?y@_{h!ah|ql?Z2kt8)ZeO z*51krVt1Q&FU6-|25;{caAU;!(myGpFMe75S9W$yOt zSnw2?W=)T*LOgSxyAZDh-DYh-A^>X|1pMcaKW$U??b#}cr!N_{OlD+ag{6!~9|t?z zY2{q9hAw>k3)yay3Y&TrCmHk<(xCt4stw?oD1bRBup5m8M@0XT^mv6Ai`m7OowGK5 zO0@%!sd6knj`hP6sKs=dD;UetAMkU?FmKXS#m3?mE9jv^{RhR2cp0#IYPZkS^zd1+ z$LE$Ma>zrM5vJ}vHxgrV@J8%ci~ufsDquj=1CPiGb`%pURjie5?G;jcWK<`MYsb(5 zW`EO%{en}bOt1V$hOiXX+u1_oM1|2ClLjwb6O}JRr9VtAVO*l?=oK{}d%AsbZlAU) zDtwV-6rXLA815|^`tKUS-M&8fa*aTLwMt_9(htugI}32^?~7X>>f7WXZ_thMol794H~DF(Q!dsC~&UTA1AAUHU(G9n{WNV zFc9ab5w#abF$)E(sdo_>VY^;*!1nF)_R~A&-dCgn3(aVJ)LJZPs60nTzx+M97OCAN z{p8|_;1SV9BDLD9_w&mhHR4o(H!uOvgi!b|t&V)v3{62vLCvDCey4h1VV{az;|eXj z#}9$2{m$t&rp?JY9-kB7ZrCuyX9VZc zJjWf#z|CoT$y=Ded&whJUp{l>^UzJx*eI2`Bt(Gs^pzVepNW9?wCh3doFQgcc;4_v zCkvyYQ9lkG@m_}$-0lOZ;{4s|P58a1S3R)P_{y^_^WWJw;S@rLpX&`2?`fIuk?0r} z@YZ)sGtnLDFq+hQez@Kgi`ClfXaM7&#kP4?&>!Zp?6FA_P9)D*&Sqf~a=rry;4O~a zuWjgDvKaUZC_ey2FS#3YhrWu`j|mJ?shKZZI(*)**V~8`LV|L*&oLS~R3VQdHG1Ah z4{Mco93a~=wsR{U3~ImU)GLq-HbS&LkWtpn|81T2fjuvy`_E_Pn4n`#zwp*VJ4z0O z6-o>)3F8JI&e0OZF)Nb(Zo<^_+O$7wnp=9B2FPv7l_YQwEKYUVKPP3Hm0O$S zCOL9nC^Nb=@bcnR-<9zpk8>v&ViXU*T&TM|YI%zlIVHH!@0Pv*S$x;9?p$TzgV?uo zA-50ZW*A<4XAFx#FfQxX+#fuU1}>o21viH0_-fbQ|GjO_*1OutiI#bQF@J`i=$zJN z=wQCbmbqGF?zreCuNhbHZkjo`ID$W`wW;>IT)+Wl8q4s_67q32y_(E?En6K$-qi`z|7dwkEFE(*khB>6Ak zjP}=UAF>nk_0GZ1VN!M)MH!k33PpFQNq6O8HU{&zpRSJz8Gv9_B zeG1<3rIx#2nz(402*35L0LG6r0VzUjCO6h~FurN$9dF*2N7B1y;A~R`3lrvQg4Zei zwGLUs|9s@PQlpr`4=d<62yzTKT1#|*zLmS#k~)$-?(jVNJ+ZVPZRO+QSDK68O8eJfu z*DK*-bqiymDaC6C4FK2Z$&)wGd%4U*nFd^*;9_MyBWlo*M{K2Jy9UkYNzYre)?eip zst0#ZJ!(X#Zq#*~uh);oo!Ld)k~ONK>VQbzT8~+^YrIgnRi(RgjcaUmc2r zak?Tsseqh(Q(d!e>ePyT>=lXbRaX!rGf1b8V66aU(LOPOn%#a;F3;sm7Jn z!iEUfs}KI(iwlx_!<-kWcZ+ysBDRd9_zN5IMCkK&(>LW{KUBpuLwcr6p>%l90p-e# zcLh?I3#28fK78k&fE8_^(ICu37wd4zK`JIDl@I=>=}oFyIX3YPHYu#0Y-K;UqKuyI z82HjBG$G@gW-qs02))5JnO`AN>mq)*!n@&8SUI4yJj^=)fB!`Di_==u+yPRKZgd8j z3jCZpO0oKKHNz(7b1Rr>`0Zhbag&GmO;#{uCec`Xs`7n1Ds678N%!)X3Kf7l9n8nw z8=Z|#fG`f|RfJcS5pkt6v0_C}rI`e2P|5r5MO?}EUJNj-#xb^~STD4Uki$9J*1X;{2tenrd&%jK^WI#BCJBrRu)ZN^re1iV)$Kp`z^_Loo!ewV8{1O{^^Q&$*L&fL%1gg7Bj~+sIZWb%E96PeR)dX0jq_4^2j> z+noB1U;@8Kb=_i`oLR>MXIPzXlw1DWq)v7NNS|JIbRQ{mO{02Fi2f^cBna z^88txXHf)ECnIQ#`I*ck+GLW)&(v-|uvnbW;-g%m*z%?;FrEuKnM-J+21auhL}Ic^Y`% z^rvguqez|*UO7c<38$W?g3oQnWh5E|I=+EX-$r0`(pI}C+qmX3`b$|6__2X)6LZIy z9C4W#Y&?3o*mETl>~Z<+z_ifb?RV!ZlCck?qu#~usBmW8Fn-^3e z3-^n*XUGQSFqI*N-;C<|%_MIkE{_>nnkEk8s^`{pWusK#w&XnH_Psa6xBboE7-YHl z8?*xt(C-AdE^nTiX3-*Cp7a4et25+pUnf+)MnMx&i>*d6wxl_O#Ou@VQB3cur^?ow zXc;!oYgXyo-rNuHErX!{A7O7D71bO5i>g?ZbPGsGBNEauh=jB>QX}1rwB!g%mq<%@ z2}n0XBi)_S4FdxV12g;1_jk|z@0@ej+JEoaYu5Wd@rmcTCC8(|^Iqx4dzUlEX!~*X z?Jd;zZcH@$C| zLI2&+cf&Z*gR}3V%A&R){UEPo3_5hvcatZvD*5>ZYcn6b#eAtbtV$xavaiCcHu{UfN4UY7pzOLC?0F+V2QJ(!SMc-$9-iLXm=>i^ zHtma7s$!YX&csk>Z&JL4dABPo z7Im)YS_h|2onTEE4 zr0X7jny3F$rB$?au#rRUi9=f#BatC@Kh^Kofn&HMf5W$cu?u*Oon%XIbH^vq+^GZp z=L8UC*fYb}LGnY87dKg)P*_Iir{b)5TE-sn9MEl3jE~7kJI9o&51gn49GN!TeeBOYlp+ozg!Ilo(HYaK8 zJbX*PG8+x^dpZ6Zbhpw+k5OP9yjb0cQK#TXxeWEJ0uf=`-*5h>)Pbgc~(pW zuK4(Gz5H{RmH?lYTWF5K+HQZ-^R|W&;28OoW6UC;{_iey%LNk)>?_`!wj*&zC11CQ zv`d@gZifWpZQqjDaN>RR4f?l=AbOWG!7%A~2rs+@rT!kX6jNzRX|Y0Sn??+TOK!W4 zYKlF_y|j*1@cn}WmZS0a*s`d-D!jW+$o+9KV43Imwkk=s;H$xjNF#Ddjj4xfB z!tNl&8%j`=1sb+G$XN|q0zN34hu8Lz9^b`wz2pul^f9;G`AMmE`;EC1Hi9(g%9hJI z*sbN423iQAt;gXNr;G4wqYk1?G?5$l_zU2azqZg&2fWAVm7F|6`wAjUL@$1eI!ZVJ z&8oJVaYw@rT%Pn)HmqrlKl-*rO~=t8jK4MoOsx8JZ`+Ih4@B1gU~Nz$KZ5qm_aYMc z9Q3s(dBeG&J$kJUD#3xG7e8!A<%6@Qk*-n9hVHt#-&hhjiTI18!-9xqHLwWFVZr|U zvr~FM4P3LuA_3ie$&c7*{PrOhZ=(}A@Nw0ZMU_jJ>DQ}1q6pTC!RfboYL3kPZeG%{ z5iVwmKA;PGnx`q_)~4iEBVS^ey+l*xop*v*)BeF-m@0uq+yXuzIkQxyo%SZEv4DJ^ zLjRF>qXhZ^IoC%9zXOgoy<}TOT{T~rgyr!fWw~Mh9kcKG8n3ybAjPs=%_q-+b4jkL zmkd7V)N6M``XnT@biAXm&(TvFj5$*$HHbd(%S+*V*bt9{k0ma}cKy+M{L-u*(n+Q*&mislJyGTGJCl%zVJl{tA`^q`{`uktN?qX%@cb!y{DytJlTA&}d z?EDjNZZA^({%7Ixs9n-25@Mp3;tJXE7jUf1Vrdv|7c=x{l_!4SEcxQy;nk@YABb!tZX;di5XdI#O^z-RKwlV>zBK^y^(IIOj? zbi7hgt5H47c-F?b8v7quGfZkoI_vr6{4x#eyFT6R-yXd_j(6y5*P$%s6 zY>Fo9+%nLdGxU@@NQU|KYAHL1LeP+b#V=c+V;fhsPlrZ1;B_01OZm;}nsgxs{^G7I|5IgHw6y1j?qVyNUEC71o2_)JuA60j9`oKtOO?J@phZ+|HQit* z7*GCvv8?`+;AJ^XVIOI5%EAr1%G;8J-SL5st6%oTvGvxZ!<9k%A9bfTd8r}ECl zI@&fp#4GIHMVR@@+@(W#f=2v=XnbZyAci9>&ls?He5*-7Nlg*pm2?jxf4|ny_ zL;Ac4$CnTOxSJ)uo-TPPfM@Gm*W}_9*?swJ+x}mBH5L)O`WN}7P=&%SEo~}17W%z7 zR&o}z3PyvhettG)jwx8XTX4E7k&HEG>h5eJ#g;i|!+DW@KtIEJxEDQZysr5DPoTTk zQE+{W7hi2ijh$to)O&J4nG%HB?~5e43U~@!)gf%3BN0VY4BCl2iRQI$2W^GS%NFux zo}c`Hk!lyDG?2dAI*sX*V|gOPkL+_0}m98!TvTp?Zde##P!@mI7lJ zr2&VpXshFsvf8X(cMHXS>FWkNO0O{#k*Lht7cyJ?Q$9JcEoeO%IZ z3;b7qlDllJy-2wsxDa##?8}nfEq#qW>EBqu*cw-kBR6W}s^C@HEsdC7?3vF4FB@$_ zcY*%JwX=#G@W&gYk7X&9uLTqlI}}WPN2hFPUwmZMMWe;>Z(2a2k4c;JuxI+!sRehN zT8iwENZj$gYx?#cc;d&iFHa`^uRs)cz>!qji`=zZ{OnwWUozDJu4oI8pZ%dw%kxB@ zFVm;6LY4zBmRLiQFjiKlR_2iAmn4EY*i@SM=)_+(2wM3Mo9QbOEKbf?N6KG?y*_$Y zQZ7VuiIeao!!v8ffli^9q5w;kZq^|A{qDQY@I28_Wl;d*;O^I=ew(c18*q-F63!di z*|t%>-_}wc(fBc9<2BCsVBbArLvB>H5AXZy=TWv!!kV|^eC_=?WNv={M5Lt@U@P0` zzC^VEV9hU_3WM^nu=vG(U*d+5%gJJ}LRIv(hVto`4;in{I;IpoWO2vs5RJ%l5Aky8 z^73c9Fpm)=fj{2gk{ZR@)HH3=j!~#NWzwUwbfN_&D0!xv+@g2bWb4xeH?{=-0)~y{hckUxxTd-X(?#e^H_^IEB8J@I^OjbNCt~s zO*SO&c|EB*z%}Cbz#KsV2^Q#WZ9yk-DB*hjsJiX6c)Bl!w1@%R`aeBJ1~Qi_HDkm6 zw%c*uu3WT7c)GBVv<_B5NHx((s%*q{LO+#sh^Oe|AaP$px=O$)tJ4!_>uQ7BYQ;{{q1x`E%wVCSx>MDkTZOl6~U^R zXE`OV9d%UqEpZREz4918w_p~QxTZVPzJ7f3qTo)!$DQ1tJ8Y zeCQ!dY%bVUx}6xHU+D~s=B3Y;sDO%Bv0JY<`|IXzDhO#`^?J1#SnZJ)c*ruNj%)H> zWUnv~e>eZl;F?NKmMFRK+eSxxw(!UTiq*HJ?qiQet%NW1wR!of#cSGB(d56F=OcPY z3L=gI|6R%I`4#=aBRRfN%G2*?$S!bv1v0;@y#Qi9aL+jfe4mGS9U>1i!XQpV%jb?C zQZzGK8qgYNrwVtDOz%7@RO?d;r>I|3KNJ#Kau;WLfQv;Rr}feh5;!ZbClgpY&t z?j|_0mk#)839bvE);WVk0XuCW4Vwr^g%ENH^d+zo4OhYb(0qph`o#He?*8%MFXpe+ zfxYXTIXVL_TY=uDX)RrSh8gNDL+9Hb_J?Y&+s}s}zqP>-CYh?A7XJdp*medDfYY65 zb5gtnX^T!W&qPoR=Cvz6UJq-*Q`z>P+v_D_Y&K=$Ir$Qn{&tbk+q5%H0$51`qJ&eX zBEjt6*7KPgyf4I!S6H>y6n2Oj1byh~^V@jU(yfi=GVSZEUg$2&ef9DUv!d;t(K0vJ z3UDuH>G>Z&v*<&sayo#vq0z9tcC@OFs~^Ij z9hsw%v4hMh?bP4V2_|rxdbGn9vG^qRAKAHOIuFZz^#K@~+G!hOXj4 zU-58X??(mEZHoG_0I#1<9$xQXDXM3*B`DUV7w9NUL>Wzh*7Q~N@Dh|i?m`vc%H%EN zEzrufz7eSHU4Un0j*olH)ka-Fz5M!a2fm!>(Cu={kvwEtISDg`+78RU_m;+{7-deZ z6!3Nd%(cz0IjYTR^&Inp+!@uh zm@l&tlYF@_;n6cXKBLW^F`pA--LxL=xUNfvXmf)Cbs=4s<3dXcO zx+9ls^^kT=AFC==c$9thWti05D8k_73S?eA&KuR<6AR0GM5=#wn}zwm@%$_BOZyT9 zlHsmEQ5}8IHfPN}`@$X7uR-&{-r6=TzUfj6Z*#mh$O@nrU&nj=LpWMLJeA7DRf!B= zC=>34HJp*0_wu+!#$!R9<_YyZ&nJ#Nb15mWs;ids=V-oaAu|>23Of)RJ=E=8QxfMR zdMW9A+D3M=#e1Hp<;LdTD#1tsaTfbB!Wv}$Un``1U>Szo~RY)Kuwr{%(m;^R&6Ziq19oTSPk-wFpUa>L&hm zMa~T*3K%12DbT;4ki~m5i2L>otdY6+d>WcjbPe?#1he*0hFjWs+14P+YR9H!6D^1s z*Edw=-e%V=Y%II!8;FVxHQ7`yJ}6eD5JOLED#Gy8Udu-TKL7fozxn9fq5t}cP)NN{4X%-=$NTUe%I;wabFb3r zvH2-#@*d9~!S)UXN-_U>8JG}eUs8V|naOx4ab5#Q04HkL2yZTa&W)bySy z%Y%nlcKJd3lG^piW~eYn$KbsWq`$gFTGJ*^IfILa2dHL&(jTv%U&9(?VNb)tR(ody zjLmUgqq_V@mD8$0`#Iy)Dn1$HoDNdQB-g2R&%oLDsM%4i$_b7;-S^EJj>M@X006B zCUWg5D_84&cj7KHnX=Ux_Jme>lnJC8>7%hcd%xuzoY436 z%Pqu|iU~o!Zyd7Ut5is&{`q*B>l-G_vwS34WESopyFHX9B(j+O0z6Xl*)=JA|7Sc* zpy04sannp=UK_&^jK|W?T;a@6ubrTLqdJ--`q?jndsoj^M55M6DrwcA=)))$5E37G zMkD_IO#G`B?)jN>Pm4RnYz^UgY?2GR$R2dP%K}vUP zH7Si>^C&Z5>jhEPFZx5M9G`h#7UU5ld630#feYQtp3Jqh0LI{5?u1j;;(j?jn^gkz z4r~8w1<+Gd+y1pB_@l9yrf9dzBi^eba26l)3BOd4ndN@;h1V1G-V4Jt4i1^UCQAEs z;6di;Kkf##(2_FdcHE%0d3`{RDaX2k4xk}*9Is4-g`LDV!;`($Ym|-$u?W`SYi?c_=6b=%OjtCPekLwR@i9f|M%_i%eQ z;#ktgH<)A-^skfW=hIJ~llXZyD?x;f!P0FtnVC?26qd$gz8s1w&*ibmK=SnOd z&ENoWWX@StsQ$FKwlS?ewoAI~zK;uPlCYzNQEhxnp?YvPRWL#`$G>oxAg4YgtM;S3l9FG9ox) zu$dkyOu}4CRS0I9En0c(w}BkRsO=5%j%SVJr=)SNr`}^EclWuEl@14e^3CtXAg)pG zX0aSAj>~_YYk}+IZC{BqioxH4T#DihoA()fH`40OhXLHM88~_?)+pZ{&h{1Z&D{KM zY}?PeFe3l1Sn9wd^tW>@b@0EmNf_q!Cy$zLTOoIi0xgDZt;b{55W}0uoCRI^hBrQO zSF;H4g0);P8rDSB_obwkVOR&>vbxvHxN?fFs(QjqG=j8j(k@Gj zKQwtVklzJ8*ZI9C2ypjiaEUvPJ~dx3KL*?0Y5c{YpkpXT?I&fxl_f_c<0(#59opi} z!;^aWoCG;zL2iodQr&Z3)QJN?cGp4%%23hHDC;8?`7}VG=_4K}I~w z+NTODHCmwn=?}kl`y6C0_D#0I-$;bE_Rz`aSbD`L_H!}XPdE!}ie%OwJXVGZXXS_+ zu|YgZ^<{{^yZ@3Bb{3+p_l1N93=RDSxaUq6BLVRg0efKOhBYLVA;SDkf6fz|kpULj zmWAD}NEK|)gjYxBp^!(iU>x!{n^pJ!(*oe&V~r>}*bY)1v!q1q`1yAA{92A!#qE5ptzXh&d{og*p?!> z=u7BtbecK_y#V0I1;1F2wR-Z@^Uh1+tha0Fg^nWhu0{lHx>#g5@L|9v<4*Byhu%v1 zFF4yCW6Xp4102KXz;9~aSO-stqi`J#r~;ZaeU`LO^yk6@-k$ojSOx~mx_f61PwrE9 z{HgdHlz3H~P@RCUxg@!(*pE<2Y z%v=guzHsxWGhurQEeDZ^esP$t*iYi>Xv{<4*}upWJ{6AH%hNMJcw%&B+)UwRG_*yc zZw-PAO+%{;V@?%Nx~kdIdip)8BBd>0!1+FiLf1@0U0k@|zzUeTMldgX&chp`eBW?X z2|PNwcN);6ZxO*}&49|l61D$llmk<`mnUZ8%0xSDdqE zu=9@>NV$_mj6Zy=O1FR`hbxdA97*^*>AyC&E87BPsokIael@dK>m#JM^gU-5Lc~G& zAB!X8!B7NFSjDXr*EGlMk&CPvD9X)MR;_L%LX0=C*&31=5PmfAnCi<;w9I@WFcJP3 zPfkP8p@uvZEB=u*S?^?zJ>On~2EHcP7xrzvo59Cc(uBT}Tv+tm4^{k3q3&D}VF}yL zYngsK(lP*!_GY!>s0SzJF@9f`%Os0sU``&JJVfkYW7mk!>iR@i;LdC6uwA74oQFwY z%71QY=1lNb*ksvk=N^T=wZ-}zeYPK?SI1j%g6%WC+O9?W8N~MxLA$KYf{64H@qCg#rzAxUbmEFMNK8!T z^ml1`HEkA?LQclq(+LSq1zg(2r<rV+D?1f! z*W$-SqUkq*E+?eKfA+=yz(ez2Gla*HvLuVgOm#k?3;VJO%U{S)zUZ%5x5I<@vuwF! zFz@ymk3YZP^RirdU0Lnt_F&ZUMnQmv80uli38uN|F8r*j-T0bF4KfA?rK#}?>b^)6 z@Sv23QdgTo1gT>P7@gIC<~9-sI^M_K<3b)0s$B9-DNKn1`()}EF1<_^V`!_<6ts=I zcJa!Wx{7%fxFm1#ix{xabRV=3C1sf+IZUH}ocGm{%|qpCj_%-l{=^Gqj)8nhte1*Y z<9HnU_AF89PRMFr*g;V-fXdH*F(u-Age-}I(+zz!Ah8d1hp^TzI-oQ|h>k4jPQ=1b z`nI7BQLxn#GtE(rT;~oHvcfQub0dD;kiwKdUrrak2NgS>-}pJ9-CvbhFWJtGhEfipb5_{2xmn1gL zDiya3z#xlw8Vl6gE+c&IzO$yU2d%I-0_JVf6WdabVVQ}*%>}&8w^;Ced8}*ih=JIgDm$a zX?HV?a_bGB1jdNz7E~?f3TU=g{>Sl4A!(`ploty$@1SlA!IqaAM53od7{>QPm6UWq zxLJx<%USu@Sx+2aJuKHz#)nLo3ZczP ztTN<-*Bo1dTW*+Veo}8}Quuv%2D%^Ctwsg7B>piGkuoPQg5C~+{%n%X_050m3j?sS z;Od`wS`q4k^pBY86k-+1LT}-e2tH~oE9zUj?7QwO%aq2Jrg1z0xYsEv$sIcXM~yNS z(lew1b%@ZOegOGntSFUZ8{7FP$&e9N`78qR1FJMO1GQuVJ2@c;@HvJ za!pN(0WyH9V?galgL5jtqStQKZF6j88(MjrHBZnO05;pnLO>=4bT-#6!9(s7hXjKU zgZg(SdEI;DrRp0l!YWF8cGh100+i2A38a-t%+-?>E4drr;FVF$+pxrX>&L>@9t5yu z?Bp?`|11zFye|@L1<;IPQeM9BH6gTTz&rvbnWv)6d~;a%VNQ35ajLS}7Xbm?kxR^T zR-M|@X{Z&DH(!)&JQ%V%^VXPYfWpyaeEQrD6CTQ1N#3Xv=nI% zU@;`utRT=pd(~%x6+X=PPL&V-#eN1R>aLOJ;lc8NvE%#bcqD zQYHwhu@U+uQ?W$L(x+k=MVN6n?r{APrfC7+BH-344Nn)>GTa}&z-V@5WwPtGkM9Ba0KP=ii z_&VT}hHGXct&p)t?LppW&jt}7kx5JSn4ZD{QKT&`_=Nj;z}~%N@JBX+oc7h(qIhZy zRx^ghrT_8Ykd>y;$3l&>Z(p5z`3aoLcRs@lKi=5!mi54H&+I3HPwKz=`t&loUgFK8 zYysV!e>I99_)+h=xvS#$pK?&x98;si#upDhpDb{zWAMxlWD-v?;Wh@63>x>e3XjJ3 zV2$Pqc|UNVUUB;ESC4bVg0oFd-Qc|r*zls&gz&fgul(t5P++JrXdo#ldZQlhI~w_B zXVByqzJ5%KyrbGTS1eLNg}cXDqiZe)Gm)G_Rs7*jL$%<}!j6&IQ9RG}m$`uBPW@4( z1>kKs?yruPmngR;Awg}cS769%+I@k=HYSxaCv}F*uCXt^8~yvI?oXBtoL}|TIouK4fc7W z=2$qR5~py(DYIbB7%~Mj7VbUL$<$f25v9F9e*EzF?*?iDoN7BD>8Nsv=1=uvA&B7l zQcaW4lS2G_!4i5DRbA2Mv70SWg{$DZmsR*Os;%XCw5K8i8YY$V*E{U#~}t zF!Yyk@A})RbiTb@fow{b?ydeXu+OhTJnnq8U||2^?R%KSgO?nSur9g-_dv&2b?oSw z0cLhaiJm$)Zfs*0axX(6b_gpW9IsQuo>TZ_iHJO`!fWe?l!fGjmUG5#FU_i}HI$8Z zEjvnQpd?iQYieus2}m@pQEZtlG&$Wyzt~R8Z-`>aEYa_y@Gn3T>x2sJ52r z%Fz<-qAM%HHk9raA@HNS9g&%DmL zVUwx#X6F6oCk_L*ni~tOB`TtCC$Dx}5n0WzyGvMhsl_CAO4XF}iUxVFjmPLAEty&c z>!r-I7)l3dpWA2TKACD}Rz1G**Ah65Iz7EP^5~K93H=M@yS14t-~n)c7Eco!PUx$* zLj$B&npfK2LM{)Xc8yp5u)m$$+gcaVU_X*H_aOV7wcaf%l8E5vBR+AiFUPj>O&-3M zn6m!j{u6(5%KeQB_fM#&ur>0IJdf%w2)Ez|RWqGD?acXuo8tj-jOAm-mxsVBHduyq z21>=jcXaRP)%C{ysbo7yM2KJW&AxADR^)g4a+x;09y!@%J@e?)^j{WRT|(MZZV5xd z6boyP64thsL2+?9pc~_uOzYGN!yFcLeC_aa5#kn4S1m|eG1_KvZI!FnrpNNhH#sX! zEOn(aeJCFi%6`9K#@Wr;G9s?Zj%yngaSkq9!M^L~MP7V(x~Aoxz0RIJlI(-{F~|sm z-+pH?fH+;Z_)Tv$wN?kJbOffhFF;jOG%jqgVuKajFhjM{`QZG3$woqz_eKgC7*Jig z4p5JI$#I1sGkF10y4lPnL6!|r;j1ar%*C>`pBK@;ByMk2H((W^haWfsyXW5Rn-Z&| zUVi;csn_eJXW)`M=Y@~vvR6UdH~n&N>;U%|SUJs`V<})`t!)%2U$o+);i`E)|1?hc!@cvlpd<5m(Yub7HkvUb^LE1@K3%V zlC_Ud_ES<9-<=W`bM@6~DtYa~zZL$JTT~-w&-k)R$^H$0PD3{OX9^w{+YS9UI-uGl zDiG>%F8|$?SL3wa*;gZRp+qbsCd0lo>d$>rpI~4%g=LS4<(w4dSYw`y?4TD@pos68f#PW^T`%u%h*~3-ABJX1Hl3V<*TD?! z*B}gw(!&0rMsl?7+f8r$epV}iHB9`#Hl}IV-{n5a?S!8um1&`1t&N>VF@ZqU0{rxAKfvvn7I5$gZ`vzdeZMh^^SAK^1l^5ByFs zb+~oPF`@Ba^ZmhK+xu%v;0=KMbqQdk9zeD)c{8wgZoUs0k_H_eXA$djjLt!uQY_v< zqdBO=>NssiP1i9`hy=y?8<=m?G92LLdEJr-BzC@{b^NVKG^xnZex9-Gc990O>Qa|q z>Bu&cn?~Or*f=*OP1RAn%22=Mrotyc%y8~Z4ICGZMZ-mi82)tyr(&lJlW>HP@x{VB zy7~A8gyZuSz4Ch6rTlf2y?2(f{6REgqxrD9(B4VVP{T#_&qPOIT2ik&)5JRv7#KKJ z8KVl6>RRT8Bmdsx1e1xVbs0lbR#11W~gbiJ6%kA7@>oY z^_(-0U}(4P?}NPE`EQ0U@s_R#tId2hsvCb8i20hQu($h#P4Q(wd(Y{10deW}J0m$j zp~^$YB>zrP964j1I^z3yZ@AOrrlDl5Ja0S{2;B}d{`krwFaNM z;#=i^G0p_7`}k>SRl50cbG7>8ujVB(YddWIX)OcEcmp1Mu4N z*ptU59Z@QkN{Ey4w_c22$+S~iet5eKYO;^Y+Fn}7yeMswv8Gvixf3~$pdw>GwQ=C( zjsr2$_%tgE*#~cpV1u(HYMLg~-^Et`ynk_H_)Iib`a*z@o>3{2gUS`V+;(KA6ZIg# zvGwHy6YRH4Kxd|{IZzD;}0Cv|&r?P7^5$~5I)Rnh2Kd3X0fe1UmS zOu1UUBZ7{L*AcL(%bWpvkH#`NR_a}p5H?uV=`mFKOL8V89Z*}!vL{j?iO+QZdO9?4 z?tk{{=svSLuwjv%%JE`=(;$;i>Gva&hT>UdqtbdMDJ)~V1ydp%)F;E!gSQwWT7MXG zhhn+J1owjUThadIXC|_=*JInBeq(v~qMY|-$Rs}EUo|iXaQT^gUw8!Zd7~j8&tSfT zxbbSu(>b18#x;x2{tsc7=*}Uu(({wNfS`FIb#y1=5Z(P!lH3K8xs`jxPpjw$d$OrV z`X-)*;W;b5ZotBazfCpeC_Gv!sgFJ-<2e(voK-5f%QWeke{#?;&nVX$9xuU{1duM~ zb@UC1hJn!%KE(Vg=OpjSP0c^=eof@iW9usvyE?K988d#rp;b;vGr#DsdGnIgM_9P& zW@N6wn?y$9>@>!7i_dKTagPq=j|0d-R zov!eUO~1p5yhA?V5yt9Y|GN4&Vn96Xt!5OGUdoB#J@qT++3A-wTh-f@Y570N*HQzI zA8u~i?@(-D+J=>$EF2wfxQN}9t^x~I!mH&kSGQcx;%%k|qezSP+aKIKK{>R->z0ZW zCQD3gLfM*t&SX*dJIQj-+uZXkQMW+sqF~9QP`2=uX9ayv(JK_sj*jy_{N>$}0E)t$ zIB2;SuVBeZAidpzHxwTzbO;6**06)`#f{WFv+9}@kLqFg2v|<#ARS~$q0R8UbB3fu z=PrY2Pq<)S&#_R9$^tfS=z-pKTO^fV^^_BS^YC;&EHnrcp9BenxH4s>VilT({1#+x zJ>_4lu41`E(U3==8P%5~!0Ly`zC+9Yx0DAn?m=T`f60jgE~qv94|%3)%3n^cU1$)g zxvnK+o2V+1MWiuTGDP^x*g!saOltTA>mf>!9H*<{>NFETY|I8)3f#%pgEc z6dTk(g~x^E3YPjRKWA7{|{iT=XxyNY9I`bPOqrH_Ng3tMsIiAXg3d}Dw60l#sVHy%-kmxp>aITt_}@QZ2|tof>g8uK5o_=PhLyv|n{8z&wc`?IsWoN-5GUT|p0OJA6al9FFT%EC5)3&nr`<2}8@#~*p-jxYS1opzD zweWF)sA;9Jb+H@!$g2HMC7~EGZ{+hVOa|Y@@LbGqgMeEi5nP#NlY1W%_@rqAKFt6% zL)w&o8B@b50>youmlxx*)GYTHe%BoKso^6SBVYVl3_3kiSR5*Zaip&N?0D9AgXB$s zO=z2~(c87&b=CSnqWv3A3|_!aITUMLi{kNX_8IY`GP2fh6k}f6vC8C?b!C1_GmS&{ z=T4`+^jxVW5$Y6>mhO%#QWp@Es%ihPE~aeP^dK8Cenw^FO#4ukw%b6)e{~?Bhme`m z^Tp)*gD2HA-HAW|{A(IamMiu4a3fQ^>d{M#)%NXuL6}i}b7$?m&7Yx;WVs#H6zV%n z6?Yu0;p5JUM18ftgGiIHxaQv92{X)=;Fy50bi?nva?iBgXGHMiYfyhrkE$%~p2OMB z+)2JBUr}OBOl(jDq5hvsJ{Dr{h2*3)2ALdst#FQw!6>0z{O#+DH#4?<0F9A4PcDF2 zrG>OQNPP3xWwy}h3foL!Q4B$1J?0jcegg75=1@U5r_2Bh&oz35Nul2+{)^*LrR%AK z*LX1{RX&sng6%BH&zo3V7g8y^S+r|~u>X+nv8}>&s@dkFD}6{p<90~&(b|+^3H3{+ z=Tg#xS^n^kql6my=Db}HhvA&$5ady^>H0EOu#LIC(rG$ODtDwUwopw_`k4^h7#^&o z<3CR|25s)Z96vPcJvG=%6g+jKxMjRG?JJ7-6xAsC$H4S3Do`|lE94L5B6yA&e6K26 z{Tinu^Ls~-Yp6AddO4=l^oOKk=Mr8NNSdH^2WRf&xB0vh=c}MNXsqFn+3O&=V?i)a61mU* zWv{)6hK|9g>5kMZ$*v-5>Ls$OM+u`c2wmPX#aG^nRh6r084X$SpZ;al{|8?>O-fhz zWG`>R#4sDWRPT1#5tCpjji1<VKN~mb&Csgzy#}o_DCPVYjMk3MehUc@> zTjWKt>AQES{EMZBT>q62>;1y7!B8q0IpR%BvzH6vOX3og)vv#K_|8(ge?MPr&nT6(7Uig}~6*bEI2Qaw3ePKV8y4uu{@J!6p-w z_I;_CqVclh#gi+9(XwQFah;MHg3^Yq{uLi=rim@NL&G$KeS`VDD9tY413SK}7XdD# zXgm(N@4P7TUfO0!g}1KAJYYL#QxaaDoLmQ+H6ETI%ac5d&TIYrg=3(kPB}1*$Z@N~ z1Fh8T9Y8KWZU3lcyywx3eQ#9sh=BBjVGf$x6Q0N2b_d2;NJ;?`R7h?;FI|=}ghFWr z`6m+M(%s370pXzx5(DP@r#H0y1X)>Irwrm={doAKTYvK{UJnvimH$Y_}5H3G1I@;6wM!&~lS}*_TumZDvp!e$^cdEhIX8*|# zP9-F(pT{}AGb8cKJnXq6^Sl1_BD}12Tb@+woa9q^Ev%9v;EHfe0~1D^mcXZUU?P9T z-iQ;uEB1cqZ?XJj$JF2aIAc|-@Y0tTE9}NQWXM3q|NS#ZyQU^0;`ySBD6_AMnYpKe zj7NQAL!!meFM9EQKDESGS~`iqC88)iRvg#$$8QPWJD$=V>7eTAUPP~~m*NFN6VFLF zk|PmB!dPA2E7Czk&vP3M0vuu{okMxylOv;~$_Fz(;e;(H7na81(Rym}H+_3eE)riM z1O@itoxCR8Ke5O_r<&9@(|Qe%tlF6`nn$pCYc zZCD4)*iYg!HM?23Ntj)f6n39tTz`-4(kqS&_$8wna{exrw~=1NMLBn$H7b97RQ2J* z4Q2f$!uVc62(;T61LwS&uNxzCvO?0?iE8nels); zP^({mlUn=dLCHXVFY`G&?NEh_RwYndv40a6=#-Y$dVFFs?Qp{y9g{(`*#HGdFhh(1 z@&DpzsML$lYN})KWH5>n1kgc8;w*OKIkuR)Pn@J$FEXXv-$Xhg4g@tW7%@ZN zo=d&is1Gmv29lt<@FJq$W`W(z`EBk`)f&2JVIV8Z>n*F*u^4N+wB8_ zNxj?y(0lf;Eswz@x)LPTZ!Q3?n=AwJ%a-YgOFo6kuQi>v{k@JmLt#1lM#5C4Wydh? zPO2U?X{txM9`R;Wv(P!c8%Cr@1#ttl#6lFy3>*rb;W{#v)!h zuaO64%sJjrrBr7}l`yO}#*aAX$m@=;HR69t9`VQrb)Zw)8#xGx%CY$~dfDJq1X0od zB|<(*4;rHa-FQf&!=H#@%BN}n$$%NHVmJoW^+{(`~Xi70Oweb^G!j@}@ zAyyl#wvw=H_lCO;kY&>Ff@WlPvfa4ezrR7vrob-d?-&Np8SqHN0@qn?jQ zzJb&h!&on22Aw_J5Wv{!5@b~Q`EJXb(OOHT3E}66lJIK#0ef)6FRM{AOfTYO4XjE7 z94u7U$mO-x4(MIaJHKOieB`;!A}@n4N(h+ihxGx-G8?AsuGB&CUQzz2fIWbBncC>+``EG+KV(~X;jghwkg@%cx4 z^90bV0-Kvy;POxsb!10b18YhZYljcp1zc;Bkgt+6OK{7J5c*g&T!M zRn+`kPcr@qm?P39e<#&_&MuNEmv$vRkeV2$|azzdu$fy zl}c!QR(JVkE!gr9tRIIK!?Y4w{zlMTH@+b0;WVwbKSaid#I^_rI@xPrd*FQb2@M*bs)xqn6!1gI}Bn!z3Mnb@w^Y}JL zj>Iy$sc9+O$jG9b$t=k(i2s=ea%v6bpb*OTfzSB&pWm1|7R=9#&wpeB-qo3!_g~VL zcqJdx;f<`~`9jv(i!o9xskTQ*c=di|QY8%}?~{p3y-h0p3>H`k7kx|UY4`Ox<;Yor6v@f^vly`krv(B zt!E<^hp(_j()M~w>mfS!DXILG*A^S9EE=57nXpo(gsCR+P{>Fa`gImhrTatloUa9T zpf0Lx<6udJ7iR8NU#EUveSUoBtaL!P3WPUbkiL2X@)p_zt@O~OIdjKD-$4^k{f0~{ zmKL&(BJjHQ1ugD&plERrdI-D@-~E61`tGQvx29bM5d{(H2qFq7O+b($HAoeZ-lRr) z2Wg?2hzQbqmEM~W0qH^%>AiOd9jO5#kdTnv=sD;8&b{mX?ph>&gy+cO}K!q4?8O zbO`=6t7hP?&F0+H-MMNH>@N~9=Irh!5WEic#ikSR&YftXttpb#cbLL?aprRg&7%Zd zkxusa1D+<+=ANxic6RKvrgVE=>JUvgUj{v&O`v}$lEK!#)4|}mMwdGpCO`F*4_4R9Ex?tu?mNI-QDf}IQU11MAga^ixvP$>;`efxaFhb~o4$lD$k;)t!;m&uJ z)6=unUNWEW25w0QG#xdA$vUz5rzK>d`s7Ji+zG&8=TS1=8EfS%vN(<3?=UkE#7jn+ zu;2@BIc1(RUXIFIPIRw}DUoq>Fdu)>*gIV}f@>BE_5r;)PR&?KtVq9@T(tqg zVJLbzz=?y;XNTXItP)y$BSKb$ts(R#&@5cr-9LBp7%sMF~0L z2IC8IRwyACkzWsIv^Q-{s&H1wl1YSI8x+9dxqNSN)_|c8_$h@jCFaA@_p41keLyGL z*{=;bRk*qQbB3|cl`Zueb5p6QY7oG)rDK;bB``nJYhrd4bcdFm8Sz!I`6gNHh`t&xFfh;KZk){Ejxw<34*yuAxNB{KYkOI=}p zka&0M13El8^`l#0!JzeO?md<3wL<8OI%;UE3ojJrLO$sz=m)m;3WcHCJ+bXjcz1~m z<`Dz`3*sraY}WKm&okE5Eyx-ehslxYn(27aj@ajSN0>=AVTM?ep4jN6K)X@|>ZOw+ z%6Q{|pAqNdVwn=3aPyBcaHY~NoPLvKKLkXLUf|8@hHa+Gr>zXY~o>f(;nBeCx>*L2BjFuHS5sglOl)fax4QD3^Lg%ti}Z_%iX> zt?gp6&(BrzbG+D6Q}j}mhYmg~GjoF+;*&;+$6(MXRzMee)WHrPderOCh8@wC!DgHx z@mQSvjo23tPJtE4zCS8WeEWBObShF)J_F_9lUevU{739=Vc~|e~ zGUweY-6#AytEr6~!%tW4Gq>FAw2?CvUij|`R6xV;OMI*nx$F!c0j^%HKRb+J zZHw#;=Qc-e9^G90vN@M2<^lx8T zy#Yw{TvXP^S|&gE6>ITYSpbetbX!~K)#iA&^xd_7pG9`nHwZ^^e~2v;^0`H>KYv?O z2`|IZfrxXEY~Lz@aP)oKe274oxUK%mt1YGa_=e(ism4Y70;6jtflcPJUx>%$3$AT- z`{6RW#|fU>y?vR+4gGmScFf}Jdu8DM6U_$_*_mmphvS@S*_nBI+?6?422^S$G=jw% z1dpESm9>1ISr`x!l=TV$P&sN{UVCD?cl_-1**>kT&g@?8_RaWS&)zq9pQYW@G=avT zr=xoKp{8=vhiz*%b`>uH6&&n-ym15?eco-ZLjV^kcBBi#`eg%z6k1$zm~UsTt`VjR z;&COhwbjtBAs6Wzgznj^YCm3R)O{_y4y!O)Z8=!%#~&|c*iWc#SCRx)Bux_H=@5T0W>V(xT$YX9 zRi)|g4D5Fn#J+t1V^2&OGd5N@H6Baam8`_54;+^b%k<*0*l|KKcebvs7d3I`L+nj= zi@+g`f!@*vW4_~*X~g;m*qQCs^K!MhoYuX9O>eJH2C7ChK!;1@5C*$;HlT`Bk<>}LZ}(*7t!CG(>&IaTXR}AQK!zH0d2FO zQ>K&aVaNdBv@h*Bhz{wi-~vt8Zb4k0tF_O#ub?IX>qO7 zZv;%Ldu`slmD}{uGV`b96A`C_=yM>4Yg!)NPph@)lb_=jOZ zf7eIa#zll2oE@cEYqa7#jG-3qQQ5ORb@$^Nx}hvaC8}ahyQcwOX-I+CY8&{y_>8@; zcYjl)#xj;ZJ=29xlpia_J->A$p31nCf-h$L{HMexybYSieTS@eUk2aw9{X~2BHrfR zG|5TM#)eqD#aX>87=>}Oadu7tod448Wjm?vy)A?PcK-l#Oy%FAM(|ZadaoB9} z{&hWzKF({TZM#=b_kWrQT~yQ&H7q^-hTA>&U3OBwoR;lE$YI1B2mRPW?ZY8qEGEs2 zO^^GL0$Enw#r!xl2C-TRg$M29!}gWr6AVQ5927KgihG(0Ghgz)1I{sG?qKH#_{D@( zl;#l8slpIc#y<)w4a&?pPadweQWGoEiLvSyc)hd7=$<*(#6ue??m9O;O!;P*V!?ZS zGpbE;Ki2QKWCMnfB%~*5v#aF$nK1JeIe5cPd?|&5#v`M{68Z2_GiOC6@Tt{BFs9`i z1dP)aruweDI3G*2dJYL}Cc!0db@PY@8scc?&&!`mMtAbh8SKH)cgLPo6`w-xbl;H) ze9x|YlQ@)1Xm6%1i$&6>iVQ#5m$TsgodaaCaNV;Ui~GmRBFoNed%`>#&!E6Ft(=R=%gBe(iC zhBLxk00{S-UnoRkl96`Ze%srL;UA-e z{C%QbFr@oPXgf}A;|{aWuaTdXk0qom)^78z$?HntXWVbLxt=xmLV_UM=yFghHYRwp z;2PC4HlMx_LI}KnNxxfy#v5S_-cf$PSE`rrA!(@k7vn@nSaFM}KYBkbE&u8nvslIU z@o}P=K&=Kb!-xKF%Cs$yH4+W?;X~nmgfvCcu={Y9Bq{^~7GOPZ04{fwk%J2YwFuQJ zp1&rz2oZYmv&Ke0%&ub!D#!=k^8q)@{1Z|Bs6VxXtP&roml;c+aGhGM2f>-$NzJ(t zTDgpFNYQ{W=;`#KdNboyo7uP6&Y?-if!)lYC;k?pz;ggT)T)G^1FfcQyES)5MzDQl zjlFEF=G@>XqUx~i?9MtTWS#%2KaYwWe+urx%*NcO+n^p&oRNnlkW--iZPSXuyn-Kh zBqTC_!Wn%x(X~;-12*e9hVV(@l5*B+vkn|tQ`q#<*W?`Z(KON%<_^#VKw?0W8ZSOg z5;9S?Nr`q!423X;76aa+th0j^c>q{!z5&F}rZxu@z1Vtvaw*w5)rnpO?~L1hl2TVf3`4u=WlHbK$@0gqd~T^jHP8Bi|j+czvVwY20H z44eQ}Z$8uiV^Nf+Wuplg`*Xxs=MJ|P7a{z9Szs&x4Fi6n{kb?&xWM&9LU=U(KB4T# z>_Vh(a+Nm$ctzBCcY_ZRKoQ+W2E0IW`l2VEEBhS~CVS955tOKVj2z&FMAWfi5HsI1 zU0^Z04$UyddjzTuogkh77!*^Y0`<6@^?#sY|3V8C&=gtgf0l$l`XkQqswtN9G zvl7C?5XYb_()!P9-!|8qIX2*O7k1mrj%BV?`I>l;m3@Tup)>tkDfp)V${u>)Kjd?M;hX)1W-k9?_LU_D%t^&q4scp22yZHo?u z`9{o&`w%@8g~vu|AO}ZvAE6q(c>yJx~T!Gfum*t!NeQzoidG|djlk8=u7;j?%)l?!r?*?Szj_*U%l zvBeEXXEZf|w}Q^R{x+M)7yBYK=T3~m6s<^g-g~%v&Op`x#fhKbZqgI@lOPn_{$bh~z1hvY!`*i_N zjx|4?4$Wk;(W%?^HLqn-GvFD9jKF~qY;PI2Ytsh3YA5xEPW-{a-E6PNyNob0Jou<+A+mge60SWT%=kd z-daDbkiR2(m@gg2#zE{5jy1;J@J>Dn z!qAsE$D(g%U#(bS7-qx$o*~GEnK|%h_tH1$#V?cDn(A;=FNNCSKK9GZ z+^6R6YKa;*%{0K?V(R%2e3VM1U)irSa<#)#H+=b;KndQqvIn!|$qu`4f_KgzEGd@7 z0l#1g$@fcLj376}iXC`08oP=I3z&$C?6E}>D&4M4BM#Pb+7KX++iq8yHE4|JqQPoo z4}JY6>+R~6QJ)#0-CfPhZ$hVr{2PSnvS)`mko^`k&WANu249%&hy8>Nz7PRpS(gaS z#irrkzqb;v3ngB>%DYf`c)_?yzq`bie}7cxNxR4DW#SycvhTl>XZ$7WkD-SVz0MUeGiM<(i`Fh3)@B1!xQ1p^Ju2Gqx zI&u<_pJ{=t0xz9VB323Ae_HpX8q>M#=r@>I!xiujxknGiV3ayS?hL>!Ben~x68wy& zFukE?)cEyJ@Tv>6(=3tlW)Z#vC0qg|&^9B!`FM}Io^L7UOs#|iy(e@?#XVaF=6Zq? zQ1RecGn!D)b0|KP!vg%XZ}@<8dt^R2KKqu`Y|o;qOOhMU4NWGY4X{Ti0tnG6XN+!c z0dDsfw4QR?Lb_MKE{Ksv#Ur;&kLLOo!(%biU!J zpEZyizXy547mO=ZhF-HdIxYDo>y`{TG$TcC&14%aWLU&E&01Gw{s|l+Z>$`4tS^2pduuJJs~Z_Ey^1<-#R)qm6a0nhMW20Y2-J(^z%-1&oQrf#Ki%&a ztPU7Wf3;VuDk={g+=r-l!`~}ZS9C+_j@8bmec|_Hcz3jCC2!QO;A1q;*wBY2m!SH88OeL3a8Pyj~7GbjWN zn11H!zsP?V(|7-FEB>~5;0|qNnUPq!;dq+3QMBNMf8Snlrc+JDAu^1u;cn9-$-XxgmfG2^;Y}`J7hT9__J~n}!J&px#sbH|4NivvE2L?MquZ?uQXm8iMI3Bkm7KUT7BYdU}t+8Bo5*lho{1+`dHcMw`H*2ia z7+_cvNWbA-nXX?iBu8Na5VazyH)bM_cj#%1Kt-m6x)dGO43hA&=?Gf2Ca+bv5Y{qj zt;(dn25)$dZYLHv%mE`T?)v|_k~mJ_jjY}WWiuhbyXN3uq;4Bi#LV1up6LoN=Wxd% zVO2`$RZ2nW>370(xVL)3y(W;U>f%ca|I`BXLHwxtHl$7*Z-?84Dg1OvDbqU!kxuIO z;VX6^UMb?yqt-Q{-=7ajU|ysdT48oSt}#D;*lxXGd#Ae|M_0Lc3^<$?g=zJTGujhS zP`5mrvp8BiSRu5kO`?@pJ)%g?@kjq55*8i#XXo+$BKl^`Uw|(~3yT z&$*YJs>k!IfgW;sTDf|BI;HfNTey@wHWRi1sZaEnYzVO?S5i3%kD20ek$vwkv8xfC zQpt|J^`vyFMMMvC)E00JP7rPWd{Y7KGX1o4$3y`|z2<(}2sFpVe61j%Ds-etv?#*t z^~7F9je{nu^c#xx`Q=n6#j8FiV-3~4>md#pEsht_!;hs!YODU@THba!8fLvBj){u< z!QVr>x9+H~I4|ff6t9!A4D37sSiZ2{?KP`oJMq6>pjENKoyz9n;fIQ6BtuI@8)w_L zUUFo}F;pdCOO)S8CV17#cfPvD#bP4^e79Ny2$?FG_AUG=NvLuO&F3?GJXLcB-IDK? zDDLPCK45Y%_pDXNG#>YW^29i(>Okn|6ZJ-j;{#2801RBOg)`+4W_obm>?B;lAZy7K zO)Wi0Jt-$cWh>YgmZuZ()0KYPAtfy`PbaK2iCm$H3;A;uYk9xBl2r7#0BJD ze(~!?bPIvaf0GEnwAZq5e)eq(8za;^AyWnkldzZCB@v7AngtOb>pkhz=5xZ|`!_vz zt8EX#-iYU#bPNi#B<p{l-=qO4Lt9y)L0@i+Mq$B6vB}*sw^2^f2n`GkeDFcr)gU<6 z3rmJ7)N7h6#i9LupF$|6yWsK7L&B z%)t;^QFeo)YVC$~xRmx`FG*ee2GQAA9<&?awk z*jWbrN485V=ygQ_*1OtF3&#t-7uYmr9^$ne08WdPf*tBC1ET;>rZ*3dwA+pdEw(eW zUO5(rl-K0Jx=sQxjehXljEP#A4HzTDwz)I1dx%hyT=Eoo{B*pQkL`U>5D)3;&DU$* z7ZZIX*!*F`{+x6t_stFW>_FTjfjhTt9g7v466~fo`CS{UatOyjXgx-45+Sx#oxQqN zQrKS(8ku)&EFI}d&RZ(><|q5V6BGZM=y%Q9t4qWgz5_^)uOcBBBG&UW;QH*=ur^P! zFGh0dyE0^ddIko;CmfmW>G(_cp8?06B30`2UI)QEaO|;C?E<)Vf#Xh0+qYl8ZkU|i zJU%#s51#n!&r=xOoKlUGI@Q z$OW;2N5!|Gy5T-FCytxe(l5nMU?&4PP7UIopiIte$^J;?A$JRpo6!V~`|P7EAhyT+ zYFsv}r>8jyZ;72X$ao?Ui$rJI&_4CbLF*NjcESNl<%A z_x~?df2*i zg}s#Rc;&NOd-S}he$kPjlZFwoTy;w6bKeK|dJiW;=e4Vhl#a^RAFxB|SFo&i326# zC*)YeZvvZ0Q&{bXU7u6_WjjZ$T?1U?ky_^+!kN4^LbWW4%${kgXpI3WUhm>swiLmw{GGTYrp zqH0_T00E0*sCRX&@zzZg{$PWi{cJ7`V7PwKN_QUM^E=|<;{l{<<1()iOY1KkH%qwr zEyi@BRa%$lRPem*)1n=g^O*r4Ea%>m%NJlp4FVfT;vciQRqDzc%^$23&o z4bzoj^_3)zBvAx%CPLr~2^*bBo22=tY^Z|cF7fj|F{6S+G;qou0~oTdiGrDl;EEF9 z(g=7gTgzvKpgXQX;bA6v?H}s^PlhY`knu4Hl`)5hmI>j-#qf(@Tz%{-(FyPFSP`@< z4X=^{l;wXXTb3g;Xjv8gFiEl*X&`2?P#ej)y(+%bO2(FJos0_#_vS}8Zdc1ba#7fm zEr*Y4-eMyaEe%j!2snk3b%0-s{6pbDC=T@}`DjP2rbo8&ZbVVj>1$Bc8>6E&HOsbkW z%y|BH9rzUw?W7l*vUD!NJ9GT_|1d^|!IZhx)SQv_AwBD19#l=HFBlY{LK)$w8vlsa;5TmgCYgQgN5u8CzOwJ$hhTUgZ=IB zh2z5XT7q1vCJ(~Zn4cBygDd2VB-Kg&ISK;3-qzKz5AxT%fi_)A$wA)%cYC}1Sjddc zncp6u1b1uq$-idTe!Hlmov0=wv#tDKkx#I7i$5)jpH@L#sE7Vfi5f%%X{B#F?+LOD zUHi!O!ChurX>G>|Vip+~AI8NH)TsENyklP!L5QWcasyU?7yBYQ?@rr*MHWDd+`12$SqG#7l>>Q2^C0{o)u8A(J*f;5@uDhWN5+1S^Q#XI=8j!0ZSsa8yazF*6C@6){6>9U=1(8hHSWTybc*(uf$ zt3Paw!%qH+_@bgTfZhG;B8HN{JQ1~;+z!V3IsKU)EAmZ!zt2;^6)b&TiZzBesY=FZ zM^AeZYH$C(FN`U$Wke_xv{$Z7rJcl{T1zg(-jK^-P&646?#(idC;H>98Aem?gEAWT zhl+)|vTxEUq8xTtdm>KoVGp&{=87yYVA|6<)ZUSli^Mzf)n`#-_hG14Bpj#HMEIQ* z^6eenmQn>Bn%X=R!s+V;kXs+!qb$?j(eadqh9*+#B{u7cRZ zWC2L}qNGnH6X4D1+{FK)T~IR_Sz5()Pe0~1OHQ}J*(_uN4Qvq-amx@rt zeA-pdE<+#YE(y(Lyn-3*;_34FJZIZ_pkY47@CMbU*f-*L3yx`)GD8PC4>gQ>bdHt(;LD)cH)-@eXr z&eg<;V7E6OH%T>dDbsG;x)SXI**7Jha=H-s1I>^+6iAoRrk<`P;gSbBj4Ra&j8VP= z&oz~6HgRhTF>!4Qv26O{h(%kNIi3IUP5&|i|NgEU999%3 z>wNcvXTMS1sVX{*Wl!&DF*nzeRFNb=Iz{+f50X}4?^`(+BW*$YjEw|Fo$U*oBdOEH zTbKVx$#~hx#G!5;B~1}cm@yoY)}!@^Ua)GwrD0#z%jDjoARk|>q%B8LwMVe=3}YjL z=N+pkTN7p2;-qMqz`bpCF|Mk=)MWQ9RFJ^J^eO~#+4V4xFJ$Hd4IqU~NBs-0L>fiH032n(9L}6!3lDLo)7v{r2N&3Aaaa?tBe+g8fu9%i@32&DeAMS=23b zonx0D)2N6A#Nncvy>UYKAwII@lhL-MegVsn&J&9WjN^~NrR#ro<$sfjQ?x+Zb1r(a zyD|0mUCZ`U4R6(D*%5^?V}G?A-ac_Rw4Cmx92<>iCI50Ed9Av-n);-s-sPI5&%5ei zDD{8vMifQ1Z=`-TD(YINTsBvq_9I%Y>glzYX&DajDZBMo48mO+L9}7N*0O2vQ z_o|x(;-Vhsfi687>Uo<#2D(8^2J+YWBVpX+pj!dqh;cP@lvr2X@WujLTapD}wQcTo ztS)- z%7%^Q6g>^|-ZQ=%YqwtQvv)|fTQ|)+e>}s!Tg;KLdYzg>W7XM9Y-C!J7lKi?HJEg| z#MW6bx`;R<`gvxqZvX}xLCtOSH~JqT2gTrle|=#(l<)N(Xst49>#A>q=Ip=IhtR%$ z9|`2?&_>C+bA+)wy`Gc`P*t_(8Z7B_$K6paFFFi1V}-#Ei#Ide+R&RBFdl4`=a+dF zz_SpKvKN&G1$wHb6FcbRp;7yikow00s@Oa&QFZ^#wJXA+fosfdYm~*cUjqvo?Rzy- zXI7K$4dF>CB02tC%xOuJ&`0ASK?1+l&%v!*F%_3EY*m>JIVk}0JkaA2Q4MDuUljid z(rv~ym{io`{(cK=@*&HJ3Jd>4B*%3EZ=zbHT4Y*e%3nZK=J`cZwPCPX8u#S)`vSRv z`rBT*5)K2bs>wnd9)x@;MiGf*b(f<)Z!3LEj#hp!iuFE9rUw5yuw?w03=pbD2GQ@Z zA`Kf@qkVm9I_zZkuJGIke#o^4wLYDnAemZ2;~mcerXXz6E!P2Q6FalG1gS)4k&d(~ zzNYgV$G3z{DEtRi$J>uX^kISQBL6J<|1AHY7z}|`B`lly@KT9c(fBc8{^=3JXW3is z&lw+f+9(|=l7o6AJD_j-KLDRihz;<36#3Fc^``Lk<|{V@>3q?i6Q{-6CoEmJnO@8!OtJXoYqI#oVVuFpdUDWSggTD4?6Z?}2( z6gB+&*&#)N`bFa!4&E&?CMu-{yh%=Vk91)bH|v7A*m9}QP^$UsfGd;^bTHT%x13Bh z*I0(#Y9N@JjbT&{JqWCj*>NceK_X^6@5u;jGO+G$U?F@c_vTf=H0pNZb1W?sjcvL< z51h(VAcF)LSXPfv*A_GCUlPeMec2-IRQ%LW+~W63|JlO-`T|2x;ny})(}RM7GF2`p z19h-JB_i0$!%}WhG+?4FLM*&je+KZZ79?KzfQocqWwxthG^0;_Yi}UzhTi^X1(>kH zbj{Hk&Y{J61b%axJGJk|&6}YwNK$zQK@qw+4V?LRlj20l!vew8L2AO`k#CV3B_pLt z4KL}Gt$u9`3;nfQf3&Hw3>L5I;B|*|W_`G*;zi!zCS$>6NsqOlx;)8L-^vQ+RYtwM z|F!a9hE8wVY4smMHs#4Ylm1z%t*TANm-RDsC9;x0hE@l=2me`nZF^9QmjoDQCRb{( zGtsv(F#Qd~a1_OLfWN@ZjLLrD>bQ9Gw1pZVprBi-7I;C^57odn3qsT}@6LDWpH1B8 zY46YeYyYp!Z?u)iy}!_$3Q`~subnY)#q>)geVt+_6$T1QlTcPDq-qKjjs*5k>~aeJ zHx+u`#VlMW9)W>}?lJ_p@LFdak%E8&c_X3XTOtM^f-o><0QKd=GHdjWrOs`L_ z$@Rv4-d1n|vrPWMZsMao%Y*MPWfrm1$B8=W#&t?%dU8}cTlNjQ%|mWn6?Ojtr3WEj z7K^fqx#E)E6Mk1|Twq?N&1ZP&`Q)g5@fS(%qw1v?{J?JvGweg;e=!*XB4|cl`;eP9jYz%5AZ+oEz z;aGM4OaLWOvT8Ks1d1cAcU_!g*!Yh{`4_ms;suB%J(5?x-v^8b?Q$*JP6PGE-z(4o zG=Q!s?Q;%tV=`p!Q{z|y(+1T&HbI>$%z3)Au6ByLrTwLnxWUu$iBU&TSix8d4_#>J zhcZ3hs2h~=@6YSU)zeL!8ET^Hs+LU}BI^V@v>A&=4wFb<+GFWSKfafcncA1r%mOc- z|JiYn{d0$pH52xUp&Fz`39t&wGSmF6>aHQ!`6x6=ji8-6{Hd2Zy^}$;#AYy;Pg8&W z*Kg6ZYwA1r=$OuPawStc5_|8|G1U?Ds_v@nH1{AVRYTIOp&Rl??|C;Ri)MGuHT=m` z*d}&G9P5~>pS+41k3VxbYJn)mN5wk4amk|{F8zi_nFd)Kq2;&l^Csl;Y(kCTtP~wB z8XYYuDpQF`&f^23794kV)~~-4Sl_Pgp7+HeO#4D2xBSl2OD;3sxwAetQACF{{EF2X z>D;Ph3v&dWiC@F84x*D6a0$*WjZ5sOAZ}-1D+zy9Zu^2|`{#%LsgrvSt>;;;slR9I zuHv!#0!Qa}{h>A!%qRV^c^zOM_tEjB^t4minuAt0Oa#CY?X4MA3QFc_A9B@+Z`FUbGR1)oH_x5|9**oD;Jh^!?1#& z3(8oM7fG&SMb#5zsW-{fBgvyn!G*Nf3W_*r`FD8j@5BovDbvtW_}n$jks+6pHc_**?8A}EKsqUsq1{bwPnMeT=&k0*^oD1NYaP&v)-CpDqzTPddVs+lN$M9}rq z$*af(>)bV||0M9-0Nh?0VU*i>4vcjBOE~p8Af3(Bk6tY@DIR5gBO|wpbjmm@@*A;> zj?(xYF_=!}W2vVKO{Q1B6{&ib$(K}AH2(`D-MRM-QeEdMEwy+!Z`_!;eTOmEoVCyZ z>BFUxRZu;VH@Wlv(KP%h=MOxqqKBRp%ofl7yrh~C20xo}?6`m_@B|&2{E{k;b1A*7 zb%+7cs*rJ@appg8zz5lW4)tFyu^qV@Y(hC%?|a>if8o^`HL(Tv#8v8W$z}`7tVr1{agD>xCAG|MBCx%6y1~}G7mu)1Ao54 zM_>8BKB9NMIJagErEht6rjGs>?EObt0dF855+n)=*m14Ft}SX-c`QPP-+;@sy=y93 z!fE;MsdNi(RjhLCbE|z_PKh32?;K&djsuMx<){DcnLw|?y;-80QV?42@D_VBUD0|K z-d&vn4g7%0%( zekD|vsXV>o^f0DTHd$J82JYw|p6uS3X%^8RFiwbV%U&hL}Y9_8GU z>2JqzE(sQXKKj&in)-`gC4j!~oj{xw3r1<{yTI|_~cdMi*>P%~o$ zHFN5L@#vMejE2o8kZVZE>ookHx|nd~^FZ z?s@^MpF1Vx@I2zHd+Cg7kx5RV4i1*T zaU*)UaAtc1u!;jxt)6_}iy?j+W((kmg(NOV#+gZa-IoHCj5r8R0IqF9*@5q3ua$!% zLFG?L^tQ`a@011$lQ6{Gf4ONlALx$Rs+7k2wB4Ka=~S)>L^lj8HKk(>F)nnmJ6Zo1MoezJn9q8EV(a|gJ07;{Cqd5$L-TTt z%Y^DJl8ozi)(Ab@Lqy3VUEr6}L_i-rP}gd@*zgq(PS8Bo6rhjyKCatD9Cbeu4tfg8 z3bvf}?Ee$~5Ihnw8;j~r%i8GTo6i_aDIZIH4f1HxD@;K*^?4UpMIfOvzj!yue*9q9 zBPEUFAUuD?Y;pC3Qj4qp)c*uqZJw$nvqdcF^!{Z zv?1-1VRrf&a?vZYllWz+(XXh|drY1u8)x1h(7R*G7q)j}8T6i$%|Aq~g~vrD%itNV zcR#(K2l3$D>^@-y!ZVMz@R2z%E)Ms#xYCXbO*am=jnh<9=x0IVZSFUB(?S8Q73DrQ zoxziKN8lX%{|BUCnZf)4xT*-gT`M)ul`KyaU)HK*j$#94M>(w@rnWflIQ4 z04?v!-9pQ!ot&4xgzJ=DA*wsSlp(5~@cjBMrhR?}yOh@-&#xZNT;r_1w8Pv_66n-u zO?t`5>@M@yH{bGp^|Ov8+JPTa;YT@n1D3*}aXWpUY76V6JPP)2 zHLU3$r1S%$@2{S%JdYCH+min|%5cmY8;LT05PBdN$h5@!o;AhETo^f>!|DNAmSlV% zXf}hRQzYLS?R;8;cwtdak87jb!Y4b1Xa=s9ptA(tgbTdcR>MxYUC^`Q4wVfCfZ0D( zQ(>GRKDXSGO~4{!$yKlmOPl-9JN*x-Y53Jfbr%}*HlCT{^X@0pPB~aGGT&PeGUP;)jv>{f6mN9@s|K@fsId{ebD<^&8U}vTw|d*c5`;D)PugsP8)&Xm z${jd)ckC{m=wH=lGE|ysiQ-pe2Gb~V-GG%s9KeU@tg6%?$FcYFZiGMfL5&C}&= zx>}XS1TtpC)h8WRqtG&0WDNa0xAQL#bzg#hE5)84B#Vs|?^Y5MdMXq}Q)-|Z<;rOF z+BYl&EiSRh|HSa)fPgR~YbCVeKkn*ymB)r-BCjXKrM@ES0;__k=%`TEfNsVMFyMNM zd3Wl-SwF|y&k3j|^u{oHCJF%^!LLFxw@U)IB_`;onW&w#69N(dR4e~klj z8#bOT1vC*psYPZ1)2vHEK-tWMuw^|m2do{nBvtptuu1F1gS$LQm@cR5uy{F*5JUe0 zFY8?|zphpiGYGiIcms~jfUi^H4k=NwkV8`J65=8Nv^P_2i5ukqXQTg{cEQCSD^e?k zD$lgJWN7A%I1)z7nv+d0O=>D=hjfHwz8|@@11)fD?AuV@rNP=u6LCE_$>xc>c9Sd5 z-u}gY^t1qb*x%{{*Mm*&dDuAK^u~Qq2o4V3Tc``xVT=d8dL5P}>64J8dg+m9b7t#f zWy3dK)Fwpw0lc^?H3se3ANt4NvvLQ;>SqFF+=x@1-`rcd%}v|;2c=!kql-itiZ>m; zNaj9%$^44Y8*O6uWMG($c4`4_2skI$f_mq zx|Z~hP0~kq(sSum=s$eeb(`h(19z=T{q(ducFsSp(%RSQ7DI&_CE$buQh$e%8zp6H z$9rr~3{5U2sXplH?w)OQXQw{JQ-6J_`_mh{r@V%e;RWLnK>ILR>JhD5dAB^C=eeg7 zh%p2uFajL*%xnQC&og+D7H-?L5t|)Ij;pcmzq@;O{=;OGVG71EYxy?rJ*8d;{LR7s z3@#y%=IhehM-fGh&dEY)-e&=0biiE^-r&7&lfT#C|D|lO;NwIN3+o?O2~xdDRNgmc z8QISOO}O{$gkhDMHu+XE+0AF2v0oG3V`%`HH_P6^(t!5Xow2Ki5r|Z}rpH>e8WVqC z)~y`wT%RTmq05v%^7%DaXT^|!y2&9=klb8TBF9msD!Ce;x%eBBtvmMbZ1sJ8@2}KR z+oHCOo%xbhBO9aS3~);`Yd;&Qkx_r)FVhLbN3$W*w1UYUA}v-X>;9?A2`BF_Vh$@O zbK>|NWytH#Lz{di1Y1iq;!teb`js$mxHkb>{I6&qQLnoRee_uv0hur*h$u7D+RKZW zN2_uooe|#A6DlZcU$0Zl&-d7~uIb=^*qg!_wed*PY@;jh3G^$fWU{HS0@e!y%?5_k@Y44Hx+ag11u~U|*Wronz%y(Qr*ncc| z?D8e`wGsCoj$FgIHYMxy!g7Fnudx$19Lm!X^4A5D24dOg-`%TvJdhqrIQV>(^46G> z@BVZ!gg4}>UoYS9hsBEQ2=Ld5d%9*9g(9?sq@+5A%p}VwCwf7P&DhSrq-SrRM$qHO z!h4cQUp~6nmeAV^)YYz6l>IB5lqAHi{p61ZMP%O_d%qpbXsB^_{y%hm2{hDy+df)^ zWXqB@vXg1CZxbb3kq}Cz$X51!88fmkA;j3nk|jdOzD#7vI!cAe*kv7zZOr!ns$b9Z z{NML^`b=`>@$pL@Bk>%Q;L&C6cGWHMfD%P==*6a3|=w-Vg{d3srFIu-{Z{LA5d zjeElNV=Xf>XkP(WjV#`NFfQAuv^Gbo=C)3p?%5X<6IRs*tAq=I7{q0y3v_(;%ci!I zQ-DCaL+!1(rkHOLG1EA`bgps(NAaGEMPDQ+&IWD)qOI^AoMD`EAFPg4s3D0QZ!z!7 zVP>#36|NDvjwsJo)H0Wc1diS|lKr1fcH|+bO5)wS)=pl>0{1rZOeFJ4y)q?6I62Ml zJLHxM>&C>)GT+d6KW~E$Im1{vU(aMH@{0P-bHxx7iCB|6y4CwE5x*-WOiGnVTyBvKf^yg(CJT!o0**vwQ^X6W$ zcU@$!3A-Zv*y@&98i>9WAGD!MZ+nss>=Dzi`+c)8Z$_6M|A(XSp?wQ@ar)U5{&|azHT=Hw%Zm@{E?rjFND}sHk{B`L&T5uLTNkj+ z_S)FQ{hPy-dU#j#Hpb#CA39tBgvxN|iMmUTj;zeAC*bUtP*SEFJ!^84DU#wzT6c8- zzLlV4w=hC46eFp_+W!a!Cg>?H-oZR)H`|QOlrJL?;iLon#9GCLU;x4%I6#+J2V3f$ z2>mv6Dt*sJb<+?i#4~b)$z#TyovKC+L+*If(U!m(|3DjCs~&a zhwjjr=tpUWM4Z+pLO`v4xVkrBMe) z{=Y%TpaRA2pgdf(k{@+ztGkz$fQHU53f+^!&rd8w_!0Wsj(5`-^JWGrd49WM?IHxYs((A$kt21x zIeBfoIpOnV49nxzEck9FQn;hL%E$*hJ~%vliK@KojIq7FqqLlvDd{gxmzt@6z-K2j zP<5#}isslZb)CyZ?|kY-la5APp-Zx=9r4-CSs73o;S zS517-I(6w?9eWt&aC~t^g}daGfgIWBllFb2_FKyFLX*I6}g+ z7)}g)>2Zh&)PA^^GdtU{%U?~b75{IfV{yM2B1xB=oIJS|uR`TkI)Ik;*jUhSVAjt; z=CkscrLfY>7wmV5vweH18Fb&hggzSA|4g&t!wRTEiMVj}Gjq;iH`PBDfjUPfS|(ow zS%y^{d*n1H9?2w7WVvV4-+k!o<#a(io>x!;L7moE{b@g^@tVqRzhEAVk%rc=@Yi;W zhXT`gC_OCBrsnr3kxR^h_eP$zMFc#fDE>hz>jyQSj5$j3U6W8Y>h7D`jq7!!(@ad2 zWw=V%j>3z$YB&XUQ(NKuhgr?gSTTF^jAu0&|A9eKtzQ?uG)fJG#|s|29=bMOo9Ucc zKd;_5Jx$f@i?x21rjGhLlagGM)%gBHz`r;o>ZgJK!u;tTNGpA*iIl0&#?;3lK9$6f z%8~@5YaP71$(oXaXt{{d)#k7@+C_g!2#j%o^FuVK2*23lELU_>Lw4VY($a!^PUX5s z1h4z}&qj+KY$H$u2o%v4fRL2nVx)gYZ2)OzeOu08god3ceS3BvEEFQ#)dFu>2PK*Z zYX-wq#KT1V7xCQ^4%u%Z@b?v7 z=~DLcfM)q(aQuA>S3)kS)O5~)Rr{@Lu0tIb=XeHU*?rw=4k2gR%^a>G?!cniuGJ35 zSG46x7MTuyNBm}*_qR<`7)BUW2_0IC96!Y0xz8#3@+uERBz_(kpqxydeM@5{9kC}9 zSc3*C90EK%bMd|{p;d7_M!Ai{^OJyQS4zJut%&&#;A3c%!N4e!lQGk`;7=7188Pm= z3afL)obY1`OU!$`hp9$7tN79kn>%cMS?ng$mN4J&e3kWH7Q7mN@*S<-mB>~gdT_~y`xaouY*6#T!>WK>+AnNSn24te)Ofw zmwOYhM~+`orZ4rHh+j5aQo_xKuL*HZgCb&*gvgp{#!1(rSZ^-kI1fjd6D0El?|t-! zA(6^?!mKVorJ|0BXncjJ-p)GM{+pcX596e6$AbeS>J4G@eC5*=G@b;jBerBXNYBxqNeW; z^QC8fa8|3@5s2OZoPaO@3;$RR)q)aZM;H{BfQman%CbXgU2GiuF5ReAKdOZZoSXVU ze!%q3hHyqQgZSZI- zcuapdGPj&AmVJ~UH>Iry-GN>?F8kVaSXvg+bASU4Zhz6lvn?&LqfrA`Fw$0iRwYTC zMuu|o&9SVAL1i=;Gh~xHW4Oo}t@<-=h@vHXGkGBqRB+brp#nx>mGgUd>HeH*ie?)P zD}r6eTJIu^+_Jcb0Cex@hq&mjPn$fpQ(Xozg{NazcjvBg)DeO%H5SvSRfCodTdl6eOr?mfvkI zBD1pqDYP3HDFMddsh`Ih`p=gibM%U;d-i(4j9 zbOe-RPwxBjKMp(7Q>>@~3gb9_^QN{U^w%lE-p`OI@FMG0_FdNJRg49^${uM!f%`O< z6fleFh$27yPKtC%Z2-~!NIXT|g%x~WXhg`k?<`1AtsFbnGB?aO2_xjaEDT$d!=B0= zUA#D3kjQg@*GER%xrYgHvUcM@7o{Cx6nvM62rry!+BzfW6#dfPXs#AX8Ix`6STT3Z zzc@_}N_wmnw2hWee7AAqdIIyHuf_Nh)@<{cGu$22AicLr`?4^aPR0**NIe)4Dngl_RP2*TJDYU zRZU**ig%*HisHLbTb0*O=&fikvJrKZtGu=jvrAZA&%F)#tL8-^>zZ$6XD{sNbGi@t zijP-2Uls7%K^nbzb^T@QE*3~N2Ont3=Th`eoUIs;I~Za@dpHKLm_}B5%W=6|2tGTH zhEj{TzSxP)n-!1O=jdnB%~4n9?dRVbYd@j~@ZH05pT0L&Yg9?IS`{*fXGAg;oM+ky zSX=1ajhqE#ih9>tm(mx5`pN|G9~g;c!W2rF*>NcP zT4yvaYu(}E;hP(NjLGX1fZ&ND^YYd1uRca%tuf^UYIBG8ia;m1O;54K{ih=YbCv&~ zPic)!pGLGnZrf|@R6vnDl8J9-&$s0Q(U1ItR4GNwe}KT>KOj{>^a}V_Sm?CaOc?w* zIPPu%Q3c+JmTTJ!&ai`IZi)?T)2htpwUKe~8&@B)V^l&qh_fE|Wn&T)HYyiZ1&pHk zqFi?HJ^m$VvbHmvJO^^}PW&a&z=4#DU=C?An2}@ZM2%{`slb^d+Uzj=(SDGW|3EiUrsFL{}x>=ij zDShk93=5pU@wh40a8P=?T%e_IAzh0Z(1D{uX_-8xzaRU0a^;X3l z2a!KapReD*pxckpa&hs~JbD}uZ49z}1S|SH+9U8Me(gu18ph_=q>OK6r=Ku=TC3db zvpze$PJ51LBF*1`cqQb@o${K%B^ehLzn5`53LzaSW%{bSM6GyaTT<8i@BdOgT;-aH zo4>SxDl5G=&~f*eP{%jv-oxkgLpDA8i!_EJs+Wq0suU1!+3)uQ=h#Lc#)tV!5P8kS zQDN2x260c@@CrMhbz&eTZsc-3AI@Z`M{xBW_zN67Ws~$#pg#D$$l>9QA`I8LwfDJMkVbuHW>fIjFp z8K|U-akec>{lcvM&9Z+x8-7PfR4POQG2ZM8N>DwFf=HVJti^q;cjUEV7?DOovVr*) z>$dN?!jjkI6lQKZVV+(zXb}_@9h{$cS-!i#fmN>()(O7xk3dFo-etG-3CQ6T0Bil& zQgWGSOX=Y!Fn{xCQ0Zk{!IV$bZH+~$-)LMJ$!=Cfo1uKN?v-U0qC`Wbi#mwr%g=g? z*-(2kj&2dR^%TmCCw=;-p>l!0=8H`)OcPMXVZeK z@O59kNUI5(($TakCHI5_>hYxa(!(_joGN&{k;708o>L%Je9imzzeqJ|Y;E&v4W~c3 zOX6Ay)7M5^wy*oI;lzd-)C}3Ih6s({3VO?9*`BZ*%hDroAeO;2~DE;jXdfLH(UY5G#g@Y5pI{cVvpQs`B$CdXS%PDTqq&UHEFg$X@jyq6y( zI;x9A2y`g~+s6LdP5Ps!(8HhlL}OLmQ`w5`ffgiT762c$llG^2#tyonsOB?A&>opJ z*{OzYjC7u;H`WUSl;*FEWm)P0&$}9LLipi-DjpaCm`zpJ_?&T;%4Myx&o_l1Ljkrh zwQJymZSKkpGa3-_n?Y;WjfDKmW|9#AZc50-xY=+&*f?wnT1Qn-vDna%j7?qt&}dsXDlS zYV8MVF%zx^q4#&-NT4YRn-UJUJ@s-?X#{t-GW{%)O*d`qee_Xu+nm;LXYZ(Ukm>5pKYf?cd3s)55RK4hM7f{0p6)gZQiP= z;P4RR*)GzK@+gEiM1MOm;MK5SEb7uqr^3n3lcVNO;mgufLuL}X{Q3X3ij-JS)!kYf zuV#r;PCu)|nSy?WzHsJZbab_(r0UzcjQcV$sS!`5JUw^E0EwJKqqb=VeKOc@e$MVPK>T2)Ikb-4Kitb6@Q1iz5Ym(UnEiM^-@K{ZzG&xQ@e29CGc z@DvJ@pLk{iozi&_$&|6$K(YNvPbRkHOy;mMV;pbbEU+3F7vxX;)0qJDxaxb}Hc$`D z!HE^Ignee@X}}FN1L~2+%D0wGz5Ey&cN34`*Cy6boZS?D$hXQKYjr%WwYsw&soGxy zf>@XH{cRRNHGExAF>LK)LuY9}e||_6d4_K>+qX6OY3&!>1?=iNyd8_S;cVxMrV;kP z!uT$jQV!otdCM&!U5SU_1}TYcI|t%zAIm~w?D(v+Bj$K!7_%v(v@1kk6dB{O^UEln zKj^s_5*h=7&L9H&?m->WUiTMobrYvVzW44XG^iwPHa=+-*Bw)@Mnj0?$^*QZ|I5nV zUq6-XG#vR`FsDAM3_^Os+^II*_l%ifn)?3Vje_D% zyjwW%oJ4!@26-rP98Q?;Y7`aAQR_*!LAZ$v(dp8ZlC9zS8Ry!nWVJdI(&n^3Z{4n~ za+lZmc!f0;hbC*5p~bAcj88;9G5X8!CT7rR(AF#v#g(syDnsfwe$+%F$;-&R`)}GR z4t3@X}@OMq5!~Q zyDV65CAsPKotv9+xXX6GjpW-L+R}M*UCa=-Rwcn*ekSCd=aP*2W;;{0UW;Hn`p8lKjee>ZB zvW;;CyT`{OIIlf&b9fnuZB6j~NpcbBSRJG2!7|IIE^{&1Ix}`(Es;ME{%iU-a<|M0Q*4O{+4{RW^*;j5OAVJYM#G(eEKxGj&niD3@ z@aEntJJbruZz7ZH)ASgyqa8=J%wxyA&rJEKU>leCyL-B+)7lh!YB2uc>9L+cp8pOV zk`fFxjPYvzF;_u`&rf~44&2}fjk=&c)%z)8=B@$q|5L1o{65*cmS33&BGIOMI6A{>~`3bP%iEHKemi9ui(??0Isw&Rm z`JoX30VwR(n+%@esGY;9*yK{W-`UzIk?aw8= zLa_I3$F*wbJvqS>2|wmW4uLLR$&iM-dFTI;4k+L8xU2fAxN@^J<-il(Qz%=Ex;kzg zeYgB;pc7@Kdlx&^8i*gT2lrnqKS__%cWxtnybAFBgpp-sq@ny_ez16c60mPH|1hFk z82M7|nYz~9YL2pohEw@6AozE2(C^PF>qaTod>C!>PO^tI>C}8X@m#Ih7xy^<8g}tP z{TIWdt_194Z88$NCN|a@$a&r?gKr4s5j_Wnrr^u;gWUgp*n{C`iXp1#=p`9I`4gfC zI%Bq{Lzz71Ke1r#dmvTaUpLq0hNlx;Gp5^V1?DSMu$ng{`oe-6v7r4u#ZxI+->JByn?-1C>V!@a z5YSyzYFHr78u>`$Q?~~b{A@eBko+H;A}Xxg32u)l zFg+1WE-7FYGc)seqZO;pA8-IP_tDevg5~{@h`gIz{}Nf6n0#Ba=>roCsT>qltNou`K^(uJHYr^qhp` z)226qY@D6mv2Vq-N`^ey!r!!@K1#!YKm*q&rsF)JtCh%^l>s5~S9%@o6HMDUB!Rc~ zX^O&K%Em-90&)|~;pw*PXcY_>~`P6wG(zZsnX&jNgX3E!P% zwM6FpzHt=an|-je7X(O`>ergXg{g1zEg1}M4k$qE7H22_w74Dr^}7S`2#W!>is?F~-nDZ9LTzR`H?_{)W z)jKG)`egir`zc%i$r>DifV)S@STWrRnDD4t^ZsTTYFkQ`xOx3mKl_vp>A@QlE?(~W z?FB&kCM$h$=_B5aT^<+NP;W_b&eYaPl5S8=?UN7RXS2@VU6A1h2dz z@jkBVjbd0Q)4X5nKO-&Sq{i@*aDrhEejmOqTxpedJvh7BXKNe{O-1^xUB0QUecc>P zCM)Q<$0y94TmX>=tJg3%ROSq#-}dCnv4KG}Zo40JRY5_!`7_X0YUj}}&h|iC!b}gC z68rf3c@fD+t#ZXy4Ig|dd;S1RX8JHce>_2LFB!URuiiEEu=!f=O;OHhmPCtLVcdx> zI^n`AyFbon-%BT@z$kMk(|IP+73wQ)*OMtwiOP+Zk z#`0<=Pmte#-bkG=j&Vntpxgl7P=US?C~q-Ohle)auEr2YQTL*dz;svz=0Khc%4;cmB>>ULQG{5s2( zi&pmeTS2wZY((vRu|BVhj7AoPiJL>0zTLrG;cl9!+8+Zq`+`;OxeB_ywEzx`F}7c# zDmnAjZ&U?4Ya>A6%B*vW-UN0daeSM4bJ?(w<-?%*kwy&ce1Yy12+RyMZ@j)&dF;~e zigW^e*MA6^rkPz)Hplemoe3Y!if=*EmX<38=H_ecjl%V6JId>Mr$YsxFD97ywwGC} zp(ZxFX#|T$2g)W{FBQ|$(xypF-~qza{`rsBujjgLd{Kq(`hvD0Z9y?E3bwVpe_15< zY>|EHK`hEY5k>ex2W~a7o`~mz{<djsS~=6@Fx&%qx(3MOEiksx9$V$Y3d?R%jSV@+wLg(Q0w;~ zuQV{oD>Hy!mKlgau4dD?af~yvD)4B`*4SVL`6dDA2 z@IUPYl4do(?Ap@liBZ!4^rqnTi8|b8Di0C@bK%kT-oA6 zg012zz8kYP;9d8PW=as1S5tgJrRAa_`S4sDVZQNY&)~m!BpjC43!y9_4r(e@(JA6miAIYXsU_UBaTz3|!1!$AYu@K@cXYAv%$W*Y&>>}%hmv5Q3QYE;LNm}_cB(jTv z#J*=@HoOi-DRY4Ag(1rKSOrA06&q~DQiwRGsg^gOzJEc_e>8$47)P;#vOIQk3r?Id z8p={1l&Wl@DV61M=BSWO8&mEpaxQp$4jd2@!H^bxz*QY<_eATI(P6rzKbLy|z3}C` zA}tJ#*8>=r=X5UqB^?1ops>?MMi_39;C|2Fukuv-ZcO%q;4=)+H8vzL$Pn>@fnjTI z6m@J@nMX`i5x&8D^qYPyl$P;sxK#TgzMMz#oom)ziQc(>#oi#G|3Cd{&fQr36AwBv z7~*!iwnJUh2DUi@wto*U!<$JA2+o576f_k~?zeB0Z9Vm2Z+!wkfDVcmvI`cXn&t*S=~RI<>X! zCNeBMP034S`C<^8EM|Bq2!CbJr*Gh!=t7cP|3=)c7mV!pbSQlABjJ@a2a;2P&Jl|7 zKPfxM+VF5grNK&xBIOabA{>WxL;=+yg(_hNY#RI9{@d zvb*o{Ob2PpgIjZcv{P_Rua5jXQDO|>u!>#`c|;0|H-vv0*Y~74eTH#sYQHrB0OQ5u zO+n9?@Acpl@KRXpcrAjmVvL-fB5c?`a$~+z1nj9lFw0pQ+eZTcu6+658Oya^`A7}N zl7BJ)>K04z)AFIb>yZbYhVUd6%&v@p;ivG;wV;4Ajnx7LrA$KKD_O-PoYE%;Y!ZUP zm|&Y?K)39%=`RafP1Fv@~qusogZN7y(4%|xuaqcS`#q%iknFgEqg_Sb(j<@G(syerEFJ586D@>KHT4CV~oLhNJuqc~&r3jtQ z4z7<=i)z;nnl;;BRIS(wD88y20;!KZ!eaDU+mWWv7Wp(ZiEpI=Zk8VG6Cw z<()!C(kj1b&u9~*ruK902t2+78=snIqzC@%%+5t@jrAln$CZs2t8*|5Vmp+c1EDj! zhp|i-zV9c|nR}Gn{i)W2y4!T>G%eSCZUgb5iWH=RIg8CVCiwzIv8NH^Onu^myT8Bd z|NoM}2zlq=v8rnNOF?wk65br+~4Q7%~hG9 z?`Xk$Hf}_$rmK4@TU+FD{an7ZgECD?$?_ahg3hvy-nFneaZTAC*BQeHB|l?&=WNKN zG?y*3v)xqb++T@;dt|pDHm>CPu_~MvR1^gHuVR!6qCI(Yj}8x>Ml$JMj9#k4VKML| zPKEc${Yq5>Iaf2w)ICJ>4OE|3Z(o57W$FJLAgs;e!1#I1A^Ek* z21KMQWGEHkldQjV^V?P=@TS01Dx6vJP}*}|p5DTgTdx~C4~<-UPRAy?T~}{4fakQ- zady6FKJ@e8_OFBf*b%C3j$+rdo3w~o-nw(3@?f2p3is}H6HhQS?0oU{?CUL7CfN_Y zt13?xn5s)^#$tdQ-Pg~HxDnp~-vAMhtYw_o^THK8xVO2AY1NTjune{`e$bN%f?mb8 z%->z%S>c;8b%3{8T|33f#r8^ShP$+qeCaS^x<+VYX=s|inM*9&7k*($nT{beMZiz- zQ*LgH1&rppc)VZ{%K_n|&R*BGC*##18J?zgZ+W+d=oB}bmFMJmQ4Kf$z(Uu%2Sa{R z)~650Z>0(lr)j;s*0h7o-@YhQhq7P1cu~X%esrk6Pxjw0q>l>^Qnha1W@Hn+m5;k7 z_H4A`#B(P}?a?-?lTB;`=8B2t76Y-vlvT!Lxl~Wszo9@gwA zRl2x#c7^&sPkSY>Eqk!aao1pjxOd%aZMnNatiISFyBkcw+57iD0Od>HTU-qEKE#xk zJ}tVHqX~h7pM8HTm^qjR_hA$jC@&cIXsv-LTirh-y`8Q#ZHtl25kFxeK_1>wnDu z%ck?hJKqOl2}BnWG`J=4>m%;t_cI>&k7&6rZueu*Z^vsM3M!Jf&lU@tB~Q1GnsFbq zo_V2l8n3~7ha+KR^2;p?)uH(HiLuc(CY$T_`{z~m_wGW>h6di;?i5PA9Q2RXxH^5@ zXa!czjG_U{F&qAgIG)?olrf6hV6^n!)wXE(#dV{FgZ#}APEranP8GpF(+=8Ah!TL3 zvtqSdyk;%EtS8Zl1cl_=xUqtSt%JD)YE5Ylo8>Q873W?QY6;-oNG-kvyZezWs|+_% zfd+nQumyACom1>Z9{Z|vg}b|MgQu)`wjcqteKw!bc2_q}i11N$S{tcDzoopFtxvr{ z`rlR)w98a10Nf`jC+EI1uj>@x&o)u-tQ~wyQoAR|B|wEXt=f&I%0tSeh%GYQ*V>FY z+{q3MB(-V;9p`zc3GSquU+{%F)o$neZrmz4^8HHK8$zQ~e4JFBXHPv@lQ1OnV!|w* z2b_~&x%;?+{OMcg+!hVyVt*Xo4Y*8scXuF5DRlioeN|&QqYC0e=3terHk0#2lhX1* zLCG5#-&J1857?G)qN!XM$mf%<8LYuT=(zSV7-KJ_KdIvn%rkHOb@Ri%z-}a_@=dwzm)E*GJ3H03`aH*u z9h-N`tRMw>VOI;>Eg8Kpelx%4cKXtnc<^t_{03qKBl@fA%R{xX0na(fEzh3_g;Ujf zd$;UOj&9{iRa$S&c2%sM;bu$`Ag-Pf^iDvvz+_crj5>LqaPOdm3@B1G?CdGG(k@X4 z*VPSrd3z_V9)pTJ7+VdZ>E>+Zacn~<`zVy=IvgJN%$vAdDcB9N>!&n;VVOItbnEs?&hs}H3gHofZZ?#=^Kc-ycP*y^L)vv7)Hnt(-}(|C2q z&G+Ip?!NpXQi)-1P>uF3ya=||4C(gP<_qU1TW1cC)xeiR`*&W;M`>7EvaC$6fc!f; z|Bsi*@dR_0?zm@<4|mevK5S0?8R!$ra^;OQUvt1O7pzTR3*9TzPKM*fsu#{r@wwf2 zH|gPg8D>H=w!zR}UgJy`b^fGuyPCud>0;`d;8N)yP#_5tO?kXN;yk&vjv8((=Gku- zelM$ld(H%X3GDjy?~{@Yo>_%wW^kv2U8ytWovnNJ*2n5@iy~}rdouDaN*|(QyN%5t zES4s-9^K_A&j*(F%p-SpY3XS8L)}&uzIGgUoUBPE`)sX%^5lM}G73HEhUCZVhP72| z4-z(a7@B(<3`HHEI?ZPzEIVqEEvXX5MYFdHW(Wy~AF&Ew6)t%n$~FZCj5XOZ^gf|c zdG_lmw$Rg+sdi9K_Pq&P^o>bY?V*3CGy57PAD5>~Am6qReUaajZ@s22dGluB?N#*R z$}{!uX9j_cLL5R3r0hBWpQVq8x``OgcvY(`t?vgj=?^o!-Ie(t-m@39lH)YZ$< z1YhqO_yo1+kV9c<-WT^t;w;BB$voOUb^O|K;{9ksAAS1FsELzT)d(^CnZU#~0w z!4C6cC-186Pb+T_v5~))f9>m6x;~u`LO_O4#D3Hri-*hmz8}9)gfv$h?k%DrpPCp~ z$ErNVy6buuoPN~#;yUyRX$l3=1^3IUCZFGGleAKb7rTBqv`S1SwP$Q$r=LxFylCDa z9R?b5U4^y$_62wuW&|<2K`Xymi3tcxdA6T4xX7HF*OpjsMU+>^rNga%ygyg4ePlYT zv8OKHj=Pm9A9swlX{_2?s`m-&ggg+CY%qIgB%jX8z9jufW>wC$x($K(`oQ% zS9$vq+qHgimsXJe>({TBz5C8O9m|wQ-GV~|>KF;i1dIANY)&6vINmlfOO1=3GX$Up z%8TeF+P?y9qdwe6&YSsGSU=cas*-+zZJ@OLSg%ByPT~Rk z@c`IA>ucGNY0DU)$CwBFhkhJzvV;y2?E9WXzQh`|ACd$ez&zjff-|{m%1*@_aizLm zIFaI>>1n1A9{gXd0{G$rKRBbhtZ!w>M*HIyt6WI;eXvu<6n%fKj{WT4iwksoT3+a{ zsW#O6Pz=ekqVO%_vG;BKw)BogQf#HKW=>G2{vKYVmAB4j1|~_pRyXbim_O+^|IMnS^cI(^D*7= z58e@l8}?9vn}SYY8t}SSQ(vOjuiU-*&UFY}V4}`NB=MH@i@bWz;D?#Y?IA_c-kp-OyZ~wc0}b^*3alAc9rj*S z_3N*3f}nfl3{a9RqQieEAJM5KRZ>0*E~} zTcgE1OY&Iv8w6+W32$%jTgc1V=CU^@QU`Z-cOISQ{iXYEalxmFzq2+)XfV~5Bl8dQC0vR zfxpJ5hS7ID>Wu9*az^i~58d0m3`)lc-29V{u25pmDhT2Y#2cFn8eJoAEhf#b3&H&J z{1s`3tYH5I1p{~wE&`M0emvV5Ep7hpW!+%rcwG<=4Uh(0wZWOA8{P8q$#!+DarejS zj8o$toeQnn?)R~J(Ef~C|9HES|1{~H=rodxL!s7JXsn~P>`chTc3oQiJHOF3{4`-q zzFEH3bJUlAcANkaO72yXINn_wzhpCN`EjYTIew*2lHPFVxY21tid5<)mr1`=X^Vk) zK9#YSM#hOc>gzqvHTdB!JVD3K#b+u!-;NQFGBd3FKD7$PAjKYOiXE$ZIHvEqAaMNn zD(0-oZw;pPsPJfYwmOR8`rA)@dL%+!LQ3BKcBOn zz8sBZIPv)G?Lrc%Y#z#zk(rsU;49PnB&qzt4=ygQa%>DIU@zRQ-;`A@)w(Js7{X?avtWnk{CXar{19yZ ze5KVx4qRR@ycU?uzVfV<{8ixfN=t7Uu;#C<-2U*bg-NFr5Xi1ReE5)Rhw-()qYbU| zgJws^()nL>wKs(BeL}Qs|7?VPy)biN44fHA5Hq%h<)0q)t(?&ZqNTmrx(AP0i8yHK z{b<%uF$M3L1)n-`;{r$bVpJUp2rfxMK+i37PW2n(v6IJc5*hAh6+fMalUviW-p(Rq zSKx3i4cKpjg{u_W-sbb#{&HevSYsZ+Nq+&PirW4kVCA15fo!iM+t(S*W>Niwj<%`E zobctoSn*}y!z3q2b3*F_m2(QU(JK*>rWMjzOe~=Swm-VCJ@%S8|UYr?RhpT zd8F}ULc^{XtJVuM$w$$DqL|IF0IyEfEPAz(gyMffhzJ_n_`pwHWRTg-i3m!3R?!#{ zv%L+ibKq_xn;pM@|GwhxS7TjKwzn2Ip~knKECU}NMX;CoRuK!;2GS-!b*6Gl_tR&a zd)atmczj8OlCh{Nr~r0epkZk}@;=>+-pR_YO4M&`yo3)5(%lGfJEj1g6Xf@6iF>ZG z^yUuDt?Vn|32@@v(T>Zdl4v&E4^&?9XWS9B+SLjxDiV!XvSX<-KbJZ2R|StxRtQJ(Jrq|B83|kOn3j{c#IFV4K0mBDrQifcb0=2#gMou3Juu3%!5; z-eao^J8g_4p4Lyzn)bG86XM`v3&3hK??A1jt+@Lyc0?(x{jj*_M_gjM?=|Q3bit0J zr0mV(p-ClKh1zPHo-;ot_Gw-k*pIi6sL++;*F~RQb@sUUEoJNc&HCX)zrSp~_lZSq zTQ(ep*KH%=ipOi~(V)Ddq6md7IQ8g5%84Q6Ice#{ToSHN`R+OZ2ogjwq@nnBuoY{+ zlwJ$v!&Oq!W`9wxbT8ymr>qBHI}%5>W82@MqC+-0>*%1c-ErC{%I%CqpN*x6Nm0G; zoBv<4`HNaGGuqOzi>Vw2=G9>ivVAyE%QczI6IlY}eP@*q{Zc;6(f4`fYFVUM8691| zh6rThLf;%DMJ)yTGM#lge&gM#4GD3VmO7;7q$fxF6@B{AkU!>A@J-CV=9xY%WuK&t z$Cab63@3Li=bdMbb|Y@ZHZFr*XPsSxq=5748cD>$S65e+wq7Tfl$9-nb6TBGyhZb} zcl^CG_ydxE>W1Vq0L7%2ZkzK&@_g%O@Vtomc#{YtZGAUCZZ#UF>KPgNZVzON@ByV?kd z@q_*jtyl+V$H#>4&TH6TAGkuDCwRw3wI^2ic6Nmw9Lfu;i)i{i_Hb2x#Dd>ghP-(& zHvDa>X9Rx#BN9eFn`yRjes${jhw89i+NcD$3G3!h{FX~QE z(q@Ft4eiHov`8ra@C?AXp_ch5?1uo^0w8KKfXu&)S0VXSd>>7moRD8z<(*DXX&!4S z&rDs^i|01G-pP7%cB>e8#1Q4S%MnAz=v+{~7b(~x)Mihle3F5`potSs$J-3O_%voN^gmp_)X9eXuzwlMcr zXkb|E<=Ip=moO2~8=Po~HS;YiuRjv!|1=>oX7xep@Ey#DB(~H`K|f^gwY^K4nz9m$ zzPz)4x_Ty@&*@=vWQ6-S+4wSVc&FEeOVgxBc8{}|rGY7j5k*ioTh}RnUoKhexig?{ zNnoi6=`wn6iD#+x9~=6PUFV&j!KkD8FtC@-t9#bXq3P5B0QVcooazK>2Lp6sS$Et7 z=RU2%%SublTl_9Fa7s^iagMr8UlZ$o*wcDUg8xBVTx5<*L1R^tV=_*WA+56;l$~UGc|nJF+lpZ}G>aE4cFW@vW~-)n?ujJ$6kp z@o`UgkkkEEw%`bNxlYMGu4n(wUPvyT>b+(84gB()DxnAQOYKrHZP%PT2T1)n8oF{7 zP#6VrYrV6QjW-M2$uwN`v23S97WUyoye|~z>Tio%yCVd-E*m(v!oKjmT>mCs$*CSC zAeC@6Xyq1%^nD$fWSjo)@=3kIG{VLf-(5H66N~FTbEVo}6+Cr(TSb4%b<#t^XowwyH;9X1O@0gEdXnzZ#MewFR`{-a1$?Cy-{M4!gOMkrHV!nfFl^ zj6S*fZ}FZXo3}KkBZIqxT_@!G&E7tGlq=9KWK9=qXs&5$`|aI9d-E4d%Ky%IFvr8_ zCov1?i%Vq?{1tc;#c(6nT%q<x#iu z*IQFRbQXxZuZyV$;rtLRU3VwEIMT}u8wY{R6TJM$Cs*9C75?}Qj8JUq zl0{VjXRpck!MW2nx!_-oEVtUE4mKkzpY*-zjq^P5V6012AH}gRn zm*^Pd7>}TdrS~^&bLgyYXG?Mn@50mfVpSVUS?YY@aI{a{2l`ylA?0+Dd$` zkitWJTjhwUiNp{WePF~nfoj;t0AX3nDrtV(skirJ;Qc~V?(H<9ag~ha6e#L!JP>ez zssqkgA~R)mZ!7r$a;btaT7zkhLT5{sl`4{6Ty5C1K`y?n8S0GbvNHM;jK2x=lBcK7 zD@+}jwvlTqX@&+hD}W1(D^TQkL4}1Va6Wp+ zvacE*7rsD@a{Pc-`i9q#o6YI`M4d`KHx;7&g3fvuEw8e`tpOlL;DIFm;`z0{VUI^D zY|g8Js$Eyfxu7cs5?kAf`e?;u5NY*_f%4W9Ve7ab~Ocfa6flJF)88A)4K2dkw zF?^*|+#ac#U5056t1-;J>95K`FAP3!)98Muf!w;jlIkq{%IDLMW_*jP%elJdy8v$^ zU0&j`8NBb+6cvaEyw2c^Hr z${c$7^qHcumH;vI)IKXu4yOB&>lIKcPzwO~uLphsyOg9qr3GjfEjk?}OVrUPA9hX`zSS zA%v1Z%HC(=?|shoocDX)bA9Izxvm5g)?RDY%sn&r%-EJXm>)aWS*?vj)PWIE z8?h>RM>*PptU?jcZs@IUNraO(!SjVIveHtR0$QD;Db#X*JgBenS?L99OeLk##G!#j zo;u$G-T#;91%cNsUgwN%pMXV=k0~uJZMtj;S3QS>JoY)@)cM3XWRsW5wbSX~<;*JE z(1@hH@oMy3HN`tV&g##^#8Tu+%(+6BnD+D`#?QqKN&^6l?tuE>H zIaJpMUZg(hqrK@4q%dFP;Sd`(7mB}vJjoXlC^eve8Gmv3gL>dNhY=MU?qWn3oNb_uevDMl-Ivn!xg}L)>$U`r?)Wn!L#~@am{?nh??|~P3HMpp zLeXK;LQ+mn7cz0a4=E7tFcPmn=1Fp>*kW_y!JDYp3wE(EF*lk*Di0sw$NgoiFC82i zR{02>tU)GweE9pE$z5G3jy5AhHCdF*nNFJ75* zmuwVciuIFqLoRYIm5yh|nyjbzpT?Q*jUvr|Rs!{4+!@f-#jEa#Mgkrdq7bcIPpQyL zq>|C%t8cmsY761CZbr_IoCx-nf>pGO;SlcJ=&lXDwZX#4JhDxkIDF?)$tsm^n?7f^ zcIfcb$gyMe!*}U&%I8i$xX_umNvpHpso}h%c1W$mN%z>qd9V4O`>eRG4hLk9@W5Yj zNZTa;ZQ&B0>AsO!%lce`<(Tiz|DfU`#`%U*%8O$Hz;(18H5`JQt{^CFWRW+4!o4zh z$K?fSxz+g6yK#z4K{w(Nq8yZc1=|tBf;^Uidy3>{G(*Qnv&Ued-x;2JW8Y!Fy#4iO z_R7m@OFn9obCdm5jVnkD`v)l^Lw)*XHeHV7u9IRsz#11D%eqwuX+-;~yI>H$XTnJEMvg|F*N#GJ!#``_K^RJ1%0WA-cX zN9x_gvWg@22aFGu$VQ1B`E9z2O1CI~^6my|XBc-moYpC>9Xie?u}0KRg|AllDFRR2 zD2-MM{plD10}7Q2Efkf#Ux=X0XrjA8cye*&S5c&w@Zpo8(Ok>}T4bpDt7+K*%xc-IPYzUr#ZrFuDnCT363RDC{ zyQ^5F1sd4HM-HceknKxdxYyL7GJ_us#qPfzCKpmlt~pHB$3F*{$}oT18qBH!jBa_6 z%lM8>nZ;YYIcT_A^T{0xR0H2x%xqRk@kGyuza`vrSdCd(yWhpoazd{7KMUnRmVA49 zaB7@*IJg6n)<`g~#`IN%lf5DhPyg(woCFq>%?#&LXquJXIDoe zf;d%9*pX;65NqO5BvkJ?7Sizi%cHiWo@ZCwTArTo5YW+ck+)Folyj@%80P8~ueR|m zUpumM(Cu8{emMK9xO~P{B=Yq@V&wU~jsHj{bZL9g<~^gl%wuQAwqmpQq)g7>?JE)F zvVGNu{>T%>ugYF?TKGYnPDU^;IL)@#9OwPBs~cGk&^+cRZf=u{s~XO+Bh?fl1(NIo zHN?&%^qTmhJIA-s9pj&-!k>9rp0K!`=2fFyhZ^_f%ld|qkt@#JIB@E863CKeFy;k| z3m~+e?i_7f{>p=1YD*_g+*1NaT!+{rY-l z@11EG;QTrZ%K;D9odWuF6LMnAllY^cpy!^LU5oY|t+zw7m!6$=x_VeUq4n4P$mcvL zO`((Gu~U+N1@r;Qb~vTD0Gq`}&Ibp@ccZIcJzq6jIqhR;U+#N^w;x)_nW!W(_T+f4 zSf}S$H}i=E8T;EyIAl!XByTNkUZ2nwA1U6?z{WHhp;cW_w(vD6up8AHdDSLPpOq~x zaGG<)ccGBc!dLrZoF?-k<)BO>G36Jm1t~HQc*FUVAnqHj|464-)mLffO(FLu9d@fg zW*CdWaQnL=?e5sjI6<8EMM=rI63g(pVAz9}HFM|4>JYE%2Kg7r25{Q>7gA2HIt&8; zVne~br1d*6I6R;Nl`g4{Z0?PgqKtbc#({^I>Gw+jt5S^~yC7ME*2aSadzDt?y&DlfDx^Cpbn-$3?_o`p+{g*$rCAZnG>SU&7r+>*`|BP*M?eUy2 zIuObi6j^H?vN1%<>nmSMym`uD)$4g_@jljCDY?}2F!p_I|Kx2UYaa=84?+6Y+?W8F zIRC7xU)vQ|5pHTrTadr}T8B-_NdIk1WcX}ljAHK>x5)$~7sg2mE1MC+aPAMbJU~0Gv9wy(6!V z5$qTvtH@xMEEuy#^)UlYKV4`8zX~I(z^nJY=wS}C`uKkP#fzyh@?F*RVhg+Z{lD}r z;vxZ1Yr*4KuWMQf%ne(wvVUn&GBAzsKrL+&X4^Q?7&13m%Ceeh?04 zBUFC-Uh-PKWUC1+h`&^7wH^i{FHdy0Bnz@mh zkj2?(e_`JfOf|JdM|6YaR6TK@1Pu?G(>bR)TOB_RsLj89zo(O*TU?S~?Wmudf{54N z=5p!ejn5u6C)I@>0g_piv{!%i7p_m%PGPw6U=0uY^3ftD%iOS<=_8E;pEJ)|I^NQT z_1IG<8F-ki&CvO{mIy6$^HZDd7O{m#UpCF(ISbwsj5t3S^Xwb&0bJ!`9%XF03eO`i z-%^NTTGu}3SN!3ZP`8q7Yc*^hZj ze|tznFyeP`^3#8=cH7`?g@q?*&t5$0$|srtB-Aib1pC!b!x5zv!~6QYsYQvtU*5g^ zxg?tYGwh+Rv9^_7y!(%huhM3`!jA=##ZC&J42Laxu7%pwlP=VjG)*CkH&z^BuQlqS z&Wq7BpV3-*~5#Ht7S4gZVZ<1 zhw>VA5UooBz)-a3n+^`LQ|W1%B6(vincabwRlGLMsq?$fY)L5YSGX5FN;@xUTTArM#=d~YMM%Sj6B}18;0h8F2NBV zYnE3eaU$Dl^iys zVn)ZHjQp_qoK1ewJqZ@0;?a)pZ0RQN?zVooi@ufOcqHYD&~c*!oV*994>!+0*xX84 zSiJXpFlzid{*0#r?F9jO-2mfzw^8m5gKY&PcFL-?_XSME;a=}_Giy!KY~mCRe;-KJ zfdlOwI_JCL_P>&)j3Q&H#$(}49{6%Xog!h!4*GJ%asqMFB0faR=CI)FTenC<$QBB_ zJa1z=ksZn<+F8!|eoQ)Hbl=2%smwFv?7$w)5(l@vP-QWTfKqPWEPKlFirG&+hPWrn z7Dr~H^ptk$IorR=77H!@_xSU!4~=~6HC!ZbKfWhig?a%;?aSHTz#_h<7UOTv@1JQn z!1XnB?V|P!WE?Ytp~Pq}*mZ^{$B-jho-`Gk_O#Jm@(>t7vdrU=Mt6=U6<0)s5Xi14#Z0bO zRCs9=1Mt1PL$7(mWadTE^zod->@bMLMB9yL`LXSR+w^}DF-R|;tH)>HM3y{l*{O7U z(#^iRMdYCYew>2rJfn4UP2oGto*h@7Q2i=#aY8O$)fe=aJpiqa1s+-zP*Gz%y3)~D z6gEN$@GPONDtf%=2oZv5DV~fu$wRn{VT|1a>MO2KN>7jdq1841QA@%KrsO@hdm->= z1q;Ww%P{5K`Qs8B0xsfXQ1nJEZlGQg@qUv6=`QivWACrCYkQ2VTJm37uao{C@aVkgzYoRs#%u$|lG~i&ML#x0`)~4sIvuqJIY4V*JwUk8Xf3_VV zgVIRL$(gH<)SezG=$dfJ@l+0ZZ*=SGDW~I~Zb6iA>Vde`Yyr5(p~EVU=p*30EY*)O z)KQ3PZIkXU+GXAVPrW!2i?ZKO!IW%in399XkzG!EE;~HJKYGv`9k#YBo~Q`Sx76YQ zZZr2cEC{dH0Rld63$6Qq_YBVw`aCiL5O&(&Eo$XX%+LQAD~)!bE}-o@OLJzvbLu3O zTE)vs94AWro~$6TpaQT5?;HN&n6sNK8do{{nywfepvJ1B?t)3Kf%|OnnAg4AQre7R z@tBXm!I6SHWVhv}0hU9K%|^mSP7AO@9Ano{N~m z%eH$M8N^u*{LRag%875s6%Jo?g=Sx*?Kyn%L?xf#3PCyS7=cHP`xDQaqGEva+CE2R>qAcUV0yO)A^Rs4eLCh(2 z+OD|$!~|)&xCnEe*S2WfTxd_Y7PlV`+AZgWd{*{6dabZ9u=(>#sMi3B+J_UWWUN6)hWgk8{Q{#d+v10eFs3P0`Jnxxeegee%+J>+T_+l{NKQ#zg_Xy5_4R z`^6Myd1l*VwY1vgk~nJ0DMgVF>FfLER!5>q(I?9q7ApNGu8xn|4z;O{jg~b9!{pcN z0K&M)&T1<})QHx7O2)=e$Jm&!+-)T#CnqlcE3U9<9fU&(KNdr-S7e4ZraVj0{@U8k zTM#P?96fyUbepgLVIKS|u0MuP?-s)M9u-^_Gb-9p{-ORjp1X;=O-@H8!U9Nh_4ma$ z^c>S-kFLqW;eG?W?R*PWO!YF2T<57vHEdA|13WA`4r{OfaRUB3WCMP|TYW3#<1z+= zCRTYn|0?bo^myIL%mOF6Q&Mxn-;Ajopwf8np1*zM=#|g6KHd?&o%->64{!Iv)mWQo zoAc@tyDZfYZG+Q}MmEFeV+y1i8#@OF4us~Bh-{XjPE7)o>9i8))c6OtwROD4#E9=h zWp$A@)HHM{V^_mM#WcZa&+aBi9eN4z+{tJ|4Z)KfqH02?gpcP1-muh2<`9hFA-VJB z!G>$yv9+!Bf}~{X&-{eO?amr14RK@L-m(sp#Y#UAabA2M4T3i zy%ljg_12FbA!*)a=XZBy)FtBJC;ZqklWR-Nsp*`_4P&yTxbjM-FB$hj;GDf zoTl@6^T#M1@%K7A6}(eyEpw%0Iw^N8Ekn(7YeLs|by}aKJW69vO}@}(<-=onxngH6 zH1h=2geJWSp~pisg6b{mlXNYttUg6GTo4saO-hPiUuV7a^HTbyobUN%!Eo5=rS%dV zjZDo;PiSm>_SpFG3tQVCRp~*&?((lai|wZR2M$Z~bk4J7ayigWxs@WX4)YyZh_;G7 z`zp-5kSnX;!4^|~>}hX?@~>iIvNx)n=Do28=E`hR$bW|M>QMC_W@d&E=b__%hd!nY zr9b`nNI_lUE6g|g&}P;0bcFbM%&pU;bEl)fNb3B}PM+MbVc^t}8n=!_U%n+Qa!YG~ zyN0z87fA8Mnj3MXGQu>13@5=c?z5*tj)&F53_6$<-rCxFQ8a`jJm#U1(QQr5{WE%c zz3@qyK2*28C%)yf)qCyg?~xG=OQ*w}jbW+jc|q=0!W?(X84pp6yxHNs9C(UPM{x~~ zDl`f9`|qowY|2*1sgu}rBk6bUm*J3mR!h-`loO{vRS<{I?S=oNpCGSB=L$uZGRtr@n?!>L<@GD)S{ zw~H5#PyhRsXXLGVHbEX$?Cw?4DEmoaq8g{4oBz8HNDq~F-Bi%&0^#+Es`&c1V29_6 zctA~y&i0<`op2)Zcxxoe3b*e+T7~~+B>wf1F|l50u`gcrh23#QoJki8`dA!~L!;Zo zi1J_wb6T45gOY|GMkg+Eg%3#Ct+5h8f(ja_p$unFsA#!)qdfr>RphQbmA{< z`9Eyrf7;v(K#2Z3^42l?8|YZ`A|7Pc_C|d9_G5THUc9P<&w}083aieqvG>fJE$mr_ zJT~rLa7;BZ{`aLeeZy*}@}8bY+nc|JG1`^b%|!2dkKlI>5@$#_NggIYCytOg%VSf0 zn0-DF&x?0Y2W4*!$WR^GZg zRP>824rQpHp7o#VcK*}j!S5kI6NP0Pw|PhMFxs!AwwagI7Hn#8mi7ZuuDU$h@#hKL zHI)(h+q^>4!;@^EAO0JHVMxt0A8)myLl6_eJ>GK2%x7vg;(oj^m4Uv18ab_RmQqnG z^yu|I`MZW)?BXS?jXI|*D8`uXrgBr}u3;HOt~uWbGMjrt#! z`(OWs5wXa^JUXJt!{=vvWUN{momg|yZzYA3`DEvHS)JU`rg)yVpgU?+@RXa-DLy_i z#k7y^Ql-%Ya&GVcNwfXe*Ef9&FNxl0;>ig$SoN#n$z>ZT+HmHyoB|~W4miqRmZ$I_ zF7y$RnpI4GH3RQ1>^BOdDRg~UBdk4>G+9pXTs=FnFR@}L<4Gnp+O71YIUQ#6KKy^! zd_+x^!%0QI0pH`Ha(;iVA5(0e95otT@19_*I^6!vY}MLrE#8*5zJ!@jp>#g}v0C1# z%Xa@qaSd=}=KwwARw#$>6aYR$roFR##r@5&QL4W3f@Ww*IN`%L&<-x|7v5rsR2Z(| zu~E3+gK`aT_66O|J#;YplH=U$ep|Jjd)0H$#e;TYJFY%v?f0LAPW;ED=YPD}Ip9G` z!8!vx#gEq>m|0yfrtEGbsA}?UJBsZ1l>fIe%d|a%-Ek(fv=E z#Xrge{Jbe%6B}wUB)Wcat)WCmRVq7NPDhj}S6!;?Plz=iPOc^=mOIT@fr~*BnZmz; z>4PH62J(%oP*>pwwgF{-|z$_Nwh9G4^}o$QkR zF&(E)y}{~b{T#*WHS#9pAZVIp+s2{w*|qX$d> zc4`0b*9y*wu8%U=G;W1$ut#$MrHXCgt>Z_7rL|vSc<$$#_^LX+UTMUX&wT#Y&l4(* zd;|n+dR1Q%My-3_O~CkjherM}oj0jxDhk9kv$M7ghlldG<;F{I{|<~q$YScZDl^B9 za!TB(h)wkFb!=-M#*E%AGX5rRu4OO#98*PVl}y8#J{bPrtCbWc1m3g%ePFT@WbryseZX98+GH6bosi?<{9&=vlcE@jDi3HwOb) zoo#l4G<-Ru#h$E`x$JrL=uxiMj^k_>7AfZQ;rmtmu4nrH(}p41DUDWxjpjBEEV26! z*{Hec$&81suRm~aU>%?Sbau9U&E^k$FxR0+LU4Q1d*p@iw zr%OJec5L-Vt4f;nTlbFXjA7M#5pFcknSf%PfhbVQJ4Bh~!|XqKuc3-Wf&6HeZnXcs zwjcAujVYCtU*D5mLk~t<{ps#jbFyBVAk|g8eK=`$cR9-XdVQtb06Sj1*kAp}*tKwzgqU?7QMzlT7FuGFmN}l7%6! zWl&A9MhP6&_&xCZ{_+2(%d^N=&2w2Ey;fCFU0JdxzWiiBD`ySvV1uZE34v?d1H-uT zb<;z$Bs*`U#&c|31p|KGQa7#%T;g+!K2oZD*!$eOgOf*P>W${j!Y0t1=r+FM>stGU z_sw{4+YTI8V$`3m?c$}pxOkpWyuZFYuwmFcjM7aCYi=zYq`HGihpK@n)dYi_j(F>-MbKf9J&0X>W-4=Y7i~MdE={Me?^^sLS~~`BK0P z+^OlB+Fe?}{|yiPAHPKi5C%Fq@7SCh2e%Gn2fdc1+N9avP1C69F!$Y=V6q>pmYcKd z(7rfv)@|Vk;WuN6Ci^yW-+A48#|F+|E62nEtUx*>b+{pRFB_i_d z^oc$sfgO*4bLpe$!=KeN0gA=(QP|o3*Xr1FBo79v(wU`I>*aJxIq|utHCQKnTiij_ z<6M##?FXvV^-R z6eL( zeDESx`^QePBoGwc-+#89hZ{~n!8EDD=ZxXTXK+wP?D4PWbd>5CTzp4|d1I(zvnuxrxIB5C!*z8;_ZI^uzgUJNJLx-zkBJgO0)O}Rw!@Ss#c8t6 z`Hv>9gI13W^yKp%PsZr#a?HJx^m+qNa(Yk#SxNJ*mMz)MwkND$0NAyB8)zlvwrZlE zX{a&{;t9xVK7u>bAH#M6I9B!(W{<|ndPg#UWeFzZoE17?KKr>8ow$=jKs-b8RG1@m zLCdJ{}vXFuB*tlKL^#ckXo5?20MlC?{|dzh(^HAtpfN3sbrOlaWBv`Bfn4W!x%85;LRAl6bg$618IX5vtPpVJ$laLzKnq}cT#tz zU}{2dk9G$NDX4Kycee9u8xep!-z@^KUcD+={K@{%Og|bA8b;qYHSO5Kn>MYEmiwC6 zSw*&R!O0TQCIn6{hrz*Nk;ju7?|6k>W$k*6z!@H98o6*Ip-n>cB~X9-<>JdyQig2z zXwQk@UR9VWL?5RS(4Fz}nhcbZStnnoCv!NG{2AMoVJi`1In~H!;zd$~9|Mg{n**w7 zrdML`>O!)=30<2Z3gZ%v)pIWu}9jl-qD98OLEf~d14`? zsiut{EZw;o#N`x+TR~LKQ+#p3?LS*DscgSt&$U~jCV`S)wKw2Kp!Q_>Ij$|>&+||6 zPO}be;>FXpKo@akVL}?(iED zk=m1PIGX8wdAIzqCxj~3Y*^_gk#B0GW#r5pMs4q9IqLi|;1nD#ndU}9`p` zO;hnstEEb7)eOxQ9SQV63CSwEWHLC3+;lv#Dls7kq=kL&iA@4?a5yYfx`T_Aae1i= z8JOwTXmXp@N9riYa%b#;TNKP0D0K-F-EZpB;0o8>x_kn)zxbUJxU{k~#fO3vIy%&e zEHS2J0W&X$d%6&O;2zTv0j1+~*^_@C+Nj_esSW{ts3o&}D=&_-LYdsCP||9D6*Di8|+QQ@aB(bz*bjT1QCkO0J zRk?i=Eiyr7VQ#5E<2C~Fy;lUc;oTOHoNE7}RXUg8skU@ugi-yhXZD_nQ%4hg&UtUC zWv}eEYvF=5-r5y*o&!{t?ZGaeM!5bogo+DpW$i=~l>~gIBPv{2KXg2URm-&)l#hol z&C;lL+X2E>%Ch%nrgc_g^)PPOx+8nV9aS||HS9G6FKRxJ*cbt=e6(aJINP1dh60%ptQ$&8RTq6x0=OncU`Yby9S3#R+v9^B#j7NPcV$4ZOQ%-5-%vx^_86rq zfWR0f@D5^+?eknMUMx<#cE-W?IK}&36BbMtJ>f<9PRNNb*bw#Z05hp#_~SAl$wF9BJin)#$X|9-E<(LujnG zi~wW)CVVD~eHjk(i2)NG7;qyGQm@xDF!kT1t(YngxWt%}-ZmkR8kpqS)k59^>?LsH zK@6Nd*_{X%46CV89x5)4BL-S?teEY~;T=S9)WAanf>V`tE*cJ1zz-@jlFNi;ETbJ_ z%1=wJ1J?REHBwEd{opDfEK8O~?yp~O;chX470W*)`DwUgxm_xrQ>7va1HHHvJNT2( zI!R}|zqo3v8Oky$I6%P>8EgL1xbdf!Am-4nYFb-HBR!{~+MlnPkCw3cKoH&RA*5L( zJMtRNC$VMdX9?`!{QdVUZEsIGfID+_#V>j&PF)NI0cg7ke)OLE_C=pw2W=z?)W{W`8VvRXgEFTk)1WgDM@HOXQ+bp zn{M7}tLR#zY-M_eXUO5h?(NA4<}fXIV>;qwCfcQOui-&ej?<8S)}1mp*q(>cv5kRR zlI2_?mGtisj6<^yPIR!yv2i&rL*F7XWVXP?35K za243wIb~vTBZxUa;D%$YO>$QrRHsF@mhxi3UXi?)ML%0*dOgFY-YXs@C=iu?#hqyk z#CR_ayH6exab^(SKvUN%7WHyY4I(tlAX)$pqD3%nEBs<*fA!nXz9AeinKXJ~8M@n= z?fq0Z(A*69AG2>>!Gu2?16>IkA5ed?w z(H9LI@X4zR>%iH4BMaJYwE-%d;g*m{u&9sMer=x_M0o7kFlD8Mx16w$TXR&o_Zw-l z*M7^q$?sx7_Y`11wMejvG5g}B+XTS^oZvpTk$~^T%|Wy|m6*mT78&Wl@p(`FupEG~ z^aHH?BvG~|s5H}}QzP~^0?ex10^N5+&ED|D+@Kaa)E)5JMj%74S?h<>v4P_yQ>!6u z66IXoRr*I>kJC50n$!{0%Wns5z5{R~^zzs`H|F+~FEgVYc2`O7-4cBLuN^jJD1tHJ zOBhmra2!jY7Rl@7g(^1lwMZXh=to@#T^#C1 ztaFq}#$DbL*_Gs>Y|tCrz&62LM&)%t&=($kb_$JUSlO68KD9vb=LTgHc0$qSF{ZA( z3xuIU+7vrD4vl}Y&W2w$hViXM^r-k&QU=k}kj6fW;;Nh%MUkK>won1>kp?*4=J?fv91_qmYY$>8nBZHBTIcHUQ5*PKW4KqsZoC-zWW}t*=3_SSyuw>aW5fzHYG7JKeLHtmmH7P%)lFJ~w zJfEv)llYE}Z%qfWBXZhnP4yfMndQg>mJXfrbLJbf^Nz8T2>`g`-W{o3El|E9vk35e zt<`MzAieJi_^BlW29THXnD-)K{F~oVyG6nPyVq}KQsr!^;R$$NTtWye)QP*Qak%0@ z4L^JNIc@nFE_#JDSh}kRQe>+*SZ1YOwlEeP@BUfjeO<6Y>v{tSU(d&TNTS**Ef* zv~jro)v(H|4(#w)3~c=tJ{${~I{(php9nE~?M)fQ)SHw;k1M8yC2M}fn91ZL`DARW zn^dT}BQ1>qEDl5`tR)EnIh=&;lhQiC?L`}Qr(#t2J*~VcRJUQjqqQv}*iZz_r;UJ& zfvr~+u&${|3=rt|0J+zySyYappqwg*Mg_e2!@ZX+^M>up-_r^F?3K)8^x0x_(;{Ir z2yWAJr(!D$c%yjQ@fHOn{t5RuQsprwTiyj#tw-gI!>y2-7CSd^=ibZhO|7CmqA&l% zhSA|`Z~D%laxto8MU_R7ahqB^XXZKiLdWwJ1!QN1DU7jlv!5+trSbN4Ibmg0ZsiyI zRGD?hsve^tFZKyeUn3Xtp-uTqx~oXgi6J-0bGye+3I|T#By1%7J^tCxs9LlN&B&&c z;$Q|z!1Ymbq-34+A`>4&Bw>S*kz*DPS2-|BmUM51E$e)EO(Q^{ z+)}0_+vD5aIE>E}pL=U0HdGmwsA=e^{PeL3a4V;d9oSZPr!nvMp+<_5-aj0BQ%CgL zZU)69B*t~a-S<^dqjF?mOAq+0AW3OARen+hQ-J`454vLNYwEa&%tBQ*sZE78_55az66xcUn@94$}K6u&;WD zpn&LdpOwzVCwRh#w_@j}Sh21rGQ|>O@?aM;&gl5~x-t67?TgC^w!3I&WIwy-K%M<; zEZX8Zu&1UJ?!ca~{Z_=>;mH$t3zfyfI!qiiD^v4p!TA7WZ5%}Ja+p&@=vu{1t>(=o z$W6QznQTL2!(2JjE>+QIa>gfcWb4s6;O1!$Zu(X^cnSu;rEM>K!Eu(D@hA~ozLFn_ zT1?C$H%Mc*?#6p+^RpCp+YRJ0rDA;)1nA?azxQaA=p*a3#*C-U zs|4-uY5=V%jg!9SPgGlpiEeT{0R1Kq7=m+icpsUNK2*6#8>0!Fc3YM*YChVt6?CP)FC3Y8Ad{vsfz*#IjlKZy9Y$@0nB@9rV7 z(P=emF*6K*QF|I zU%}a17~rUU4h*>)xjX4f0{|k`)fvt4&3xnPl8_upN!z!NB0l022R41 zszy<8UT9>sh{eL8(7a>u^)>5mqOD!QNr(mk0x;%}J8yKIi>cZLZxW*8MW5P^3Q*$7 zc*d0R?1f#U&3QwZ75%;nue=rxZJ!Mcy6NcPH(9%TGG^<7%f$p0!^%6kmmuTHJNCnV z?6pK_Ov_=ypF6rGL@@msKrX;e_#x_!|V4K%6jg~*sxJhhAUyDkY1$S~^wLl3{ezc1gI2pES!wDc}SirRf+vI$v>s@M{j zTW?m}eZ#mMGIL*qypDIOEM?O7NB;!ijjc68DIL~Pm>`vUWr3qJfpu%yI3E@nn~_gh zo_)Ao%#!2XkzwDYcskw#9)qJ~k5r=6qbK0*ju*)y8SL4(RR#JI8X0mExj12Hc<^>g zNKkp3g!IUUz|Bc-B2+$ooXd)S`&-F03pX5G@IkPg&ecf|Y4HDq_hVPtgJTR!*6-&w z`GCw#HbQVMQBJHn!0uk{uxqZ#v!!b|H`1=PNeEY_Jl}!^ z9Y%-j^`d4z$0f>_S;82)aB+B_4#^Fm6w9p+B=gYxJ50DJi~%9 zs5M)qIr%Q0L9GA&8h)bvWitxZVCpB=|3KPzunqpN859Daqz~z^LyIs0vmCysWMXJ(EiXL& z9m?moH!MN9X9k?YHI>FeN)3|H^Wq#WIk04Ty^Zvd;8U|}Uk@HN^c1b2&p<3l#)1wX z?c9Ah5L5`8NAM)23U9hAgU9Y5=cTa>XG>s@`9+BdMMCACE)R0S4Kmt6)qaVYT(e|G z`*IagJ&4dVXHcz4F~E-NX`g|Wn;-_O>{i2xT{H9|g`jMf*)?+ESSNh<@M}n{W9{yr zKd`In#-XYe@pG{C&n|tYJO)Ri$6qs(`FC*MHwi-8cZj{yHz(MZ}$(w5k=8m0Q_|KxcAeRD;3Si`GguBAICme9$;S@0amlD zX;I2(@Q2HWUn*_au-B&|w_38Hr9na>J%EgEMie0lVDc95+m7}62Kw;BvY;H-zEOz1 z3HOpUieNWI!%b{At$}EF2$!`@mi=}6XP0hR=2jUsBe=ph^`86v{AZ}97UENSX6Q*m zcFl|NazxQ18URV>KHIx@S+%1lqcD{M>v8%B6-Ukhj^F*Orr?bUATAjaU zmHhpk=**4VLuu)Gb0z9K#&*K#&RH>D_l2$=!-(UoZ64H#)b-{-X%{ntWo3<4XP2Eb zfoCP5m>1{pschz4vGNSR3pSc2>A+M-*0&Ac3@`DmnTRZqZ3`tFRc6Awh>Zg7KymA| zdm8{hqhOsqN#92tN@L8pKeoI&BdKuN2J(Fjz5!p{w)LS=G8l^<4lW}-xJ%(TAV+~m z&b8p%UzdL0IDY5j$J!``*kqsz)LjX`C3>cBF@7nvUV(})#QH8j8y5@@Hrcm0b)skT zWdYpg_72YiDtV5X(bIm3S)Q3u@FqHeanbhoI-wT`gD=1husb2&IRs4ngFIpD{4@=) zcj=q5P6ZD?CJ;TE`STTF&WR_JmTO zyL?|ny#(fmu*=k;kKgiu+@r+C#^4*H|7>=QfzI0gWg5@~PMauwx70D5NVx zXw6eY$_8Uec+Hokpa>n6=N~KIixg$#jv7CCbe;}R9Tw}l&Dgm37^n>(4kA;X&7fZw zp#dmLlo>)Z;>{R!!`O>ZKySd}4(_LiH=`#T+}}H*|3WXFZD#z#y|sR*?~wNCmPo* zMw^`*HX2%yIgs>xQ+pP4iM<@+J(Tu+fCR|)L!GXemk%j!%g|*!%lEUW)`NRK(2L*nq1K2hVavuV_x4d4g;N+x7-@>=<8 z)v`kR$C-t#u1qtY)}&q_jh+a0ltLTl4qHO#KO(Ep>=rK0Ie;o+v^2Yo>ttYCTC~26 z>F5mqJ?dHm$=@JH)OJMooNS~Rj#)Bcm zDaHmF*3_-WpAV2lQgvsBdvFV2F%e#kClnyh$Z=rU1~Fc=Ay0njDS6Mh{o)Si@$;Xf z34pdFqsk>QI4H~!`ojl_*}8T^v6%TK6m?|@wS_CRF@sX5C>d~= zANs`zBI)!L`#^2QX?ft2JCF_C>3e_~^s-bLHMfXW*ilT6g*YVvNxe5myI+of?#GEQ z09#Fnd~fG8RF#WsXac*{`Dg0i8gqXgTP%8_b_iIW0Eoi&=XbEbQq3As%>XRog42O9K1 z_J2A+*=zbc4HTCq#7D?FI*p5V?x42aLq?9=I&fb3y61Wqn4?r@O~+~vW5s!Z3x}dR z@cN&xswyb`8NlvLVXf#HdB~~wt5$j2YYY0cNFliQh-uJSu!2- zlI~>UBMVM1<9@8H=&W6_-js~N;h`84P3THLVwMGXMA%lm-Rhc4t)6`kb)ER)-> zLw3`kWE0;I)`XM-z#h(Rp5**nRovwlyC5;FI&Er&xVf!P0BvtWopfurr(yiPf40SITLu=u}DH@uJp&*#}X;W}<6FloYq`d5 zjN4-a*~P`AMo#{-E^0}e!Xj@)zo=YEqhF+1 z=5t6_*wOEOQB8ZSse4@HsdD#y+o+DiYU!+9TFVIOB;bD-!PHr)AZS-`@^9tJ3B5(neA4>R%@Fty|B7B%%?DRGDb_?$LhhDN^j zH$z6I4cU;^M%V8@xK{zPzmWdObYKw0wyg6pa|ep|#Uo{1V;u~kq52MBfr?DacrphmP*Apv0R0a|QI1c#p?jmy zp?y;4J%=}dhafNlT2~}6mWF{|l4PXItnpF$0Pa~D6^0s1!U+C2`8g1y1cs}6y&xB# z$EoP(c(r7eYGnJ~+%P^bn}(eYD5}&7L#_9(9%++OC{Nz#2zq-y>U7&hnh=6X^(&LF z!GYUY!f6Y*R@_}%uV>B^9s$Pw=M*V+a@ySj`~LJN@qfR?oqKKqClkKZ0gr^s*ZQl;=x`kIgKNB*I(XlD7cgGJ@CgTTr!|8 zd|M%koct`3u-RJ)CKmE05JSJ>k)#>7Ey6r@=!ZV^ZU(cdBc#~yB{2!u*||z2NTdqbqK$W(F-kD=>zo%xh;!+m5qSzjx9)VqyTCEpz7F%g2zWMY<%G z%7^GfQN-*d{!9m|!J}K{WR^>&G)vOEEuz2fP^)=CK*0~~cyLgt=aaJz^s70$jW~c! zF;wVBTDv;bf`R+hj3NJ=3o1X*z>v)EMO4((eP~Y_wnTPX>kz@bA%Op2;@aE}?hW?R zr61=`Jdms&$Nw>ct?&7vgOCUzELPR8!j)HXNEXl`aPn z>Ai`HbOMM-?;RlsigZ*I8?piENN>`67m!{=f+D^5rl3@50-+`(dw<(=KhM2my!XEE zIe#)TAR}3G%{iZ0*4od^+yUZn{1>l{A(QukoLF+3*W1+&_1?1IA(0OmV7_efHZ+sr zQj7N7)HAyLdLg%_vJw);N$mqV*cXR4ts&31fDe_K3s6F+SmX84lvR38@X#Sq{|ZXs z%kJK!@Wi9~rqKoj;iR-+qg9Wa-YiSG>zU*B1|K59M*d12`e9URDX?YA&#!ZgOkRZe zaP#`MK{!N>s8fR9As(eI@f_5%lY3fumyoGATdt-1;wOB-^e6+CFSE&L3rMBAL)gF~ zU4T)r;vUdU$ix8lEN>uI%u5-=20r9?pfMrU9(aBPl!V!`L4JhLy3sXYmayFn4#pFT z4^jI0b2wzX_z$ROtp#LCH>f$F*@mo6jPRNDa|$Q90?)*kU+-<|IfQ;JIPkayb-88d z0;b@@`AgYA#Lscz<=dl5koU>PEIU6?2wX!bJcjPYWd?mX^+Ojh)vp0;`z|Id3i+V;!fZ#(E6bpIMY|8^2Z%m;A z0!r}3B$!td4oWU3+XMQusJ(C3P!}VC{e8XRBY@7b19g5)hIRBkBz(JX6_d${YM1UcNb@fn;?v@1Ov}cC49ZSu;n=jIi(yA52K^Ev;jQls!Jr;&kkI& z`m&BP#8MJ^Z~Rz8Z2>g1zWEy$J0th?!a<8uE_mr_h(2%wIE`nAC2?x|fFmD@gmjM4 z?vTq5Kz7*-`c{q65!a&9g?jdqY@qoJr1oyR9t+z#icQ9?ef3OD-QmrZBQ*87#r_`l zn+KVr$yY$zBjCesuqDLC{%t{TI)95CoJbzpy^TlBYbt_ zEX{6zy#FUj2)T_##Q=><%?6Fo?lZTPr>SB?$4~~ybTY~c_$oB8Z=ince8_4(Q{9l# zD_sFe!k=H~KXTQZIBGn+iCfHE??k~^0PlCgWYFjV(6}h^1V6ezG0mP83BmpF80f-@ z6sKKpcr&I!8m+?~vO#OrVk{b(zyS8Ko8<1PN@SV&qg^5jM*QpeJB zlcoTl=>V;cb|*_f$h&|s$)`MmMD5$I|E4ctwrnUoZb9$6qz94jDh0G>+3mxpMNTHw z73x2vHIs6X+SVdU9sDnmIB<$zb-J^TEgqDhZq@cRxx&$hk$$Ip?e|@PT@ld`>e?Uq z@9dy{QqGDG3G(C*nJ^3Vy}6UfVp&m{?;Pq-H0IkWZA(HgF-6WBjWDxg|CuM5?~q(! zCRc19CmIQ5+McU=4I>$b8AUuY=_le2UN(@ZR)CUzJbSCDg}wKzWqf}RJrndCN4~Us z6a(@?9;m{~kcEq}eS5&iey9j%#$R@moOK%gM^*|B)uE@M7Xhp&+ym(6KYi`mV`f5R zi(J#%%VeaJ*y{ugknucka0___fn^hcXfY97s21Zr6m zXc}E=wK={Eq;O9(hFL%bn^7~zTB0x7G8ZA+L$gjCv)u`1#vQsji}Tn-88I9@*R<(5 zem{wR4k0DM__qChj~;N!g9T8|1=c|O9?0pz1yeYfZGQK&y#s7!E>*=FmqMvepnHen z=@SJ#jH8P*Ha0RY2b;~ig>wW{qlPBKCTwGl7!vqUJhrlqf4K=p(N;sIbUX3KFL#kv^a_wb~lq;Dd&HS3)$Fi;|Ap6qe8 z=qa@=PCh<@rkrhKIYLVg*XX&!P3q{rQ_4Ws&8C&SGP`c;HgJx0>Q3wN%0zD)SJoQO z@jcZRyuWwccgvh7G%I4>HdKl{ruLfDsNTHj?|jb6a%a1C2_==+{P2a+zK>iP$m|p1 zT|~>v)w~(BAKUU;mtk23_2@)Gr_&@kOR)7+=+$%P_QKlC zwa8*ig@%kloeJNREp5DAU`A@lBG22Jn(oGPz-%%sKp4+)U%&ij_;Y@1Bwal z<%Hxh*rqB8g4DVSbLFAc(B<2}Yh;MH7sJFJKrBL8MevaFThqrh8XXqSLnU|NCRY=r ziPec9Up^!rYepRmYIx*;?{W8b!`SN78F0+n(9kVODA*4lUM8TEvj`cl&SWNz;n3?4 zoB^z(=Ar4(NU*q?SD~dAX|xcuJ;Oe8ys(T^mp&uC1n1$%0y(ymS8tz8fSRQI!V(`Y zV9L&T;n~$Bzfiq9jP&-w%s(SQ?oi>?$W6#4B^S3n&He^otNUpe$3MKAYG;fb9Y#Mp zU^zfqCzpJ^zZ&5Y&e?=p?cE-U;i=bW%87675r-%1B`mq^*?gV; zRFBcaA{!1@8LnkSKl&u?W*We23n;s~vhYLHQpKkZL%oEvPTiYa0rK_E4cu+UuIF zrdhL(4};60w$B7qOtKda^@=`nzhVlKKrK%BjX$#UgpX~o;QPt5RkXJhtq2>3kSDe+ z0J=h?cZd$6`i4UtfQ75j`t|(`0-Vj?;eleOyLX=yFN@qZAOMOP5WZp?>g$N#vr~GVGbKubOtZWfw50QshJ(&(_(C-mf z8UrH6q3<0T=mzyNLh=Oi|!WW>rb)dJOc+rM`^dmo-CG z0feocphdoQ?*_y10TKq;O~e!w?=!$xb>EMo53wf!gyaUui1|A+2AH1M$D=gzj_`U_ z;IqcF2YMw9LA&s?C3zh>c)$dIJsia5P#zuso<(y5|aY9KdaRxI@C9ezIM2 zqckoOO07aV@fS|Sw{7W7Wg3DRgi58$ksff1ZC9^&YjBmM@xvf1(6v?JEm>tFM)1E3 zeGC;;0E$G0or%b7M*Op(YWDY#7WIe@X@MC7IQYp{8a!|7xm2)4qo3#*rU2FN>h*5G zT{csDrw2D4u&118xWR*v(BoF`o*Tsn(%;2b!zcJ1s4qL&?;$;wizn##d(^0=i+ucR zv+S(!yv56%)!pHT1AFCA-2TBUXJKc>_7>C2&q3aC@SI#~ZMG5k^El4*By)};V|}Eh zMeZ%cRT!hNkCQ5gN&v%im^VPp^C;bmdk1>yG#cHVE7;ZO%)3HbzSTfN1>czpqJC+= z?y)`lI&LkvPjyfsa*=3L>)u^XqfRIRt49K%j`P1poS@!eTspy!f}{Xjd8i#1+5%En z2f*!gWA<*S(~qEB6)N={3XUX%#~q-SB{KjfzK!{+Ox`b}$Zpj9kA1Y!7gtCQ$IKaG zw3s<9MkPPr<97JO5g74fjbvLxS-}ShwLgo8(OG+h%uHai2U!+kRCBORps6Cf3LV~i zzvWLg|;#Y6ki&9wmYa)|%wGEC~vBchGF=E%Yw!-4jNi?3fLs1vKVg zfE6{!(i1Hu zVmmEev&~kLcJ(TtSIs1~3XiwZDVU?hq=#o}a-BX%1*7_&)>_<1H#Tv*#3LE6Ja9?< z5zY>IYnqLDAhL@;(;EZdLi$@T^^_KFiX}{U?z>$H`FY4Uap7- z3x#@8v=Jc&*#;aX1};plPNvqM7?K??%S&zD4qJ-TQ^fw=7ps^G$%rB^m3MjSal3o% z0&Ak3&cm3>^x!P*xhf@}n6RuM!Om9~oD~<*w0ynM6|wX0${uG^1`sdCo>Nh3#I)~AzZG)I3yhX)j%As*|NS;Olwx+mYX~$-z0J;G>A548(%mQDn z9v(N@uOWZ061}%;;+u@fp*qR=5JD*J&EFFC;ze~BB@xwQoBOCY^mKS6V;1l_MCSZe zBG(4c2BSyCRla<=QdUeKRutSG=IBVH=drfs6E*+dw!fj-dNQO=;^g@YvHbo-w4x~Q zow(_5k)stWq0VbkD_kMFp69Ps&S5UxjW)@XPFbW#WJmFc@^$6(gljw;=HqFv*CVNdc#^*0{TtHZG-Ouc44T9wN=pIH-)=sYh3a;UQN@eBF(+Zl<7w}hzl z*pVnXI8~}p_*)GX4(f3uDg#ys6@8`CoU}T(Icf_=+!v_xnh&NImiuVDOzXg} z#w|*{T+s*)C0hS(g&Wb)c4TqriYs(|4oYt*9hBiRw#1^W(?Wgn-j#OOQ*ToQO@oag z&Pl*(J9humJ_!}m$B7a2RvGl$y1{{N8kt*TkFKe4bkyAE5u;2@WK5M3zIM}AQ0Q7I zhnkiR-T1l8V{hIdHt}Gva7F$mLPFg9oXI#>-p6b5)^|e3=-vW__JXbvx?kTL(Ehf^ zzvDmP#DdF=86qdFh}h3ggG?)BPFwzU{wsSwt{Gy=G5@@)-}&mt-=6k#6%^(X&GH?S zIxo)L;)q)hL~tnua5+*{TyI&WG_?GQuPTPvLXjc12S`28OOfk?pDbJ9ep*hxbcx06 z2roPr6}+J(Q!ki+8D`XSzOnIdry1Q9jU>*8ge|d7A8J(cSGW;QfsZ4NM&fSI@Xo3U ztafAv-JN<)7<|)9TbqcIBv709V_&ylxI?Xb|1vK1x}Ysry$+x0NYP^S-;wt>q<9Fo z%)EYrxF#sPQtBfS>fDr3I8)e-bslYu=C5|@FXhs?)9XE)qTBCRfLVHX_qlmS-*Wxd z0D0EJu{uMa%D`&`Mb2_Kw;jb|t~j)14aqU0EPaY!$zYP+LfIDzOAG7|#jWG;i92|v z^VH&EqUs~u35Oe3gsq|99*s|rrRgL`go?6bON6ovX#K|SnUWdD0H>_rAwD;OPQ0PN z+Zgm(=&-UQHitO6sHDxLSAZz*-6=U;mE4fFJExTKp?w^G`P<)!0P|}2s|KRm9C{kz zz9j{Lyd`TFc=YUgzk8m0e3m_#eb1F(TS)FLj8>v0qmbau7;uG~z5>y<9=8U(1@QO8 zKOYl!o$CQ!Od_#X9cHELZ-EI1<2)fAMrSHGy7I_0Hfy+BDW`$U=>m1py^l0v#vZJ? z0e{)}!_`$6eI-&N;`Bzc_exmeTkrmm;U?BVNg24-gD6tpd`1GCe|+EtM1y1JnrG6w&x5cmIYFJQGhZqVEO^u{wQ3C0lY6>4HcNr>TSBxhj4 zt=E@1``!_AAn1v@h0TPJ`Mic$gKvDryhByrR;84wYtqw_nBzzhoO&U>5J?k$BNNZM zYGpi%?C-m3ATf#FKhPs}qKhpUbK5dF?N9q0K0;D|?&!x&s{90Vc=NY`iF>vjlsy#V z*_8x1d7N>?6j@kdjuhq%&6PU5*X$tyU4VB+x%3f$TEnMBU#2S4$5QMcDBpa*IDt() zef{^h{GAwxK69KBr*D!seS59H`MjhK-cU(}%Km|?U2~KD@VAB+{p2_x*l zDvf<2ktV@MxS3D~0?H5ydQiA6=X}Eb+xE;jkonw2r?q-r^T&pnsq=U0eoe)1l(%%=fu)ion>x|f}YhCERFm14& zi5FdUIb-KBAy)aBaGh{t=xQ=O6(u%e^5b&$rk0QAlX0Ypm3B`4>5Tg8^~m3m2O!(E zRwB#UeaSA=Dd|h)JB6vg3F(a!KDA^So{Rs%;m%K@_>#KF;#g;^M#XvIagXLd(w+`B zgI};FgpJ=vQT5T4lB>bu@l!lxek)JI1O%D7i;J7C276oBc>6I{c5*14U0k+a+Aek0 zTHdZpH-t>NppJE9v4)~9(e(**G`~`qRuxYwBjc4`m1kDVbM6wSd}dV252|;2Q)aX4 zs`=p$GxujLAsptXMozy}S6?Mt9_uLwF!j{vKFz)+QT2@`Z8N4Yif)E$OhUw}9H;s{ zS&ewcQ{a{3TFy&@e9U*OAvShT#FQBI zM)V7#;8u~+m^YN$6|PMhau`TYy%k=On_e|Z(fx1(_&R<#mKD(UCH5c1MEGb=PKV8i z%$SMJNhLw2AWTuF+kUXJy%vNd z3RE0He%g30-0K7tCr{V>+GFQ6p)4VNvP2OHvatjtaPZmSPJ>b4A{oklY_Rodb`(ZL zr%w4Fxu)nbalH72yl5x-gsFR)q!cO4X-cU9xrkU#A#5hG(*3^Ev3*foiH6BvY!+Pz z8#YeOACCEoqXWw=?=FlU&cgK{rk;KN+_Rz?s|F@z!-nVsz0W$fstWx6`vp5`860RC zxot;xB-JQ3#lLoIc@F%Y_v6`OD$cO$hfw73rSNhY^x8cMO=Py!Fq|=9L-MawS?tBO zCFzJnz6`vf^=0JddOXVbrPTf2dP9Ak%Iv`^SO#HSHw8V zGRN>$&+?SXGoN0U=AsH*>b6|-4)L|&Wr434cnKR@T}O4Re+Ybm&oNdxtwNCLxC z!279idJY0ubkJR@7ZOE>$r}>#Lu)FL| zKPLS|Pog0Ffj*d(`{|Kpp=m>tg`W;zn{cD5nC_W;MY4Z{L00}12gV568yl&wYYT)C zdEWG#I;R+zrotm~5h>g=q@w-Hdmczrow#nEI&bPq*W*~M^P|l-2YO)G;wnd1wd|-e zVv|9Dw?De@X;8_iZ$m|vqoqC{lf*daoBo#YS4^%KV9%&XE)Gh3q)QaB=ECcnAM?pV zKCu1ByFp$khuHZgA5o1Fh%iGu07lE3P{n2Y6cyOt4q}NK9`IQ}uK=$`n}Sbbs)=Y5xjp6itzrAn^(l?mW9#YH-4@1`j=4^+)EX z$lf(_`e%7Kbqle%rJ-s<{8@xjmSRng?s<|F7cSq9eO9u&n8ZKy5GGI3025zYBGUMJ zGS+Dj=RG&#O&Jz#Ha$h1oPTQOHXPO20;2X|3LsksZmknv7xt9>`H|ZC7dHin5w)S2 zG7;Zc&w523{)L3|Ro-qYt7tDOrD}>Bi18V-$uu*oH}vwThcY71wO8%>mJPYJ6y1B- zH?tfjGz3TxqPNrfG@UQET+-AEtEoNn&%%4zF=DfnJa(Qh%}`${Uz@AC+iorYBb72Z zLWJnAt{u5k^91<) z)YnI9zx3q)Bv%#-j`vE)%|A>U{bDimS2YFnVn)}VLqv|FP#2NW*cSsc#QEC1q#P$p z5xc^kyMfN@qeTvihR{t;L64GLOn6DBMt7Qcm9V{V`}ezUa*;bZMy;m(r^|hR2f)9q z;{Ou$0jnL8(eH+&b{VhpyBHd;2KyCsuS=xY@8|G@IjWJ9)nH5*bBLKZu2#RhjG{+* z4_`6u>E7yOMsOnTW=G^SlRdjx3}Bv!`CvC@G&JRA{k=^&*S5YUTg__f*x&Skeme^I zaG8X2zEJ7vOJF;N?`iF-S0r6IsD8n9A1m!?rm>Ojy&Q_h;I% zR0`iefBmK`a;WNmDV%N+12&M$$A6xuJWuJ@Vv$-K?04;XxSq7}nP>&6#R5^q|S^B=>&A!rRTrC$3LP z7t)dPEviNTvd3tezD(fQbKPav=)}CBN?i{I}c<&{7gg$fo*Cq5Ak% z|NGc;CXUty2FHrwhhFnnhpoTev!-M;$kXLX?b>*UPhnvG==Gf&Q60@UPU4guv1|H> z5Qp~M6U~Z>cJt?$CK<*L(^vK;SCkVh@NX8J-2o~;HB>;tH2B;8@l2nNn0fQL-vx4F z3DeIe z&yqbGwrDB(c?;Kn?`pb{0cir$e=fbFUYdoT_a$$;kJ}B+{k4t6UEY$HLwVI$^{+}S zc7fZchP|`(hP#FTw*~zdBa`y0U@4OfEeb0#EYIg*Dly~(5`M(;J1jR`Q`U0Ej^pu0iqr{u>G@kYzb)DB)gsIydxCRH-YraHcH$-scmyQF;R-(Vdl zeoouOCFFae*wnJYyOjXBA9<4*JvZYh|I0%D_p0X~wFq&Y;YMxj$gS{==436Cy@wA;mdOAdMpzMtrIpeBEb}fAc%TzuY^9r z?S#blhs_aNHif~LNo~wPQRV&&%4#5e8vGUZdv*zei)U3ix+RE@YfXycQ2|BuBiCz>uW3R4 zh%42#SNb8}OIp_+KSr36*^Y}$->SUopnfPdyJdke)xZ&3Q7B1ZVo#7Gns+;+VT6Z` zuE_GCKW^5&Wa25_O_jd6l3XZ({h)uGYxKz>Z9dC&iIX1Fr$3mtDjXR!iXh)C+N}sZPWM{)le(8qA8(5<3 zQm(yK9h5ffan_vgxf+=&N6HpW-mQ73EgBf9&?Fe5J66I%*lMnBkSdmtx-9PuQy4np z*tzXz{YRrrTc%ON53ot32i`xSP+?&1tKSlA@lxrT%<;!}jFe(<;kSK5%NlQ;b#IRQ zs|)tOh5C2cLQ7)bd2@&gg-UEWQl_dE2N=c!8Ah!7uels-eU)Q#_j>7Q5hz(9Hpz7TzP-M#|pf+Vl*L^sgFH-dEO0;@^;p(yWixZdSNoA~La47-p42VXsT?d!qfW71F z?f`=3iB;^Q(CeLKyB|ln&bbYgoXole{%?%=FQQ7N!gNSjp|yw)R7>^78zI8UWyAs_ zskM@*OFlzkW*U>EPO(W89M>!^^IyovXcgC8VW(9cTXVJ<=|EK;z&ljWh+0Q>_i=ROy}pi>L2V_IoA0B3%<7mr~TVTtaYs^;zbDhQDsJs zAG)>xTpHtS>$ARFUzb&(a8IeP)Avm$1Yjhz~}uFqF7xi4&?-xj~9e3q#2 z-Ye5m-zD?a1jGCtUhvBJoOQo8j2T@%>04>2`&qw&u69~UC9ov|woznGT#7X-7BRh> zm@?f{4gjya_@7N(pm4$J%GR}p8Ic~-;glmr#fPI^sV_y7ok;$R(f)4>_fTlLN>r2f z)`^-ig->3k_l^oOnfj+R!8xNS!N*?A{|{v0Z3b4+L4#8k)_iz@Wbv8i{HeL~EwLN0 zBYHe&8S}*Q;%SEsRJfF)bx*3Z8VULBJWXvg?692yjeAT~PFQ+jRCJ4UP8h8-u1>wS z5-@~9)ons9*Tp0bHo|i~Yc0=R*?7zuo=B>?UO`I4vn&&$@@m!Cr|Zhz=p8V<63$Vl zkDt+%dibng=5;%3ZT~h?HRgFg+xSzm+Ag;wn1rQwq0u` z0ZsoetiofvcEboUu7zI?CLYSR*rpA5RiUkY1~Lx-9%|#N`}y7I#Fub#^u*jpm-|ND z@`_t(-{gAbYj8!s@lyZ!iqM2R>(c+*LjQ|7TZMZ}Q$$}-_j|*q<>b>=R}z08ZrX?| zW!MVvCR`;SmS9}lYDxBBV0CE{ELg!5CJ~F%_*`uF7v95u|iZoX!@vxTqcw z#68SD5UOv9lAp}y-S&1`-tHBMWhU0taS{Qnvwz{duj=@|NqimIYCv91?ML1I@6rpG zl9g!zt}DMgf`2Q6s;5T^WmRHwxpXE*9`RAp+AS1bp&~=5KhvsRsZk|LZ@am2^919Y z?zL=Y2HtHm%j;fK?z!WoM9V{F!eTa^*n@7IP@R_9hgDxo@4HX%QCfO(=1lFF>ZXU2c4Fy;>>u8t(qG01I?6STV;#Q&_U}6D z|M_k{#ant&Cp3v|HF12wD@-=ZzR^ZZf`uEi zduAzr<`GGp^O1hZ&#ibO2ogS#DJP7*U%)vER25>ZhZK=LGmFqceW1P}l8@&kKI{%o zu!fel`@9X68@RZQ-u`DgB}Nf_#s|{NnV#g1?1;W5o$_vd zBqI6V>+M-1{=3VI%I2AM%hf9{Sa$aOS!3jtuNq5G3qb{4f1cAWZByW#Y3NasFWtIF z1mmtVwGWj2aq{bQQJlNw1QU~!9B`3zO_;YM_JWlx-+kd3^!Y-nJD2{@*ptiX6-&Y%q7G)pJ`nP*b z8v^OpT8|?s=tQS3PnNV_HyAZ^YWvp#-bX%^W$W^%D6DF1vPy0oK^S26Nv@MGQ$u;DSSa*vuH8lf62t}1F8^l@i>OMP&!htyUWEB*f9ADqCYv*%X705 zGTXG?J*`16i32_Yl#-NB&SwQ%DfRF0J#h!7#N^!mXaC{9871I?A@hF|WgN3gqq(4g ze`ep+PvKxVpX6d4P72&LmgBvb2_DW}b*$ z3G4sL5}V(HB75VMfaKs6tVTEW{fe**a|V)-+xZL}Do~R9j|;DKJcnZGYFi2aOfcbH zZbs!%RUtBtXIzQIwBr~x8w7=rOq$9=d5JWHvJ2?uZ>ztg5x5pml;>Ig$X_<5bm#H2 zUCB;S$sEa;gj>JF%H1LJ!n*+0|F?(UNMY@sVnv=6zNAS~!*mE8b7VY*p-%mEn2w0$)_Kyc41@4ZIWAb~-wG zQ{18UtQ#@(une{bW&zc4v9GUn{GNA?qyO>g9=Z|#$0(AWno5LIZ-_WchNye;>Yikl z?vJX5#7M_vUtS3^{o?O#tPZ7S&C9LKbOHFF1w4?V?)?4E-3hqD+YZ5BT{8+(AQ@@D zOk!MG7H-)#wM_9Z{j~;jX{N+SCObfxE$^~-P&Y~rZ+hEFJ+bmTH0zsa(!_X8AC9|9 zI0D@$5I^)8^OS8X{sUi}B6;p9S@Sir;+PkWeE-ez{@35;sk}{IP}f4|gD0}oI|P&%*p&1n%mn@NL^Kp#$FMs#)IY0z*0k~@#Oj6#HP zpbr$$3MfAg%T0uM2#5PIiBU63M0B)loZo(Qu8=c8>in)mwKCnWw>XR z)mez~u}peVcKvJuFOa`eJtEtD`ib+3d_zZ3rt+>n+G3GaQvDx6gj=gw|N9aFk0h^B zK9!88WkY=51&Y4!A<2BdyWCupY}TpF#o&%MTg9DpdEG$R_=8ogM)tCUMUh~BZoUr^Udkk(mV_sH2zUn&&R_mOzE|xHYtY$ z`D9|nGycJLf7!?o70zc>cCzd_*1FWj-YNIr$3}_x3@vxk7?O~u5(EUl?vn-lOOVlB zi2cPX+kG>G6=>V@=gaMvR&)EL4C=l^^c(>JLh7`t`V>@&fkhz#0_71V#U~6C`2tkJ zf4T5ySLIz*~RDaEHd_J&eLJ#%5w7d zm3s~YjU}i;6?iHbPbm<2>y6Hs14+rdG{15HrgGgAMSyS%g>f=dd$%XzKdD_1hHzrHAW z@bA6d=0g=Bi>s5*e{ww>HCUR(-5wcUqep0paMK|u{e3DuG=25yZj_7~TYXHchp4jW z_P^f5x`*c#yYnTB9jqS0KSj=f>ZX zNT5A-7^Eqq_bTuLY3Mr@Zz9!_8K&LISMNoO1mF?>M9M!u3zcJxVkhadNBCbPkdCPd zztokMxO>(xy-v(ODAqgNgUeV~qe$v$oEhIsofa1Jr;QuJ_$)ARCQ+^K2F~RK-q=?` zdWxBkr0LZ;Ab$!<4s_cvyP_2Q<4-$P1`m}U+XqtWRgfuf?M|4gtB9o$g!!w5Zry%E@!Y-{>|44JcWST$IqVDH> zhL)74_;eZ-YmhM&7eh?5w!-dNVCYz@GK$^*x5C9wR@6y`vF+(Xr2| zA95}7<6#;<#DjlENtv>Po&#<_Jw-%r^_U zK0mRtflC6&M&da*IQFlLzD4(Kli*c>O22ekAp@m1ms%>RUx@u6^Rc|6Dg0^R^WQVW z|4`=kIaF7T211f}->NY5j*m7Ku>(WW+}nhb9a8K7ikM@}U9>S-xGz=xm1~zb6&n9C zLOeiQ9STz}p+6_2igp8jxqu>j6%^+m!+6d!lAa}zUJ2=VZRj8h-rts$UV~T!$Ji(n zWu7W!8B(as=a-Msl%k23YTu9Fm0lD-2^V_R2ELhRKIR$Ii+@u%pfln(Eu0gld04e0 zclOI$Y9(YxLL>1%^|U^q{@6)Kzm#s@%11*Q zPd;Fonw~=$qtiMkCX&ER9N=(hqFB7M{vx(Zo+oeMpzW8-!9WraUD%&GNJ(2f5PNAp zRDG-Z@PJ$}nvb|!X%x9~1&H43CquNWc5rt>aa1Fj2Ochaf$!J@bg#EMmidmLXbl^- zXri--+|Xsg9I6b+8;8#=n=U z?~e;QMzaC~rHQ_3+p*sl8ii9!zppnpV7dr+E^T^>MKhJ}Z0t z(&l8tTS`(AX3jn4ow1xHKG|5`{B$=76$Df9ryfpzX=;+a@N$3nDRw!Dppb8}(O zc$lmClg$gAD_a#?;u>LZY8H9qVIuZnaDHMYj6p9gA}HwjD3~%kH5LLLM{V0-G0Q$` zp+jH4MzDG`)4kWpaGgQTqAJUO5wU?ytAGQHSuq=A&0Q8(`|dj-~@v z!r4<<)zu7U;9$bkgAty=!w`ymL&}>MpD||mtUKK~;MXBLCt~i)t@urV~ssl1-`N-IRFj-1t%-c^;a*!)_HcCHQLUzLxzey;^Nlo>$( z`ho;88ohO`!twy^=H_-hQ9t-8gpOODz{z#>n&iU1IKWcz;n1?~`Ov6QJG|^*5*m4q zp-h!*U)Tmd!M3nmi&nI>l3J)4Z3*ZIQbQdHC>t3u3@EO;idZ*Y9fAhR#o(dHqQ3}! z72q#r+~}FQ*U{3jJ$I6(G80>F_5NIDih|q_JJ+Y^#O#LS*zih+{+U+DSXS;qza+8~ z<`iF9qcQM@t&m-M;l4a(huaU#<7$V;jP6!0BG3KMi=_=Z!xkwFJmeKhPk@&$w-Y8kmgq~~GyVpZC`if^#zv)aKXP+HkFlwR%LR{wQeg16H+?!FKV18zX3c^9sL_}G`esGfA=2WF0GjRxtTi}Ze!g?t=H%(vCR?E|ajm$Tu{lu1MWd&5Ort(E8%&g=JK zg7fdSWZBTh`g(`L(U%B#i0~<}qTD$6LxE14hX>v*3+dasKJM}Wrh+~7OYn=br1xO@ zY4b|wyWb#LS=sv=W%YI)uU`^FW@6)u-(H>VV28Qdl>k-CwvBM+?3(1+!PPOM~{E zOpa^E_UEyNZCBHuHtmhhYlJnfnO2zbMs-8%{u2~JlIA8u7tS5qH-g*X(S zeva{jgvS%!t-|c?N$BH1vohTLxXZ*hWXnmWA4p3VblUA>v*oel@4Em_C_LAkq5b_y zP$)t7%d^gO>2JOS5_lJqDj}ki#Y8LMUaCR6mJ6{~VsoSiaPs!oHq_MAeA}2mhWAaD z7@C|L_DgkdVKFMuhHas-g>DZN(R0Km?AlJ z4svsy4f(|(snf6XFN5hsf)ASaN19uq$B!Q`#sstfqWNbyMmO2fz29mQF2{E#7(n%2 zUoG)F!Aaz}>l*$mD-Pbc(aer2srl1*)IBpY0NeXP`p38Hu;zslBwlW>HGIcykU&WV ztyTHGJL!99h95M%0i4%^zQOZlZsb>weje>NDmLDMf12*a&&qqSqbxta@MuW6VR1k% z`T&4O2EhZX&JRd0vck{cQ?vy~wf-6#frQ;_Mo#wro@l)-{&?39Dy{Hny&cT%2hZr? z*#X8gglwMvlu5%h!W?>LB8(F5_^0?2=p+)Qwcd^r|LHJ zyy?Zx=QzD@xt(n6zqaYD+}2e}n3=tC67)^$n||k`P!6t*`+<(L@aU4oO4sSjqo8bC zy0yab?EJ-!>?<^9U}cpLjB-qZBUjt4dd893-^^I9awWv{x*`)Z&piG-q4T0z4r`p!~L;m<1OdDDX508E}C-1#mBMu{pAftp=Y9$Qz1Fr{Qs1-i`>Q>JpcYmv*rR8B4R&d*^p{Y58BU#|2ZXn%#Ol4snzsRr1U~IG^#^&t)gWKCfp!A;5Tzr0Ppq+Y6;a^bs)l=`w!@Yi+T4K@|DiP~C#(5dYzX)=1k5Qa-17K=# z@PxfJfO3`$#+UN5C!6nSe!ho%W?#=7It!i?wG657IdJiVw2FLVxYZVfQ72)OAnwP{ zwU1w)%$NOSZ9739U3^NL@p8qYFuEktMY37m6V% zn6M%yaK_q)U#CeAm6|g@e~h-YY^(OnY>Q!+zq|tks`zi!&(lGKMd&O(la_cx>TRc9 zMT;yP&r~YSKhxd3;HQ~?We2EDX^qp94_zBABnK;^d46%o-(Nf<1*?K7AVJt{0cN0v z)riAAzwiXVJB*z)9^3hm8)Y}U#rLPSenH8?92yTJoC+l5t~1Z9W=A?VPOY{42Dn84;DKYt9l`76{;w+k8W&R$IG>oIr;*s zTOKQY*N2CP-Dl=&(T#)s?eMgcC>2ABO^=a0w%rXBiY6wk6`Xi$^J*J~dpX9m@d-O< z$mYN%N3Gy2Cg%d^$%8yzDK&9r%$;C?cE)SD?FN!s%kxw7Fcy4DzURZ5Zz%H)%_%A> zy&P{eG(F8yiw?dBvIfzx(;Xu}i$}`eS4cDfuRVfTt2h8Rz*C`T^|S3rE9?24YFXC#*=H3oTE& zj||p|j#+qB;4$a$`~}O?PEMgS^PItbpTiPLIk`5(2m4>14IW4fHMF+21>(HY0x}8s z)gMKVaBQbrQzTvPPXpGGiMQMS1g}dj6P4!gz0xPa84ymdDYd`42vM~++2SRiVHm%7 z&2A5|Btzbw(^k^kU-pJB*T4$xhheQ6Cf|pL_dMOIz|tt-1^8U$K28+d1EYmHZ7|(# zRK`!SPRtUTnpyPI8VWRn1-o#JHn3OwKHq}Xd29Xtnw7$jwc>@J?|*hQJlLkhv(w#% zNDe!V^pLMCYtOw#uM7e#{Arh1;T-?tu?xQ8!SNt9!~l=3fPV8(tmsmxqCt`|^#th&)Mid<=xlL(GWFv-u>Yb1MH+UM3b+~YOgUSH; z-TlH_LW^%KZY~~7CXwPuh%=r(JbPAtM=4$_-xgmx>uz+v&BI;E#meOp_cK}c zX50NK2{?RQyiCgfa)qaPw_`*J;Gl?-#|ZH0WxURdCS9y>dZ7c?8gqdI15~#NbQ~g< z%@imIC$%L1v;&GbFG&T@Mw-ky0u&U!Ho&!0<>;AiwXJ@xVU_NdUS*X#W|NO0I%L#9!AJ}Cf^9@)#`y;xf= zf?hlyxBycJ-yS|dTmOHQeR(vMZ}&G+rerJ$9kUE&Dnrqcj75kvh*K%bbYzy{MutS@ zGCN3yWX?RCNEC9Ec|4itnd6N2{oZ|_=UMCh<9*-X`>fT{s>N|#_qDIRKYQQ%^V!~$ z$A_si$#<}U)_d)p_@wajrlHLlkLLjp>9hcRh1>870(|1z2(i) zl>yD|Mm+8Gy7PK4WYy%O*sHN>Mz;pJ|Dn%g<|eq+5iHCLW5OtDoa;s-~8FfhX`Z9zjw@apkL} zjfctRGH;*HF~zoQxJV@T>;hJB*7M-U>h&$C7dlpx zt{}1ni0Q|)7ZqfM`|jH4Spsmxoog8Jt)+Z{ zoOiPh{`|ceO&^D5?o%8c9XmH&D5y1MTf7#)ea-a?NSSSAIvsQnj@sY6zPm8(Uf_TF zYd8lREGD`xH50NK$anpn+z(5RAy{u8WlfPa%51RDM$W@$tu}MWhZwKlOc?HThn=6) zW$?!m8Gj?utL+Ko$!{r#f}zvAtmPm5+xK&ZX8fVyfr*gWSjx%MP$v(Z*aDa|WhQ+g z-^bZVozO{1lU7_ANqwknoRf(C(EV<@jZMD&*rhf2Uw`8k>=Q!mvwT(hFdGi4r+v1{ z)Yi(s!kaZOxRK~*cJb07sMhtDc82G|T{j`MS*RmgC<=~y#jqbI)Et`tBgUWiU18E4 zh6t&OhrA~}y|a-Fp`@}Ncp4(WFBTpFqTwlH-_+ruCWqPh;M1XzJ&CZU{mTHtB%B!D ztdG9Rp~-tN>4(ln(oKKkL$%xAARkRjOE;C_>iWRYC81rERX%?1(x2am`yr~ySn1to zF0qW3W%u4-puK_=#Q>9}RhIoH+V%Ifu1dFlfA~`kth)<8yNEydXWwboxSyF(0IB`? znPj#?c_|r}5VzEFN|N>S!MD(x3#)@ZCNx$$IyozXWT4OJhZKz`Qs+2cgLU9jcRkpbywF7RA6Ibj!n=C z8lR6$N9Z45*Q$ptCSI&M!)GWDr^BauiM7fKZgbyZV^em?7|6bD1LsTYXj-@>j9W_7 zmYSL1YjW(&*f>E25lTVtqQR2J-rG}i}UdkKx?`8$$o_KoTkGZjB}!4Vc1w|P%E zLRI%K_C+SV@d0qA5X6Vdh%osHTKu&aA<6Y11gN!c*ah2Ick=9Ogmv(wS2`RC za;`>9^B^F`(*bk8v;esB<3mM2U!KFD)EXLplAQT`@D}pz>SY(b$a0V6oA-Tw>82<= zhtZDSn^TeSdt*Tu=^Mcrm`r9JJRMBaKH zxW3}rOPd%X7ET7GXHs5I5Qsp>vUtQ zaOzA^DAbxo>}Ra!>d-V9Kk*9o)QD1WshJI-wx%#qLdbo*{?~W?amP^n6+lT*qZBoI zK5PF9evU>0Jhje&y0?OP6szgQK}B`Poe-q_vfciJf*jz~sF+}}z1MFt9m->;KMd7i z>NEpAzQvjrc?5A6x_u|g-Y)q9Sc8Ywns#FfWlxyONEFI)U6W{AQWSIDDL{r5Wcg^P zeX8pJTwKKihak?&&nNU5XhmgGYbw6>uop*i?CFc|*sLM3ewRC91;q9XjP(BQDoh?) zede?MjZaw1?Pc+AY)y>UggZ`?*k`7n-6D74-YUx7T!=8~jh^u6f79W`_<(r*U2||% zFI6%;_Gu-#bPr9fv2`6v#kVEsI-r>EQXH-E;ds0oo84(Q;jaCg!_^%e&PAQm%J_&& zF4)N`2D-xFa^k)&u?rh`x01K2}&5NF<3xa#KZm3$WGu{HLAgatsy;-hnG9nUg*<+fuGGSbw=aU_aFGl2QBM zZR=$?v;gqXwYGy6K(S`h3$~|@=+ogiV4Al{;ehl7x+QZQuCWEAc=o`_v7^JMuW%Wy z)pHmca|C{(s$WQec+%oSV!7lhltgQ`?>Pe(22ibSx@<-iN6P9`bmzJ^l|JG$`Lem5 zGjR6j6G7@!!1MGfL~k{A9^D+N+_LXk23ISrav#UOU z0thP3h3n`$*AKzzIpz7L=7E}}6mrl(dI3u~b%Qb^{kNpy*Ad%xaIT(HNa%P!==zyZ zfNVT7Hv9K6BIhFw!d_{7Egqd}D#!vL8H2woiu7p1zL{`n8GU|Xly}hTWBTly|>LcJwq+M|=QF8qiJRdLZ0hL|U z3H~v7a_bcjTA8l_&kfCei>8_fX7HVCZ1@}HLl1Bn{BTCyz)eVN;B%p_qYpt+!tnIb zd;XLjw3xK`8D8dZ3^{ki2L`#qIUHM4@!$!|GMmHV3_4oZLUlMBWUcGgYpmchX}>0| zNS^23sq1PU3s>qEOD{E_;1Y(*X$iy0GqMo*gBGyT9m@4JeH($N9fL~?8Z>$q4==Bo zF%*r}Z>=V(06Fg#BJ%S|&qcC<2DJ~RNuy_eX8d5PDS0b35y`V;5|FB3x28bd+PQv+ z_G71thFgb9!N;TuFnJ$&>MKK&@3`$}@Z zs#AKS4~?y`AW!L8x@qdpxPmxop~3I8f{B6N7)pz^)KryjCt==C{Mm~SYi#ZIca-FMOTR?&pwW*fk-fzucYx< zze@pAE~CrFYiiV035!F|bbxFMCQ-i@N+jrAl&-AQ)5Ek8ROh`pz9n;94JW^cF3jV` zuA<3~D@OI_ai9lOW!P~0gEFrkG~qLMaoT{w6YqtUyzSa0fAsvJm>NiOPk(7on-=eT zoQ~U(Cm3uW$A7+V2{rFY-v**V{WTM(aHt9>p~WY*SOpF@&#iD2k#T}9B-?B6ozLK) zeLLWL68`nmj(3wtor~tLRuC-M^g6QaAB&{)+g;AU19S2&p##|h^<#LOeF9f0QVqULh>Rcyt8hyYEnih=tOY7xJF{A#%x9#CHne(LJah-DiLT*I<0B){=~?I_m2bjovS0nQ>wt@F zsoN1~K<8OLIH^K+bG<;BJo`km6N)FJh9E=s8%|k|KwMsC1@c!EIKRZfoE=IxJ%q)p z*fbx3wOIWg0Ano^UuP2q0j=Roz2a@DFc4QVpZ#Hw76G(6uNr}wO5CCEA%2L-+sP&I z^C{t6azWx)63ARDm1sLxtL#5r4Y|H1^GTIUQHOCATxlt1%9VthEtW=qy%ZW zVN7(mY|ABxdlv|r70`soCDZuQ=b;`_G#KOovF%IE8s&nx5e(>*`@Q0dKlsB?)D4n? zFm~1%>khBRQRrP0#uU&VM>+tetg@zHUFI5iIa0;YFA|f9nI~-#53$tE;`+b@4Vt3= z>-)9qWfw+kkv_Zfql95{ay7EDL}(v?1%c;R8{%&zpKZj|+Gx)oZjQy$HiQm-moukQgfph>#b1VoyR{Y3LUK!3OKUO;eG zA*Yu#v}xn*)B@?;Vq`<$&H~aHVq2A$a3hzZfyX?%u?7%s3<2}C;Y#jhu=2K&&{Pzn z9(r%H0eO6E{s|(7oX6E7;qruA_sL%fE7i6#Y!BId1Z;Sf11?9I zL5owsV~#S~89!+`Y{7+5VhgR6gMS3UQed5VvdvKSB9tTztAE-8F0XxO5z~3}7r@W^ zu$0N^K?@B9Xsp5i?}5Ma*N6d!1*XT1UX#^6LGTQf2#N zQAtoD^%eB;J@WBA!|f92AedmA1&QT#LG>+HtUk~3XAt=E*MDczJZ-LQ+)vzuqW9*f zpm`$l7LzY|aT(|Ug$>?ej;>&8kU76SDVXkv-}iV%Dv+Y|@rC3F%&^V@)=1#gYj=pR z3VASXq#EeJ7}~TQt8zDI4|y31rS2o=qOoG2J_001X%@@=8gkGfDlK>HbsnJaUY`Gq zRR*I;NZ6ekemOCSU`EBg+G|3Pok3X`OcPA2< zgpC;}*Nq!H_B)N*F0P|r7EkdlGww7sllIqf?~r}SE_lS&ui3?Mj9oohEL$)_2FM0% zMDNnJ)4Ms&R%`ocousT91I-{4{x5J)t+-36I(^(0e>aPfXQANV0hxv*%a5Jt5>I0qF9?P~4;$h5^&YbBUf$l<7A-xq6iWsNgA-908 z(>r^eve;^5Ep^mur~%QBa0GhUV;ljOka}|RCUj=8N!q0y`oWU}>ezRyS^@r)?$cS| zGEe$6WKa#fu7kQKuAtw6O9ZL;&PSk$m8B7sU_`Q}uhIHaU| zKIqdL$+v%=upa9HbqfIT&Wb4D1wW)V>gFs`j(r8Obz&~wx_ObS7P$`GSKcxgRt8zA zh#w4W^4&M^J^hG+Z1{zplTgQ+NirtX{j-ZFIYU2e-9;^~9A6aaMubH8f0i9nCPEd3Ppokc_zmdNq7l zV0&Bo3BZG&U7oCt@F@%MTnj7s6Ig!cXTR!gzcK+nTHl<4zIEU)J}iOi z35&daj@59`eUH2=#Ic6UJM|8^2mJa?p)L`}PpILcc{~(6GmCzYtRO_}0fumx`Mov+ z%EL&Io#^R6+AtYfgzwkf(YIXM3*r+1;Mr4~OP*Bc zEryt|k#@sDloNHZn>b#iWaScD_U=lC9SiNS?jBP;ENK4*>zV(9?PMeL(%o2lo@k{* zL!tNOix-8+NauboXz{Fr-pA8he0QrrletNVz8*AD)$Qx9q`u-Or}M`tgV&~yTSQpS z;)q22xeoi$7s{dTZM!gp++Bd)=toUJVc%j;W|0%k%r9^Vj-eV1DPJYmOl&9j8n%Nw zdnR?sgZ6lE2E6LHkLU43potF$OJaOK_mOg->Hqi0V`wz&?dkP8FPC- zbuA@pV+_SQ4pow`>h){Ww!2Nf7c-(QzYk(3U?JM7Z|gWLB&q78!l8G>eF!zNb5H|) zQPjYCu2HYI@4Y7<3bt89P7b^$)JD>n%=nM(!MKS&09swR_qM#2f@E6jGL>{RxRq5& zs9h#8v5q0Pu&LKLUC9Q2pwM+7JcouNfVR6Z;Hy`KF)ulmPUl+X2#h+^LA(%>|njG5#eivk(H;da7EwhrcJLpnDgm1?kH94VKJ<=6BZA;*%Cc*uSow*=?_ zD$9V9(xZL-Z#|=s=qer!I3F5=_8UOKB1ShNS|4EMwFX8rwpBxED`}y>tY%$z7ZLYq z@xY6_Q2%`NQmhr&Q)XGwd}n4k_8t_q4g~i@4~C@AV}nkKbVAb5=FJg0lw1paTabSg5Yrf01{F1R5`d5g3&=xI zf%9)J-zNj%CL%R)|Muv$tqHSiWclukxf-mBpcN!3^?23rAix9= z{ql(>@sauhY^_BOKbi&0RB2HpWRPihG|;r|-2r|DhpixvZp1n?&`rdUexEB(jX1m{ z^+{0lK}+2CEN|_T_6V5B*;z)gc>eW`>P!K$`IO^u$pxqWCCL+{LrP!8ASpY+@a3|p z!hxL+z4vZ$-WYw)@$hlia*X*;UpGf#^T3?zGX=J!o#m!GwCj9&eH>CR5|ta)c!#i; z)RS0TE}GOtpTh@k$=oU95J|67@U132XPGjUOO(G0Dcop3Py{JrZPYa=Vv|g?nFaJ20U4t9o0F6e+9sPjA>V zl63)iRbC}m_gq!VljL*XV;ak{CD`vtm1@OEQ^@c8Pkhu=COd||X)ETB;D|fUA6v>N z^TCct7PIhMRaK@C7FQ$f0Z}I%JQ!N0979_M=gEgrQ0OPfNGJi$BzEi<=0zXZ&^+%4 zuU+^iSp<0{x=w%|7w?Mn3}bqDmI7m5?lgk(Ye1@(`^hvD(YN!smRFNc8VuD9^S=)x zruUFYETPIN{S(wL!&pYM)$Jl<`yk4o%RauobYJQ>c*9)uQNzchm5=&UpzdD?EcMv{ z#B6HEpd^4|MC&PmxKoGrq{l%LVX@ks&_`2RfP^!)lMz2|G^wY%SW~W^LWPRoBwzhS z;OmIcUBjgh?ClnpT(6p$#rs3{ABXb7j7YPD#aJXE(PMWZ#K`)7oRhBh@dfB&tCeCc)S&$*Z#y-=~uN9@(&_ z2DB77W-oQ^j{jAlr)oBr&Tv+|ipsoTwD zq!pBJ6Izm<0x5g(IRhg*i!oTYJ^ecn;tmF%rd&&- zFPMDvf)1_?z@p*v9G$3UJY-hWn<*>$V<=}Hy#mS)gE~VH#n9&f>q1NaF-N2dNK@Rq zmv?eHP~_Zdmba$uu#`X0LNiloG&|%AN>12}aOTDTe~1hfdz6fy2p z%n-t!U9%6V+Wd+31UM{a0y*)Q(C#L4t)MK2*Pe`Cu{{8kXa;&2j!CwFBIZwpbSL;C z73x{gM1vx#RjF=u^L%#Hw`U_te%H}H)X&N*yRblKsc4t0q&_m+9z(3?V?|X5cre#>h6! znOQ`##J-f|PAZMLl`9h%I|}K$(8LB1T`GA3{i9odMtMe6C&K4BVJqxTMncb=lr5pn zEfBXw4=3A-kZp=mO|?XN7W3_ILbPA=2dkb{3kVOY(fwNJ@H)pTWSxAfd>8(oka!N`=3y!8Hfx6^&-4cz-JFoNL+{H&dJ;fe{6=g2U!M@CIR@w=nL zh}!x4u1?g=#~hXbry4ttM&#z2Kho0F&36=r8cSu|GByS#*F99s@__h*m2<$1@<_R28=sy58zS zaM`?9Bk7r?Cb%&Og$3N}d=rbco@CBV+e!balfDdv@?cddQHW`3)Uok#c$g-pT+y2L zi!7k*?S8fs`e7hCw+b%iJ*K5!0u$C(dx5x-b{gUtWbbG0umfM+e5zf?P3JF1bQILz z!c!lR(%&Z*`CHOBU&BSMmI88_Rp*$vhjsg&RKb;`^}96XdG~IF6{P$mY6(8KM01H| zGFFh%PflzEla;|C{&$0XE3tBqHdk?}`kKn1NkDfD_Q!S_Kc$Mo>C}7|Dj>myh;J+$ z>oA@Gz?S$egGUN(Q60o1(dY8PF&<3c?X&OX0coA*u&~yE3$*>k+@hT~<5&EXPl$Qs7SP-F3 z4WSqx>3G}bT?gQ`2#_o&9 zlwRT+oF>4o*-RdkUj-^@$Zc{!Au^ES2uYn8JqqznA5?%(^`VoOXXzmmMUdub*K zXR)RNTeB3@L|!GK00-j+`>`TGANcG!KaNpYM+Y9auGY82%i;L}xKuX|6hdB5*c4n0 z;Gnw%PYEk65nS^7IkgXivaQ|XPQe_=ybA%(_~C^8&a#o;=$lW-y40Hk6EsZ4CDD@B z=GEA9fOeY|cIZXlIMv(|xOsDz{l+D`b@>as@{t9|24F-brU&T@c8;O0<-czFNbf=E zCZS4@nbw&^j4UKT=}ifTUr`D~G;Pw|r*u1t(&SM__a<$xp9C*8;tQqt?fL;C z^p~^2B#*T`UYH+rkf5Akb?wKSt8OnV4{W5r{Us}bmuA@lEHyuu0u}c+mEQWjDS0;_+@zlqS}uQVjpJX$ zp?!cMp*=t6+0N~vGvt-ThzKz+Zbo8@JF2GZwMmN|Q6X%p8a95SjB885s9FJZI;idiP<`>;?^#DQEv2w>JUDeys^ua3pzh(I zA>ddgIGjv0e45`zWna%w%HbVlcl?A>6UoH&>esp+O6KT4sSO>QI5Cf2-}x^ob5`%< z1p`I@IQ1$;cALIfeA_KqX#7@5$c^N7@WDt|m+5+{99xm$MpF$*D}^Z3X9`N;Bv;zmFUnP97dPp z#aQz7IkNuiokg0^8LanRH+*iP)+`;X#aUz|C2^C&bQXDQ`!MqhM4a!eyfC7U^I|wv z)vWA5awXRP=DtCt`cmdFpz`HG)nDR~)=o8Ox}X%Az)mk(gp4dkmZ#fNO8>E7Z_-Z2 z+LreBb_cqfIlnESD`%{^^h~oa=-OB=v!Vc}tvZmIr!sa4|89oIDE%cfd73Lh)bZ{O z3foF7LEkhu^F`j-@A$F#O+SB<=^45EA=9+M$g6d@M24(@2EDbnKi3~`&D!@K6Az4M zRr4D9MI0!wn>ELzz2Ui`M*lVZn9%w8zuc++!nA88QOxzzGF*L>L2=YldE1tumdgOQ z%D(u%xx}5CQO2iBJMP!qu4lk+JyAiS6Hni3m;c>FBihNSPL2o9YC zP;-gpu}J3`&hopTBik>%I_5mfxEPvl{@?S}youFmTGe){n;#hCw)zwN>D_i1k86h~ z+Z#HEi-$2j7R<~LuHMEOYcY=`Q7+<(tP&uJ!~ClMAr;~^TVkXKj&n28(#1-Hm~-0B z(996=3@i5RNUk#}9ZYt3~^NMhh?wgUmf?jP>gBMr@c#8&j zOOQLx%c&leq<^-Ln@_LmRb{xzEKYvN`IaNo!zgrOcR~Fq(r`C8G(h-`kphFtu31KdU#V~Ry?&Skj6V}Wc2_C1DyUn=Wd8@X49tj;SZ90k2&?)$B zk=M~96k8Yq2%$#M+FyQ22^;VKfLX79VFAPs&wmXI6+H6*A^2m^5?JXz()D={{%g9fLDK_b=md}eLCctTsQ0=D$O$4 z{dc!E6-V`sHMx-iOG40p|7pRXlOS(k@JZ_p?3;s#^QEN9Yc}edn8^KZtb( zq=pX-fGN<%HV^;E1?2SO*KNn_$QMTyPVg_M47LB@cz9XqzXTxmU}GoWf`3ATubDr? z-Z`0^%Pamcf7J5|Fzn&xF5=PTq&{ivPYaW`&}2(3|GJ6!$0~vA+dr8?e6@4}rD$LE zg+<>x>S*P9SA5(IIZ_3}OQ0pnTZDX+J;Yh~1G(dOZ6%nin~xzospqR`QSK`Hu3M)@ z9IKRwNOe1xCPnyDf#D%5faUR_A3O`gHghliHm)*;ihD9}Xi;pz#%?K*P=#P6eZpr$ zwtEa`xCfD9!WJTH1;!zN-QWK^^y14%nv;`vs$X1qCyxq#TWtAgGVQ{hDzCFcC8!9v z!|$oHvo72g4|CEyEHCy=nO*U=Kb&bjxwNuzrMi(#Q}dQZG5klfaLA!>=wspF%>jp7 zc^BwK9Cp~lil}mUyEDRjqSfXXt@WU-vQdTYzKXh?SjuQ)*FWn@|CWsAm0%V(V0QZ| zBw@fUoWpQNL_CZkA=FfEAm@&OmZxqKtLNW_z1ttJT3fTgJh%OMrLKMUr%#`1+1egT zOFQ*zU?3W%1wB~mJ8@LtqM(>w%$Ha3@5_IV{5q*@C#M&EGQ6STrcOTv11;mI8(A_c zpHLCUWi21fcfEysl#dUUT1CL!h!1TYWnl_qZb`L}(7N-})lb>Q%^`5xc|${E-{9bz zjSUwIzZZSxCAs(B1}c~Qz*XO4f0ia{Y|boVdfhyZvyg?JTbeP*B94>+9Bl$SsaLwB zw3xLMnCV!wECHTtS}bwmhfNc*P0wVT8kican950>;f*Sk9m*-aqhnnkdefR^-i?2M z{soMuvs-dljNh<7b#T#x35EB6sAaDE{90~l2Y zzUuJ~(OPlp8w%sqvU5*0mM^|@@d-UcJb7Wq(8dJj6Q_ysNS#U{cT-!t%LW}r$}8B5 zEla<=J}z4hcT|3|t~#kN_8MkEey;V*zWGdF{{v6v+Z!I9xBhZhH%eVR z4)(H#%ynb8>u(J>T-e2Mwryh>LV2IbJodGKZe1$!Q$G^-*Q|Rshy7t;0`(JDv<Z9Qt>HPWgxfQK3 znDEb^;nU1HsR(a5`eaUr$@qhw6tkyJ4(F$WdXT2JCw=!kq1_wA27aw_920XP3SB|x zYpfWG`zZve`^S@7bUQFT_!WPWR^+jBbi3nHHs_8jjtOL#YU_I|&Ey^TV2o8RCmIAl zL7&t0z8(8wCrTl3+}Z4-?6KGX6PTk0<4pQb!MukEzO3}9dUs4`6x=*secPrrcbM-X z$*TL;;NWq?cnb!y?87QmoLcLo1(ny(Qrjir-{gDM)=*~lH^x4=I_L7)W0xMmOtEhp zjV2ufMg8UDbCU`lPuW-vuXHSnfd$Cj{?Fm;tEw|^Z=+f#>fN`0N69~gn`0kL=N|Tf z=_5-|(fTrb#X3wae<9Gzwkb<&z*DrdfX)1;jzIt-q7>Gqj8c-^#dQ1#p&YrxO8@!end$e$kLvG zf9BcZb77WX6yLdyiN%GPFg}|vRMKw6gH{HZ-ODSW;-r%{lyU#46Yb&O2tp0)mdY-@ z)3y~aR`InxGUfB`jT9wcR>FikhtZ_FOvf8U?4um!YZK?0QXunLja}hjx|f+8L1?X|r*>sr?>E{3K7tA1&xylsF{qTHEaZK3Pgs*4;JPz0|*rVO_>nD|kz0=MP#f~qegBowBT z@2u?qCo>l5SbbUpYGd=6{B;ICOcuza3@UNg=D~z@H1e0}LVmWYW zVJ4@c=YJdbVII5sbEYhaVowx$>!krvcVW6Cwo}M^B3PG=FT1g^0RQi!`6|!*dc=iV zqegwjF;f>Cz(Dyqg1BlGPA9I)A<00;cvDn7=!ILZk4TD?QPs@#6zcXNJ!(N?D51{a0vQ0THcH{ zF$|gh8ZrInB|c(QDl6y!;iU$5PHX<5VgjEE8|SPjeMV(|4%X?BQF~^QFmmCph8Xb*jVyBYxihF^i+6qt=Ta!M><& zx+C4S`>g8hAgL_--H>QxXRNr$fNHX$hYwHqQMC5%E6MXJ$$cZggUB7+UB*uahKF$Xx&8U6&)n$C<-t=pD2xA7}V{$S+3+^RV>HWU&7WUNrAk zgF~U9Ok6%)s8$*)KME=tG`&u5#Qk^Fz(7Os(sF4|W7TVi+sCb~ zZ+TRE=T`lGdCiLN;#kA4mYhTDtK+{KYC_(?f8AQvyH}Meb(dWdHTu4&f~B936>b7{ zWB-iYtty|a8P745W1NOtcwti;yvKeW@AH>=!}nLnLxZ-bytMQ*LM!~?%0L-j+)yE} zB2@9cfZ~e*&S;@uq48H&6`k&Ah94`^zwXZe^Nr_I2f6nZIj_iWLZ?o>E-aJHwm6r2 zKRLU!G^jyalvcQt6Q*N;xb{2Z1h;lDT`pr%p0G^e&UbccgPSRnv&~7!TASuw#_A8! zJU6^gIYN?0cT=&4NjFy+9i)RpwXQK#rwxpMmW-1TDjj6}_{8RNT21E z>J>*PG}#J;ij_9^>eNTR(6AGtn5{MS_rf?Q_2)YqIrCd9>w`B)B`$Z3e?QE_B_P8S z=(0nsnjOX;p--g_pF4CPN`*DLFT_6@{CJDF`jTU&7E3#O+Ly=$d+XjignX*JgjSJ- z_HN&i&(Tg=v*mW@YX{yxGtfL0KJb3xPK4W4micheu6gd)`?>>fSqJ9<^6XSJ% zr#b35T!THrWm<}ES6|GCsOr`5j@qht+l}WVO&yX@amlK-|GkU*7+XpgCbDDx_p<9B zRwn$=Fx?ptG|JL=Vw&_nB|--#f%^JPr~TG8>&PomD+Z$SeXgy&DlhfyogZ(vZuFSl znVLyYxthu6ym}@lBY(|C*tVyL6r#_F7(do%9?{j7zbB)@70PKaAp1-1s#Y_{XcFEg z5p(m4Q-fFn`YF%HXeHPZ{~ceah=nX3pS>T^x-384YdF@8LcRG=R??at1*ard0;VPT zW;7=HP{!Gg*suC_KidjSF2a{DEgTkHF=y@a?>Zgu1EMKYd#@dasq4qEq@Vl5*>#}3 zQZ-q`xZg<53i@!9e!SWK3}NKSQ9@c1yX2+@YvY;%!??5Z(XSD2&E$BMs0RE4&uDtT zsHZr)n7N^}skfDP|8azn%WiQsuXD+ykL|pk+?}Q)^34sW{B3TI{6ghTGv z{k!~ub)>@TIQzx-D*lAkDr`pT`mCo)jMUW#J^Ok2%oQ0;<_GPsxL|e{qHPiD%1XdP zbmvu6(p&>Z|C(ORHKxDKHV`_XS5!R)c zxusG%x@qZfPvbXZS}*96fi`f@Tj6&3b!!O@Hu}pAHv5e<`V* zcrAgJt9Z%P2}vepw|t0R_nthG$T32H$$0#jZ3l;beM_lHEcV}Jh0P%OplgnN?^NMoY=uR+wnp%XZ}Z8h7{AR%gyy8?PHwLNov60?bxp5J zuH1x6PO3&@jv8I_7lXK^6n&yB+We*R;@EyL*5CQW*U7xHP89#jO23=mmh|AnS7x{1 z9;DQ2)W0O?AOKg!5**{Xo&xO&_$&k`Z?Ej4bAR7cL?Z_uh&~wjZl*>ML$va>xXR?O zYqIapbCYxbqKeGP$rMUYRoz0Gqbmi0?)*4dTv^IP_i&{Ag`kG*CI6kJ$>+q@OOMvR z+ws2gzs#Z~enc+)N0302$p?fHa0dkU9nTOksvk<@m)(rt}F8P)B@$XU1j|A<$(vJwjX^9(MZ|# z=>L32ra5kV(lo zeTIHOK1pL=Jl|+4AZn{BIRyRhEdKwNU;?JQj?Z2X+XNg=whmNxY1poP>gX5|D3;wZ z^!D|L*Y7)P2h-9cyV;qF9y(&Ln5EY`l&(k{h`T+PJ*@SjKV!?sZo4Z_J~?tzArQJkKeF)q9#-P4a%P zeR5tv%cB?K7cHwE84ImvMn><(zEbcpRUbw{aLHj5_q-!i*)`)%%7!WH%|(3JohG3W zmNZ3MjOmQ^9q!O=uOu5HNuM^gEMPlge!CNuWTZOmVs}FArJiIqVsQpkK#%G6{UW7$#&~i3Lw{P?PN|1-!7|3&O;PzK@AH0O^ z8QGsv#3OT=)mM#~)jh6p@P*aRWK)ab-&PQf&MvCX^o{b`>6EhE>Jk`#`fNEmBRwo* zP0c<-JrWm4u)DmM!(o2$0o_dIH=052qI>Xz1gWXjlpwJq`c7@_jwVzZzI%7`$7QG( z*rCP{yiT08X%9G{g~M7{F^8CrX>rQ?eH1^Ucb;2n!Sr%nhDpGR`2;ykUcqC;^aadVZm?PAf4NHO z2;%xZRjO?VmHCkyl7aJ=eg%91#=;N`kcf1XHB?nq^^&rIO~%s}F4MW~xYh6PqUPS= zfNjlf6EDy&Z}XIGZloG@=f>d3NN2VOfD{xBzxpljpXK~tzkreZlOC`x&G0r{?ny4d zyhtn6xm{-_tg4!>*dO6edeeXMx!vK4ln2p&)It=G3=2$1yc!mG#d|6AxRLhPsVjZo z@~w>ycu6nsNX{L;gzp{iZgKmUeQh2NTm)|not^4f1+0J6R5u!WNh(jYzGT{oH1SWs z0pPC1n^Nrzh5W23u;E$qPpewE>;b+*Y7&M0yw(|-B2Pa0-u-i?sp$9w_1Cle?uN&| zh1IKse@(&yp&{6BK`a}M0;Q^d_=31Zc6%EG?+j~4d@q3?J+MMKOX2B3RVLz*P0{si zwZAK|vbLDtzv=z|974>!vvPT|lO)}4`&>BVj^s?{ss{4n1?2^w#RpPN%08|`Ne2s5 zKceOUaxg$)U}$5eq7yM&dw|C^c&>(k5%e>o$6m@@S6}?dV$|p%M?1os@s1|{uNxIR zWm3?;Hr5b(7+3muw258t=>~KEk;jX3uCOA;ZuO;>wt3|;;o|ZBYcH--9HVdx_zN|d z#fFY?$<`ZJAlxo{O_@DLvi~PHa>{j^_7qapsA-ir)?^Er(-Jq+H)=C3wn-QiiV(Bo zD!yrT9KX%Ks{BYTU-$oTi~qOlO%pp99~(>6^izSh5m0aV8v$2q)e+6pQOZseT6Tdu z_j}bE6Yf|Z<$D!Lcf|IX%KFHAp@!<0_d|r*JhhAzzC5vwGZ;Gx)_ow7_9WTsnH3u& z{+c)JaeMg)wVx~erzC9vyTgsOn__R>w#^eUm~L49qfH1^|B_9#(dh{~t=tS=Zqqor+ z)Bc^j@A=O8|8?H)Uu#$_*4{Jwd7iso_jQ$YePCg6R#hVR@3OvsN?%Dq3r6t**6SKfMT5>m@v_-tIf3yD!)lVI*{VUhK(@ znNR<7ob7}fqaI4&dV-L1J8gV6Lj8cDw3Y5U+2!}5qUG#;=gr5D)Q`^_h*Y@XW|MzdOc|uUYIzZrnUF1d|gC3ZU+5}RqSU+k{hQSy{ z$%B{ZI4#3i&xu8J#Gz+W$Bbk=369Rx4D*|d6XW);c5>M!i=;w?9i;(zZS#%F%f2|p zVEdMbe|8Vj^n|Ie=jrp|8{cy2wdZBut-V`)VSFmgPOf#tWzu)OCud=K@xGaI2!T)= zF;Y5#1}cJlLHqaW@zdBMfN3Tn@^eL@%$Fap`lhcXV_}=;3u^7IoelE*@yyoBj_+dk zv4RaH=J>nN>iONd0rjIs4Wos}zK+|2&OB}Rf16Lo_2p$0C#V{3(D(f_?5UKh_hw$z z@QjOUvi?02%jT$L30=_yIEQRVNCDJuS}VsAJ_h>VChPp=$!As!c%r(_ zbNU>*b&Bgh|K^mF^Py*Q{MXP_H$cs*Ii0QY4F&)3^XHJO%BfB3O5KIg9Y(=X8*hkL_$Yadouu9FZ9Yamkd9UQjJ`@n z`T2_#N8}X5j3*lVI>nMCPTFULWx&{{ z;a;4zk$ls8kdUI=H(Cw=5`?QPJUYPb$@_=A=b}!Q-AB?F4SCE_q%$<%d@2$Qb7VQ* zac2NbY1e`K(G~Laz8W~C+fs24zpN9`%NqGZaSrWe5M|l#sX_jNRwago1*zc{ObFZO zKsJpbXAdLa-H`11P+OAJ1yPCa0znl*!&8|&n<87w!?FpaxWDb$U|(mfd}Yb&_{6YR zvO}ZgZ}mUl0629M%bZ3^TqYHOt|#i((_0<79uP+uQK?Bvv=WN!)#~h%Fg?Y52DMJ> z-$;ah@cqC9#)k)Iud0$X;)i(a$($)~PQ=>creBU)`Y@ZxfvRn|gd{XxI3r zj}A&44Ubi9?d)#+f9yDBfg2#{=o+m*yv`RukWGlxVuC%r$1hd4Dp^o-z@i_1AryRz zIwY<-{+K6I_eY4$xe7P>Sn7hY1M0?Xp^!$3G1c?uZED@()wHhuwbI7CAPEVh@gm#X zhL^>XJsiIPQfuj2lmJbjS)DySz}WfVVN7mirO5Mh#`bt8I_HVn^Z=+oBa%$Sj@LQ) z6hR#CY49Um<$Qt`ZKd96WD=$%Aq;QrJ_}IRMqSG{TGHWv>%S{yrCq~ZR2ngWjR&@DXUy!{kcKYUPqnyRIE zIUvB(4)Wp?mK&2(1xTITLn@x!rmOMUTnZ)!z54x*fMFck`L3Qm`zd8%Uv!>>%t1-< z!h-$5@&BxCpAXN6_g@%4i9PxSyKySODMRVkhWh5!EZg;z{2$g8*(Jfx;Cz;BE#u}B z))MiCBE6fDC;KpalO$g2O7wi$Nl_rRShS2R5l?ppB&W1=|j%*po zSMle+K}QD>;eB3dV!t=eXu5L(D9nyhN9IDhs%G}qm8;(SDrfzE?otxQ1`CCxNs>a zr(Mq(W;yx{NR0{mVxYNr9X@+>eULYQ@V8DDSMn5&u2cfb-=B@lR3CBoep+ABPAJEivUZn41g!3B)popK(F=IXwvKmP*F(UDusQXz4@^z>!a<~9{8nAu z=48d>w$O!?!@1h;eGA`=v)i`aL!J3QjPApR{^T#%7^s?<5iY_u3@(nj0am52$&f`s zA5GN-C<2}4_m!1t?%*~-C^|p&Co*j7HN!l)jL^k~$8+KpSS%axO#WhZkvmHqZBaUCY^Gt(0x?FE$R?GT(dt$=8JE z+{8~Jwb-reHx;(Xn?yCj_9wN2mc&d&jKz8U0qE-P|LVCBJZO&8XI&}ocvIKqGSlTm z!yJl;wm0`!B`SByB{&paly)Ag39HZNjMI#8sQy}b7dlEq9m0WG97Gz}RQ9~wrh4u| z5q~X&GK}@hU!ehF4eIL35UP`OSFPw3N*zCr9tw7+&*J5Eo-TP4L{AU@`px!+8Rt3l zo8c2JlfcYLYcSpVNCBqJ2i*g3jT(VHSt)kXnt#f7;gDpnQ~bmR&ZG;Af-SW+QsqPN zPly?*_=H#>8u_=#tKc&?+ zVOwl3Bp>A18gh2nMOAO!N&9IqQMr*n*>w`by@E!9SbI6*FxD#r&6C!V$MfXPRu}kD zV5%;TRyBw8fO4J?CTWmU0s--iW$46V;XEEtYi-+e!QC0wO)us zAz^#y@t^Lj)JnxlrWW3uXwqQ?r>w^U%>j_T{c}o=4XT$7dnjSLJ6s5jsoGmg(9dOV z=!les`d?Bc3|>k~VaHH=H>Ec+7-@$^TVQ?Q?b_nDOlPD*FzEEL$rqG7V6uW$3Eljg z;$|~PNVM;|%%8JrO&t7XESli_`C{?aE-3$hIX8`cI(vk9Csza!I&oFTFXBk zFhJs>;~s&`<{befH*xB>O~Arbf{Lmr_$piA<{rDpj$iVELz{dPcjV_Q5mPjslHIQ> zrj=eryVKcby3T#ti6A=ADHX8y;?4h<2o6HQ>n`)}FL2{$qH*iA8PBnXuY#e(6WP7a z@3>SLPHq_itIxWy`es)wpR>(KgO!FXhz%6iNZ7*)9??h5w;;&FAiw%kFPvTggmUC| z06AfN(3?d)+X7B9P4VcbH%ap!SMu2PG>ZJU^=BW=`3eQ?A*@hNFf&bu0CwBgzK=4m znB;E5>PasPkfg9}y-S+<0p9raLPaVIa07M0&ZCG%4kT#pY?*Z3Zk)(4e%0F?tu9+m zmN<6+Q&mwp9#{9-i?*IRLtRx>1yHqa64q0vOx7uTq_hylpn|$%@K~E=++!4M7_p)^J76lY_i&GXTLY5_0N!PVwp#jncsZ#VpY(<;F zzCMJDaqFH-YxwGM$C0x7_bvZsF&{}bk5r0})1ECHrI4-UC%co)e3;-G_( zfHVl7v`YRfvqIwo-CutUK5MZXUs}1}Jh(Ec$QUY^Zdel7dVZ^FnJ+tlq}=r2kZ_pV zK(}`pN;rMo=gAyO%|U~Q#hwD!4(YztuhHl}+xUUi`Dj$ay{&BLO^1i$`UX%q4hEvo zYu6Qi_4glSMUc&in9>Yo{XjTeWK}xr{5-Lw6<`5 zs@y?H=aC!gXQ!|C+0}9Oz`V0G4e5a0(7^9=!0*dHwPVUssXWXdwuKE zO%5w2CRV$D?S#>|HZWYRH1oU0OFm34gK$^P7dz|>6|mG-Nc*#)LqiV&@OY4sA-?9j zH#-cvBKZ3+YwJhB!BbO`QW>(D5W?P@?zBgm-fFQy`ZroGVj&P2W&b?$Z83W-qFaDh zNZj!bS3}vNvE6+gC9u{X8!?SD+u<+~qgyKP%W9*?avN(Roauf|O|?Wz*SFka)Mx+< ztqaeMiDOM-;ujmNtK?xONvaQ-*YwA0phw3Vek^A%H=ZN6j9fKupx*I??|uum@E^RE zYoU_j8yh4(_H~GS@2@oX2&crLeJQAFC9Je^McAP~&ss|8t)a-{-K7k|k|OeH<9gvy z)S=>9^eHf7pUEpLlfKqb^Kpg4MZsg0G-z?=kKK()xleqxqE z(RVJr#utp$SB;Vt#Y5<0>JiCPqkGo-q2|fIFKHExPf)T?5pvRa0nic_xlP9?iUVI08n<+NGwkf|ruzIZYmFt7X6B*@ZmEt}FnQ>3L7l^#bg%EgaXr zz`vt?=(XQypA#m~{|gHML6ChG`Ng%d#2v;NRhDd?9WEvUOzc;mE-g0be{(kKP~uFG z+I9oNcQ*Lk%G%oL*SyS}ee*CWoTtKm}}E?<#a#M*x-*Qg?8%V4bS$k_6u}ZN*F{EO0M_ zPEsOZ6la`gxrE$3?*sS_=gIo4q9^D5{HoLd?Sbx0<;0DRAI-~T%j57aB>!=p3hD5c z?}FCZhzOt7ln*+ipo3v7TfJbqZi(xr%8M6Hx_N^wJmF89!(?9!UEbZpXU^mPdy|2P z!BqHo0fdjzyL$-qEeD-EOJ4Td2<6v|&>a^2&htOOADLgHeQHj}y1T&}8Lqwwt240X zcq}w2;9qeK@@h073pLYk6he3h?{N7m^8F6zvfj>Aex0tE-LuhqVRCd+C`;~l)Jw)S z^MBz`NJ=-gFtW0OauAftkd~Ib05y07h&0FOJX6Uj(}DryL>ZP2`?9+H2E~DrK0nX0 zU=WN^5bPaptv@q(wx#ECn84Eez?QR^9zL4+i@#H6B4*8@jaM{=zdfJqrr?t0n4M7* zuO)uxjtjl9+73r(XIIr~ZruI5cWtBvXg*$xcqJQx;lbmM9vuJKSKiQeF20k!Tau59 znCO&Z?>@TM;>VxXG8C=D+82Y#p8t118%IslKXIEYOI%r7v2C<|6t6Ghxc(e7?tpmK z{#Fp6v))WfN_y?Z=m?V!qzP)QscaaF*4-e7&?<4@Cnys9nBg>u!4;Q6>7{N04W;YQ ztvX3%4#Nc-PV9{*pl(v+E^}y*z5&Pb>Xzyp_bOBL*xN=n^XUqm`=b5%{k*Mwi`_+{ z$1lH~h>oH8+A(urIuu9~S5N)*XV=Df>2uc?DoP*P-jcmkdVpUYU$~w>WXD#>hW%Wh z@Y?^iYn(SwTW@p9SmQt>4zwd;tbT9-pnUyUTDo^Rs8;d52>F;kX@3l8*g)fs=zo@% z?~Z}vJbSK>%3wk`Edn;nEPyO5a3MFN5(}7N(AL<0?a#PZfiCY$YWN>T1knyd-A* zBjsK(v!80MRi zSm%{~A{{X$-aRTLG#1(h)Fkz4q=cG|@0~fGM{-%H`nWFl>+n76rS;SmO%%9EE`b^%{HPM8 z`3#H@Q-itWPk&0tQ0$)e>!wpd>F%2%xt1yynJ2Hjc=6&loOuc8aEK-AGmFkHR3LTc zqlJ}XeV)dSxqf2qyWp(Ru|a-QU}MNp;rsn;gDl4>)>ZNeo|9r!u+r-$fuMVvyr*@7 zVk1O9ZmB8Kp8a`mUB9kEO(wT6iODsT)DvkKQ)Ri$`tx1lf}9SnJ$U3 zChwX$PyD=+=4P(&Gvd33M%Ggd6?3%m*r~g!5hAyBxtWU52a}DaLh)zjsnC<0n4Dn_ zcxcwpl34q(4<%)&CcKQGI?j2HNBHk;OfHAILKQH%^f_gYKyvc9x8AbnXN6D6Ld+#K zK$R$o;qtSVZRWOoVt&C)$EP^;&X3x(ju6N>8!c^SMDoCpL2^9S$>PN~C*O@?9ZvNZ7vvp! zf4Uz6ECX4AnhhLq{=8G}HwOA6goq$%E6k-^+jxX;4kQr_zw;hV78xMZ4+y&YN`_!; z7?Z3?nnk*uaR`W-;s%To2f9N%ZX6c*5zk_lk^Wic1zxBGau@&cdd7BHe(URl+S-W>{Gm2dMz;XcJbrVCBIhB?~` zP_4uVkH~;Rq6I}+;2@-CB20X z?~>-x>)E<^c-^bW(}y-D2A>9kNP*j8!MZr6&Q7XXAth#Xgr-q*pY;zFts7u8?+QS-7ck7NLi5wrEdw(a&Y$^wc6#j8;3W( zP0Z**!*={joKi#sOCV+1epo=m{mxw_tFn1~ao zg~_c(E~1Br^&EtlX_k?Lu_JLC{0nH+ivym2@0UEwa+m4Or5^L9Xg$JqBUF@H%C=9s zF~utTADTO$d-dyFkMb>g`e29h0IBw=h~*Ye^{SBQLWI7gSXYLr{f3=fLxg}WGT5_q zmP(e(q8P72jXG;ei3wug@&!2Iu-O-78C>5=W9T16{=FIbldaXhyj?ZDV>4FiQBpEb zeW+)_2qHL)K*nxN983;^onSFeV@2*~7}fWgG89~H%GfQpq-#`V!YZG#2!~yq%rymA zTQe=6b_t#^Zsa?uw;{d;&sI#%$4%1umIgWdf7(R8&YAQ4vrn)&AUhk7hVOx|6w*y3 z=wyN7u8B!cX&>p0b~Jl}eqLjlX3S1+k&ZeUWj+}Vf_;U->w|&$yllTiPAl_mE)>VS zdw6j3iRZ{7n8W>cs~#5icOw))Rg5dkq7bSHUrpg+6%YoS=MX1MA&kh~M>;6{Y%(XT z;eQJ+ZKWnOM$5*tV?`T##6>w;TpYnMsYquLAbZAlF2rM>N19|qx*j4d3#eeUUB{9s zurPF)1rU3{xh#|Kkx;pvUEw{*V9Y#sVa;ZgFT3EIMOgKT(Pw`x8#yRG9$al%DiZ%f z?CG&Cm%JRO{xthRT)R#KwbAEYYASfpC+y^C!|#G$FcBZu;S)UlGT?FTG5sOGlH0cO z`NDY)u;PBht|F=k`4Y5hS3MaiC(iRBPIIbzm)&$*&Z6p~_0n|r1Mq?@HxvVtT0~>3G&G-+XO1^FF>m&S1;sx6{-}R^oQC>gb2M(@oS4V$VTyU# zfOAh|tuCNWK~Cbce0Z05sjb2j>xeH8A+Qpvcv*Yyii$$_BA>DAnz-L+f;HqQv^)4- zb5!L{5JuTyfQb^YktL3UR90#7;q>bx4w+>2xi7u{&hdXWO!@8kx{#cH@;*QFzNqq5 z2m1qA`&~C%S>;tCeZFyvetlNU%fgaec>y4^Yk<~-n4lsCO6TS=`$#B8Z* zrOdhrrz7jeLU;*BCiV||e*7S1T(2I{f>M<|BnltK`0Ni%_i~dEj*5T9#*WIO=sUT7 zJ8#&Zzi^JQS4@BaX#CINDM;BO9L(8_lcsOHTNlC(7a`TldS1(CO@QI88md?-|2Gsr!eZ##j)C>I&DAOf78| zbFt?Dg>Y;I!vOW{#3{2gwtq@AKtvVS(bv)Sw1xB5Z#dj}gAQUo0&gUPB^z*cC{u@HiizzdkhlOWg9=UW|H_+*(O+2yZs zrsNhJgO(5@l;_3j#bCSJNma+ADtaRI?J5mgFTS7XEYTinEhDY9d2Yw>X&YHdQ2;H2 zTED$7=u)vt?(zQPuU_xQOYH}TrtFTqc!1enZC4|{{;Bh0I$U*aI8e#1;?GUAu}ao? z5*1dwsh#FfH;|QT`8@#Rah|Tva+xeIeoCZ* z0(EpCYyTos=kdjEf94N@?(e7ZAF&K6a=XF>-FpT`5O%>c3Qz$SdhX#FW&Agq77yE3 zL0Wa|AslYZE!Sr#SdJ5s8)|Z5zE@u5B?pF9AG% zGxFH%%(r%DfDm@#k*D;-_m^v(#QO6D!Z{AZ?xHASXHU!0p&Id9cjV^JKx8g#2z*t% z`+`o1^eSnrh)Ol~S*`TWM>G~S&DH&&75U?%Oz&i8-TdzN~+CWy~Y((S_x3?;0 zs%>hbkL6Y19nQzR)b;^(lKts?PecFgZdeH+4I^Xg2AlCQV@pIaP`i)3}|L zjK$eYU+kY!HX7I5k1F=Vrs1 zC}unl;C|c2iXYlJEb~0X)u4V&4crt@r!(4$4YrfhGY>fDG6Pm(Zb+Ve@x8Xz0`IEw zlEIFN3gD!_gwX%3W!;7iCTGFIReAuPBhLL5MiN#`=r0KiFnr9(b_n>8qXjm39#O2O)pm~++zih+=pnmP=$AT@qwq!b^@Jy5!8;#lz_Ym=3($e$! zier%zQXFkdUGzcRsqFkzss|6eUbYodk~V7}P8Qo(dBrQoDT7i3>}oXuH$ zlU{uAp7b~3b&WrtKU(zxoUdV}G_2T#=*n=dVsLo8Benl)AFt_+C~P@FjfGZlCl9g?-otL{%iA6BGt6!O1lsbRV~ z1j>d^Q6<84n~*u?S?8T1Wao#4X)3faG)-?Fs{Me^uneu6^kLUF=1l;(c;HAd&;eUw zXCwn62W@%{K8iWIB??7Uv+qpYnVhDd&2qcr#vbY;1q0h0ZB45YmO*;uwKO9G3*XWyXjdpZ&{ZLl+rl3H}mo52* zQ2eYIAPok>br>_OK3iVc-c-fTr}Uw3>gl=PCEYGcW%p0t1*JJWbHM1*(TOC`WJ8xY zw}=OYX0^vLp}Pm21V5VwcOKM(hdZJCq_+}_$uZ%;2ty8mD*jR>0mIS>vH6 zM-qXP7<3(>5onk;eTq{<0RsY_BZbxmDw})*r`vmbmwv1|?Cit^Wdd5J!GgCs(U`qL zm`40|ndJ^2F3%qSSH1ZEu@9uL>lGCh#kS+;FG@Wz6wji3@8t&Tt+7`dA2(C5mllUN zo`t+0qVQ(wUeh>?XO0%Ue|6|`$$HZXE#>t8E1`Ga zkEXgPvqNAWa+&PP=8Xa97j~A+Atlghf55icw^^V+5I0v=Y=~j>wN-XBzyv7;orLrP zR*L)1Ge1ydTNM$GW3`n<6OV z?$f(w8mWC>>~JTRwe_Fhjf@a~JSmGBc6uZOSfcZ^w1MNa7lQm=1d;9or5-Jz#@MV$ zv)z#l0GD>=SA{j-LzD3Pvi1Tu`?mfg_%j=K|CY<}tZnT{O7EnuT(T|jRwb<}(+s4l zk`m@^{lR$1r1QG!m>UY+(FKYCbBp+8&$k?@( zrU8n#-Ad$;hTURql@6K5O z<{5jLEShDH+g)@XFG%-qoqystiCf z*Dt(Y7qth#6N#vp-}>)!rob!^Jw5*ByyXz`v1BhNM(h_77Vpy^bM+_;MWx^}(H>l0 zF0em23`mr>mOSfneDMMWdIA!s*%G}7Lp{{b(itjIc zrYTwJ1Q)c&pw8-X-op`CWiQSs6Nk8h87~1-F>OBOiz}lAN z-kWqj>O6idD`v+POEasrOE-TfdnW@-W8atNW=n;dblu|-HLR8Ms&x^K5F0D_{Ta~B z1Za>)`n7s^sVH4;r3|CYhRayjNA2jbpN_(_J+R#$D}-p1%|BHDMFoXo-yQBBW_47d zg4w*_8`{jtm~$p_@2%thv&3lB{o%*QwD?5v%aW^SI8%m^sk6iMhA5L;a}G4*{? zD9L)MRZ`(S>uRb0XVZ@%5p`d5cy7rJw?+dPB}*UDIiG3Vo%*9`^1`s+Ui$|5#IcBI zT-fF$P1qS=-}3a*n8!j4?ljaetzJ894uw!lv3CAoohjdjD6iyIq^71nAEpQWvU{Y& z>TdK94I891nG5|s8{2cFh%4rx%i_&SN@172r04*NiFrTmM5t4j2`xveI>8OOnNq;s|lqqn*Hg%fLVa5R6q!wG0FVy78`uL&L#N zwI$n|+mtWwP86~3rCv4fY|}WDweNeUZE89ysIP&UejER>d-tfsjT*f*rtSi@5LMHd zw=BEVG*z{Cyse|r1fX)X)=5O274CN#Tvfn4mhbPa=rYbv;{iV~eqRKG^(ui+2jMSL zt`DwW0|qMds=ij4S2JjksBXP{d0o}VIj^)-9Y8%k@A&}x znf$AS6&=u$Ir>Bth{fl!X2|7&7Yd&qts_3<>B{p=Z$KLh=$Y1D^Z+`+CGWveD}9{)o9jC2py2=esp23P@Dz6qnJ>3+OPn zrh~d|!hem!>rdJyjdLu@ou`{>tM+s+u~J;1m$d1;1a9w)dWytdfY$|a-fp13eMDzV zhpO324&@cN9&!{^KqP^vI9PAdDRD@%%~1THmO2zwV4!N#8oc~>S`E8NfSx)3<(*1W z(?EU1jpmu4+NuTmZ%*Iz^6K23$qp5Vidn#TjY-eEw!v7955E8%-{XdyWkv^GGflgj z3O1Rp@f_G^N~qOK65VyDYD*fK7C_GAXtzX%~wGL?$U0NDvd$!mb_F}L&oFb(=pCWo}Gq05Y=)-IR z8;DQZ>@bINJtV8?mFDX?%BAPmY1vmvPt-6M4q}e%4-5787?))6C@mu<0TfW&3|4X* z9yXPA=;fK1V6?FS7!ldH6w%lvAuQ${Y9yP%D9%qKOWrb2-4Q%AM3 zTk2}gz4oyr&lhtz+(GnijC+p3-m}Thul$!=ccg4(Z%4FY4AbS3=jRjXkiZ}ff=I}#Rl=W5 zh%9iu%G$#7!FI(>6wa@Mdn@>}tgP7oK)l>}MqkHNMYM z+d)K)h6ApPAB>^=VGoi9D~zFOyR5#XR$Y!5ek+1&By46iYx$r0eKSx?BHvQ|zMRXZ z3x}}XGi5q1v(9$;Z~<9W0?ka&UW*Or5^o3+qA8UOK6g*$GbrX|{%4jeFMO~|F^VaG>aG$M zBi9Ni&k$+L<+)8qg5r3c13T(vM{lgWzpt!=G2cx`ca8oX$c{sc*8O1L^^l$U_VQZ? zG%ehHiZ(oxsPF*oCO`O}SmNj|5~it;*v2^9{q59`C)PRvI#EFZM*xP_qgTU0<! zWC-wJXRG3@!*Y?$O_5wV&l55~F@(Vq^8yQR;b(Ka9>FkT0kdGq-5^I`&DF0$O_ z3w-^y`JqK(F(NfvH>jJ=^95e?RVxhRu~8q9Px(PaB=Jg%dnKVng5&O!&MnItx7N4% zdFk>_7ag*$$CQ)=0}VsK15`dgb$z((vxy@mu=l%b*IqGTz?$Y(B)+exmz#NqI+Uu+%3B!QscWn*9MTKe%@>des9>P9H0V>?|2tfors@5VH+48Z_Mqf) zGL!}*I;&(f#xD~#UE}Re1~6n~V0`w=*RPjAK&1#$mkgBtM8u@Y>dIuHu}9r8UMlcN zZ(9Ey+&|0%0OOzzqptPGZaa$Do@)o1z%Ih zI$g=Z19s+keeJLyK#FB5tB*g8h0fNQfJ(D@KV%cZ*~p*v*Ca3icicR)e|5DhLY~T` z;-xfwz>$fybsKjydUj2s(02R1X1EJ%eSQ6LtuJ%H+L}y5!$4qfmdb0vGbdD=MU4We zNXiS4k~i!hWM_`$UKO*Jol4315;mn<5L*Sn~G(up~LB(__1`54gu{F9!i}RU0B-)LXm_d{3J)EH{9b^Y=4`xH< z+Zi&p>Ztz6-`c8TXhPDUe8IAYbNlv9TW(GngPRJv$C{5rXzWBq<&oVY=}ZZR&@*;t z5^u zPR_@^l)^OrFgT1=hrDTw1fX=zgT2I>nF3XPg?e;5+g`?{LlZcOuoSbur zuZDTK^RsE(a$*-dwbRRM!U%TGdiRnvOX$7lj;?cciqkDoixSMQU#^etj3Q1hSQ95k zkIRl~!+|ljyj~Fn2re+2+trsIpyojrJ2s;hs)UjZE|C_kNpDCPF|K{AUsKdG31Eho ze!0N1>s{Hh`_|gDJy6hlwmGr)xW@h}@Z*QPif{qy z+*NdYLiV-6$M*Hu;+N|e>$h>zZe3!a-V~R~_C;DfF2HxzVsnq&yTmjaPY%{i%-F<} z#mOfS>-jMg;~pd-eVP=wQ#spo@Z)h+f6se!G!kKfu2Xp+8pocVzjdVQ+Ve-C51menS*b_ImSj%5s7cU5kD_ z>sD>$E|Y*|$psRNiu!|894ZSvi-FZUP@AgXULT1EXk$s+5Wjn4#4Z_`X+`+l zGV4A-VpQw@;YIT7;76p{^v%@qsQA9Ud@Z_7C1A4h`T7e&!>v}sl&E*Iilt1>6UXKv z$3ossdT*&OFy7OAd2)>GN#rHdG0Rt7NasDCx9eH>s`9T}fC>qRy%$F;NKgV z%bafrKC=PxdezaP$Ljc`M&;qV`Ptym4cNXwPRA9>#}kviU5fc1D@Ik4z6iwg>y*QW zrBJ#lsL%ZQk6F^)cIMxCtyv88r5~aL`wgGWAID;kW62+veSCfv9V9GfR#bCqT~6)f zQ6?zGZNZY_i~B#{k$G7!awUr%iR2;gA18aHi)9Cf=AR~g&Zu^5d)mO?D_V&MF)BHC(=F!VtL zD(QG50h=zvLQeI&{0mdu*}*L|j5QEqseQsT`|Q5KfwHZcyFIJC z5%U433jKQMz1s4ewV1=>)Ovd@Yc412mv$C5jEyOwiM>9Y+`MN6r~|F)i2l{l)9;yX z3%<*Lt7X!q_J&Z8_1r82^-Q97yh;0DeoQttFx&@;xAgLUnROqEKB^=?Ac#(gOe0*I^b;1wY-4!uMtQ*Y(7rZZhzxC1pm)a>L34d!NCA5e}2(`CjJ4^C$)H%~)P-`&7{R zTYS*tr$(7dnjB%r!EH-dGn5<-lqofB2EP`2E*phyL`s+LtRF;#^98=0`=B{5-p&}P z;4))aoagWeYVO3XIPJyCUyBQ!A|)4YuOe~W3XpL_CSUrw+%EL+cn^foMs5eDIua?k z8jp`TK$TbxBZkaO)Vmyle53)@z|ezPODt3jTen*G!=lP#=ERiE{7POq$FYWag$t$> zfs<5(ZHG#BZN*-@YT38<*r!p2wBM(JT1UGH_gwAKD|(XO3Ah(^27cw6pZn>c7v89O zugKok31@|xd}zp+*}pASyYkUw1g6QfU6NV{|4rXdSQOLfZB4akSsk<9b@^~ms2@s_ zwJbD@K33b~Zr2m?4D$?Z^ZADt$b>**6+0eLMC#=~y7emM@U)YZxt_UCv@*rxfooa~ z&PK_xNbS>WSH8UvQ|*8|ii_VAIlq?f;9jtKoEI(>pWdTi>cYYN`^fxx^+|);{q`jt z9a{a=uT`Y`{Ujw4r(k4arkbBz&!Z(3zg-bfJ-IEDXGQ0piYl#hrOg$mM=$TTk82Ov z%3n9ffA{W{b0n9#R9>U3urBosEeAf;w&qUFw{9x2DxG;|XZLzK z;LvTMojLJpdV1@^I=t4HR=B&@_~V-4xRnr+d{F}C?3`H_aOAOF9QrB&a{h63b^Cy4 z)P;8*lI5iEf7vY3P2BOXp%ic{u5Mp)UF+48vd(Nk%oRTy8eKt*dsbc@37TwkWS}fr z2VMGZpogRBx4&-oU7gn2xhPG(j+_Biqt_7l_*FQpcWB%o5_^f}x`^87X=iRxhA$Uq z-q!pdzP>svs=iy>K*c~(DFpt21Rbq~>ql8h2e-l{bKgsN9Q&w5_K zBgWSPh)gkCXN5{tL4t31_9df>N^8RtQ0alY7i=;ObgrEbAvRu??t19fuLHBlb0OAq zFi94n3;?l=V~^grzwvmC?&+cJWp%YjUp}b0g2$fvM_I2s>c z-#8VGdmX`w$;rt)5gRFiyH@t4%vxci)eKl8(&6*rLZ+nN{jq8c89la3u?EPOp_UIB ztfwGcNlP8$4N+Epy<`fXd*+*!3llcdF_Wb=TiknoTum6)@CM0~7%dLbin|I9nREJ1XAP1ZbYy%Mfz1;(6RP+0bzuy|9()es*iVjZ`mhLW$+Y(A}RJND|7#`OS2_$Zjg@9cMnxSg(c?pLINWuAxwhqH3M>{Iue$QNa)2@w zaOkUMEvWa}gd9e-XYd$R+iqM%Lk}x1Bl1o<_a7qv{}+O*vCtKB=CXqukObBtP-NxQ zCETtq?I4CqNHgvYe@VeJ=``zC@WUBAKrG@oTkqHunC4~qPy*)3XyTDqsrL*dzho@9 z{u8vYM`xU~CB==eMYmt7U%GSW&R#Q?`1Q9YxYr4HqfP=4V_Q&D)~l>wM#6?!CrrqP zCu`ew70}C%Uq(Kf*d?9x6BV2k7*cKR)NlL#WJlb#hr!xv$o;RoLzpq8QPx7n^^tD+ z9s$i=U2GR8+Y5V3ib@;qy_AB-&WU#;c2AR*;4Ba`FU#bHixbzg^43cN)f6v0=E!`- zlW1QOh){b&d8(IWp;ee5!LzsJG<-KfJDv+d~jtxE#-T|QSqVnmJTQoxZV zZU2iG`vH-Z><9JwpL+rq`eFT{R0H2toqjPh(cH+yx?;Td@A13DZ_$xd6xSJRYX<;e zjg-svo2?(~8uy8O?c3D+Ul7+U4Tw`w(_O!YIITI@C^Bd?7j4{S<(8tBm1EiyU>b7c zZt?>5Ux*r4*}EBpdT^XdJ#BMK{9+ho_=8O{C7QY<z*XIxK?c zVFm}aJa*^W_vXK*FzvnHX}mjT^8F5y{3@k5St&ao9c{GsU8fDH*c2tf2NZJmD#c^7 z=GlW%(1hTn?*iode}WA{;_v1gpu3qkWZ>9-8e~{9u)owf*@VBeWU*vf7PjwlcMH^9 zZqW%Wo$;b8-g2;M&GmEl6kytyaqLyh{;Yogcngz*W*uhv(4X@Hr$ZN~&by#PlXnzn zBK=_QuYiWd?1>xQHvZn2B#3g4Ih1Vi`4ER?6+N1GcK!PG4;B$`rzEVZQ!1DHGM245 z^oMGbe-If|KNGkg{Q}o_O1*Seqk>6l}ZWXudM_*!`q3&9-*$ z1g~}qdhpM~qU@oByzti`2DZeuBha>=Bcnf*!|?u555-fJGvr5cted9y>dNb=IzOrr z^N_{WeiY*ieOX(qhu;sQQhzV83%#g2+j!`?Un#z`2%NW`{_B*;)FUf-HS|aH-PKGL zj@&OU%VB{7NzOyDYMG-V%nk8q%iB_Dk^Teb3w;Kmw|%#7*mcB2a#}s^PBj_3^$?Z` zt-NQnaf3PKJCwWgWZW^oX!_~Auz1;>*SmhFyi!tgmqwLDT^0#IXA8vC@bpUzgc7eN z8Z*qxlb5v|aTWKJ?ClV63}egC+~eg#??rokEsYamP0sPv5!jY71HWc!!F6~FeLn9? zf3U=qk$BfNT>T0qBB!n>!dNOvW<=!X9U>yFf>G-gL>H2M3oN(Bb@F=fG3DUgT%sJO ze&N<+T~f>-Y2GG8mzsw9w~3AN7850bvB#v9u}k~CG2ZSr8anFL7)5$BbEx@sxDqw> z!-eJm*+_8ePebG(K-x(5>K8L~p=>y?28{q*PT}ioYtO-KCr^6k4v{8T5~>PRzMcPp zygK)HhwBXGq~)46b=n!hT*ppdCeyK=K+0?*$+ISAoiu0-s~B&R*mZUPyxfyJbjN>= zc0YN!AC z;c$VNh-e>JkS@(BV9~w(P`$XuUG$lDl$SW~J^s62-o0dI>DKU3xX%B`lI23+A-<#I z^y$m~C|_$S#*9xg{i|XVzNTYQ1@|WY$x?Xq>ILKXH^LRrZL-v2^i`(;=KB0eFTVygny0yRq|f0Boj;Ss*@TVEokm7ZcH0z7v)Sug=O0W% zS^NF+u0CU-{;Rsg$hS}u`-rF3tYIEMHJXaLvb41HK*HZVS?y1$q=P4#W#ghAM^xl-abNOuT`A;`;9L36iv|G0sy8$W-?IB;@f>|0k@cU# z3qKF8K7x@fui5T0pl&Ef+GmP4=h?avudo|E$q1w8Lo5HtbJ+|kpG_q3i@N9Pg zCoWc5S=qPq7j&CrB{uEh_l%+Y?MGY&?{?Fc>Nk!*W>grCbo`?Q5cl8{5fBKZS9o-B zvSZ=p&Sq!)uGwZdPyOiR$WjI*ybJl><&8s3IEM5GQS-LMsQR0w0VGy~f*mM8`0l|QW5ez* zAJjWeuRJv>mskif%eX=J1CFdkTjS7he9GY9p!XqI5oa)*j<}$&6SH5EdR*wjdAe8t z`*Xnm&kqO+7*Omt5br!L9=~k6X}g+!`+^YTPVjt$Kkl=&DebeE;C*D_t#876qRm&=V%fxuG-JJa|Mlv3 zI6L=Nyiyl_l9;Luz&NZTu4ja-;B>WEUVknF|KMDHDIS|B6N10-0&VTM2cpm*@=mzI zTN!QX-Eb~3Uqgc}Eb%{G8s`0hB$%yqE8A0^)pQI=Cj|{+0!zeXahIMzoFm zUem}9BN?b|Pr^Sq2R{?T)t|AH=0UreP0@|?uUma3?P}xzJ!XOaAP?x`xf+(q zevg+{v6JQuI1J>>{51TtIh9j7R@>v}uN>326G^iKTe|Z=(2}IJwYA;% zT)VY(bM?w{Vx(A4D9@ylAUMA8_H1-kG|*y*d-C#0@X6wV0{|F);+2rN!`4Xu)%%bI z2oE5|06+9OpjRXf;wdh7%Xg7V87LcTtWl+@JJp9s{UW0RRUaC|UA z*d~sGcp@hnY>0P#P<39zw8%C*4utQT|J58(h*Q0h^ZZ@m=Z8_0TG-8hfTDl8=|3-R z6pZD}jEYx8r1|I78=*3X#lPm-dJYpAGea)DBY7leQmdU}&I;xwmyJnoH{B1iQ4?I+ z-Cq?6=VljAk(Svut?{O^?g0G>D~OW6ssnj@!b<^R;YWnP8TG*X;AXHwa2TWP-JH;v z1=1)9m!PmP_Df<@w~8U!pDRkFRfdk7>l;`<-+re#bocE9TK`eQmJRnTL-|6Z>5^@$ z)8B#6-r;Qbc4RyP{kxN(1BEkobg+teY$eZr z(d!aiVLN#7we8JM4t6)Ju}&HJ(e$Xb!JK?!e0sNf>+rHk z9#*|I<2Pm}>>#PzXS0-YH8$OzY!f^TEI{=h+ixpwQa6a9%E;f}ws58aalQOmI;q>M z)GC6*pr6?@Bixl+QB*%7?zna+~ z(n^V=2)z!`lw)?$;DvJgf8#+*K87O}(qt>i{MuMKK$It_3n@wyugg#LicJCIzLCv0 z0B&908tryekmLBpiGhAO4jOFMRhl7AMMrmqVb2c`$*FmI3IoAn)|%`71_&KKH7bs1 ziE~0$jU)14(#%~}3{P=|=dP53LU5c)VNPxWpkhoO7#R3Lv1*0avOVw}7v5QPfLwSs z!gd>g!Ypfyx`#qx%J{K!K(d086S-s(wqVM$%SWL@CbZ-}SQSrI#cHSRaNUq5Z3o>5 zQ1v$Y!s^%DdNdvJz%if}3GVyVA#}mf5eN;I}oqvfJL;V z#*4BXwi>9ki=BAM!)tPV%Va@>beXB?(n0eJ5h;X6n&J{%%5%+*?w|MhFKa#14I`<$ z$Rx`;TYQv#BIoi`wFA`t&l`zaj&dzO?j7MO*5Vv+@&FpXn~tmT?(dHzN~&ikfTt>C!zap*^|V z@i$K-Dy0@}4-RkJNA;ds``qjQVTQM+yy&ntcO9~gj79S&aci?&>x6?Kv^_)jJ%#jF z-2>h90kR~?Ij6GS%NqcP#COlGx3;b-A$=$gO6r>WyVgPtOu6Y$r3S5uoPcAyQ)Wlf z$cI}qvJ*nNre9O5CO%c?NVWNfSE+1if*m_xTDWl!xTiftg=ddU8Zj$nJR-Ik z%6vXuUj&>=bsP;%iq93WHZ(#Ox5}MpSjiJ?^6?Yh?A}}~-1&z%-$h_Qq;l4;5mUSw z1uu_Cvrd6P;I20vrLh+&?0^;4ay$-El!AlnZjb5Y@BnNT6Q;crAN#vsAFn?VgyIe% zrfr$e-K{r3$}NJ8_Idoo1&v_{1YB*gd!e<`uPC9{w@?ItjX%bQax$m;%1 zJKTR)jH9?E_cx!;l+jiJ7XfJi*@zMwd)>fqi7g2#Wp{0-2FM9hX^ z%p@&=Ig!Xa#H(s1hdGEk${}FPy)@m@>R?1 z2tvtrM*T+2JxtH^T*r+$JfPvN6S`@eTJ~k3GP#?Q;IXfcgCh)~fme=ZBw`#~9|c_j zx^-i&kR+m#p0+cBH@?}3qk~hwXjSgIc!b~1E+8BQwVdA`d&KB6of)i?Ys_)j@9ZD< zo4f@W&{xY55Depzg#|(ecQ$-w6GBlwS19DbygSQSivLqO{m&2ZC;oQ!;jC~d8IkOz zgMRPuUU5?74m3n4GR%i28g`51@scxLIM76s98L^Vu?caKtIB`uLJ*qs-J6`vXY31# z`$I#d%%jp=-A}%2+upqduy98pzw~c!*w_>D*LFeA3UsamioE9VMm@vSLx%9l6wrV} zINxo~`s-EbqV{zz#U;q?7Gke%5-h{lRVeX36f2llvsjPVw(6K@2qAH%9T0ECE+zu$SolAgk4PEiH>@`Rggbf=_~wL{LAXa<+4WHC`ClOLL=dq zSV=3!@QS#piQ~GQ&)^S2xH9R$yDq7jRCx{gbq^Hj*NWIC%0*duBSk1wUE=PZU7KxsOd_JRT zDqD|8){Cpy1GnO?faVWMrq_13SWY*zGBU(HX+p@-6eRCV)>zoM+J~Vo94G5jcFgr# zdMIx>&U}yAF(3R)uCO&x6ZdoBXAq<;WNWS>X(4Mt4%d7-S?^8WzB8xY2&=UJfWY^b zU=FzPx2k77XvxXR4a!NtFNUUj(~uZzDeOA9;`hY9girh~eqIWPbAvBc`)^tUI;S@Z zFY^qdrhkeeVU?MlcHfuLcANC&m?OUtC5V$ zw{+Hg0cy_hoNLEp;PeYay%4LrMx9t>od($x8l9p1(0Jr7UQJ* zXZce^4qGi;3@sWwc2Ija30%ork`DM5EIfn!Yf`k$=jU5d{(q`OZVAvh#V-@b4(?=h zS%GT`Irgerg6gGs6|3`$i)x~a>pu(Lc}y7pU_rU~g?%T2p#y!up^*OeOAGjT-z56$#wT`M2BN zGa{nseeKP($Ij;58`;`-UimMk>H{k~?%pz^a_JP_v7BGNGLNoYzOnNZ^O;o1q(lUdc#fxTT%IJLFTrS&;|Z5Q=xDO#ID{SjM&>wAFbjp$yM6O6FE1R>MK#hLqjtRumkEP zHP=nbS1Ypg=YD`8=U}nUnSb-|0U?ysg(#iRve~PKz`|!g@jUopc2f)m@WJF>ugv#un#h5?d^!WE87fZ`<} z;1S2Yib)IYN1v)ERNUrXsj((Mm_XR0kYT*#ROTKD>ZpjvezufWY3SV_<=~5}YH)nd zTK`{#B8>MS?NCY*a6S`6UH&ps7M-(Mlvy=7(rzK^pn_n4 z*5%_9vt|M^6lnTdf{x?xuHzbv#=rQYm>{;ho&_J@R(_@L-1+AQ%zAF|IG_+-ScG$K(qlklx~AB~nmb_+r^{)nQdc+57HU zVfnKOrWlgVYr=86H>9Q^hxWdUL7%j`fQ@*~fXsg2q32xpUKb$;3Bl!Q=O2&W&~zQh zmuRKpIcdUXGgdz~>6ILIH`Tu1MV?CbB)G=P%wN>+bd0;50&deN7@E>?;zx`z7pMrs zD6{pHWK)`{2tU$lc^xUo>#bidc+dK^=KU}D&l^Lw$K{xXyA5*nuA$ zra+qgvG*E4isYDxb)>&kx_>k1O9qC5Q5=^Tl@W5CHT;Ihf%X>WYN)-dO7%qfm=~oK zc&YwL%%8*sr}qDNYPW9tD2cz_+#Od`y<@$tGt+j(#xZ4oS8j|e7v9kW16MdX z)d_*^!qg9GEzIr1mF`uhx8r^{I)P=Hl_xv~sd8@_P=iBba#^Ax4(IfLg7@}{Z|1KV zy-&H7)FdrD=8l%}t28|xIIeVU<{bw=>3Y{mA8v6=nazpV(sd5MF03) zI7*`X(N$??Ay7Gv|KK(|eebv}4e0*xH)cN8qWaHh-po5}n*l?b2Qq=A{m^t0H5v}P zw{3=9sh2-^B3K$J>uz7BMg`ckQ|b)axX3doi?JgQua?#x*z0jY-tYRdX%^_5b;2-; zJLr>HHFfnzxBAul0ryYv@ThKjcQk=ce>aV7svU2_rttwWpmlgZjY|l?{glHoBrWOB zoe1GJ%A~r3dxc2OaO^AkTQQQd&~*)E6Ux#hhaqex9l&vc6kFc#`v+aol6peW=tqgMXHasD

A6GT>AP%!3j+FQGJY`>r)=i2pRLR4D_i?O5q{>%Iv+Xd$zhF#Ai7=93 z5(T?JVdQBO-On(bKMU)JxTvX~x{A8Eu__@6glY>{8xD#2>O85BtL~|M#v1?>Mhtq^qEy zSTHI>-bR^fcmVW)UL4&)HWGbL-dLyc z&jtM?=cN?%r!k;j@XZ9fX-k)iUIJ_C7OwCCAXt393IOy0OllZo1+|H)YGz{Ml>qK; zOtpd62LmrB6l~aI@@dcg<9@#|ePb^rOI*MdTgt;3%_7?FnYduo-|uil0)42WLzc9W zs_7U*!SfQG6dL#^!x9qiBIT6TP4jc?%2|FVagJDSw{jj!eqUZyrXm+S=U2MoyBUMw ze{c2agN&sZkTml*Itr@w0Kz)i6!1aSs>ZwR$NdmuznmSJZ7#{OiLhWLOQ42^5g^W` zilhZTEZZecb?JY-LDjZGywTr+h;LF}&Fw!Ol3V;7E7AYdTkz&*gHUzEoD&aCe;Xp% zWC`v#WJ3+IkBk^d{BUi*AFJT+5rB0^r=_LUoYXq3Z*38LGATNQ;r8*bvzzGU_Wa<# z^-W(f~)o;z?}sGazcBPgHCf{k)QGf zml5qZ5QR=+mzn)p|TH_5g)tnvWOn!a3r5taci9rOv^bk?*sA>C9hk+V=;Ba_w)`VEB9JH$h7Z zHpfZqc+s;KrOsyjs9 z2>;)EhJWiNK)(K!dKQ*m6Hj|C11L~ply$vjl~P=n`O*^-s}7KSf7_RAU=gvzI77<5 zGk9CWQpkca#;;T^EK<+^U-bmE6I21^-edc|+u9!jbB#5(g=-&<9=Z$HEuvl5AU5wGIe9@$xxhNYK_=}4sM_xEiomaF_7jg*o| z#4{t+s5};W3j1RkfP7KY9k^xk>*2u zH*Qij=6{YNiCrF=0p~KXi~Zyx7lLx9zK0*dXj>a~&|zh7Epg5~*^XJN=>n6+2A^1} z`NtBJU(&n+E2cLumP?kT#o%{S*$Y}I?@_12i0zww~;67^M8RkN&f zeO?bMchE5I{&PGqN5K>=(f3=S3+RBz^hnN(y#V6<UEYHo!587vT0a(Dj0IA|IX0qio%2EZ*cgeWza7GTW*U+D-8lhhTW(ihw_Jp0ndcQ}{Zr zF&-Tz+s*%yU0S=<_UkQ&STU?WiyVe2?^!Z1e%iZoHjBi*1l*^|p_oLYvk1@yeL_ zygx8RVf)d(B(aTx6sTmLJ%YolF$So#WbV*zXz|755%Kr$Rg2!w$oQXIxqAiql!!S? zeLl`rQShIB`*Z}?x6THLJ)~}&Y>$d=RX*Z=+J}PzD3DW<_S5_7#=0$r2_ba;LG-vi z#XEQJlF5^1dOd}dzY|22W)&28Umrj96LOqU@&w166{q@8eDmEk$@DrP459Ol><&|I zen8z|b(k2eEs(d#5N>MwI$5PMb=gP)-q` zgt;nWYlqO$CKr9?k{)Ke8G&Ij&Z_f&q-_7a&;b4(Uo$^$|E8mpMY^98v}e1)X36)3 z&DxVMnvEJ3#XxNn(Y7p3hZ>0T?S#pW>|m`snskHCzPoBEcYq{qA8`L+E6 zs|#&iTA)|NFDfWs^W56LP_BgToebBWofG>!iXd4EC=Pr$PAq07Sc&*PiES2!d*LXV zMF|_PYbk?qGHZT8!uqGu;z49)vIp_mpo{F|w zLT^MZ7XG2k;G^jntOLG%<)!*j^m@Mitq5ZAX4PTJw9omx`oFdw`6m0CqzYCvrnzS{ z*+3w_QQn02jS^(nBxQQYFoP6sP6Eu#wxn%`0 zBnVryny={P&SXCRaHWux_57lsU4sR|+{D2oD?5YFxuB+}MSLb$w3p^z(g|RIL}}Q-Ag?I)q6}PIq)V)&2UFhnT{3J__nn^H24xi%b2xku+bu$o%zuuX{y3 zgXaCC71uDJ!V3$fv`<|Y9GLsQ*?#^PZASl|bme)9%4ji~6w3*V3XiHg9w&doheXx)G3=cSRU`e&KzX2D^Lj=i%?6ITg8$ z;bNgj@?`ZY!68CQ1RZFZ%g%;)eT#MS7z$nG^nk_J+*XZAGzU+Ul9R(aM~+Y_=~k3I z!UPnDwE`WZ%h5{+cr*u1llo1FM)#?k=&K(K*K7Hru9-eCscP~OpPg+NMK7kz63S?) z3M7~<1_(!BfP-__NTXC!i9<`*3K{5|lAP|PfEEm&IGqoyUny~3iNO2N#%afyK6z;+ zgr-F^(lxZ3xT1|)qO<>+UhuL6HsBqOfllOmdgz5ozRe$u_c8s{i&P5ie8Gf2+6K2- zC`0HsflbH2TsuG>b~T33Enb}Z!v_aRDJ*UPPTJXZzV}1OdQYLz0OkB^cJ{*+Gpcg8 z@mG!b248^PQg3r~c1FcUaA)Okx-Is*U!4AaQovY;n=5xXV-%fuM;JNl^ZXpqW&QZD zZe<(>fq#d`{KU5aW@o8u(o3gH6uS@#VZGSv+8OI(Jd{lfJsS|d%M?AMx8P03dX4AD zWk&gF!J*yo30w=$;vhEqV1XExe6iaE>KFUii}KobFzNM1We5kBHP6qJ&;t;D%|ZB+ zvn?B`-mYJJ=?6Aw8g?LHA56gBY8gdk5p~|*xLn!#c80{XkjR$|y0lR{_Y)n{CLzj>`~%RUDx=hSOu2$s~c@rmJ7 z!k3p!5CS^I2cj*`xO{wDf0H*qApc;A1b3xa!;ZfU5CfY>(_Ynval_EP@hXqg@&>V0 zz192VWq{d?kFWJ!Z!Q@eJ7aS;HNLZqyRn1v+x4a2_1^XUf$uye4?Y%H6L4Z%^}CpE zLJNS-28lyuwDY|h5mS{w$G_qKYy-sGQ22f8sp36GKbO(}OSHr-04g45Psocc=Yuee z$1*(xYya|6lt|0;VY9a(rEsExX{T{N(&0Cwh{pRWzjBmniwd;x;J8|6l%i;@mGTFV zNdfQ}00?2)>w{WIMzZOu=-B^8WnvTv-|hNR+!DQl(C8|(q13UQdLFce@$FM=5C~1O z9-K)Vd(~;zw%x=!yd1vtFENryW3*=1#j@+_1F{b?g`+g1hMb2Nii_6SkBTA|{uUz9 zjTY!U8F8@wki;tG|5NeG?tk>E$-6ge-bKT~lsEN@tbqBk>{obrnDhC&F)c-ceFl>D zLjr`h#AQ--jcBb(2QQ~*%0iQB)k29%oDj`6rX1||`q9KDm8Uf<<=0v%JDulmxGdVG zX2~3Ex)AqzZCKoDiTu-zD(XvNAROU!uLPVFPeE)_31h02MUDxVYIGwXhih#~h z4(QeJ8{1D`)-F?@*>gTgyKyKHZZ^DpdKLZz;__IsK^`clm32Nio75ODHsPv~tNWew zOL=Lp%ts4Hy&B)%x*B*Dy3z|-aK`UAPokUe_9UnABe~nk)k)sw7?W%6XavB;<{{YNd0Ov=q>i|UAD$xtk zWVBOLVrFVBK8Janpl)n*xPExoX!K z&l5RW2~VDvjcdgh6ckuqVxT;8ocoqyUM^CEF}OfnyIkglzPIINdIL-kisBe)5kq2k zJZsRa$Yo*Y#uT{ED0paW?5=c)9)PXT&BKA=+nVDs#Np8}s~Rq-?~>esOp34DL3zfd zLpmBbpmG`-9>Tjb4aOUDB%IA4VNrl7U2pxuvGBnm==9JSm^=dJ5wmE%;hUc|zMgi1 zp6p7sT7=Nc<_J6l2LU(^bbcJvysDmNy$1}R1GbzGal3eI(*wzK0zw^*d#^59C~HpV ze9`R?pWDvpT*zXqDpy`Xf#@KN>b=C#yzIoHMR zS2y;=9s?S6p#yY_YT^G*6kw9>gYL+xXg`+gzrwr!-k9sFcyNmQNJl8M%;I~965k^V z_*Aj^H>cnNCjOg+=Gt97Omb|EN_@DDETVmRipnRw|CPrG=!soLqrDVCM6v*hR%xH0 zk}K2j>g3b0Ul#lB0@|d7G|fL9byQ|vo#|As{y;DddFD+J^Rgk!oYG`kiXc3kp>Q?x zU(5i?H{c@zO>pA=g)Q=n9>>Ubf#y5QvDXvOi+*7#*8rdfRR zKN|3hTLJVW1lIfny19l0S_Qgl{_(^1FfEhs*yta4$}&{@MsRYr0zM4Y`hCkT8d+%A-fV zT6Z95a+y!!8@A65msFpZr=ViYO*ljak*8N*L1#CA?7t~XJ2?Qd1~LrnMzTmLp5R>z zuw2M*QYME}9aXX!bp4K6A%gUkw1UM@jpr<1P;#dn<>kO%|sCN%)WHME1 z)EZ2{2i*C~)4}xG+AQi>6IVouuuLY7oGvmRYQW-aGcViFzm!cgNPM8^82|IO!!P5q z*{?71qwv7@_dZVxSPuk)hFAC; z*Az=Sa{`2&nufZ|+1}CvZwwDmy-|xM7?bRgmtmCs_bL4Y+y)FJozbXoO+1kzujd)y z*ezs*8|tog)3+1nQl2}Lm2Rwc9y@BCRf`|5sK00VJ;z>Nlor8NbM{#Ke@AO^s?iW) zro0|2{%1j=W+H3^maI#Q-{mD42`av;Zd=AobZb7IvsIMudj;i?%l?vDY4eUO9KBia zDTZuvIO8#2U5C7fRx0|Bejc`ZmWf(G`AP6bc(?F;eqN$kq6eTHa#<W0Z3_@n05C%9R zrs6hZD_gc<3E7y#ClZxK! zaPscQWXy7LWOmQ9<-8oVzOR(a zd8hDqCgNXMoOhmiE$?@AetSHW*paQIl`Z(q$|X)tn?gSPHbVfxuZe)2jgk{TZ&mA# z40_U<`i)Wr&x*Q1-FsnyrO*`8(>h^BK1RMJ1-U1$$f_3wXK|27?p(kHZKw9MNS=hi z;paKBqw28YXk18~85RBoE)t_furv(eyEfWQnKoIlQ!Unbjw?%OMx5oPuTvW*R5g|I zT;_OuY%IW@k(8i_5ND6kqYHp3LkV;aoc!fpCw!z=ubqMxl}09_vkfl29L3pJ_8{@T zF!;reFH>cdVQHY(f@-g-#Knvou%QyUTw?>zZTvn1kh)|;a`D|Bbm6NL z9}}&zmNFfV`qQ$eq`(^|)sg7UsrbaJb$4N&$tcy?&lr8Iy&JL~H z^}m{SLpY((2l*rXtHU_g$u>w@-v(89VzbsI_gh2~4P9p1LN=AzN<-`7^Ag=q1N*a` zO}$GWxR(JF)=$Y za9xK8HlY7hp7$0@ern+?p!dZ?S*i{rmfs={8bj84Ak96t$OB zwkgY)Ym?N2qTk9);$^Uq`CxziEzFZ8BVFY6&Ye`M&opKn*0aZ;E1yts0+9a`5OvSVLyYBT&t>z>u<}qpTB>Xp{~Ndwlfos;Jsbw z&lvOE^_~ZmRoCre8trJ?nB~4a$+8V=_%0!RB2h`1aMBVh`E;eo@LZT%a5EnMnH_c( zf)514&iOH8`Jh%C7^cU>zoZ8FdzC!fsWKc*^_=T%z+fcDu;_V|tzUx+FYec> zI;ID9c8|_`Im6xEqmiX)@WpDuE~vd_5y}qKL=r0_A)aBb1}os^MG6F(!)e`@vj>*@ ze1Qm9_5CX^>u~G0>3o#0bZt7LTPY(2p#?AoO z7CYQe5atFjmRrrE6dunhte4XF^XD_%90L0ka)zNq?^)2V0+u}Po^KLbZiW{3AXw7k zJ@w14OrBU2O~T5LAO)<9z;OB@7`F~u0Qnt&_V-#ra*8&c4vDZ4^qh zmzA*2Czh7KlGb@P0%N_W=@OXFVT!5#MF?tM){Ak)hZ*Toy$Q^G|GvG)nqv_<^yejm zC^k|(DpYty?QWblX)mN48zd5@Jfh4OD+=;0XuLcyeQ=->ypbTX%jW3n!W_Mb+h2-o z{K*xP;CSP5^$hHUeOx!6nBlHcGWyn|^YOzh0{&X)7qq4$P#W$ae z5HFs`#GbO>t4I|%7*R`si_|ll+I1w{!WK>nOUR+&b@uWu=w(?+0pJLFy=8p%<1<q!0zyw%)Z-PAoxk#)N!wN$&iL*&^bHJX zm5mqZF`aD9sh1e~^&dq?E>unL%vXaBsc{I{aoO265_iYKV(Hb5*l(GHe%PykiK3AR zbeV8erBca-)MIC&`z4@joo^;FEzl{P@!V}VIzBehL-R8}n5c%=MbFAMOiJqlGQ-7H zCdDtKVt&O%5Zn{3wyUQ|MBph%8sZF3BeW#22oitQ6xiT<@(!-=!Q047>U+AgkQG8F zGfGx%p5&YD?(V)f;3?GtctYq`;$>FKa&iJ;m}nd+anrFGZk8L*2geU3rKDwg5P22+ zb9Pspu_aCY!04q$i5`>k^AmcB=9i}ra*FC`NvpM0V9wyVuCis?Al%Q( zuNtsIG#`S?q?ukdZR-JU5B1-O>OUVLXb@Yz5ox)Fp>p*XA;|s^siQ}@Js9S4(>^W! z{Vo(WjoQAOeG6z*>X3U1xYZC)Uo>p3bE3v=+-e06VEUlbQ&^YT4%1Ebq{))RWzn$fd?cMZ#*FJ|BtcvfNH8;+J>#5paN2r zDn+_T2Pv^oq)YFh^p1e^Kme|!jMm=BHxr*m0_=knKXEP zIeoLCbO?bozG6-5?0~cQjkw;N;0Yf|rvhErTrDSdX55Uu{Z9)ao*l$d@g-WaQGIrD zfw|j3tn@`MfKzgH4XY$EqV9meME|A$CPu7A=N)>}O4gaEdCJ(qMtzV}6xQ#HZZRp;mzE^}VCc$2CyEueade$DcQRxlfmKb4I2{I@a| zXsc~P7IbGr=;zEt-%#-O`soxDXIxxTB2Cz?WrD0ZuG#&W*Ts*!wY?lav-fgq+!Z@` zExb)!o`6okpc5eYOZefvn0-mt`F^4EY{(BJVS<=7Qlf?F7wz(23}63jFjX5UNlL-I z)G*^#2uz+3gj@j5HVRU-g*VC`rt#OJN=hyh9IC3QH=05 z==EFF*$8A9R7oXyK$jUy|J3++ zzQE&cV5Y|pCTY)id3kvac$gmh{9&;qW{ngD&YQ15cQE|XFd>8$h!Z*7Nv8Sf7I`re z_CfUGCQ-{Zy5R8m2tA3yTb}k%fI7=`OZ?X|OQg`7*Xii!Hdwdqhj!&=gS(iwTQ=@H z)u$%^cmQ-PlK@VYp^ueW7F~h9FKvi^cPNS3$-89+S{7%Aq@*8xtU|=ovgONeP3$T><3? zXzO5dDpq1Okx%*D?0JrUgGE$Ac$g*U|B7I&6`xd8 zbM9-#Gw>y{(Hi@nC;ZFU`&MHf>qR|GL>8zradXz-)+#Coj)_TF&k)wl19lZ^7UJ$Q z{)`(Ga>;zoe0#Dc*H`Y$jb0s`bv=@d{XyGR$;7EB9UjchQdLG1ayFPGDIDPAxn42Ge?^eolMF z>&Rs7YSGrxBZqUhA`*i5+by#XCg#QlfZ|$d@ag{Mv3+yfJobM>B!9iXj@B2f;`F$j z%vZ{wUKEbS1H@TKz56cRNLc8tdj{^(yx;_Pm#ay}_5-zQ)Kkbk>A)vRKNO|m%Q=%^d~^hf6O|MqbC7nMn@*(cRiI*B8VF;2wUgM$e}JBL-?q@qBybo#kTsh znZ3An$dTR>>B@WI=L#YCJ!6_;W&CJ0B!01XDuh>ET4?l zg?lHf^T?-E+W-G9FMv#K3ZI3}#4&!m3AQ>oE%Uq6v@qr5r>#k?r%sp&H63M;q|9QE zyr|!iK|{NK<33qI;)tA~$^ThlOT@L?Dr-A$P1KxpU9vuK@*1s+-(ygxc?AalY&fQG z7auPV_Sd?gYZdCpT_}*ZV!^nX?ZDgPEkVzG87-0V_l6c9B7NAo(h_fUB%@n;^+?kp z^YX`ZT?h5-97gmx|2`}QHp%JH5{6yqWN5GaK~VlDRVLUT2*1R?yNxpCP`N#N-(SGD zvi_Jzx=^uBek!PD&@ONSsht@l%Bwv~6iDxVWoD+kDQaKPj!_&Hl21H}o4X~pw4o#^ z1yX~&@Z4KJ7Fpb*MXt6&f_U83XP650?L-`-YAvu+nvebUKMXycVK@dTusZ_+=78Cn zCiu?%+0g&{Wta;ae^fFlkR&|lPuIxN{Ob$TlhxnFVJ195>C%TBHN9uLAcEzU~_Q?nTs<9%drAEt{{rTE3Cb zsQx9bzPA(2Co`X8(?Q%sgpRK*M;oz9AEEzTx1-!0(&2jJKKuVgXD$ebQeC8)V608! z)EBCe8rQ)=#80O_Me6WShAkEEL|)*N_cbFLiXZ}NboOhb1hXpktenb-Tb>E6dnWRF zbLVmWl8T*VMRzDIUeM`8s}I4dPBo;PU*|tF$cy=6fo1=m*Cy%ld*R6~mPh^H!br=; z${9Ne!2XD97utL&sQ<%L9&;D{%DUule)b>b5<E5uK8-L zK%&DbT0_u@_R8qB)b3MTi7}#(KQ6n9r{a6`F^YK9ye`;AcekODFvYdkp8t`(6dm-o zwd!xl#j*uVHESckTwTUjcE+swdA>$No2v3+{(}9m`^sNFL%!Td<8zsNm*PjOhe}57 z|L&fbV%)RmVlV2{mmVcU)VfTZ0GAOwUPY__C0B>6NPq-{Rzgc_|E&L1U;UeC(wJFh z$U)Ba(QPTAw9jj+ndkYv9>=ZaUXnFKPV2J11$Lt{qZOg+yLl`A;QY`+*G0-`uOc3? z@aC-~)UT-?Rv@=j#9s*}jMtw7`<8KQT_-r`T4caN($mWOW(1i-@AC?Yjz7}Z4Un9BKQPpgp2=w}q( zyw-l@B-9Lkl#QRzV5Z<*X>#6el+5+y3~?=;vFxCLF*rvH_^6) zFXgJJ;nhzVfB*5^v9Qe??<%^W%N}ucfe>@$e_L$+#TNHw&Agh>hMgB*BxZ=L8A4C1 z3VTUGr%&Whzfm;T)mK~Qr?ujY54}AIO-0UA?dv!4uVrCqkAkL_T0)wl7?O+^1sm8p zYcQxU!r0;zZ~y!uXbRq!H}-|Y`0;p>;uvO$O()Io3H)y_?VJijh$eRF3x0RG@ZTfj zCPrT{O5H2EPwv&}=gEBU3Xi;Kw7fNWaiU)rjMPX2& zo6$y+S%#&||BMO5vwg)Y;-M=sD>CovcZsrf7j=8S8i!@|{8HZ_ZzN`2AEM!JTpXsG z%?=9Th;jZqq)y0MQnoXQL_7sDcC$QcN66+gR`f}Gj8Xo2+%bXn)UaDwo4kCpFEW&h z)9QZ+tJ&@}vDO?f6>G|@`eafnJAgR2(o&u8hgO~UZhK5LY(G!ZIZI!qD|z5HPdxPa zMDw}7qV5)Goee zzMwzNICJFqUwm-zf~5??DT;Pc$-DKW@{H>KUM)LINh5#(?!-XnZSAznB;) zWU^KHKQuks_9jKDFUKS0&F110uW`h%gWAtlW_giPb!rt!EJcE%y)x;gCD#T?(i2t* z$~B}hf-_qf3r<{>{Cf)1EbV-gl-#)QLrTnKrNl3!|zRY#X)*qk0R2u!)RzJJ=FW9FZ@h|f#U z3D}_0x%=GPr+}@zlqT6I_9>qtS;oQ15MaNIZxc-D=c6WH|AlEB5Pa+;XmDkL)7&jC zC-nT2L6iv@aw`XF`T7fGc4JRLXJgwg1pUQqp_0(EEwt_Nr6XK7-{z9P5ZnG`NZ7ts zPS%0?j>J^eqmSG2>C$| zZnysglqn+BX)aD{{lh*5Kr=z=9YJQ=53d0uofWjNj<@^?n*Zwsi9015rweHh<)yRZ zAEz+TtMt9_15$g)zg@O=*}c0I1C4QHq!@pv@?-9FbtQrb>7-sGornW(hhXmY)< zq*w|!EalO6)7JOIyM{Q<>@q!{5Y_K|Fv4kj&@aH92U4y`4CTp{%iV7AmofFBU{Cbd zz=>;QvgEF(mJAJ7c11p}s_m^l&z|ys$)9SnnzRFxyqgpJDrcPq)Kj*ul)+;8@|9ht zU7KRPsMTvP)@L%LlMu`;H^|66AJ!?=Dg*e+A#7;O|LF8uZ8Eu0#e==rR}RXch~3F& z%34R!7$jnJp})^sJuer7T|AY$oV$&5r9U?fEQ^MJJnxIq&_n{z{VCVw|)p22zn1z@JZPl996V6-;QHwR1H{}*_zdIu*Fq$ zsvP&dv1|Aknw8>`JA~g~9Bf2w<&p|`vsDX2?1~4IhvGe#4sTRR<%4a^?~LwD4*xQK zHM91>XNTeCP=h=PX>+N;8w~*bNZx|Hs`*B(m2j}GbwgC5Ch|hxK*o9=bb%k`m&$>Y z^y{9E&pL&7IRfYdo&8o}eV%ogEN7*|WxJWS1$dB@Kf#bnQy$q)!f{ zM5)Omy>D%e#gomr-L*RTR`tebteS_&rB6qwJx*|W3_yuE)2g)I~vgA z3_nNqf@v>abspu4)A~%a9z*_85p$i=m^jhvFZM;=E-k`j$(}mKEV%hY=OaY<*4^|~ zN0p(tuduqr)u(qX%uJ2l4xH(E7^ch}TXSyPGT=U z))poQ&cCLZX>_+d`R|CVe$g~Xx3XoI^fVmrt56e~Q;NQv9?Ez_{~(($(ul@GvkCr6 zqESKrJZONs^3#$5FSQ_v8Mz!_;cG)00{itta^I3IAyG|i~cUKY0?j;o^ z7;!8)#yjtH69Ya!f!a9c$6q+l4)oi??EziamstXE%@!ks)gro~pmk15!hrijNnO&H z4!8^MxSxzu&4C|O1)H1B>Fm1bPmj|RFmpnnfzfUzr}`cLiw+NrJJXBr$cX>IprpSU zLQXsYTOTwj0~c{_Q$3`F#hFP+->m|k<^nwQ zk<0*n@OgF)*ykPRgZnAlkLprcv6&GY%-c_83dq-pqum0Qg#Jcp?BSgygRGf0+=9W% zUTe7N?wppg{ZVu;YVtSR5(|bO#hR>FZ_OgIDZtwiO&e+EdhFH4SV$Wb`4uUTce(kd@V>i_xT)2+op1Ox z)wq%*X?2*x&P=tfChh!Vg_~T3svf!Lr+geO0#myecb>qciDM(8_+eO37z~Wj*V+1R zJ7kIFnSfsCyf$Yp?lwI(WV$!gg+m3AyURC35D} z#~_!0jlgLxlM6ZmyV#9Atj~e;1ab3Vne)|3Do`)v%esz3fv?jPbbmho zN;iKr8RRl0zn=x}5COrv8|8^Xh@)`(7gsrY*W{kb9weV&){w#6R{hS?$j?7l(6H98 zl7t17thfvw5to_9Ol8)Vgj}2?Fx5?k{}s@9KY{qG7C$zTXYFGZb&x3Mu7=@N!`xJ| z$Xw`gZ&jEmc_iiOLg5~a5{{quq8>9@E!aR+Hu_Us9xzpFjA&Pn}n z5xdI^J2HTuShnl0K!JW3%AZ`}hRSDS_;YF95Y9sGxE>tC?6mJI9Ha0b&onkf;DX5p z@o7~!RecX>Io_{^wx*lz%Dh%wg-u0EW4<$3VsrNnwtY2MHty*2Z^Qlh_s;e&yG%*H zo&rAMfF4e^Ki%Au9V|;%!ibn{?Jf*TGOz4_JcMuwyGD&emJ623!3}a|pmRO!djqPS+~UUa0*-3jBt*{1GONPy`T$$LLuEWx@VysJc| z1;e_-LP3n}s*1MMr<5Ya8w2+Abia>vl5Gk8SFM;2wg&F+fxBdYLy7cMA8Tu4K!jw_ z;_qMClJEYa(cyqnU8wB7ww9exz0k&?bBNZ z#b0=fhR{4(O5vulgDaqejk3Wiep5%G+uh)eTK)dVZ4Z4#_uaw4^%b4P=FjxXsXRE8 zsIrdkCl_KqIR>cUJD-LnF7hrJ2><$sm|U9eaU4!-e1y`e_N9Y)Ysjk5)Z*~Cdm4*w%l?# zZi%*(IyVod4X`W0h{ojjtue{C1I;fj-%Mdq4s}w#PHAPo;6SqB-n@)sbY&LNO|dVu zc%FdG8~rJg&DZ5HlvBJrhh2NP#ObL)1R0;ko}=8PculOVjMiRQi3tG zM>y1aNXyASQvR5PCFqp$+2uLsAB3++C;aI5K5lO8EUSFqNgnIv@Ss3aEZP^<+t%fy z8`rOhWgd&n|MsA{MW*jXIc+-5 z42;jYan$!JVpaHU1Hd7fJw^DLDE<8USt~nmNI0D9EaL2e6VjvzKKb&MTV^rT!Lm&0 zhRbGtU};3EmTIO}d4KEx=F||nE@SBz9aUTF-wjy2vaxHXd6kfa-q8Rz)#Z_mKvR#yt=qnq)=$ercrlz2DHFRG;(T-zvwnwEiqJ->btneT!jP-3cgJ>J@;hI7*7)*qbQq6;Eu9zM20b6@Q6DzpdaPz1__)3tcOk zQp@Go8B3uh>=BUG*0^oA5ujW%7UDW`EHjm-b;<>z<@v_)**yMPGp6@NXEdvnm>LSa!rAJQ-k>X7e=sM^YL z7n#IrO!YpS9_$;^uFgmq9bJ+o&zKve5l}OC2kZ159?}54yJEF89}0_WIwjw<{;%xz zZ)Jvf0sOco(k9+GI{Ibh+714fT#D^CZSK;-5-9y^NIFt-e~36sO*L28m;gnaq0MP+=xy%wQh&aZ8lc% zBeWikLXTnOAu;-~by?2|;#d}W0u<7(?&Wg1aRCoidxQ=olV-=5^GN=wHQZYfixjJE z-$&PqrQH^)`A{9a+8G=04KF!}W`i275g!bU-3bnU*U@47)xrmBnq#`Qf>}4RaG?vR z*Ozw-7+6h17EL`Wv1BN!neA4zt)E^p%heT%9WipL-ui>x=q6qEpf+JBfQr^cN-mPcCRIyU$+Jmi;J9Pv?Wr-yGXUPxuUlUcr&Gu!E$YyCd-Ti9 z^y9~9lW!)5a%Fa4)c`oAwYaxp@8%hy(Ew`<(kGKsm&mmFqq`8D4xW_08UKx%edcS~ z$Pc>YXqB05Y~xr>$ig?>5tV89x-`l@b&sd2*fzi=58ksnQ3s0BT`Ng9&M@~GV|~f@ zo*>wEc=JtF@5H3%?wAMRR1~$w4<)2Q)rXJuw1ejjwjgEtcHH#XHG1e;3=%DTjFE57B*&~H0|O6TpY#O( z@e(#JfYo~eg9Zqk^`qxj^t#p<*F7R2pTbJ4l{&*@;zs6jAW>={WJwyRpIR%Lao187}8$QS%hkzGZ)vI>_afwE@xXO0Z%*WjWWq zB60!@3Qzh0zF;g(QTFLoc3skKO*z=lv(zTme`F;N3c<` z?Ylb!E8BQ7#P`cXO@%^W!zVu3jh=k_49D zuCMyuqess4^+@=(4^PxG4N&z(OCsStGBOh`OuwcD&0f?BK9RwXHktDOvld_#+6vg_ zb^g5XrB)2n<3k}czO~jUO7kFdPLJ>XaA^yFw-hf}e6kqzei+Wt2{6D|81V#YGn04k zeZz_Q!}BU55hxSUsNiu$=0hK!G~m%4-t^YpDmC*Y#PE65s+VX8wl%WD%p`ec!!h`YZ z%L(>KIkT#%jni_wt0vf~87nFYZ~uB6%y++ne_{ede>#yMbaz#8LfR;-ykm$TEqSBU z2-Vx1iXU6Z18Mih(`afsUQ!)QPr7SCKk%UGEnn~@DBt8YP+vy}?bfonX)r?M&DyNZ z%b8+}__^k|fv`5V2d*YHwszHnD~IgVqTY7vjAEv4uheN_X+ZQX!KP}-Y37&^j!ENd zoC8reT_FpWa=}@AeQ_BEChFH{FLL(IbjP4OczkESyz$R}h${%3JKH4}&mG4W3x7BKoT3Pk~#O2+5NNHDn9RH~Y|Ets0>x6A2 zflDjsTrsJ=0j>Daq~eSB5F5^Iu4laM*_$^8j)IL0GIBrmy$WM<^X_X5w-U$9oDRNQ z-W07Xgc7RGZ@6yAfzI_0PR8+QgU8`^9JVX%oXaJ4cczEEjImHLYb8xh;b5m`hf5-{ zO&jd~#K2U%@6o$~ouPTJ?6`M{)FZLES$4R2jdXp&lGLm1(HqO$CEBLr0$vY&U3CL8 zSB2cTa%98@+BY@|h+BIhNp%9&CB+ON(E)_1Pu0vHKTly9*QD}d$~^@v(sS312qQyj zF?e`#rt>E$Qdtrg5VhDQ?3L>sCm=Nwl^Wc8AZcYc(X-0a>1?)wTxaM$>vNO5%iCd! z#7A%Kq}R66@V0QTvOnf6EYi5+YcS|D`-8_e)Ah=@C@9-_^8n`eQzH7mif(`FagcEQ zb|a4Txz`c}YMjn@+f2hh!ubJJwUur5S!r0%h?iHp@#&DzKa|-PfPy4!1_ZkL~ z_3*FAe5}>)?W!-f_8_n17*3B+J~2kYV&(NoV7;@1D-IYj!i7I{Ijd3%@bE&!q>YVV zQZ~wuIyZMrALy&~t^@l>O;l}iRC?mg8^b%(K)6mOp9X>I+m|i<)AoXnY(bfyJ|m}% z7x!}mvH7flmz8(tpH&q`Q=!51pm^%{P{u_q(1{@v1b;ydTIvzF#zP23VtuA+@Q1Z$L=N5vSV9CI=+uyavaW!-!(q*eqOd~lu1mQc-HXz zruGY?40EefxEB+`UdVi|UhM(IL)31wUtrn3ILg$K{eM_f4QPqz; zL)CMs-wO+_75FKW*A|=$ZRooMfg7FHy|P#z&$e;)>CwaseGBH zdXcB7`OfU!DN|M5J+pYq5?=ekL3B6_(Y}s&lfw;Vv97TgMcON`pAJLYx-a|gy~3rj z6htzP1K2zO>?{~(r+&Lprt1;C^gxGstYwwLtUrcsw-xJJ7oxhlr&JHxH(ZY)BjL3O zQJ{2w;ySOKc-yjV8JfMnmMa`YR8|GEI$TB7(=u+JyH_35J3!`KzrFO(P5G;e115pg z0S+{ds_ngyyrFeb^)`3^fI!VdOC*nFWf6ST3V`Ar(v2<8i*;td^lSEu%D^M_)js_y zKb}bw0kyRsY)_I_uKLW!J&+}?EJF5!>Z`fC1Q zzNKm7w@-OJS?H;J`L}{PI>y#h!;;exB0feBuGh<3cYSo$3q>7LU}M~CE11jyfa8GO*;kKMyT;if?< z^_-d+vXVbe%Xm#rc1y;!;VTCdPMtZ7c`xyYdKUWm5A;4oYj4eD^zlPfMD4%;_^%cH z$kzFD%3N`5=gr;5*(=!qK-g~cUT+Q}TmlkZm^JC_D>+;3(}3-hpc?d@BKbDG_(#<6 z39a$z?C0Vzh}qe?<0jPu;uZ6!%m*Wj|sguqsZQd2%RI;KhMr_T875ye9?GJ+@+7 z!nz!oPf=|=5P$gK?po#}DZ5pOp15>;Ao6U1Ph>2B_<@ z!G%lo!OkV^64Me6N z6hDz=5d$)$fV8)}*3IZ%P*~CIbk<_7`f@dy?6cIw!f36v7@euxYus%NmPl>Lp3KH>pw8^8P|h?M+Rjlx9#3y!wx4SF%wo#%f5ed|NQH#=qLm|!Ac5imyH-PqGEFcxZ zqX1R@0iP4}y)n6%tvSM1%UmJ$QtcPWX9b53!blu@mIot${Q^b1k+X74(i!MsN%A(KZ z*4`g0_z=_r*2XQ1*}2(ehQpg{+pKPA({TEQzENkblP(i#tf1S%hlDeU`ny%TdSeEwZ%OO{^T5h-x>!M? zKcK{Ky8T)Y8nA_~D4F&`R)0|n#JXlPPUa$zHzRF0+&_l;aX{9W_twKeM2Y#+YM^oW zMw7#t3{D-*l*ZC6rUEoZT;Hx(uDjdARRk&Ton2usfwC6;a;PUC*bJ}CgFh!!NXe;n zM9&&`@sW#qYqm-$d6oJ1th#&HWhM{Bt+afNw4sd?a(}zP#Py&a|0=a(ifgb{>WSA> z&(7|~%KluS)a!LXPbz==)5uN4$1^&!2Rp2a*xLKU#J)`|<(=UOFsqpy^XT83x$j@! zJ@qm?!K#=Rosnvzj?>(2hH8?e8&+h(Xapr(SyU9u&2)p)2!rT%vT{8~EmDw^c=_N1 z<=us#z2;0;Q?io5o+;Zru-m(us8z=4KQR?A7NS(4$TZV>P8ul@c^(4sKwOGMk2NQa z?PPv9c+cB3Qwl58$NRS^hMP~-A2Zfjnsn_!oKv7ONlrEUB29#EXIqdfgt0ZC+#M0( zb|&dfh4q~PFvNgR?Zd`cjh%8l5U>D-yggb#uFVIz;2~-o@Fou))uE4d5{xU#=-A-O z@;;fjE2jnZ6m7}kvg0xgp>ln-sDN;IPk1Zd(3uM7Hb2h8VR;m(SD-%vsvu&{8u4%V+6UhK$*0MP2lZf4F72_oIac^L zH8Lvj^C1*SD2*79f6U#o?$g60E!=ZjwgKRzsK(!P!oWt@Y?~& zk!5>3s@W?Af-PrKp6e_PtdlErUn#j9xngf=i2|Ayc`EW~AJ8hjm(tI4SNDY6-tSDi z0@2Yj$lM@4|59WHDwam>1=^n@0O4j#tZlpV8f&oD(kf)U01Lr6b7g;CJYACBppW1?9WT!h^j!#bJtTGi)6D*(9l}1qLd%ol zw3;t+T;?(OvNgQ++^(a_)Baq}Imf^)_UU7t2+ zAn8tX=W<=5_?fG+Fao>(-IY;A(zoD&GbNM z3|MBx*)J6b*+&^=HU2J1Dbqp<&Mij1CmnMeq*Hw?kMVXrP9xirmXH~i7isc zY0utxk3AS7RNp#A)yDwOZ{>b+tI`>xetW`KvY)@StzSC}rIvfK8s%6lt)R8<9<6bq z)7KeqAf-Bozl%7fI@!l*%mkGB7^scKFtQk=>SI`Ez0p|IFJs%6rWC0rSFXi~s@-4( zsd@1tJ{?RLxd{gV_J#Hzp1iHk4&*8~n18cJf1#C_yPN%$D;;_EG`SY0`T1dO?9oK? zS&Nk4`h#Osb4>iaW-GhEprF@gxNufX)!&-*0%I&$un?=6*Y+MfR9zc)$5U%_+iDGL z_oVWSx?4`*HT6Jyn#*Htm0XGoBq>05kPEwc9|y_9mYR;i*&d$;PLqOR9rVY0cUP3> zJ;)=DU3-5tWL*~(y}233@Y&D>QkyCw-_{LBU!|kB))`Z4K$ZV==#146dTr+tp9#P3 zXa&iX6FwF6s|LVszZ=7_v=P`hQ;%CHHnQ z*{1Y?1E;f93o~zbIWnSDH=#unCHSzS%=_y7qtae(@f(?3Nk%Y6Pz|qFjJFbZy0Mu< zNSj9V^slc*@- z>+H%>Ii6F+ILE^7*PdqRV=j>!H@rtACxi0NkcB>GwUfLZ5bgwjH{^OH7o3 zWAdNr#lmfHW7mQ%&oKr42zCQV&sg=e;s~nebBE3~+Gj&A=;bb(vr#tQzCWZ6*lHGM zlw~$W`_a3^WL@>V%!?u2GbowKn{Qn)5OXe~sIt4|x^imde75x)wWqwXfca=h6KkZqL)2erQQ^~hde$S;)FWv{ zBm5v|;r_w3-I>(y{^we}=-k+(&`VC=QtpG)ij#+&YuqLZJqh;yIU1aNd)>g1z|;oy zzFP4aiQ-5zyr8w={NOcBIbKqp4p*V~TR?Wf`+${j4zYRlib^4>AhIpcefr^lO>zHS zGDt|p&~81ok6)E9=y+L~6nC~lt@-wOTiTT6 zV?|4Hw%h!-rCIi)5@m;?UZC%FqG1Vq2f?sYY4!nC-k5PWlPhYD&2E(H7qTcJKJS)- zdiJ?!atssk`1+=r;J%0xGZlbme=riJtKJ-WCsuv;tr|u}f7H1V?Rzvf>E*x(V`9A* zwIv`pDt&~%riKwbBcTyqE5LVT+G?_Zl&yK6PzIdIO!_-89xdBPGG^|Y1T?^36F90_ z_J<($mXa;U?f{q8PBO?n^GvSgzMq+E^zvm)_Mw+Bj^1uv+8_Mx%!-=!H}h|s^^htc z8stxRkFa`cQWSWq02rWpFmqkGOh{(^qltZ-U<)b3<*8&IIUmZu=(!8m{w$bM-6%Ei zBpkYgap&D#%mhe-nVazBM30>B_R^&5%81Ep21B+M_G6YBtbC(v+m-xz6e+T$<3uiv0RV z006L+y$3w};u-Ey(^eWY{xr~SIcV=u@BGeX|IGdah)3BsU!-q53L92}O-A~q)>@NK zj*3i9bR>plpXQUEphJA;dTVBUei~PpX%mKo#Ql~sspN^=LIBto>mdy?HF zYj{L|wOH&9-|pL_ZJCIq>lZ&kx)Fp#!Qs18a|6%Z%K!lY6!2u~8_0)gdv=$8q9%HB-R1jTPjH=bn& zB$hs{$fT6KeYfwreFXO%kW5$ndA3lmd;M*tp}~ibWEJn0C}=C7UPy+`1R7_R)k>M3 zpv_%Bn-G%Ae$TBbeynXwuFks~`B|z(y^vP)Pk-qVQKlyEb;TX6;dfI}SCupezsRB# z+67f) z!ZKT*fK2q_oiw|x!&`5v`u-@R)F5Y2?HX-k#e|9bAM~e>Ig<>8&|Vxk_{906ZP~t0 z(9x7wYhDdHN#-^q;G!msnFJ%!ZYw`&JMREL0!4M8YxPVL@TwltLL`d3mYJC7?5I5Y zkR^{7D25jZLv@dy$^l#couvsz_7;f_x@<;p(sjUgr<%rFQs+;w@V&iF7cqUsy?;pH38MCC98 zU@_4-h}fEQ@y7f75E7Dyb)*k}GPsN|EYDSUta|Pwm8%ZV93HpqNv3Cj*hSsv8=rq^ z;cT>6Jug4p?Td}I%%SE^1AAy*Nz-VVD%9V2)gtq$D_Q%#ofpG}j9mqnMc+vva?7Gp~5X*p3Uijkw-o2ztl2e(y-=e1~5jMH%-rSmas7_^Y3+ zR+$oWCcvDuN>Pu>$0F&i(RiI)2hUf@uMAW9bG3va0ZBXj;-9v7_})$P(tjf2$F?a* zv&L>!2)T;><}1ljxZk-Qb-my^vs|&TZ+ZW!{kX&7`x^n58z!7a8%!<`Qzqh?B;3Ir*G=gihTLHl7f#PGCUe(WInqblOf}CVRTE5 zaI7nY{)Y5V+fhx&_S^s+%AJUUlJj-8&@GSyPS(@D68M*2H&X7NJ&RECdMo|;gH;Y{ zYep8k{RnwZ3<^h{k6!iJ3K})|Ubybs^{Qof(HaU))CwltMnX<*fC%RlXmfXvM%jnv zVuESzJ!lX(9ED5fhSwPsrl{?kqdZA#NA6H2AK#6F{gR}|g*T_w&;*Jq|3G5CiI23u>LSH%>a0N7`l-`9HUhNyI!cah4 z`n8BZow0-NExA_r{pXHQ8#n#8%lPrTP;e#^*Wahd7$jLG8IvL<;j&YAy3aj`K@VB3 zc%Ss65hwCrWl#a{3s?^ymIFg7`y%8Ifb@HbmZxL=noOrZHq^$su>%^kEq&{37BV45 zoEU10gYy=F?!79RUW9=>@<6!Xn9(_pz4_V^{L&DZLpRx%at!81JkReS7jnf!KnP+G zLJhqU%2C)(;q4pF)PMvn`7eM_-r7k@IB*PJY1vzs+iT9A>N(6m+Ur5NW`j32#Dg&j zc?(nIGoN%xPzP%x>ghT*BEl@#13uzn0(5~IihSN8$+XX(#3KJ3o)Y*6Bn$4sqr|4bq5N)lXHx3Q&6@{qMGngoUcc0ta~q!AxxmNMAeL*tdw9jP zk5;nGM(SaitE8iTiw*Y(?ljGDq|C=5aG5vBL66(C|HB}6=gOWpX=Phe&v=o!C7qnx z!&<3tWr6G6=^~Pjgcu0lhZHukxM7!DGY#-1M3|d-_2(8H>ovcPa&U$Fm-@QC!PL7K zl+^9d{1R;*sCb#W7Ra*%`uUv$(|u;mRHN=+rLOglbx84PiCW|u4c(UrB_}^^4GTf57 zTEUC+7*&sltP?EpbSMfQsu0MD{wG7|ccE(2ON)$6`x@VqxiX>{Xiq2=x+$>tKr0dW z53C2zg8$1ezSOGt@W2ba%|W=QG$V$j@fBU@x|bI3v3-1f`smjA z;9H;izSZ?{>GOr_R%Mg%2=1N4XM$2%g4^wQZriaKgk4G{2 z%HVUgvU{JxINkw1`Vi&lV~lLoXq-}f#7D`S8Ydc9t{_YM>;i_HJF2MQh5jwe9i{Yo z^6dUxjxLh1f7Sx9G?Q-+Z^ubrkH@&)qx?=j zotV*}Q8BkZt0n_pib=;VL_lo=OXMts9^ZfU+C@U`J~i|5#NB7|2R-C?9LO&gCW{QQ zPROIVh8SH!m|gtt)&5PxpU`XX5vcTL9>r{20_`?oJ5T2bq=|D@KpjfkWdx2jU2mk zK#^d!G!aY`mOnWqOIy8i*aS_u;N^fmgbNZnV8K~K@`&~!!XzX1eFWW4j)+D`naY(c zh`rQ1yTY!bF_HvO?H~ldket)fw0vwGftiqC#1o*hDuut+A^7}>*F%{-z&Uoq%(>@? zN9LSM_JYSp$YCMqCd7Q^1KZU?^VF*x&L1e>Sb@h7#JK{nMm*DC*Dqx+0@Z<~f3SjN zH?8g`az|9G`bdHP&RGacAhy_18|9{r(#OO`QsqBPwDUwfhRLxd#*a9k2uDba#oa%e zqk~5s$|I~FiSfxjI;!qfo_M}S{pld8W`vdb>$b68@NGZaC}E}+)$m?HqmktT#nG-_o( zebSiLk%$gQ&(j7zTpE48vbmB4H2)s7MoH}c5U2gy9^dW!Rt)o;!Aj}J{{@Qw!A@WT zM<@dli%g6>YnNW?yi4L4yXBj-@LxpLv;A7pG}uP`93 zXrk${T%0LPW;Si?8Cac+Y1L@yhbQ2BsHE2-*^Q7$ajFz8UITt(brL%J=g;QlsQ1O!~()1~oPM~e1RrAXhcV?j0al_4%(0U#QKGy*KwYO5y zJJNV?>@8i(W<9w&oSE$6bNVT+qO_pv5YMFt77fGKO$+Ph$vZs?cN+>;K-G--VYsr; z*&%&JKnO&baR&PjN5<@2xo9UP^mxAWR}Sds+*gQuDKJg>Db{6$hu*WYy2oO)|V6eg(*Mv)fH$%m!d zOb#ES_V73w{np$~fL(JxEdT_k7F9r0fLBqeO46CxW62Zr0WhZ`1T(uM2R`W=Z;e=M zew43k|i=-NmuGhW`~`0N$G`M8wnH`d^0et%xD>f|JzgwfXV^~(7Fn){frP+A9|U5V1M8AK6>-)7SQ+W z$SlY&>RfKa;gCxI*ol&;m|0IIhQl5b4j?!wmql)#js3Xp^wy=gVO( z;3ay(^#ixION|-4Z$>PBP;`#VckhV^_~@T-bsluiISKqtFb!USfSTVJKU{IC0_&8W zP1bMjMDzCJP60VxSLitq&G?!*?n?OW_#ccPrJ+Yn&tlBIZBu`%AkXJSJLdqEhx6+H zK{4Cz($~p{D4Ax-HsfMQEw*!Z9yUwR@`qX1ZeEvi2R_@AN={JJ5rWqdGI8xF*m9vZ z|LNjws&L}3IBmJRbYwZak7zn_d^{6QHa*x;U}=Et{CK*X{@F73vwIG0h~jma3}e29 zSG|%cm;dz*2L-t*Ts-m{jCA8qgn%H2+6A0loJgYVDy^2U?*&?SbDZyAl~*7Jaypi2yJA`&Yd6?9`lTN*$k1UFXeV+Q}yLbNJRcK zotZPHk;j5$0!p^39$Higm+1aws9Ox2*-|t^XG8L zYBLyp3G;*B?B#Flp%p=xpnLiF=KyEe(E>b*gP|PQ<6D;$U>*=XWqksh**?~;fY<9$ zDQ~<)vBqXG*r-dU%uWrNGowpx6Ep5?3O)I7DcJHQ)IgzibLUx_OW6XY;4=mO zzxVnt`7*rgm0o}I87=so$b>&9Z)Hwc%EjZgx}2`ux?pW?o6%_mNY?2i0ps{Fc9MCL zdwYAK^lJ!}RIqiNos!h|KSx*t^RsCnQ9h=d*p?~f&Fu>nNW5uSf{%43|7_P;Ye0*-Eg?&YcM={5~ z&AwPO`j+@D$NJ}Cf!2G9p^SNc)MlaKt z!eh3yi4uVzC|1|qm$VU7`#jBL`=d=Sa=m6sZKDm-X$?qeBZigJdji#WJ_)4_u6rsf zb&s=ri__|S4T}ZcE5$sS1Jk#3OaC5RueaM#AQNE)$Jq++FTJ*L(B8#jlDKpey^))s z0C&|JK0u(pXJqgp#gCuKwn{I;$1+}6zBYz04e>i>%T%dWZY215 zJ;t9W+B3XK1Up~NvgG)Sq9Myb0umtbV&*%f?RezTyL0EEq> zy0w#tB4#(c9!Xt_J-?R4z0`cQoc?FeD`p}$eUsQ~R?P_BgS-iAnadGKw?0)!0m7dc zx-UvBy^cjTM;G}>pH`_pN0i#ivAB{)n@S0I{u&nE@cX_bTK6sWqNJ?$B1yG-cuY~N z^>?zD^Kf`f=cnr@B5$^I<{dElBA`lS>spoVk_qrkPHeP+p+mGYt9Ayo&uRtyVWI2> z#9Pd6?Uu0SKmNtf_q*Y;CsP*PGV^c!6DbO#g(xN8Kc_$iIgOD}+NDzYyR7nTG>g8T zYgFC(oUC|Gb~Q=X){?U`Vagi;;K;4{>b)TfzLnLgs#a}*^(QUbmT+X;S@HOv6u9cQ zsIZzg{}X2uy-%J_h=m6+WeCaGlN@L$#%1L)j# z(cbPZX^SOYQj=cb=i38It`(Azw%%v!GMpBG@-0ms7ek%KX@kb;W3*ZQtJZwh%Xt3H z!9VvCg~y~EznlIHeGW`w%GWQOqwrfN2X#_V`nqMJpJ)A@oJ(07zXNDM^Tmu$iBW4? zFCKs58J{B8cwM3uelM55r%*&O%39;NT6lta+h1vYF1pn7#MyWt8{aw@|ECV0N|55c zDxp7=NItS6`0@hhR4-e+N|uZ49-`h?Fjv;SPz)lc{uWi3#8EP$YB}P_Ke9vMP#ePA zWOLeNd*Yo?Wxb#sbXP0;6Yt6iDaJkJ z<{atO1i)G{#9Tg%{l{lDGqb{;D+u;@Z{ma^tBd(|%g9T}pS+R5**742Eu^2ExfKKq zqpzip(%!Nuc1_u{i|SN%-Es_tODmucc|=L^SjA`j&|_BLv^B?JYe}zis9qHC-24so zu5pE+OTNWwtk=67;|=`Pe-jWq(Ja&=B2|1g*8X#o{o=Z2F=0w2NuCCAj}-h@*99#B;75SCNKuz6MJwQ$piTI@1y zfx@!T#~4rkK@LKnR-KAZ7DavGjXr1?<&L86u0>5R(V=4eMnG>EX!($ZI4`L+6y|E~ z>l`CZH;Y^5dU!sN9~NLt?0bVG`PjAV&_=(3D@anVS*aQFqBuDB-fEwrZH&1AROT9dq(TEo8 z?;f8=I@Yi+#4*pqP@6%!rnl$K0Mw-Ac=|Ne2A_MjF=9T>NX9t7gfR|>i)qjqeXeS` z^xe0oT&w&AV*bM>zi;&EQCtWG)mCUIO%Y^gt*3FZx*`{2DsKxE@ zgfpYL0e0V|J{{SStFNGNmoduI8<&DkXc(GEDHhosINckR`0|DPPC0fqUAn;$@pAdJ z2F4jZQ61j?S`(f>D9L=a@g$C0fQqgYYD|-~>>c^w$1{d=K_cOuh4qNN3mQ zt6D?bt9n)7_`pJzC|r)Qh&+E(jg4`dRu4D0e?60U0Y)KID z@pG3fOKb~~ z-|UyVKlK2-anAkEU#_)$8Fat%n+qNCxo_PVVd{2CGKM}6LHaVZ>)FU?OdK0~jf0+0 zRXB*4-zHnAt2JbzI0voMItQK7suTPUXz#0q_tO3PwV){$9d=lOCHf$OaC0^TglB9l z5}w=n2u^=rtA_n5@ZJ6aSR>h&pa6` z@LZfoZ0VUKt z*`yOan)aw@;DZ;pr}y>kdd;SmdP5?A0;}7JP@-6NLk3UNL$l;YdwUGOQW& z`f^Ef1H&1!77DhTjOc%8(oPxU6tZVEC-?PBCA(HjG>)OUNv2Z6Z&Qk|M>rvl`du8} z!ht}F1DUV@LCMm@-?oqFZ!6#3;CVTq)7gCt{N2AJk+geFwwx~M*gA2ikmiIqD^f_n zpm@o$6>IkjiNMa65+?LmRzxvM<9oiBBbIfVEN>OxPH%5LQc-EU?uWnWz%xaNtQF?c zMt*jvhjK#I>~V*vtaMeCI0byEeH4xVJn-u zQ3lLEFr|8zRM?(Z3!s)es}vl4h)nx)k1wkR7NMRumZUUz==F%DloGc<3ev93X|q22 z*Xsa7;`xpHcF|!lkE(b$f7ZEU4cBarzhPy7?DfdjlRJ*A2H>1*mn%y~@tX*O7xqwX zf2>pu&Vr9_5kA#H&a0=ix9`xk7-3D$7G%M+6|dclSL;`C7)7@D89e?!dg*_pfq!LH zDzIY3m5X+wg^Ai6N8*iFzx%a&EQHBqD;EV0B(AKJ79@-aS zQo9IEm?enzUD2m(rk}F&S}cbx%16S=bIXayosCy--2nl!;9z z(UM4=_i-)!KD!7&Lr)qHx?$);V68H1#@~6vYkGXvwgGpi3TNJWWq&H5nzQ>MTJAy>`%^=>J_1-~ouuW&s6`Xg0x^x_Zl)+ywo!jfDmV$^lY*+*6} zws`WC)_W2H5Bjqp-B|-)|t)m&t1zNzza+i*uS_|zDfrjUeeoiq#vbicD zXcZF4nFP|Yn)CZZ_-ds@SH)NIxOUAQ>j#ZDe~CixE+SUoOPz=lnruv47(d~2{FJL$ z?6s`1R$VG#u>Y%o)=x{(a-2aNc^^8ZyLHB|OWKX!Gd(;XbnP zkY<;W0DxJ3e#GHuXkWkDry5Yo={);k(U7=0L>#n;{En^Q8-Y9O#=RQV`jrFID#BHMi))NBBu#?f!U$`j+PF0aYgRfa7c@urX{+-**T! ziK}`5JFQm8<_NM)+l|``ev~Ei?!QURRyqh6%O-;C!f#K1-wU6Up{CdlpZ) z{=;Hx4lNOHfc|oYhtJLkZ+gwV;4+MyydsW}>6J=c?HFt!Wl~4)aB-V?(#!%-efO;l zaRMXc^on`^?q_A?V#xM+dp`V`378almc>9j2#NkN$^XeC2M)I0=)VsdGThQwMiC+L(1L!hku5pX|r+F!V`{w~5?#|%+=Yv1wqF+hu0eH$Tk3P|eG2CZ} zP9cr6lp1%ZGS#k?GSNC&2xUD?_V5Eq2h$Two4U&svF8Hn&$4cq7QW9eFxyFDN&cIQ z^)F@8zzZNnGy+RgIYQ~aA<@VZxe)IE3#DCR@tRJAugq5Ys_imVibMxn(fOLI84gh_EokYrd$f*Ni<(VNh@v zq&LB4~O;g1c4%Ygm|7uH0( zgRbx)IN1cTQ;QQ@Zhiu8QO<9{-35dw@udCey>8uuO{>1tW%7nUVt5~&Qu3jpEx`5W zVHIYV44ZnsWxFfBViQSRtPOl8_wIr6ft2v9{PF{6$2(}>+ZPT6AT9|!;~~u$^49CU za4jk1|LZ;d2TS3`V^m|)p8d%|euuE*s2TQ2YME@}&Vj1{Kg}5mp81hi8$Xk>NpVNt zQP&kUL`ZdTy_O#a4m`|Qq((zN;hFDvIo?HR^6TrAg59BAG{V|h9-xUU&cYR#i(;?@QPh>?xt4PFlF_t#_s7TmFfWV)Xwzn@Q-Ve7s* z5rQ8ri6RDx13phdVmd#|F7d{H*zui{d_AsG@_f88gvTYI{6ZU*V1Rz}QrY?;&wFF! zNpr9ADK3}jA4POb+b*a;T*&gG1Gt0v03pO}6VfT_C+gPbPt7 z&|akR7|w6#=Tb%6H}Oz?w}N-jMrmU6bAy_~bnc`d-iy==gmN`L#xZUUG- zHs}MEd>t^y&(5Xcb-arNjTyo|>oN~H4}enS7~O>U73dvb_H!Tq_GllRZW0`Zg~1cN zkEnk8-kxbaqy6BK7JbLMR{k$#IfibAy znZZmbX2@+V&@V*&^R~6QZnBlVSgrHM{2hQBO4a=5j`>@U;-2t5>LVc718lr@PbG*i zj6B0j^ z3=uM(7)=kq>8xrNgV$b>-6`{>S0XH^`3GW+vJ@o_LQNQcF_iTvb`GZkfM|La zrf-FNM-6ybsVaRGIZqq2vTq)|Q^1~U8QS5GM0ibJ%X-xzVKxX~zaJ%K=t_=WU8uZ~ zQHC_e#SzMlP6Qg$@0rtxJFY`>k z`nF)PSx1~@gN*?`tsaR_1VsVAq}WR7P^7SO9ayA;xD*>#hw2^gGcgyZ2j!VJ3iyi~ zW=w42Rl1^vut&LlZ_3Zm-wQ~X>GD12Rm88mWXsdz_eLG#&Lm2ZvjU1%_o7vE_D586 z%Qh)vJ+@m~OgahVzH10>+!)@w4%w^X#>n0u0(L-^H|M-oG&I_Bbv{$b_Qc)?JRkHB z!hzB94y-kgA+2`_P_6AtWaZ>TfyS~i z3q{+esJ&~EY`0=oCW)6Lg?CG~*#AfeWizrJ4HgZYd#cKKiZbii&$4&IB0 zStT7Z%|o<*6>zMt>S|Zxq6I+|j%lYv8B0eVLg~h>jT(a8hjBxKg8A2uryn8L?E*L( z+OlMKZbBh<~(_=p-PEPLM@sTT%O>e^MV{9u%?%%iD(vx|aUm7TX!Y$TJ zOc_<#39e(3RHWPCCxp&EHo+rikJFF*H?P7aLZaQi`>Yi#k(mjH%Bn(AvH_W(SF zo|*K_rlD2oV?RocEe(uY#J6_C1X@23*4hGgCB8>e{QzquD7m_}t2mqdl^fY}isTS! z9_{j0X=BQX1?AV~e}lgSof`E|gTvMpGKTW?z}x71%xh`njiSnaowA+Jsi~=B zwi{0!1i$ZUbdxcr_HbH&7PO<|9_az>+j*79l5>SCXLDqFgw9q_V8!#4PeJ0+^{|QT zw^@#jr=eY82|;_ziaqNMXu8-()|w?}An7LDNx9qQcr!ox^^-_Xy6|Q_-Yu;8F@P%EMF^ZV(y!9@X8=H|w3IQ^Jigf`Fo1 zs_+w4$aLgqlz95QsH?pD0MT>bsr$1r*>9L~ix+UTaKa#tSCLbq;oy-&*t2rSwJKt% z#H--F*s=2kAbE=9!vMYn8gg;M8)F>S^(wZ*Zc!i6XF0aRUKvjY_ZV zSNz+{cr<)@sY*9rH=9|ga>%zR4-8}nLA~o&kX9m5$^^9^9}0%B5=H2z&$H(FvJge~ z{`5b;E+Q?ZsX}^)s3^>(Y1P*EnKU@9;?tIYYZh)c9=4)8uvdJ;PuM)q)Z`=8DJ|I{ zCcX5C?ej>NqYnwa*+uuhG^&*Lcp(QGBwx0PFCzMjleF35xZ2E)P-grlx@)Vx>8%-~ zzKW(}$W3=!YsQ%eUXYS*bsWffa%3vH{++b#e!A!Eh@-sgv9PX2oY~<@H1a&8;*ztK z-faOvDZ&13^6dF$my^1GD@a1~c6qI(6Welwf7<3f`5#kx>V3bjQz+{$<8qD9&7;+p zo2m42C69b`%3jFuW>&J2r!S&LosdZ=A^ulkpw>bh9$x8EtL_m*<^eQfTWu5z+jmPf z-nt^dfdX!!1;#X0E^iRZWpVu!3|t|=%`_@Cr`8`YMr+IGhlnlgb+=-T>}vdr;@Q34 zGN_lbW*nlsze90faA3|yog@IWK&5M+!lA|c_XQ=5Fzn&z!VU1eV?Gt`6fsnOon(slL;CINp=ZL6q=595@ysNm@ zM?|h(zwi)ZR3uYAs_y;VeW5x@5+_P6F1b<^DIf+%#37X5Np-CccLfA&4puDz*l34K z&83P+yYfi=#|x#47OX!uO%p^I!{b^;N=jJY@(V~%%Ul0k^By%Q*hCbFHdk2&aB8#?NZSodE zSlWh*;@2FvhWztrubOGUJi_jQ|v^1EW$3sO-hC#u(M=zPyzc;;J_~|x`Mos3A3L7ZLjA_MnF3# zSP1*WG#qgm7pH&vmGn;jlkTCxC5h-o*l5x1YcK9<@#2n@#)+9#cbD}lwgjdt?w9o{ zXMZ&fr#Ji$(HW!a0dgBhEMf$s@?0UupZ5f13W&rV3-i1OA2!VL-E>|fU^V|$@hscjpigAhS5ldBn)QzQh>*g{ zx}E7NQC66^x0q*RwE2qMi~2A`UNjOf<|IY*=j)%G6d$n_9{u<{VI%}MuE6RgoDbWN`8J@&HeJ|hk7WaVWT|*1dDa8 zT0YP6b>(4*eR^yDD3zqd9vx)^JrTvnv-r+prJ5SefQ5@ts#^GuPimIlVO9T|eu&!_ z&MQ%n0Xr8w+pFKA*YG!qB0--er|DEX!k6zcZYqX*OwOL^h8}vG`rIo!-vw9$pYpg~ zo`Fz+K9g$Vc5J2B7thyEeX79ld}#fWyIY|TvoYo_uHdNH7gqYV{YiPo(P;b8%`+Tb z-CB)vtiT{virx8oh!FG=~VpcSSHM4l$t2+;jxwBM`_$Hx?iPgtFVPLQ;~KU)Nj%qJ+wAAjQOc2N z4cI{27q3usNNKwf7OR|TI{HEU+e>exUvKphIIUmLgW(iXXR9+_}0?Z#|G?|PQQbw2p|L}f@ zlxYU%7n^1grBxH%D!iuYnv-wX8D+_*TAzR;Gko{*w)66NSF+}MY|F!k`ccS5%^Mk+p?s&6_@w% zhaMYu{TI!vPOeqN|IZ+^t@2s7bYf2-t@^!|G2Bn7$me%Yzr(f|l>ett%4^^6?!Evb zZXU;$uU;!Uj^rc%g!9=pCv4|o;iNpyyNG=L_Sn|$i~;vBai24McA*gtdoO!x?9%fz zH?iz`o&2I(VCPwO%T*KdwDp!V**JynGJ(`M_ZOWk#=Y){kDBbesLR+vQCPmyO9*Ps zHXr8sq}pC*Zt!<0GTu7vxTpUrvjsnbAOi}Apf>U3g1(9vay%?*icq`yIkoqqHq2$t zGz$^heMx7?_&%?Hj3zkj#J)!=#NaU)65k$WawV3LgjEkGFo9F8F{QAYLtDTC9(phJ zIvvT$#$kzwI7lP+mrpcuR^U`3%!7!%>y!~@?xC22i0##17So3sx>eg)jt=zCu*9z= z)heP%xQZmHdsifW7Wq$E(@_IOMW|MVqkIQHZu}(KvFc0Px-XML+KoV!#<8@FmII#7AlZyb21E%j&1Ok% z2On|Bfy@!~C#z}Wg&>si(eMwCxxGma|LCNb(mhaPE>b|~=_a0pVAezY+{t$5QrBIC z8)b5@5)vT8s#Y|3hWeb3vI2F`jY;Q~6TY!XVcdjT7$`FKhUXFghs7(oLF|o-X-?QP z9OC%2zxn(b772&Zl#U(RUp>!Zt;O;#3626i-adDRqe6mWM8j|m)3%=3L z5H+(Y=H^tFe`%=k{o{v7KX0*4?jEl}NLw#e&^y#K8am{rs$)@F6g9e#GgRq^DaFLA ztlL_Q;KJjBEK24?kLTjSDu0g3^oF-w4(J>JU}iC8!j=lshSuHg8q#T$M&n(h2Lc$TAQV8C)vD z!<>89AnmDuP$3a9L>_w)G1O3xW08*P?qpR0DFUGt^<)d7dR5pf6A~e{!1*8t^p4XP zU6WgtS6~s>o+m|=?(!wcL*B|ix93I~^5TtHI#bWJoDRIMq%)7>W;?aA^o8{d(hvu{FzlrFZORfqf zg9+(J#_acRy}ptytIs;!uIN=wuN_jO(1V4znZ^*LOZGa*n2F94)3@zeu&2PezTKd( z;j6Z1g*~CdJI4Lj^y>95PmQ^0mBx26bEb-H6+nwU7WVCZXLfJt|A(86dtZQBMzzrP z3eW?cg8ykUQvC1{j-{eslD)d?PauC0`nA>{7OlO1wz-Lzfbe&5HxITdJg`>)K}5?O zT7?i0hJGkp@AKC4VMuYm>@FBQ6}QhMB0i`dtLdT%iPdTnLm{2168|r&TgL!;si(KR zVhXALRIdt&1StxMX2WpLI+FWEPQ;&UDsq$QNjs2oDY4>BClN^d2uryc`JLk64;|*@C{kAE45}GA z1k=IPuwCW#{61VDY+>CMZ!7d_qPQLOhsP1XDVQFso;QLYEps|K_|LGw|S#FFIjNxsukes zek#aL;`CC#QFUJ&I)p$#W%4`tJjaG?V}%Xt7J(HKrKt#p*XR{Q&{&o8GbNlp#(RoD zbug+R$F_v9sFG@dr@%lDkw;$emNO0tdo;w{Ex=VB{2{2zD4JHH=4hJv=Cl%XnptJv zhPUHa?xx8!x3~mf{@BFi33V>WTHHZ0-UJVPEa0#3DtKQK1HOWcz@NRq~s5+-SI}IP!=-% zD40Y`xugXN3&jm7FXUEJa0w{w=?h37fP8A@NEhhRq0NYg3Uv_y=u2-XdtE_Y#^0-k zopq;=-n`37_gBW}^%z~DkNyGE8p)46N_E39yL|Ok*3)FK=jYf%-ikIctk;Z~96U@7nh7DK$1wld zx!dLZMADHqadrYa%7sIm5C7UthHO1^;esdW0Z9+2PRPTK<{e472mo}u~rop7~xol`M0Ino^gqO#tw-TlAW z9SVgVHnrmc>zhCc*wWxqvDo$g%MD&n%xSe8K-d%6&=MUUS2*FJ8NyZ(i0|j2K(w`; zT2Yaa+&Gdh@a0-jq4XD_l^n9IsN2Ol_j_EW=(?5gG*_dwSk6^?*=PyN$yVN%`b9c5 zR_#O`?6|SaLc)Z-&XSBZdf)r;VBWllZgPA)eevgP>kEVz1rp8U3^_w@-zQUf$@|;dz|y~D@+?hAj(0;Pmzlfy3)VbPD6p4`;w2ck5ASG&bzM&3cwWw;J%-z?nRwL? z&)4H;ki*)dT(Pkqtc_@i|=BGo2DWbz6z zz)L8Cjy?-Zj&2?USDo1!i6*==@#rj>=%4Tq&$|)P%2h_$8!WXXkWy$3I$y=H4>yO-u085|%ZLv!zEfVl6DR%_Cx4E#P}m|67g^k$NS6 z1_vn6y0+y=Fy6cg?e?fv>WEpV?QsFVZpBZ;QjR&2Ny%*{4X?ne|Ka-YpF}MnI+UE- zd}>6A`Xa(5GU-o(+c&Q-OfVaBM!^Q<;H&A22Xkp}OT07BXt#MUF*^nJ+PWYkry&05 zb6X_(h{DT@m)B2K`2aHK02dfi_t0zR8L)d`0OM6n9j~9>k(?os%VbaKD zVk#Df;!0BPn~5RLVdIs_C>iGD~+h!O|8D2jjS&FYHPu+pjTrY zz@ta5Mx9p}}D;~WaVbnlle}DV}vHorsfJ^7Onl%Z>Y}ZjJIjH`?ame5D z$vqlC+)>kKo9~ek_8ox3~D62IpQpdg1sibMDqlMxwNq<|Nv@abt2AIb!>|!Iv zC_9$?dDKHMq5jU(JVO&fxc)n;-t5{uOZSNbshIZ8w=I;W@k^>HHLb~)U2rETID($_Z z7y~s1V>`Ljz@hJpl~7@Z<;XO%6ztUMDl7-n+T@ciqFxa({E_5W(ilJS>7tW)lE$y^ ziGVto?=s8rv|b-_|2^}|4{x`8d=8uRi##SKKD;**Q>!6-mrp^%cHsD=-aNjut=9Kb z8O`ZlH`BPZaigel#r$xU^jQZNrQ~gzUgn-p9JQwKs4*+?yIfL6tcfs|FE@NpHe+L^ zc&Il6@R&Pvgij)QEOxCD&L*$E|5#8Af6su$t4Qv9g*2dmNBHZ5v)lHquakk^&J&q? z6Ud4{i_+mM4Q@8q z$GKKpnR=5hl=}jn5_esi^Q2I9ACqsjVG&O9ye=EGg}-K~r4Fo6hfVBognic&q>n|` zXkqpfF-71&?Iw^8;@=gvFcH79+gB-xjsThv50#=_XXU%rD`7Z*8kSiIvp0kqJRvr@jmduK^dXGI^UtSl<^l7Gf4D)5@u!HET!2*8z1_-+4b7 ztyu>)DwZAJn4c7Ud4)kMVgqOYQU}hD>a#3u=j+UTE`iqVoQT%n(d*yj8#N9+Cp2pB zGUgWQB$A!~CkvoNOIj?S^ltO0{}6vSeTj)EQA!S3+ai)O4r;qCMvX%2$Z#87t9Dd#F=0YU#Y&J)9S`nmL zIn_h1eS@QnWXHLk=~OfAHI}t>BaH1%aR5iQWOg~k?Ftyep;LBmnCvgaQ&^y-oz^{d zJDmUI4A`e#n&gb#(D6 zZibavx;EDZGLJpk?Uf06{hDo4N0Oy{F-D1kvXJ+4&S-6Q`D$7flDt=|XxN=qe*5+7 zwYH|5MW);znN)Ce3-SQvNU?&YxOT5?Gz36p;+o~(icTTg)OcXtml+?H$o(Ck;JuJ! zn25sc`}Jp{N$mC&T;^kRCn9*(6y!mp>~U%c*f!Ycn6sdt+8#L>~%+OKVQ*tm%( zZMI9*f@EKejgjTRHnj4f)8?9$yTa+46ylHU$Eqhk0VPU((drnqiOW}MQ0*|c=Kk?| zHBvBddd6ZjwC`{8TO+_NxijDze?5*}LA*tvc^{Y!4|{}GNYb%!{F<@9*VQ-JkE_kud(vOBsl7)canclg+M(ao!>hltr&hzP zM(szqcw6Mj$)|U%9Syzh{CmqyyIHjp&UdLN$7{lAtKe^FxM6W=%9CBL;rqrxgzu_P zI$`^e`#k4l5J7f%(U%`@O?z*uc=n|TC0gtn>gS%K9HAv9>Gx8h2jp|{UG0~~h?_E6 z{m$JC8rQ|%^~refd3Qf%V)A3kZNmu8eL9uBbXHW}Ia~$ankdR9)s)4cFwo=o^56!a zD4oxy6)c<#1jXnYY*@HA=6_|tLas~;$t_O=GkM_Z86$$l)Iau*+PE} z`1!>Na;Dz|Y*a022&|G4tLlGzP=c38h*cT=G>F$(KR}*7ywf#sYHVLIw<-6~zmr8K zJ7phx@3`vKht2Fk!lj8F-Dgtp+-%~==&boyYF!NudT=7mn5%D$3U-}dHm1R{;rpIC zmegf*;7O344svMnZ(V@JZ=N!;aWw0K4g^o}QEAfL#As$;(nc@damk-@%#;RtxRE_i z?IGD`jAu8(HM>8Y0+*guk?E0XqwQJA_{ud=^;1BzHQNtnUUc#(oyn~QyalnquY&6$ z2hiG_ojd?~K$<;&3K9!n-dEdCb;f@YFa>{N{*K^J>dvX{cE0k!siVJ)V@pfA9o>Uh zvuGxaDov$FSP7cmJH#L4BeEBS(~T?^Rj?0@6DX5D)=9*k*@3Zz=YoDFgGq+29>SV+eqxbgnwHHxOL5+~9?^(OI zY5L(+(aP>oOLz~dFid^!ZIq}DNZ)1in+Mu4Ioq=+gXoJd`9gg@i{$IG|Cp#SUqJi< zfz`e4P4tL*$g#R$XD>9Yvi?*c0W^kw;b>Lx5!99%EzuDql~u(p9gRA@M<%FRu*j+? zxX;fUqhdQvFAb_WRR_nV@8UaIwaY`YC-$#QtyZjEfMB=m_Xlb_WZ=XkMt2>am_Emr zlTn^BL#4iVCV-XC(A3?^pf+G|Bp02X53r!sD}y9)7MoSSn^ImswP1^hj?eF43#;=S zF6L4bK=x_=NH7Q`X*~X1hmo(ST(Y<&KD4kC+%>lr`vLTtC?XSs+b-8UXtP@XjIdmyY z&&nl)GwMcys_UZ;wMj7hy*1>PZnas}GbP~>C+0E;*>mGHZMU*Wa+SufCCrOa9{c;a ztv6SO!0W=WNPXycg&L;m{k~@a%k@3DBHd;{La#I6@40pzrYx4(v~si0;khWo`u3^d z58Nd6fm`w7gDswU2!8lG`jXL@`@8o(eqNnUx;`Qv1m{{EXTk8%cl|Wi>}AKfsBC6F zoE@2yrU*R=xIVIdKQnI#z!xp@ZrC6}SIr|RA}MLRXUl0)HkBaX z?!!wvgB}ahV@jh=7Lt0?x@UUFOD+H5k_c9xcY{-4RY*VVF(-QW89$63>D4grryR`> zgFEYyzgCn@k=FXpDg3JArXfQ|g%%H@W6%FqidD!&!ZKuUAKKBW1W68hgF^1a;*ILK z#hEp{U$U!VY6!F|nB8}K1~9jC@S%tyqk2kb7eL4QV$-Ynq?_xrgl@T@D@WEvh+}L< zsawU&9{Un+-rOv20xYh`DA~73+#9{KWgO|)YUr@xooH4vbh{ym-6(cz0lcI0+wl^Z z`L+s1WL$ApqDHX^IA+6j(0oHa?f$fAx7*LEBBF>dv_d}4S@BUN`Jv${RxSOd)8``v z0A|$@XNzZw8Vrx^5ORqtBF28ic~?%G<*Uf@a|ZKeCWoV6>|&Qa@S7iq7%uye=H4Fn z#FkEZ9G+8IOQVwea9-0RDMoIf}f|*2EDX63$)-azd<(CZ9mU~rdX6qUl{e|pk zh<{E^lEqVAjXY#$6@K6^xMGOYIFW9Gx5iiNkl;R7!eC&P>GnjXaMsYHx5ZF( z=bEN?J3r43)9nF${j8y0EF^E`gN6DL^J@X)Ko3v}DT_K4P0vfj=pewuASc{5!>l7( zPktqFx3Wc*e7gG5vG+hrFByM}hS%@gn_+dg+EV1ak*GL&n}NRKP=ve9jNj*E^1fQP zVwVnuLIN}4eH=|Pyg%}CWO&i(93)hdGbCcM;wJ;Ry{=fs6GKBBG;28}4RdQj$no>z zm$JG!xP*1PyF=9oSK>ez=k5(}D~f=J5mbOwU;4@hvl||*w_k}rT43SO$tYIfR#>KG zo<|?k02cQxyD~Sw-Hp%)fp){Peyy0H3!*s3N$9tSjPamRJ=#!f;KN|M={lk*g2qNi_TjMq^fL^$(7T zDNUZ?kG7F>YaQuPiAl(-{Bu#5lhA700)C@@Cmb>~WzPPhfnXIqYE0UP&M{bL066&Q ze=KxPG=|qdfad2-LrD6Ve!dPxw`9YQ!i%#m6BCfs|D+^>S3f7b*{M?c{LrT?dUQ$?#&1N-AS2j)z`FUB@B1Mi4YYG8_t>hap+30(Y`^kuD+RYSa2_Jj-Hcau z?-BmiyiwWLZq0XQRAg~?_X`5Ety;7xW0FQlgac|7xfwwXSL51-TE2qNqr>BOB>Q$N zXmi-SPNjkd(-S-U?>1QSPMDf`RgoZJ-3a{k^>ezar3gg4-q&{DFoDhSxhn$a7O9a! z$k^rZaG9iQB z=Akyh{vI=Vj*cz0&W4*HuanWS8g4`1iG0&9l_!-#rtAcygGjf1;gjj#FaEpa0iSvkzQxO+C@Tmlb}6Q;TYS`oF|z}zsA*+|)C>gL zvHa(yEM&q9=>60$4zxI}$vu@gKX3?ca-6krynAz2T4L}mrqMIcjdAU!53dtlPV>h- z@K^)*ORG=KqFjAAdC?MGjlR!kI<+Un2{K8_$#)S5agMGf+X@xN{jmLqjDV@&vko`O zA1EWvp!n6VP_shDWk5NYW`~-LoE_(kz-*e9Y2zpBCZ&1|i8GAJ)kyrJhqBkzMGm;> z*u$p3RY31ZJu-I%Fkjo6+BbIb^8a-fyXWwbq41N*^R;GVc5EVpxQyCYGnE&ubw6A< zDk=l6Ez#$Q(D9(xZlWw|MPQ)-p@#k`sAA7n^75-i_M6|gMR~Ys^dNTGlIbdEHz}}l zc0{!|MOQu41Mq@gI_LJmMb%w@P^#)(+4B}JbrkRX92|2=fbUV4EzhBM)4=H>HEZ&jDA3hK%X9+)xttWs z1aNPn4XoN0Z~TU@`fpd0taFzyF=NOW^684L_?O9}OPd}-Z`PkB$N!?y#UW3NL66OD~q zeXr>9c*~wK#>^X)Qlc@|EEHrL!%a!7!SvX{x4BIdxH;>J(a)lRqk;pZ6Uu7uYE!%Nr9JGs9yzS| zad%J)Hj(h7fmN@(tMMbqtjv~|soYP=%diG7L2L(eZ3kJ;;K8AR>PNn^hbGlKp`HKv zZ`l}+u0ZDEkqFy{ul?w2HNklA;sYEtL0W2BFEe;JSk*G1)6w&* zpObrD-JU+xFQYiF1UYCwT$zf`(zi)Z69%J zJbpg6OW)oMxvTD2+-wq_hcC783kG>Se+jXAe*ULUx6Go;XAH&0i1yut5Q>I+_g^t> zB`yy;OrWMoNvqMaTxcA*r*W3_vWoHY&j3^3zWN(FY_jH~gTf{kZDCPH-u$91})lB+`r>TogfIh=F`h$^vY%2IkPSf!v~hnG0ZU@GZ$`> z28piMtmhEF64vOc%{%iKxQY<^25*P>HvL5R>JO`(U4Q9rArXs?Qbez7TGhlT!a05!c!QM?u#}%IBjMvOg zrBESfrq1X=P%p=a_Mul@5ML+a;m`RYE}-*4%M#zqfYVxH%jwXMQYj$N1$;3&BeHo| zayY_aLCry|-}6)YL1u+6N{0uqkpc_(FVawG=bdhmNZg$kIqo2p&Dh|MNCI|gEh%RM z!A(5&q-~~GvI$RBs;Zo3R)ll?=jT`snNJ=W*nV_$n7u-D^zj?WD?Jn;ly8vSzdge* zhlIvA!8FNLxiB=*hzNH!aT&1vBuYnEM6dLPj0PG0C+lFFc}$B_PUc(h(sj#^HfH!3+8c#XcLSi9MC`M$d5F~}c3am)?q;x9k1#- z!dQ%(ad<}hVhDqNHA{^GXYYNQtMonC5`n4tw4)6BVlC+HcpBX9mSexi1&8EwJjxAl zBiQ1f9cBo#&ku1UtZKkrbFCOzFE8UZDq*}##eS^w9$WBiXBp(-5S8d<<=|IU@X_=M zpWyi)+EO+D{wqL5b`H>fMk_)P(FA+I7Y0hXG7EzKi``L0<7F)u^=Lj3cIwaRahgW6 zwM)Ym)DA^Z(b>pX)#GI+8qDh8Yv({doTw_I>rsRD~ZR-YEC_hj+wl;anhf z_Hr0%pXLT(JT3^S?W{lsgoIm3z37La53YmH59EhvpYWVq*iqz%g}p5@^D)0u#!x6t zDy2D^qRw7dIqh|0drH^M2hbctY@WeBTA%tI%!Cjx|HiJ+HJ`MShEapV!@z7WNMH;U zy|SDy0ZHQxOJm}^Up~fva*Y1{yz{Ql!W9%F|8w$n_mA^j7e|g*{A$4m)f!Ts`*InH z5U*8qeT&f}7s#6kBpM4|*aHN6YDMSvlsE4_-BdRtO&mF0y%5gS@_2bGhhSZHSf2lj zb-b1gUrVAAJg$+C|EaN*W75`*a?KjtpA(55e0^<~#N=LT9GzbFV$FI_C(~DA zNf(tLJH!}u$i(U|Dup@i3SugT?>U!p851WxaTbuag0S9^60_K0vW}*hzC{haFvus~ zjfN)8B(Mq6v4ECFubumZOVZGDW19;ugTPx|)g6K;^e03lDnDDc&ddFgXU3^0(CCn1 z!|}9S!iN(xY}+LdCP5U+l#|I4GzQ$*$+n{XgDdqyE+j=tR|N+VxwNw<=Dqr`xw#O& z>Y1(8^16Jwba|IKUfSU7KO-L0>Z$JAxmO`ETxkkZA#vedpRvGl0&@Q753_VQg1j5t zBWey>lGg~G_)iBE9J-yNOUz68`Ix?cST?U>Ul|>4x{?5e=xxWs0+miRRgUD! z)1)r*Gu|fhOl74wA}qq5j+{%d=0=Y+8^2)!Mx%(<^)^2<-lKXF41BL#m{)4T6H&jz zd4C7`W5^4M96#AbBmFESL!(*?54b>2yPAfF!?s&GyN(e|g?6X6u4WIf+%uWK7eJm< zEmT!@Mj>8(a?vCaJjW0xwz$~3lmbGAPGs=Q;oM2RN_-l{ZRs{KoZza!QgePqdcDkMM0t^sp>k2)%}04w0J%@c{4l{#dJe2;gPN*(Rzrz8BHC$nBw z8waJm8mTR{pyx(TyW;LfFFEH?DS+oxZ!S)_v}I-bR7x`5zwrTrLAU$>*fBb@=4ADF z+!Se#uxc4Mh6JQT&LPPAXaqDoGp00f30dDRIq=os`!T5|LbDVbLr-@d> z9IrJrWt+&#!Er#@52Lp8G_K zO-99nMkOK!-??RU%GHxFp3~f_nH`qCJSt%b zy<+}~qd2*CaMn>%uns_ug9eV#+u5x?4v$;F1DK-LYNSn!?>rQ8iif%ZK!%pi@V}O4 zMcdQJ5b_Lic;)C0-;5P(@nfnkMn(tsW^mw=yrA{=AKJ}^D#ptig@M2hVUIs$ookIq zMKbm_AHZT^^Dux*@$a`?=d`$V$*3^(nv0-}RDA<#df#vllEHYuzuRNc-a2r3t{ym* z_K6s)3gIQ^lf)f-f>6$lwX zsl#LZ2BQwadg3WaWBv;_66iWCS;?gURju&amb|L*^!rbO`?30*K2W+jNbWOYyk;9`%jfw>6xI8h8djf-A@*G~hmXjtZVKt! zDN(G4G1Qr9M^9Ef!Jsd?>whx!^HF1pl7~-bm10Ub%eZHsnsgOgbJmmj3vI#!GF>}*BKtYNit=!Ia5imS)aV1pXyQ+0 z>wM{=KWEp9K!#w#^SjNV_i!W2BqVzpI}WWq`a0e)dt^@M122GI%Y*prZM8GBvRBWDigS-jF}dIp0}%fkBT3$ zw?_N2OIGA}tcA_(E))!U5rZQqKZ+2wriNvyqWlCC&gnOV6psi1>bLrx(hlf$M!0U* zX}2b7V_bR^T;UdWI(vRWky#1c-4jCJ6n1nK0yoK-kP|*@cPk{LA-OH%Yrh)+lEY3i zpakI^nm_%jW@|Spxv3q;wEPhNoJk`GWf@iA(1)*yaf@)(B2OOBcg5FZeUBU*`P;la zr%>A^0^V=crhJ?C042c}I*EXr2F^A~$WS)K=NFrM!p2G|JL*??=SVh2;wilO?PC?@ z_#vceAg{dUe38EZMk)jPAWTD}a?n*?Y;D`v*DvdxMA~sDkX+=;VP1`_m`z*Em1OO9 z$T|ek(_5mpDxBi6NBato&(>%jP9FD%?Oz7-Sy`@Q2DwQC-JfZ?zUWx1p3>-R+4!LG zX5=YA8SByrT(rLDfyrG;3~&73X3L-`51T~GJHrQ2 z_*2?Bkj9>4WJApfTpN^$QF{C?a$U^y9}vZ)C4)u%c4%6|Tt5<*Z=1Arre8^lp`3@s z>PgP(Zng72nW@|ZanxfDnh z(r~+FTG!y9>%mdJ#T`LpnV{8QZE;B_=5kh(2o@L5T9U-%eDDSI{B52Zs|kyxd%7Fi zvt=B7F4h{dj?agRBVx%E*6een+r0_mykb(s+7#|i(hzIAxMIM*kP5H<2*G$f|01Wm zGK|l!V~6-N2<^kD<`QFi(PL7ZOTvufW8tZSPG>-S^(o_u`OwC$4ACf(bLLBgz7yUV zEADw#>3=iv=pmz|MzIPSb$C-PW9H7qAM_Z+Gj*TFY$Z73JvbYBqKRa}@66T7aC2_D zOhhO*!AN$p21Jt$r~xA7OXb|kUeU(1JclUxo!p4a z!V5dI#}R$m!RNE*{_WvEIf97%#lHWztRnvVi9$LP%c5m?FbpUi>^Ca~Wx{$&*fi!A zYpY*_-{dgI;N$9(NbSz3AkYSPCRd5bD?#QNtXfNSLOw)g|BE+R_h1S&clvoyF@uUl zsuA?n;CF%S74CQ^y4AG9*sYk0nJ~)oLi=h3%2E7bT=hM3I@g?$SnJ80p`_bk<1IKO zes|N3(yd5;nsHH9P2VjmM^bdCNgYFwc^q3R7$7KQa;zVmC^GV!?# z+hn$pZGSdoPT0YVk$9P{M`1IE0|0L7V02Y*Fb$ehVgJAbnqe=cvQmlE zUo9hlOWr;nGfKVm4QLQ($^} zGC4l&tts3|1js_%?xH3^U&+o8gO7o`<1c%qv^0(;P*)c1^s1g^+LGF=lVw(7O`!d( z3l6x{quN=;f-n?y4&woHG6ugU+GTgFciy}wLg%dBx0%Z zX;2^RnuX0ADN@={3Wyk$u04aMBTCgGl>y@RG@KLps3S!ciatD5>u{IMV<3zYiX7s; z3m3Jvzhw^P)cHm&ar1{VQ+ZUpgB6-mQ%2^H34Lxu0MBC$0N&OBBT~_@Y`Kw_wk$-d zL7!KOK?15LM><27`>V^c$!OnKMS4Rc@P$%j?4F^_?GCgz7IcNDLY}3E+nXjpIIVgw z7*0VcFK@evW~=Ias_U4JPyWZ<|F7?>7d|V3;>v#7=VXpSv_TGN8y5Qu>&(er8@_@M zIFVL0;|RE7bphbhb(f92RN!IkUrS&P|J=LPXiLfuKQ#sdHY<}sWCvSjQ$?b(sHM2U z7;V|_r?E?ViO=i>>WVr78>YAQ3vbI2OEpNkdobS^0CE{J1igg;G&$$k-}7S^v*q2h z*@Q|Z%j(O#<&ygP6w8O?rOC&wy+UbRggdbIGkQyg6v(;U`MC0=`2*}-=LPN}Y$d!e z$nwoSR?^h}4i;v~6GT2AlaBW1)rcb7(x&oY(&N$jByc}{+u-_(U`qn~P^Vz((^2{3 z1UBO+lic&Hw3ptRz2}II0prLrJ$_v#MtOqjhud66PnDeBaT_Lb1-JoOCPL&w7I$ktesmms&oYhSA&>AWX>tlf}@=1eWuNqg>Mq}bna(`3P9AeDe-T3U~)}X z>!pR#gPiD!>|6=acXRNWxg^rciuW}}HDoF=;4VWX!!Tyg7yW%^q5pD5Ar#1k-U|#0MB2Q;+ddJg2ZJrnQWMIIWr4?ZDO4mrr2eav&D8Wh4l}@ z0Ya#P0YDA-=Jc9zvj)A9D1E&pVq;y8>ot!Sd#4_D*3YzKj2^x|t#*32pxV}cqNtZQ zf+(zOQ-E5aDreYWyp+y$=eS;zsHI51`1#!%*7iM!gvI(MKNDuTE@J6$TWk-Cu|Rrs zy^$awS!5{eY?JfG9JCq!<9yepzB6agD?teHb&`(1W1qR7v>E@R`rad!6bf?ThKQ!7 zC*^$4>S`N54i$174>I>2^wrN&3jy2oewWgao{FKS5`P?9Hc)n1)632?EN|5qK9#vb z*&+5or&uz#Q;(j8B2%uMD7@LPE#1c_%r78^W&-!e>sOU?Uz6sy7b3Kuj+N8#3RA#W z_6DL*oT=FskKrIY9DJWl!x8PMOuLcns6s=IuJ=bWWd09+fIR>n(*r4e+i_TQ9MR!Kfv|Do+tGO(}hsj)Oe)P(D*g^JM z_g3WG&j6gTMq=h91W}s3F&wO?fT!7}hv7x^xo9SAYAaCZ*M$l0*(+5T4puf%mH=if z<_2;=1?qgGUUkW8S@9qBCFqOL~tcB6B!zWj~>&6JQ7x zGp0TfF-8LM@L`O#`w| z!K?W6ew(K~sROY$Qpfu~%Cyk0i!Oqz$sr^9W&g9CT%0c#y%u)E?a(m#*tkn>j(LUh z5QuD843}y@IZL9eh2@a~X3F&lKWaw*$7T;@;cYfx2Y`gs2nU6aW_2W@Nm+YYm6dNw zwPl9+!#3E2r@LLIHa!uI@67WwYyvrbrVf}Ps(X`M?bm$~hzx#rC*TJ{#~`jjHo?p9 zxv2gRuMx1q(04n9AGh?;8+nMl@gHM|uBD$h*^FgnF)HUhtab>PEM}zcb$sSX(mXRG zJkSdrQZ=Cc11%nR0zqvTo;7({K0WG}N`dgw=jC@`2$kLkDPtdS7jRXwv^IAdjchj- zMl#)Z=G_#5x2&l?XX%2#{b?wy1$eid-_htleJV>R66-{VKxh|i(V$nR844K%Go8F_ zK|^$nW5_Z~8P5+*E$v9+4+`qv6WSXzxPO~5Iy z?^yo9HL1lBw*zEgM|P*!vIE;D1-oLkuU13uM;sU%^%O(RG%cg;ZyhovtnL!Sy>3{0 zhHL^hhDF_WB5f9jPw94^9CwZ`&ht$lUlHh3GdX;nE0e3DyiDiOhFpsuqk5(>z7)p| zlO|z+U1nu~(YLs?1yqpsMImv1Z!(S+8eNsoHzZ#ia@Xq9QSnc>KKB-f~j*i5PATng6C8LwS?`rdHeWD zvJg~0Bw;pk#O@V+-?xe~(e^7p<>Yd($D;t10f*O&?Uv7NwqhTOE?h`4y8aF)QiFN- z*=wrS7T6}fB8u8MGzM;b=zj6OgC*Q#>pyg`{56KRZ}XPk$IheVy|1E?!NxH;%BT3A zD}pwrnN*0%(L+r06Yir-wwfuZ#V+6E45~~nUQa$-?6Y^$s4kln+3<+AMB*P;O$|TbLlWEL{I!j(o|*o*5nLo+Q)hC5?UOhG(p{%P z4EBHJL4TJDv7OYU2j`uIA@5`M-O>nQX6ZQE8hTtx&ZDH2Xy4Kv)^SF*qh9w)&$oIQ zXf(n=ug%$hFoR02V_8E|a7y_4CbaTr4Jl$;^UF&6g*}{ZxI$WLj;?B`y zs7NX1H0nVY*ALEHwC*m(xORN=HciDn2b(UrnfG(-E53M3^7g5UH0@#MKu79&SRZu}|2huk!1pO5X7#ox&{>HYrN-!Cq*+R6@@eRjUdB%@VgwQ)Oz zPpFY(Bmuv{jlmSwNE%|6aoLyMm@4b0tuW**6h4v{c9mEje7Ev8_oAG6BB9O*93&$( z$_YI1k-6P!I>V#v9oYsnQMK*ro39~|EBS3486?fAPxG!P_74jM|G#fd)}@t9cFTbzTyF3 zMLr}vIC#c>KZ7l>I~qxaQ7xl3Dey3sr#JJ;+5!PzT!!??a3}IV*|159C7CBzX zW$wt{ierGz*E(KygLk|Ppl#+DTmShpx6;RSeCJ)UOgp@k;-0~{Oy1M?qbnG#~LXdVPf zb+X%r#pH@T<8s)P6{B7@84gFKC(8Dy78J1Z9SOUt!*R}dlPYs|ZFvT7R6k}5ka0@B zkfK_ZL^THZylA}C6Ged;c5JpP*dB?gPP_oj{RnP1a-s57Ub7CAX)a*N^~fQTz*z<$ zMo0h30(e%T$QSp9U_v!N&WJcail6EX{KppMV|dUwddV2l2e?rOrR#eYDzU9kP)sAH zFytGCt8*@mf0QR@GiEGTsUa~WLmR+*%+7^P#Y(=A@YXrodHYGEnv(H-vlTHH#Rtk* z#nRAo&=&-dp`mTAX!%H=E7stfVr{1K6MXnf=s8m$a4I<%3HC#P3U9G#pdT;@>NGtS zL0gDuz)S)_9%`R2jsxFom5&N@mfc994+-D{>iyhQn)RK|_!+`yv0rf6KQNLGa~cEu z(wWWjBonOfWz1T#Z%g%@AHV09Z(nu_C{dQx`T~AB1d9`xAdN+|{N_}f+`Tk0-zU4K z1goi(l0R|{PlZn$U6@+#Gpp&=P+^i2P8QSBs?0v-S|?ezr$T-nBTvmKi{ogZmrqdb5cqHyqF?r7Ubo#^>uf6ctLu>Q6^fw#j@^Nx zsux&JC0ge#w-PySROzlTg!FVx%M_;9y3y@GFIOO@%eBu_fyjAydExDYxq0Kq1y8o4 zWbd$#k%OGv@71mpt`spU+egq2BHBvK#;@e{N;0?cQzGzF$(EOMsQ1uw%&&=Ol5f6F zih>*8bRCBgSG^D>cz7a)-mW<@MmM0ttZ5RS@zEs(PhAha^AWYzZ$^Y&8cO;(%H_8+ zp*_OZH>8abqc@%zaa%W@zOOr1E|UX*hsP*O)EfH2Euz?h@0=HM)&hqCP_q}`e-NhR zn!ua9jZdA-2-v@(&SQrL&udSc>S`?%&uH49%_EL#>MBm(wj>v+Pw|^S*Y#HXqPThv zFxYmjUqhYzSDUckSd9Yz)q6@dL7)74X}|!Q~mJibtF-w(pguVsnc}#?2UX z7VAe0iz#{T`@YRlV>fBu2}PyelO?2l-Rdg({aHm1s?7aYwZ=Ha zty7Ou{gGW2xjv?2*;TP?!a?^X?iuuL&7&dR0h5Nltv4~ZI+Ge4ZJ4*D$71dNdFVzZ zMvo=z=NV^KaYVXf?$=Ygx<584tHK?ngOgD^k0IK4063C_6Id>1Rc!ZFZ4bNw82JDbu_ML` zF%j!ZU`%7bUR6?aJDD`YKlPamrjN3N^92A;t>o=JaO)#;-|ptek&5om?ALMt&CxE! zBN9U~Z>-(+wFUtK;wbti_s3|)OoWZrlbjiOzIV$*G!)bE3mH?_u5RfZDAtOo2r4+JV88WAU;S6~YyKB25CVGht1&x=HTnSvpJO)ElLtvtqT5FL#6v|^J zaUpm*o}c_%-T!z2>U~So6X4ev|M2wD(#M>62)E|T8N>lUcPmjkMC8R@Wmg-kn49Y` z>GF_oWJfRQle@Hg=W&h&GI8 zjTc4m^Qfrya{9-nR%~2dM1{q=zn%4J>9Tr~w*6oRbcVstU^QQkpAdXXTbe8uSgI1O zGD-}r0RIptEI%ON+V}0p7fUrtQFtJ#?BPmIe%D3@^ ze^zmUuI^?>>hy)X}TQnSDnCn^M?FCZOqvJ6srbri6$uzbQ$Oz~7V&(q4D`@aedF@sTKnTB1W|*|!-Db@`lD)6G~d&J^ptgy&%L;Y0SOy1KKn;W*<_s_y%8A| ztviec#I^K%=}>eo;k+j#E~-#no=2Idp2p{81fDjpPIB!} zAHQcJdl8#P8`KdZH=H|pPa`lh$rVuZuIRsufgarbk{%PX>_d!h1k4i=Zi4}`9OU8M zd&|734yX4NUwo$^Q8$c#{Ui@_wY*_`M|C!TRaJQGtr(zUdQPQ<=mB zIr)iDk7!6iFVnvt(>L6ktAOl>Ysk5no0&ZiR32q<_v`1X?ZEb_IWI#hscVi{F<;FG zw-#CujD1e94-lQh_s)J!93NqJYHWzM*Xd{Cb}fOEJv5=p78MRkBC~yY#$Ap4j_Q4} zSBQN3-D>PXoGj+*++ot)Nb(ZkWtQdU>ke9xsDX;0c?j+)ckwL^AKKP%@!6o!zjo07 z_BsH-6^EF~!@$7?e1iN5Vq6kp1ICYEwaIYIwD!2FT42Ff*jl?)3$L<=nDqL~#N5xt z(?gj9WZKf5)^Rv>2L81q?_N8>P(54pLVa`Pr|I1ZKC{MU#)PL2%TKH?ZrL5;f5?HaV+^o^Wsk+Ch&#U8YNGcxqndSxoOe))A|bjIP;U*c#@b#z`pU$rZsrUDfs)g(p8|LCPB-#w6J{1 zNBB=iiR`0h*I}hL^bb#Hl2(OgMO-EeEHA~PyDPX%}3aB1H zm9&5|6G*r?sG&baM^|~4C+=ZUZoKB~apsftCud@g5AfF>ZJVt#_1<{>`J?TNDIads z8QE~*X0-cfY|#{T*rj`~`0VQ>mT%{1#tjH;Mz+vv3hsqapjR}vLU2Q_5+yV94GqgG zShZ6h&nONQIW4&c{37vUfD*nWfPOIjU*qN95!CO}WJVGoHQP+~3ChoX_N>b=q<`x4 zVRXoC-FH^f8c9jJ#|&=%#nc}ipQ_^t6T3htzDbtd=KXs~-o02dt~++>VjIQ8z;cXH zvS@jCg2yaE|D9Y7V1r$d$#3J{lDI|}_U!MyAHOn57u`z3*#qp33}2q_dh_a;c6@VV z7|Wdg`?^s}S5f_LHQ5f|rRfi~2;730*VuL6jImHxfA=@vn5Kddk~xiQ84CB0)`r=$ zdUT?F_ipC{D$GlNya}n&G94Sor>JVagEZ|;yH4)L-)HBO7&|rWdqMrJ!F^Y(!t<9=6U4+Mchcy1vwTDVISy0z_S9 zF+AFOkg|C+>%#(f5H?bQ<4@VWfM=CTZkdIb-*_!v#$H(WB*)9b=7E=T@h_|h-VXF_ zyl*Ho{e~OTB+)4%T=(q+8lU#Rq^$o*VoYL`=s)>pLhHD$jh;^?CJ899_hP}e~*LVY=~{=Yg5Tm<%(6F zjVQK5(E?AzJ93h@Qb?X7raJE#-&3e@V5g3Th@-T4bp!eiXXHn^G5bJcLa~X(joPsQF{W&7Xj;e z!`m`y1#@ogJ=?J!8@gRj!Ca1oD|SNrI_7uuJqPA+Ka-?&d6z7>pG$P-G^rA_Ku8Wk z>TusdizZ2jE?jKr-jia*+_UPiMAGWdCxQk}WhwAZeZ=*veWuJlg6AW7kB(w7o!)=n z#~`3(CSpFa=!Jd{>9%@{nBG=iPUO(PH^jg4>HjZ>dEF{b+I|sMI1t4q)vB52G>B*5 z9Mdu))9bf9s2wg)`DNRt53*X#U3|kI-_ZZ+KyEau)wbgqiZzf67Pj?ITp+*~cPwNf zKJUW_Mu;Q}9v*7td6ui|HvTLk8QZW`Wxi(w|4JDU{;X=duhZ2H>)I{lTdlY+XITXY zd_1@;@^Ns*;6DLz{hILE#ci|*^Hil``)tu{0x5Y9L)DW;{OByhfx}k@)2J|HjjHrR z{|eE^vq_5*<9iIBA0)3bIS)3~s7+l|e3&WUGD}1(wG@AvcA54p^_nNqGQ}Sfs%-zi z&iDjDs`$bxd!pxJo?Z5M_5vma%8guzw{E zGHY&?yDDXJW%+Gp9`%%n9zGpRN&j`8M7q5GHrOlZ_9kiL&!Fh+*{ z!-Q-(SnCLVJ@Y@I^8ZE=2nnp1wY9ew4;R9YMA?$E0$+qbQP`fXt8@FJvd?~LywI;8 zBhX6X>!g36!t>*kcg@*gKvKhnsiCvmLFT{KGw$Ip!i+%vuwm40{l+=x|M2zQVNGVu z+b9CAG+mWmR2Bj0f^>+=3Mxnu=_M5DolrtcKmg`_rCiF&mX{bCC`~N_uO;O%sIJQDZM(?9Fts~&6C#gpEdoZX)S)xxG*$T zQCYl~QK??s?DBxmIl&(Xic;8XWSp>W-&0JW;O+qLr#VC`@> zV0L8Rqz<_HE&D1My@YPH3bHf(=-XCPxo2!(n`U6`xK*u{TEH@8;~%Xif3A2Qc0;WD z;5sU+sO+9teOMa8-Ts07V*SB+UM61^Frj+w5NUN*_8RF85M%rU{tam)!egu~S?LYU zdpa^Xq3M9@v)wd;O^+0#R`(8KQTvRfWf%!YSb>plkk_C`_GUF6v&jcOJe+vK{|qY5 zWA2p`F)?wEFVV{7Vz<#<()F3tGFHo`x7GcY# zbxOOF9XfJ1drI@CI|AiV$`I$2l=4d>PLfJWngv6)U~uZY7p>x*Gur|Dw)gXe?uuyL zqEbX&DUE!$m5~Rdqq8u1A3SIp-_}QKRO0`aw3*aPJ+#M|MMSnaDx&4V z*%!nYF*I=(2O;8bH!?PzUKuQxw+i~Xu;@EJmKS-i^+X$zURv^2^5b8t&JO5(4P-WO ztseJ|sPm3cU|{xf^c!bWgE4gX?g({xe`0mKeb2mVsep)1(oP?S-*BiX+(k#4MM@iS zY7Yg-tR}@DyA+{vHO1BvlB*75#oC#avZ$)6ei>f}yCTZp-FXbeDw~rV1Fp2`;jAs} zvfWygnB1t9e&|2wQ#G@RGDWr$Q6@X3pGIl@#+MB{mqg!e2NM4yi0`y4?`Z@yLX@fm z*;ahnR+tz&06d`0C;g$L1s6sZS*B(?I)K76^?UL@E0@F@w?!=uN4greI=eFEg9BQB zKyMx1>us=T2#`${tNKI#b%rDUr9YpoO=J0Z$>iZqA%tx@=#p#ezW@&03_j%~rJ}+E z&uwc~wY2h07&Ui{{i5~B>xy`Eq1U=4G7YzyK(~6+oeEiRGY5T|s^celh zUtpQQhyN!p+Wan-Y0>HeXCcxz5^JxZG5Vq%9*y6kvdJ zMD2ti)48L{peu6Ef1k{n!8PhGhw2w^If?n4wpq5;csz{hO};*MWiJQ0TK!V)!dD%? zvs{N)`%Tw$cRT-c&424RT^ceK!RsE zVvPH^nGo@J7IXkL|9Ag1*5@~X+C&W&flD%o*YkF%?d&_x7Spz^<-^*)&_vLkwF9ph z-54xCU;P&`T~5g#0xsz$OOg!=dPbMds;ah9Flo@O`c%PMQum@er5av;2h{)4)I6Qg zZMR#@s4t&RzVhfEy_~;?`sB#@6nTx-+UH;TX3QpBsGIV=^-2`Z z@?AEMqlEo2ZZ5myZI3q*k&Zl6s`qm-^peM~eg8x{&fri^mtz>>``I}j8q*!p2OlybLdL&!p-mL{RqeJg1Q74*okPz~%>Z z=U_SToU_tbh6Y1zYyD99VXF}%Uhs!8n2F@C~=`Vc~obsiUV0mco`0N3rDXqx!~1sL2pSurL`R+ zmEaSkuJ8Hw?X2pQ813`l(Rj_ZRT0uJ-kq`@xzET?Pv5w7k++SDo&9<>94#ph%Vr6` zCedGnfmWIpG+w|VAMV@n^m!^^vY}#}?@7f2;tnFiIkAjQ1(QW*4lrm2v7Uv4`25;o zPMH3ey0~CikOuO%`h3zuK;QVuD-#VwF2bww6|I2@uq&r7A@i z4qkh2?>CP!5X;_*Aw$?C%|5X~?4C_o;iJ;=EhR5`7P)1OlsYXOTQO6VxFeM>tIi)NF!9qMdvrf0*vR{M%|&vuKue!vHG-p2E1z{&;2gx$9BGE) zNR#BJmSOCSVh6^y&I`y<+J%BI6hh)wxj)AWeGVr|0a{U{2|%lM@IjYL+GtQ0;a~Ur zA#tPsylJ&-zMwUP2>CvlH1?)2pBE-1?uHj99l=c{wx}SaJrb>C*z<};`8TSuU8>c^w zGMm*lG~b24ul)gW%8V_#;Hwe#jDyiHJD(*++6p@Ltsr6|TFsP)0}*-d%?Y%u?;V8} zrunm=Sx*%VZAi$dX!$)&y@7n9*0(ycui1n~sH-x}-L77U?AQUY=AM#tVwFQqf1OVPM3B* zCy3$Fw1aa>8&oZhJcs6Fg_g!(7xoixoSD4819XLOHUzxn&p!~;&@#*~Yo?qxkH?!O zgkIVaGKz;3Of2$}=YfSoW=TUmJBD?n|C&J^a}>hr-o!eSsjNFi5D2*=NDZn`Mq^mn zd19$;!|u$oaL3zqC2ridI^$;|mwSJw5R-XPcw+WNKfixccgsR(Gp!ny0 zf))p3>Cosvk?s`FnYhaoRx+j})4TNTtj+v}XvSLr`W;z+Xn@{_ z8?K@YQyndR%y*w;6D)#X_#5*K%xc*@SL2u+@IDEi>j$=HySsN%vsVXJ742xIa<6tE z5NzBWTE_R_o#j;8d}k7qO+r(@Cb~1CUh__*$w>@_8r-N1%v_%+DGS#(E?YmA8>&rR zkDZ5|Gk@4gH<^F&Zcg|CO=tUS--4ciziHwsF0fGrzicYSm^XaP9Vp-3VR#q4D~kXe zcs|R=3Fv!scM6SigGJAY3%yYiRJ1XP9q&Pb;%>gVhTB2JtX{(IgsJng2nl zRK6!H_=-7bij&6qdh&mP`M>Z8Fq0uAee>%=T-GHPx@S89{6nnvxbuwn z*9cKlcOzJ<>|Ovry&ir$&@%#= zo6BTcb#e z>YPy=NNDT4rPh{Zol(y_1wm!Lu!}`_o%N!XK?gey%7V0~?+6_JT#(?bX(;3@d}#f% zEuKY9U46b)#n#d{Aff%BT>-A9<}CSlKv)et+ubK}6L4`tRXC>S;zdoD>1PvuanP;T z0E8cm#}2muN~Bn0(N95?CvaPrv_ZMAFo%2qoWYRr@UAfC;Gh#SDYW567@mo}JN7#D zvsk$y)X@Oz98?rvtg6YWsax9lzxjF>?{=hm#As!pK(d0&5xuflPS({k9>T@mq77o< zi3-n@dgh!9OMC==zYS6{o@L@u)R~I*j+srS%l1!C>YMldD=1KQ7LQ&EdP#KmlszSP zU7x6ul$Jh2JC!RoQRQy$;K1gAW(~+x0d!4T^Gb*;j2$v6Rl@CRH>qmm4^+j}2@bKR z#%dux#CazZT*HRYNn!)-ImAbY;#2I>vg4?)Mb>dwX}Z|DxRlh9kp37Sfj>}^FZ(s; zeJ;PoE(9r+7O=N9-q7_CWB7zVtDEPwV!J+dTyGe1lq$ECnxCJqQH23-2HIwijU9%* zuH^Y!j@t=qU0am5`+$B2WV@IlkeZtMjSf`0kJ#*JniMP8w~xx{8gYRETnPNo`4iH9 zGcBfrH^nB&_3}7XhG`7{QAX&>%(cI^`hW2PXg@DEyDc`mk;1I00`S3NSs7nFGv6sR zeR)GSL+ejFZ;+HYYw#q8dS>{IjarlyO^xC`-8u9b29Sh%`)9*&EpFH^;6Jw?O&|P0 ztalK&6xy=+O^|Yj$r{#LV!(tx<(#o1O)2jZcL!Q(EqA&93HKCdrUR<0tB+N? zi`&sycK3*SkQQu}6kdbj8=W_S)%KqNRECXIdoCCMUm(tak@|g!`Uc|z^s#uDCM=m~n zFi?CLDD_~zX313QpxL=%k`qh92M}{N&L=8fAN#_uqVrfR*{n&TOY<<;_@AXWLs4s* zXWAYPO3zSsVU=sfQ?H3#W)?ErjF82(4ruZO%Jc&CJIVtR^tOGd<$zW3lzu?9!cs|< z=z%8QW8NfM)Ld5mC++^PU_u%u?eDYsP5ZN!8k|Cl^zO(Xk?wWg^!uWh{rYNl=4bAHO9%?*_{@Z z+YtR|K#dVc+w6=TIKeuksPu#mR4(^*CuJ%~wTBn(hcr{tT}8ijJA87fP|h8|rKoDV z>4Cik&hsKSM>e_RZKXebDbO6-=6=R`<}43+@en!uai)kQKjs9d5KmISM~b@Lzu>p~i1`n1Iv;n|Dak65 zUc+&Oub&rvt`9go4>z#4VErJrFf$j6e3Bh%9)>6}1k`}m8uPgPkIuR``8=;i1!PQr zofF0XE8$_r#K*eNmDO)hlp;$WI<%P*D11z#;tl#tv7#EIN0-0pn$nHtyOY547zr`a z4f;aR5M#Fp#P_VeF-4l}uGo~Dx47c_HsY5DkdqH{k(ka!`7*Q2+%+18QR${AiYacY z$j~;AjtU0uZdeVCUS}{eTLP`DNelG$eePd1qz<~aSQEZq?gV9&?c!`!uI!PHA z-q80izNAlVnY)zd7>9o0G;u?{0}6I6iYfCy>OrnF_ahwbg9aYYY0T7l3ovW2-B@{@6Mc4Wix; zYwd;bjd}K&H?{F*S3X{WlkL-Gs~to6x~4FpefT7 zx4xRp$I|<$|D9XAax(w>yE-YgH!{M)Q^g(;*^ARxQ;?kNP{K@X!*P^{uJ2b=L^m9f zYBzo?=kValgI{g6p~ydjg#`ZAR5dc zm2n}nvxG;cu&71`OOYhp@3rjj%LVSP*?M_#QbmJ;#j?|^R|pITevYLbfBR5yXLI$?#jueFhn)wxXc^A*YGM4wcRQdp9Lqai`2;6dp?}{HF z;UpM%c`=1z1j9TaKDD+C0U6ekA>w*9{|q|7NZ6 zoYc|IFlMd~`fXxaX^XK!apiu0mr?JV6w63{YyY~b0Nz?HQ>e5B=PYTDTk zH?M7U<;x(Z1U*FBrc3HZf5~pB$56L`>_n`&rCY$Mkf^Ah$>6;ym+VvrmRC_d7bk;} z<}vPL400)AeOu!RAhSAY)O}2CKhlk?%$WaioNUMGM*Ur{5>~!oLoNT|L>sC^Yd;d- zMAx&*`S{X?4B#Z0xN5?<6BS`wtHa&r z9dQLzy}-bZEQ9)d1O~gwHWM*Bhk1EE=JmeR67WyWV|#s|1HzB=_K_iB4~$x6wBTu)g6%LzFvbLkV$}`MPUHK5JJN2 z=K~LEAed^X^G^*lY2bDT@+?&bSZnDwUO{u4U`MlnSR>HX;jnhOr^o2+e^HvD4$z~j z(xADE@%LuL~W?e z8R}y6Jhiy>+^gQO0v@rSk)E8q&!oqSkvrHmwi}}6H@#lvg+i!wjFg%>h+Q&`d0M>DGg%5#AgFr3DO8WSLt+qf8)VVDrm9=er(ya1wGADN@$>LEy%S#b z0=r>W^SRksz1M52`w|s}Q_b}%V$oF-zOqmI0_x&SJI6h$HQz_x{YX%{A_dv?{_~+} ztV2Z*OOcbm4MG2K@O7%Xi_1st;qen#+aHzloAP0$z;25~Kt*sbcYXI*qTciKBW{1e z8ZZ)G(~sF6z-KB$eq*3i{U{+>fbxT+i-9&biYZE*Cr=XgS6TLPk#`|i=ptYBk|REn zZ*23Sbnq}RCD_;m_-*lWu-kv16LQo)-xmdIEYU!Hw;#LNl)Lgjz>ESw$?*a zxoNwrxll?^i9GHoW|oH7y1lkwlk1e&6D|Gm8mU5VP>?^)Ha@)4tVfL~`T>-uOH(9x zrQn1H+?oSsr(=?2`?fZYQkM1}DwMJ)`mN2)w1l~!n!>If9Ec&5nBJFwq5|)>RzHcA z`#4l=TJ&}PZ7(u7`jvoJcWZC|!B1??oG7xQrPJRoNZ+HC{|o%wpPX7*UA+D`n)Vp^AbFt8Op zT>RtmHyXr>YJ+j&jh2~}fZf%e*qm+A?SG{Ut9v~$6TYZk`|Lx8EG;2;_#I3IF35EQCj-?x=<=T^#Wql z66HR^kxblnc6n>!M9zCH_#YpU4=EZ^n@M5eKC8G&H zMNwoHH)mV9Mxj(nl^4o?`$&)jS5(baHp##sv8~h3hW)K5S3YDr%8TFQfevp5Q_j1o zJJc5rLP)(%;CDTz5%lSBCf`x^6F^e3g*uXVW}|-NjHMMMp#mZ1aY&>W5!*I$I|ACn zCFglhwjF#o$nO~b_cQF}{BEk~j4k`hc`u#XsK-Kz-gy#neGd^p*%9}%wOpJ%6*sNYB(<|&JX`0Dss-p$Hsy7H>oFm zkmg3kG)yVIkMM`G(|p_1r}!@IXxeJ6DDnF;&_ss~ctzS9*dTRD3V{W-mF7NIX3u9T z)s!@-k$*|lP$}YaG^l|%PVt!T`3*{T0lvHwRjxFVwafm~0Ivus zEYJ+}ceZZ^M`1y8-;Cqn!L*a*?`C$VszGF8>9Aa;sye-V-o-7 zL2}T&4Rb<2Gx0p78WbXou?O#PV*6M3BtYl1rIIexKLlYMLJNB3HEg4?<#|Hq{QGs4 z&kOT#d32w9d3w@@)v;TchgDHS;}^1hILN2_Af}0*)xHef!FVUNwgBG_?YKa2r`6Ng+f;}x@TsUl6(8yi3%{3XV|;0;f0XEKT>aaQV{Jfr=+2Ear0932n&V_> zgc6wpHcxFd9q~}1?E<)B$JcjK}b~WxzcPZ0htyZgCDV|b1z#G(DfK&LC{;bswoJt>lE4p7$ zhcs_m*Vrku_(>~R%4T)M)g>qMsn^jT6V=*-PL*zH@WFq#9Wp8X( zDVSF2n-I^svTb8;FX18VkvUk}S=uHcJY=t3mLTv4J~Y&bNAbY})jlO8f6tb^<)t&t zlM?Rw2fv!EWXE$FZ}tVka?q1e(c6Jndp83n)|C#w&99|SUcg83Yx+MHoe=QbT$+#f zI9jjPyVOCj{+@v~U#Ml(@*oYa-5z3?F^KJTSkvHq*-`5uAzpN+%1v#@l`2;s$(ol< z1cY!}xKdPSiXkgW`NH%;s}uJA2R%~VJ>riq(Ih4+f;U4WN2{@M{7AAlkx|lApfARZ zuc^(i&k>|z!T17S1dcf6`59MJu{lY5uRzm3k}CK4t13Eef#Mst@xiW>6fF}v)Mm%T z4G9SZ1iUgRel#Xt>ur~w=bkw`S12sBzVaXZoCuluF;Kxp+K#MJ1o%b? zTYjo@Ipe^z{Z-0c00~z~vj74mP0B}tai%aFYhPotVbfOhiABX7o?6BtYgS_YiuV>& ziv;vJ!W~Kws&oM0<&b}qlgVX};;A`4Kgjbw$Zo|KMOP|H!Za5SYrF$DJnhL+9w0Lp zVw@=oYxZ<+H!+&zwik_dq#rD?3(2s`9sYPdDm%D2oFA-q#C%;_p`hH5^Bitcg zz1G`}R@UC@$d@O0TXz?WQm>yPm94wi4_jligOzrd@-#P6&&+HW3XwMZP-x=N5cLx- z2i+f>MSieqqX)OFW%BG&BkM@RZ(TIlF-nZxlGR1T6FxY+l2b3yo)e5$wtyaSF}aN0 zYzbpl`bOE|xf8C2V=O%O)bY04()%|Tb2wf$z&vP!GRwzFmd~ikpB~S1C$3GQ-y};mU z|HVt#prbQXS#BAsd)(;RNM1XNW?pmZBzSEhD}mBq+Z6Jg$}{mZB(>Q&nPPf~{`iw- zHOn#Sp9=g>+3f!(=;R$zQ{&NoIq#8VL|yqsFE_VPKcGTOIZ{hE;`Wsur3gLCw#w8y zS9~P=HRt8*lAlIb9Nk^9@#5m^b7bg$_afh8Fa|=?5^`Q*Zq4DB&Jeq8Ux1O{J!IG7 zk?;CCd99}04{Oo1ca&|tw=t#94mUjtoN%qiHKO(|1-hZ=Y4s}T8whCm>_D6xt5_c* zuSXbwxjQU@Ak4j8d?sxYCP!3uY*5RJI|cnDLTTyTpPUps=6c0l@SBz2=wVrl$H8lP zONlBmv3{R2d!1g7t)-6=L6p4oB)4`u+t0~_yV<|LPRQ)&XiNB6RBu)aX&F4MMg z(zuBT-TzYTXO@(rFUzlz`sCjfQ{ZZRTEx>>n8`L98FH*wXpKcRS(qz zt2!~r2;cZ28K0G3+xlbB#m+54e=$1io!TUimw%{187$0r&FI!IQ5J5c(i~1nxd>|$ znE((?h^uM!);9r_mSd!*^e~JxIUgxfnj;$bg5Id}9Gsb`a-*JE!f6nCp#h0`0%E^& zj=xh8v$yl4oD0?1v6AJeQ^om1stSo~ZhrAzEw`i-@Wi<8M7dts($1>?%1aoVXdra3 zQ&DDzXF^+(U#us%peTm~-KGw1+WrR>79eCkl{3mx5k4I{gQWs|* z@8alcfq~E8d2T$)bPeA@j{vWR=Y|B^L0<)wtU`rG2T)EAlp=huP?ryc(B2;kMqgl& zbBhaCB8uh4XFmU?c!b37UEhN-KUvj7lk_!qEqY-x>)1!lhKIU zAX^>@-}UkD1%M-V6<<_v+bWdnej4VU@0C=;rh%aTD~vCS2}Uf;I#2Qx`Ep>ZEnM_% zX^EWc+HN+VdHbGSaP9H-i3-*CIj;45=S~blOKDTkjoF5lXz_h) zPaHp@fshX*3>L-yqZAQudd!s$aUGy_KZy67>ZMeuw3QD2$T|FRQC6B{yu7bLQrMHy zrgl;Zy?OqC{bahs@NRdj>y-~K4`nsM+C(8FHl(@#UA|xD2R(HpBI@^1MMNxR#VOyTk)+V2(%lPjgsk79E<6xq@*+B6v z0cG#=zs5tBe>dytFuNtOLrkH`yWiI&O}tk5K!|$sDws=xBVJ}S z8dkNsV=Ukc6YiOdC=zM40lMX-^zAOCY`P^IVlZy_SQc$DT(MPpAJbV}*D>2IAKPxz z^Q~1{M2B(XxcPyQwA*KvI=ct=eGUiD%HgAf&k2uK*fRq*gHZ_L9`9FCATH}`A>BJ; z=}d5AypZ*5(TCRGbISJbQOUyGqPrV($t_>(;@DB`eYJm}^y-S%zx%k91e$l41!yQX zJaiszDte0?AU)Zo-S*Sc+WMiVjr*1tPnmN8FPka&gd64rN9MU_oA~^bfI&UW5eb-h z+)9Oeg#`R8ZAM4O{&Yc8jsV4&*E;}`r4^5lQF6(U`Hj*1Ex4%pq}PbFM$&DV3?jwu zy1`|V;)T^7@vdfG(g5$4DSQXJzm7U|g9$ljD@AiAK6I^%}G><*LYgrkcW={rH7%2#! zWlJ7pv0#4m`X{eRSrzZ@#_tLX_%DuC*93XePL6?&)|K!F1@b}r60NPR#zqE5iqg{E zv#pd9m6i*}>g}0nGnEv3u%v+tO{7>GP3<1l7dr)#AK&qm4-jNRjL7=~QTp2)0?geG zEM?Out{G=};7LF=?9Rzv#uGFI_Ok#gUrp8fs6SDlpbvr;;t5f!Lv`*!JK94)bb$++j{IGfNkRffOEk@}n*U`=RvgSkzJG!(vYg_zs+xkMrj& zx5)a9x6nH?n_t_Ua-McYmT074dxDO;@ z$c`t1`!F>>bm}p+1zp65<^@2i_|)|M1b1uN8x>l_&KJw`G#|51Sx)_18{rUvnr;_g z0maylRojc(MJQ6vrc#2Fp^(I}WqFTi?srUX1BHh%99Vv>&{O5q!#$hQgWZ#J0PD!d zy^(LGqbl+)pP95_znk={Hx#Y}-)^S3CVmnM4#y+Jt(a7&t}-cmq!#2HoTk)=a%)AZ ziBlDR%za9DhDcs;$j0V*OD=zm68R9-)Yhfms~f<+YIN&c$k8oz{nWt8NPXa$+rvc# z$KxwB%|(aIA0CAPV(Q9Ic)saU%0@%*+BNbXoGjDV=y94WqRxCn9xM*nSIwEv4pgBX zzTl{n9%cjn2#P&A8F5*NnB@8M)_Nb3+;Rh91!6CTVR?Tb5|Iv^E50RV!TPTwfS4!b z`(>3bZAX2pJJqf^3l-YR?H9F`S|}yBIM5A1T^~1VFj3nzV3M zT=_^w&PewY2q~sQ-X2mS?F+a&9`hIE&VI{jd&F)$)JT0~6DAhMk_AkCM9=~J6kqxYcBd+YJZ?)%hlaxE)&91Iqh zV_%#2n1$+6<&K}|vRjK#3G&w{4W&Jbdv8w&P`ZFTr8ZQ+vVa{uKs!PuK>VI=(8t~C zw=;QrkT*G`Ep5=|MtzH=RXHWSj>E$s_!zoyH-oa;vC#i7x3%7Agn<tkKNs83FJ&F5Gvnh1?e!qT##`17Z z1hpRz+uEwY>WC%Bs=9nylJkVk^4L#wrPH+bG+XnkZzxs1vX=$DZ7$iU@o5&d3<+Et ztql1|u*ji=E>`>f5hQAE(H>>B(y7F@HeDHQpY0zZkOO{3=}U{%EQ1rLl*-ezm2r+u zo87FG{ufrJ%z8NlmVmoJT89QD>S65PCD8dveM;XQ(>F#$D8mCAOLfU|@A#7Qrl*aV z@A;69X%^XFA0ys_3j{(lp!h&_xqy{(l+naOn$<}pDX|{5CTVG8V$9#C@_?y zhs~mUJH$PA9KGU6OYs$cWRfYzak}TzsFM5ka6H2M?Xf~kY0Mtfo=w+)`0`^dI9LXQ z6a))8cL(1i_ZMmnR{LRo{yWibv6*gF|E_ffktrp?IV^b)g~<-16QWFCV7775gccZ} ztDLU@H&~o$T^loK6PaM#UH0)|;-Ja>tdDU8+5Di*XB}^4=(M@3YgH^}!0DW0Hit#Y zzj#k2@nFn#repSL!JV}l+d(U=mtKzIr+T*H1n&v1Z zu=Vc|oT;s>ctdqgOY|~}U=<*_xXujqxPH-ReBbiHLs#iIr|gbH{LhY?xGcqe!cYz- zQibea|j_1j^m zawrx;;h#AR-vx5&c21NPgr}%E)!n5BFoj2g+wHK*SGtRPHylvk7IBxmNdzCxCJlywdGnwLQ{e6?^ZRz`_wa(4yfBUUU@g4=k)2S~3!anI}f^wIo% z@POA)611@ynXSlSzhCu?LRT`xp_CJY4xD1S3Vz6tdYo_f?y$5}(-%+7{J zAyJgtsY#ab3M!w3TrH@Om@%E-9EbI4`INahzSj3WcFC{jHb$XzK$0mmnwQYhH?ZOD z5s9}i+l(debBGgg8QpziE~)!JRSno3l*X}qNV%FKft9p>b|WrLl2Q~(fo!?s<^d#r zg>~!pxtydpcv)W`%_gG>d3dq9>2MeZ8BQjYOL#~Oi>t`REqaAv?!J#%H$hA^2jEpc zh#?fV89N~}^Sop^ZIuJ|?4Yd&*@tU-F>&#(G5<0T_zc|a-qN>GC4EBIbC{2L~R#FZhd&9GpRlKIB(@QrMH zsbj038&oliDRH8Dh<#=VlCWQD?1w<*JwIg5qPoGK$r&HjJ1=V9OZ497y)a0Zv#JCe^JJW8cjnjs#%R?~CsHawxqQqpNB>+Gi z0(4PWdyb_S;;HST?^~I4mDKjYw3wWPOvbaQ5aZTK;0P-EWdi`0^h%iQf| zaFZq%f-8a)kceaLGuq&|zJAw@G6Gq;`j?rN zQQT#jIkVcN9{M0+HN1Up@)}j1%{&D=h-~o9XlQ(J;9kcT8>Pq^niLB!LjcXz@C86J z2asF=pm{sth;O@?vlu0iJ$Zfy7zI zE~5;;XE^uR5)VY3apVd=l&9B^op}T>XAGijc=7SKeA)YVPluj&dNrM~echTM^!C}y z5RTjV#WRL4PoKM=Fnxg`>-=4ftBvLXwHjW*Shubq2vl=yEQ|h~U@fICf~%RB63W3~ zq$nqW9aSjiq>Z`5gVPjtbUtWhbzY6+x(k6o;)US*bJ5~W|Kma4IiNW>SX1%1($mu; zVpJ32eIs1Ud8PWxr^c#;7MsHhQQ`gPG_O#(6?H;v!J1g-A<86(e>Fi<=yE5he>lyq z=T?%dV;d_l2-nWa8-6WI#<&7Z@2+$+<1z@DsQI312?giH7c?~!wn>01z>>=|u z_a?iJ0z&U;BbEt8t}Et^`tze8>>5YZC$}CG8TYZ6k#w*gCu61P!sR~&vB+r4=9SPj zCn7K9^(kvmp98D>klz%WCf>U^g+ULD3^Lu8=2+}kE$~|26SH5d!6`P9ho%U(o8Lmo zBreZ3-AvDpu}sKenr508RpUdAz&{V_-?``L%%!0pU9m93VP5=v!uDH{RssP1AeIRDGoLQ-KwbyWbnOfzdSVe2DCSn*_?2UcC|W^Aq>X0BKqk z0WGu^M-wi|z2>XgnvX4XdkYIPJNZ#08EQJFFZj@mKJq<+=#3liDEEp;%gR&~*(lcVf1vYWMK3R-7eSr~}g(=whUjB;u&XJ(qh zq==`FSCv4X)c!EoZT>k;kvFwx2qyj^zmht{g1>RU>3 z?4Re0&Hd7?CJcA$I)#Kgds3E@0eV`tDJXW4J^f(!1MaTP>m|3w69+a3L|FZyzrX+X zSShn$d(ma#LKqAoeXBIqv&zlsdOGX7cR=wN9gmb1@{1GyK`{oGI*hVl8Zns7UMc(w z*>>IAa+j>E24xPxs4JG(mB6`l&JMljiK&Rl(yoV=U5<#!Jt z5q!A4yuscwBVVDsm+MjYyc069Y^`-CQ>1R$|ASSqN7l~CKKkDe@(RTGo_wBqto6a? z#S-%hrXgubz4TH}j*sSseIXYbzYE8_oebh|6VtI24F9TdWtBnX@eEI@{_bFJs1dcR zc%Fp5U4KSyvveu)`Z-nSFA3%{KeujRc8=zIBT08^7(gi_36D&8`2NlvM@J!nZ1C?X zy&oj9@#Y4^tZJs+Q2jEu2AN-Q3k267g@`FLMmzS1eEgT)joJr z%t(#T%?G-k7b3ZR>5ib%JOkYF$n0Y0z+@qH+C zbab$th)}zmaHa7m`8)&7dV?>P6#o+s$WdQkJN|#D&#^_t3+nC5pB4;#Z#5PhT3v0pn3|Pcz?q)(VC~W`U_3#` z|MDQa@4qbSe5e-7(>`}E+6WGzHpxi+@L(!}c0{x|zpwe`%g!y`PLA#mI;l^YdKDne z9c$6ar$*Rry-A{5J!ju3R#2&P+Y~PyVznqsHPT|1Re$x(&$m$i<{mF#B z%D|XqucrKXU#L|C9h@^s&nQiN$T9C?QNcL${4<3A&#k+Hb?#*K<81~cY+wpB zdy*?B_Az`FX$LEqKgJ)2Tw`O9wCiSrY_|%_FTdB*iFoI{4Q*p37axT#Q;wY66MIG6 z+_|LuU2dvPGorzH0m4;UA}ni@7B0nW@Y7}fr}UJ?wygHDHZ_QICMa@Qn5Y5ozN zQ|ElkE^j>f`FVc9@WQ6pcp$JDzwYiTSSnp>Ti7oN;TgFCFh#(@u#}S!D@G z;8e7JwwXg;U3M|XZgF^-?`eV1F>8SHK?7BV`w=hF7^@J-!O z$G~@C^MZ?Ni9Z4Oq0r0dXttYA&*tj9x-4usFgp5TTJZ<7*5$ZVQA20irRec4qw0R5nwKwoU?yo$v)B* zd2WE>a*0ZouZq=%@nl-sw758Uq{~}!N;)gji{#Hv)d=7CK}qyu!s~Vl43{iCIx}HQ(ng%e|4yq%SvhcXQ48 z1X#yaQf=#oDt9ajyS+FjL(Lq1*;5zjoY>=BjqAwc#ZTiwjvrk+{GYr@0d=t3d$eCR zlyDh6%fOcynRO9*d2w+uhE4w2Zk(@^laqt1E6v-c+}J)AH@mJ8yP~5c9fS19+?xmb z8*q{4tt8*-w9m?!#JtP9O9kB==Q2an@F{iQiMG8qNQJ-FR-0`j)mt9Wq z?rc$xMXM(w>vprmr;63@m;1vHkBy5xJTb1JaaNV)Tz21UqDAf-sVOlkaK%!|VpET| zR3mr$8lr3|V|-y=Tg7T@%eKV0G1 zQtvYw`uq8hss0#GEHQ3~+Uar+lL^@M2S%NNVNlUGUc)V={?Y`*d7d z3{JS|%WD)naBoihp{{{RM|R-a1IjWC7bQvGk~wwWU3=?b%LFYlepeUryeT;PMTDwB z=R@1AmBY>)YIoXAH|e2l`=?1LBJ5jQ(RwKFNt6Y2xw}#hBVW5kK*wY8qxc*>L-#DU zf=f^IchKEZz39O%$hVqun>#{pX5S|`9t^YYpL2fuLF$o^fT2Oc7gNn}@{O6^&*$>j zd_0Gx8|5#JDcp`=MTy z>4n^ePZEI zyC0Y+V&1N5GK&?5VY>C?hX~B;yKrT5b#0Srv5$JMG}G&oQPU&6lsg@bzBovpd(mMX zMEG71!*lDM`kVi2)BnJ0tOCT+U>}k^lGaeC9$4yK`L&Mft(B<#BYoef zpEkx;K}_`OA&M0ihneyABJVs+i!JvS$WU))-?NV-{xSJfDxQ?|uJ%_i;bp`+lB(hT|ABKFfP~ymK-a8(>FQ*oL#lirvlbyQ(rO zM`6WpxAHIh?|)ds*Amtf$uYCB79frIgXGVH-YtDck)bKui*Cjn!Yvd8gk1Q3yF~o` zd9z=`e0?=A4a{Q@2ailpYsTb~jD|zEe^H56yFpvK_jd37x96$nR3+za{?le>;k52m z`u|>zP`Xc{`C8S5Xe?mz+do7N78Yc8yQClwoSM(~X04#?M@?VHcLpJ$x4P0dyRG}A z%B7*6eoZ*Ueg?oZI?aYMADhMaeQ&G2l%K@CyOU@&?X=9Z^*g)GPJk9`5Cx9^B+o2ZA0sP`*GY=zhC)!x}kJrzlzZ_pV`;dX z&axF-cor}HN7TV!byll|LvT$v3blAxw$m2$rncM5fxva>NkiH(Dsrd4r|qhh)h7Qa zH$}?E!wLm4_x3-zOR&G=d$a9{iXy)EJ-zLBBH11C6>Ja9YJ7PFy=U7=XUQ#7ncw?1 zOqiOoc0OOfbyrAt1E|T|X=-rM&@i_ycKek?3vsodWj~E|@0u!dlh^3s|Bp%U`zq#{ z>@!U*m?^tGXrxPww~#P?tS+>1-r#RgfTouT{j52xOr5%z%Uyy#l*?i~r&_^~F=cJ? zCU7tluBf_jP(hF`V+NQ063jQn8jSL95~Ve@eNALWVl(bmIA^JjSi4?zRgPE`J7vg9 z*`pU@|IIZ2DsS18eYM70Nnw*Ya%GM`_Evi_3XROZp;VPlhS zreNH6^OC(1rpuvuUWc-2S=jE5DHXwD`QHN>kY#zRr)yJ3><053VS<1!o=dh_Bs}?q9l; zQT67uR8Ez?YIo=4sLjQP>OigQe(ZKRX=5wcQ0(_F|6vs7>zvtA;hD;xTW6s%cGG@f zfbs0f0ppQi%+ZD=?%u#jGcgHqKN5%3-~X9{!*?V6pQ4@cuLDF9sp3t2y=rjWXbi(t zrb8`4B)8nMQL9<%=ZmB_S!Kt{Fs6S~_Z(dC7HQ1k=^e2gyaxawl(J~nuiX;p%Gxax z6l6dPv1(@J&$ATvZqW!@lCfJVi2@N)!eQK-P9vqhY8N8Pn`_beBHr;vYvr)U^ynJ-XWU7$JisjqACF4C@U*l8!L#t ze)}Q1K}5RSK~T7$-t>9aWa;RCUOQ8(@MffNM~yl~mcililQexi{VdJ5T-g{J#Qq7~ zbU48NL|0)$743>(}mpXR4uIC-MH zwvG>skR{=<0`L<(=Ve}2<%!Q6 zWOpAB6X^}OupuFKZRUJs-HpffGyL3{NcmHta99iD|8@ewUT&cKpk?Q!bfz`!|{B zcxW05Chyq1=e@vZ>1HYM#XO!#yQ67!09nFot+(YWw}#i_K2QAmfyu1ru(o7}1pei_ z%O}4?|HS|-*`D(@027Ovl{?z_n-slUKiyIF!CJfcO!r zupuyT!`{1JUV;zRwaFp)-Tm1c-&mXFWuc}wJ-Lm$ z@O@{3zOnwt5CNY_9vo~fwyt>3XORLYVx;cflRNpaH@mrUqnOx~S`hh^@$q}>AGa|N z?AToUT=A*j4-36Q#g8Axb6zGX74#f$o~;-7ovF0bBhSpnN%8W>3B3-Rj@izKPL=V;tJSq5V#qEJ0oZ-Nd0!JUf$XuMUTb;~R3b z0)1!bANg#?{8oZ_P2QlxmoZ6Jn4)rQ8S?odz!3@}8CNg6Sg$>IGO!xopi(>{kx*bvKa$r4{<%aqY1sC7*`ngPQ3e6@hgK6}IG0 z&iw%zwucgrd)93ZJUE%}Gh>lEqH;{^1OKBIAOTQoOaBY%9PPlkd;jdx9-HpgTbOKMljUC+TQSz0o&Mpo&+%JP z`QK*TYZ^Dmr-b$nXEeo!P>fu+u=gdk2I5rTiO4AxKW=HyDAnI@qEvLDJ88@1r@^Y7 z^ojy&l9IXPKfD5x5vH4zA9iVL(XL|GpXarTGize=y-VE8NsU>lS%tk0dBA~e523t@ zN~~4aC zMRU-!{ri7DND}mZUskmz^P14eiwWz-hXLFfkrQj8hy>KbFI0^B#)so`Qx$3%lEH8XXw@O37mExhMBG}^9D(q zx@dd$N>(MLL!*m$bR^=6xNE{+szn=fabng1O0)fx)AWKvTxn_PBMz0RV94l`L)ef< zTAqW`A&T>+k&k$CrAx0YU?5|SpxwTIXpM9$Q2C+t*UD?owbn+mv^|Ce+ znSogrcKZ_5@_x>BXt57#H+rC*EFQ`gkw?>=4$aesr%V>>DL1YExt0Gb`uwkEC7@I` zu5J1D;1SR?|F&^;gLS8406y#^#x`BT73y z2;xR=SVuh1di;6p)17Xml9Op$yl;w}7D)Y3+xzl)Z+JuT1{cwdV*dcCDBeArofvoL zU~di5;82Bo*Res|NPE7xZ6TR>p@H(#;sB^yjM-d_kSiHRU)Uw ze_&)5ZBq(fOZD19khZ={QY$GglFHCK0u8auw%ztG(NFKbJ-vJ3b@xzLXidVOK<4Xu zrDWWX*#hh|y-n72wmk2(d78l`p9ZjAF;-+x!H3K+!?(Zm%Q7S3mPCuBD^^}DW7+-4 z&efx&B{#Z%>v{3o-E*too;}0mK6)7z1LZz}(tZ?*JBIqZ3MqXhw*2k{Ys__rgx&q^ z5wc;LE_n?>#z;ZYvb6L?8ED=KhB?k|8(!Q_#AN#&bPZ$7|1mwTCJH@&4;=7E-^%yP zE<*{x#UXUL|3!A{lCdC*TIC@gXTXv)L$sJ?Q5Dl{YJi)Zi$-t!NSUm zx>*Oaj6kCJ;vsW6J>&T2>!154oH;KjU9y=-jIXSz+=-dZsd|?+y<%aro9ITAIn*=w z(A@(qK#LI_o2joq(HbX9&^AR5wn+!<YaT7wJbCVgOSFQuc9- zSyx{`oBGYnv=T-lwyP?%M_sYbHobm!t$3c9Q4~W#Io8ZdEdTw!bEm?d@vKtot^T%> zJEOTw!W%X{6*~QL&0hyjD!f*GwDwW>LnkYDLE$Um4_}HIS<}T%9(HLl-4Kr1 zdapXqzw+Yz=0+;+6rWNuI{W~bWCXopQBiQpkQg>-$Ca_q9Pd4lHU-oux5 zz2l8i<$B$9;}1SR6UjOg85=9-?d?t^?yUT5Uio>aW>sZhe`d~0weG=ik-51EuR_mT zlxmCJ>3SCmN)iZ-#@s#6TO@UEh^;2-4()|@fSltY$t~~IwqzCw>uul4_4sU|Q&qql zyPyA2dUAN!f)E3ZP50Ls2P_JNmlu#kn|GJJz2;g~LTOutYkjY)OOtEHf->4S9Q@Bv{gH!t?_ifv>ZIj!k^_#xCDod?Z63G?f1{b;Ls#&hh@A2j0R{;i_ z+2iO@B%6DA!)vw6T#*CrJ4;Swxc9opq=nzp6?VU@p44ibcTv7i&fL9M+CAPMr8toN zT0dKEyXm&Ln!d64oV<5ggRRHXDHD}mn3q+xkFP0a^;PS!k9`eO__%5PFEX^|yo{fa zYgXVf=QiqDxgGh3SgVPit)b?g)UMC5vdGtp}MH!3za+N4kaZyJ{`i}C*X7?e3o zzH8&ODdDcz+6ods>`mWTe4t%VJu`FCq)Vrtw0vW@BC)Bq^JO{m4+DiTIWHamemF3& z7n*;UdAGfAUdR8)@^WC^y;a|9d!n1S%vde63RMcbjv zp#g8Q+J$h2)|B1hr-u(2-UIbn34B)UdL_#vGkGZvlWDHmQw+ik$PkI#BM|;oDuSe8 zBqQVFudb$cGziBt9*Y>HE=C%87Rr1_wxf9}Q*{9f%i6L&=c=+^jOgqx?>^q0o}JVL zEp7YbYyRyELSa@F7UIa=T-nY^<`8;jG=S{%H9qh4j*FQBSGMR~BHEY_dveDPi@wq5 zYv>3*8uSagAL%Y_y6YbwhM!du+i>=2w3L4MV^Q5yOvr?4D^)9deEf#qR>g$wqcTWi zBu-0HOH0kQ+qE;5ZaNmkmELvK>ZG}Ef}hVViP2R&3!8vvah-B7Fbo%-cyL%yjrpxP zRrSt(m#L&dWzG5ZLVd^XnEd_aw8rc(EmJ9ZkM5p(WTJ}t6rs}B$YqeW(AX>0Y$g

C+jVX?hVQRkmtsbqh29oCXj1VtGIZ8t5rTS195upzxP0x*5`@7#;Q2bS#OVFYG6L#SZ(NdAXsQQyiL?0IkYTVE~mX;RD@U{1{IDJ{6z-NE6BJ{d+q@ge}TP2R(1WvTSzxW2DwX5>8ZH0;ELxc*m z4-W360GXm4K_5PQWLIwa+sL^&k8lyG)@LtrL3!vqkfMxVEWf($;e(9g?(K;i`#jQ1 zlqBoAVwg?AOsA4ldvc6NKDm!YSnM`bx?2;wDCRlDXyBf9R(PsjmaVsWL)~dWM>jst z&#QeppjDKrhcVc?>!{;wuY9pxZM^WFW8X%;pCM}MOI69#d2s(^n#fsv$K$8nt)&L) zXx%?PHA|}9Snte|Z2r{~+*^Z@UH9;c!~*n81u#2Fg_pBPml?(5Ut?DueN9_#^ugA& z4Ho{O9gO^UEaE?kPHy1e-2Z9tgSQv4{yCAT3+TjF)(Ta#f2eO{5792d6t_%pA7S-z;4b}-E--l*VWLGkxAlLEEN zG4AV8yRKSYyMP;aNL-pb8yKh~o|JU$ z`n8@SOO>XcH`kBV@$cAd_tqA%dHC>Lh6S9-YI4sjMh15q9sV|4BD}Dp{qiVjByHhj zMfSAkw=I2Zn%5u5p{9Cr+iluQ4iQZt!MKIJP*6Ee957<(=^1ZIMw^nAtGr5zoJlP2 z{vqw&{{9QTmo&OAcU87bd?pc}wN2cs1b(W=$^1?X(Lg08Q10-UJ z#?muz;_Wm=8nfXyd|hHOx1Ub7*5G+^CKT~T=>e)q;tCpR6)*|gDc{o}P1 zuBYl5)x8JzZrK@g=Ghr=kAagdH_k9__#l8;{rgqZj&`F=j#4Xjnn@u+W+}(z*e;LsQjnO?6t;y zHlJh;EF5+I{`~%V_dpZxFHP4h3^YDm&7pom=a(})2m{?A)Ibc%(1Gk1D!#qpoWet+_0>V*<2HJ6-W(4vo|1|L5cn z_~QR#OT$cmS5+T$FHX2v>3d`I;2Nu_GMuK@gUt%wc{y4N!DT5;yN_(lxyQ1XsJB$M zAO@AxGw!zMpP)S8WN}$L+8dQp%W=KM#n*&g_RK}Gk=TRG#Fal^PaX&@DXF%fcs^WB zt0!a%5qFM*J<&o?X-0n=%TdL}Kiz(ITlKqhJ9!{Ppo|bdlqS~jTn7=)9_)WnVf(YQ z%VgxK2F>rFJuTCIK-(|9<&uP2z?j{1cNSrWG2JsZHt766=QjTDT{d#`n=EsLue3KZ zo~}#uJAK+>qqtjN-rSh({;^VpQYx%uDb>@+ z2;YJCML)jkW8>y+DeO)2?--i9G!MDbvu%(+!EN1p+UMP#Wz}c(0#29>3MpZ9Cukq& zrXlozM)KgU8B1-u^X~3z^ISH}SKI2~@%%v<^VQUjw2bJJ<{*+Qd$?n|+kd@M@EBYf zFgmYVR<`Mu{+k{S&?IN&A}inkRhc)?ZY2BMsN8 z@8%V|y{Xpd;Jfk6$;&|!jIMdVEQS96f0y;qQbv?2!nvy8=`bxFzS+iI+d5BE*Eh|P z8`;#JRghR!Uma?gAoZ+Z#LlKqBk$y_v01%)B#oU!^~@SZ+Aj~^`EZ;6Cv25KE^&>9 z)tY<|abkiaArb<>(mb?z`GLows_g6V(doKFi3$Xy_*_4uY$4=TCVytKePfm0aNmdE ztCrRYzR&H4YKE>13cCh5EdN_3$&;>1LLkMP_dBhbh}w_7@n zUN?U1Pf)$1GKalrMkW4{u5_G$q^@sdnnh04ux~qgLnxcufe*U5hA{lVS{OI_U~XhK zTRxNf`lk3T38sCsje~nk@AS}2z~V16(!T|(|NRa8L9(<2Wh7hffuEmKbGoad<=Na4 zwhY>{EF3(fTuXgc$u-{J?rC>)P}NLPXO7wt(Y-^T;dL2XUa(#gtr|K8}TFqZ2 zn9Rf(dt@K833LePJvno1+6S|HzVG)bmt7>6(4^4nSolL|zzdlV{Qq_h|9vn2`V+)+ zF=njQQ!1Iv20ypsHM=1M>ElZy!6^N=4F+NN0E9s<8@Et5xvJvLVZTI z&&h?fk)+SV$lyLb)5|^Yx#Y#VTmLC$9K{Kx%8b{n&VcflUTEs8KUb8G${^y#G|H`TbKZ;vdqT!pQKO9ujlYiPXMkD%q@YRpk+?u-|^}2a0C;9!n zwZZ5xJ65Wj*v>2oG?^}Mceir?&=G~5D(Vi|keVZ4B2%n16dXD=jOf+dKr51jIq$F=D~pT zv$XrC{+HIYtN%dD`< zoHYtG4wk3f_&knWl2sM*OwCn4G3z86x-?Z7)&DP9Lj=&y>54mx#1}5g6Gi6I{u(yP zq<@aLKMZyT4^3v=N+*8MtsWeBRn?>_q^PJ^Sp4?aztX0`VIjBjpn?6a+CQ?CDVHNU zJ!gtf6BPfy7~}tRKBL0{W1fAxGpCLeX{F}7>S)V*+XsIC{6SY$(Q}{Gt-n*MhpsE+ zeX0xz_>tEAx=377cOn>PG_UfUqqmr}uDNW`Qn4agUGr_!f9MXPf%be_NaakVEB!v zHWX~TXC~A4=<-tlAQiDrR+RI(W~H?((7P~p+J}EB-~0r3q(W$qK|fQ7BqF)-!1?IO z3W5Qp60WMQyxp&Mey2#p2p=aT;IjTp)D)dW9_t39gF~4rFD~3AhqBG=^%W{mqe0uo z9~?GM)yBv)uHat!kSs)Wf8`YOY5jN}Z=DQ>f_8I{;m$$9=LKshWQaGsuhdV)Pt@!I?_c?MUa+)W7c*Af>OUw{MV z&~iJbnzpLkSZ)`ul~w626w`3+zKLyMch5sLu++6{*M1WqIHB7BQKnnuhq4i~YgUsB zkSqT4jX=9QElI`pbq3~r-r8x#zk$T6`;6QVK=eool0xbplL}*jTq;;M2Xwz=`b}Fu zIqd}AWUAnE9_;aX|4_NADm&B0?fm&i5im>j!1%+hL@G?zNmkK>z#pK>x&gWfYlHMO;G9edjl21xUOv1P%xg2L?{P~yu?=bQHWcD@T)`FhA_ zIg?pgXkC5pFrcPhgU=e7UA@}$Sk&Om{-pgid~a{>sg*rGWhDIrMete14baCje8lek z#{_fFhHHvnBPwqBf`V!eQMOt2m;LpY1XEN}6_T|w)H~9dWa)6~;ze9B>-^!z_4Vr- z*$uRc8!bj?-bJ~1-o$|Acg$kgFf~qn)5b@u1*H-W_y6)l^oq2P9lP_jlC~}OOa)Ok zPi(`}r@VPr%ZfV_2kWu!B{>ZZS#1b>e4;f5%q2;@`oz$QMZ%zN@qEt>!Zql!TK}yx z*}<`Y_F-rDB}RMfjA5u@^5)3<^)aYXu4{OV^J{$$cMxZ$EIK_wKwN719Koo)Wj*_WVgG9aom~Ox6xI94xhus^Nbs6(ksN_i|0r zC!?UBx1WTv#v*3W_s51{w~}G-=nrxh%O1_mO+000wtWcZ&3KU=;%fM7&(=4@M92g^ zQwxr!x8U96n)<=Sn$~Ek$!e6lM%SWC$8q)}MkqM_b%}|IxBD(@$6q*DfpS!^u^C^T z?de`&XJNDlHxVDiau_Zzh6?P&{A3k17jXS25 z;pw>y8u(Uk%jwFsYn`GH=Rmc0$I`{jl_7xcK}(z?kJAAD+3uLT-yRCmd!tlpx#VF3 zu<`aJrDd4iIO7Qk8Jb<~jVi)5>K4dY0sbQse}ZGVQB(#j<(*Y@E|89Zu`uld_@sK; z&9VBxOlYhY=16S1GEgUqhX-(fcAa7So>CfIY;e(p*U>?kNCe$I-I;`uMIDO#&WpD~ zf3H21eJ@@G-thQTxR6Yfbil~)EITeg|DaDD-@obSGZlDBV%Kf(ME3IDS;4?L^!Z>t z4S0E@d4UNK4}dzp>$876#Gm;4aF#uyd8IBdd9}V~2v%^pVrA7SpJI2~e$)cZ^Yrwb zX_0Vnq&2WvE%*y^O^sY?hqQU{KB~{= zlxKYs9z{Ri94^dg!CPszzKoC|y7Mj-#2>M$d~oS#?dPFC)faCc72#?R0t3Uu<>e*e zVdg2tw~C>36_s5Rsm`%KL8i9dT6D+P^|BC84K&w$?>U zh*$Y}1u)%`!7k8;eX zj(TiWvR~!o{9QrNk05@q0lURuW&mO9j1N5o6jEXD4cz8n`uAMW0)w{6D7M9Cqj@}u zMpoO~;In{-s<}AKKZD_eBbrzSNA$Yy5WOGorrDmzXU`;AqJd`I(`^CDP&?n-5kie~ zTCbp@G$V-OISs-A4mCuoLz21Z38u=fn7iNmn^UzscZ~_Zt^&l7Iy7e2t&iA&Wv@3B z`ZW|c+XtmpA@&tjF%bw$o6CKKf~9?zdO&Xe$p|OKST2p<;}xsVvH~A?dRj84ZDgI7vLI6?6HIShqV8Jt3)(#|CV`)KX%_>7J8mm{N!4%t{o^sj zY(Z&wm%I0!3u9!Ef!)npH#s%6TBi@ltO{+>zkj*SFo6>?%Z)cm?EH^ffO9M3=;J$X zjemabFMC1{&NFwnC@G!Qgry#?k#8`nos6gbO@i0lyI0BRy`*9`P*s2QUnONp1oEMQ+E>Z-ydH z^j9r9%tJd76N6bfu;(X_R*MHgJ*BY^H+Pj6XQbeH13;9U)#FYqDtOd*-RYQlUw92n zTVHtwNd7l{9PJyN+b|Ez?;6JrrhoXL>w{1+z+PS5_`)r5!t}UIpK-9SN7`38S_9Vr z7&dT~j;dd>gQLMDitKJL={h)V_pxF$fq-jh1!Amt(#dH%a11iFRMC^_N#mg8VYRFX zEN#PM?l<)d_{m3_J~U??8zC-9YD3Q=h^C{CaC9>;$dM%#4z+b2O1b6>DP z8u8{reIP4fah~I<@c7~5x3!do;1Ig00FVJ5JuYCGC|i`dRzz=J)!6*cwoOs|ME%zJ zo>?{?4ELe;Cz$%L@Nw1Ev6Rxf(mswvOzUxKM!`&2{S{VN7W0!J==3S*Bcs})rJ6~Y zh5_&~CPn);_(68(DJd#W*OB=SgD`uAR?j&4>Es#=cY?E%mHv@OK+%@igzHL{Hlgv3 zA1x6@C8gzMm9UGt-YdW=sedvzMV(dHyV!!t6H8!|uz=4V4?PZ#pWV9Lx;bX9lRjF+**+EGp zZ?D_k{(1KED>F2|=uPqX9M<0O(xnkgM0bbu2}Y;J(%kjAD5zZ(TrHHWVjEj{8O^t? zblVOP)uRSf1sBcdivFl3`Q&QJuqXRZXQZbuGx+SDqRJE%_;B8FP&raqG@5P>6|ZQ! zcJCEQReyR#Zju5COX)@nEpM{7n(F4OJi$ zJ?tDkYU0Yqw?@k}YY{JwVR=&lKM~x~hDq>D*R)ms6 zXY@A;N&Z;HE%RDoCom2CUC+-n4IwtQH{{97-8F|56Xhhu8jj7?AeSGyaYE`Hb&{uv@Af~M9gFTq~uZ%s{@?qdf z=qdOUC&`VWKoMUG0i?ap!(!kj6>{aS=9!vBQJM8tokPfO-J7bb6_&!nFd0Ren*&v8 zh0|v07aFTN6I?T4^XV^Ot^hLJ)gD`P=w!32Y{X?9K1aKI8EbXenMJ_^AKMCQu5ug z+8ML)N_6%LnC}LG=g4e4=1q+L1HMEJINJ;(gy?Ok10PQb%6zF`X6VI0paNy)$xahF z+((Fq`TgS)502G)Q$&hfMP8%TB_qobdWABY--ciyZ?UL_^}CMRUGU)w{HC9mz4QAo zK@i%6(lL|IUhtSW=;h0oJyn@AK&$RF3u}MFBWut=TaQlH`x2bK_U3Vb0jy`k!N!@# zdsZE?<9xPr;L5P8TTBhWmtAuMc7aoIv!iu!D)9Ad*KW_|kzJ?SlR&sqjRAag3OY=O zQRisF&v;K5>r7R9e63d&0i_vD@+lz^<>l&V9)(n+GGyng>^WG{#lH{oCZBVGN0)JK z1(IQS(m&?hV@SVzj$G|+TgzRjlTy3!Ln#pxA;q5WFv3^x5Nae8tw!d0qGTzT7XGm$f{jCCvi%a-| zgtIDUftR@=kngVUIe0|G;MYEch7JD&Mvr|3oqtb~lczh+RSlsVf`l#6p~^rl0BTuW zyxXYZy*o-m`Ci8iyfE|ru2k)#JHwFQJa(~ii;NFg)8#6nO;xi-L6~V4Y z1GK|<6S4s-s{myuC|J1&#x>AP;*iT@81$`rodX~iVPblk&f4Aoxj{^VJ3(Hkv-IUN zhbnxiL1-Qr0ItU-vt3yuzNTni)Y;sp6}9<;?cF{Zl) z*I4Qm8M%AZ^xq-q2fQ2N^qe^mX=Szjx#XE#z#yE>3~JvDWWLTqhZ^FMS3;Zgow-Xa zJhU@r_nB8~Uu}0mq(JIDx{kJdASNcpid>pxA1AiwnD^u$$>&D@mrrN*5*%twT~v;~ z|6ymqpL9+v$0JIfHTUG=>ya_i-hif_Gy5amimmdqE&(9=Pg7F1zDw z!4)X;_LPPiX$Gw^dni%R@`eq2{rb&1?@tJ$xz4~XZ8exFuIOe}q1sj7$Y`dLFo-gf znP%|35B44|Pa`8R8)UmjBoHH`AbJ-jtWP>>JO+{m_&wzJL#FLb6Rxvl z3-s>+D1f{>a2=IfS=ldvynXxj{d{=`AQ~=$EeuLxU9o}50;0O#B#`|tHgTzSDyI6( zdj)L>a4EkVXb^cNEORH~Pa&m%=p&s;84A4XQDTV?)f3g{sO~&uvIGLn`V%4wqb9tNV(nvD#xDlSg2=+`Un3ljer>&oys*Mo#608=s3%ZhXI=x^2@v}-8{gP2fhwgL8#W>?wOiR?R z7kdyJ8{0s7UdnM$D-Xgau`L>5O%8+amQ7-j5^~u?rLcSQFD`UStfH^?i|tC*b^ci@ zt4@~z=JsV>OgIRRUUSzJAp0gkxVO}X@-BayAMW0l?*~j6Z7h^C<<~b{9cp`sj(f;m z;A%LZi=&4k9R$M6uiUulF6~>K74p}6k;_<>A0xiqQ~>x4Lj5OzLLPc?;fFVLr7V=| zzwmy-WTVK4G=}ROW3Mp21S}Kq(V=`ANScefc8$N@Y1+<%y%kQMI(vMt&c0~MC5W}uc>bAf4=$i^I_Wy@N`)P?Ze)Fzw7n` z5nP={0(ZnSRDuRx{VazH-g4*OIyAhk8_h!@6_5j-#zs6>wD8lG*(-SLN)p}}&&Mto zFbj|+#D5Nf7tzZdD;VT6lt*5)g?mj{YAFDghKk^3EFfyjjjm}s;Vu_B1TcuQz0 zj_+Ck6YN)f5jM8Z6j{RO$ntV#1yPc1f+)_}%P7vv*@CdSL(E`^hJS-yPQ!kcmy7u{ zkAmq)0g4xagz|x+)wJB{w9-a_rg2MNx$K-Qd_JrK&Op)8m^Rd?F@L5GCLj~g5QSeH-1BkWn+XC32;clvg{aS%^}N#l`_WH=c=i^*(5p|+EzQjkdOHw@Ps ze;sKCzKJk_eFqlB_C?{4WoQ`-mOp{~^5-#{WmCgAJ84Uv5EVBl%X^GW`|}do&|sP9 zZ?YT#Bs`3XgDErRfjnjcYWV=)9GOJDL@^8Cp;|tJL&3pS*gh;T0m%xZVED=;Izbe2 zTxmhkP4Ejt{H_&4WDH;D4X$P!&3OkWK}__Z%CbD(%?F9bV|ia?X9IW)BoppLn6A?- z0*6}!F=v+0JUU{BU)JUe_;Ou&{9h@&)BI70K+Z!Z<9Kbh+&*s}%+^3LkhuVUDY@1g zd>5MD0lf=j)8MR?VJttbmahTBL;^n({vx{^w?u)O$p$r0D>*J~X)7}W>cjLy%WVi} z1on4BO6Q;xa2FLlfop|EkvTfX2iXmHCTxfT`Lkd#6?5xo4N>$$#iwgahh0jsV`8(M zpj~8@qxsQ!BXV_YUyXK4_qN37Eqcon?JuRYY^}8@m0PY+Xub6x?iQ6Pxh9KL3_?eo z8zD0yug~+`@|R|b3S8qsMyOhbyRm*Qih}lkV8dLwixs6)*#np~YM;J_VuV2Qhz)tV66>qI?UQ zadiw-SL7o~wu|NbB7!Sv1@Co9lXBWWw`SuCH{667az(`u^XouwD`)_Q;~F=l8L|Ipt0=-qS&AK*x67%`5DC*8CoD> zYZ2=7ojrCeD7)lBam@}$YM(X1mlp#=Ref5l3V!V`WIudY!eLpN+>k}YS|iOY@({z1 z*98*@RDvfTy?&*OUgO{nmGe=hkLQtlBqU~Y$zDr0@A5h-n%y%uq~-gL@5*zr4eCK+ zCQ#<)crU1v@!e6BV-{``?~GWE({6wnNjV~PFexKusQBl!9UFSlcNuX__U4YksqU31 zZF+zgVoI0bBZWu?HBQ!o%z>SjLg%FgA}ii<7A+0%O*Y>UuuY; zCoo|MTfOM!$R#c`ha3?JAn?iF+yfUqrnop0dH@fuYs1n8j^gGK?nrk8Cz#1246tOG znBbo^er2)D1uj0U2kBzXyh12d-nvz&Dt^N+r2T8FFVE$2Hy?Y}dhU5grguPFb_x~k z4&Bi6w;YbU@yYt5FIS`eN9{Tl5;l$V{;xJgg**3RQFtnHj*lba{noKw&AprVGLNOk zmGhtXRmujz?$x#tTjcKa!Tp?~J7wdzvqGgIs5b~>Zm;DdihXupEd`k)7SUY=@jg!4 zF?&=?BuAX(W2(k$cG8BEy^2)uh}2y>EZPtslcPI2lyv)_aLB;~&{`JUlZ4 zR$=4oCUAi>rM|o_1~x)^{672=-GUxPM65emhvG8g=raMKj;-Y26?_yJ#0sl`#d8Bd zJAR|VPhPVH62(b48i#kFB1Gc3EWk0`_H1*8d_HTC*X2E#NLDUz={j?DUp!hND# zEQ>>tX_F-d>3v3*%q+E~f92D}AdRC71XR60?>ZhD(yk#RH;4<0yvZY$cpzHNpnNSatjko(%irDJWPQB%}pG744Fa*4-NCeB33!{tXJYUqY zT<0mcOeEU>W`jW)vZq3Yi@zf{z$GI$J%QPzvAF0$WOSL$3m%2PXIqax37`_*oO_NB zMF=z$P@FJcpc^s`2aw^+YJ|=+JZx^yE&x`Ww(^UC?HCH=a&?hlamZmOfHEiU{%)Sz z!|(T;k)2+rJtJ$ug}SvTXm9yaSes$XNLjKIj({)qQ*G{Oil!YJqR&k6nr&{&ApHzT zyN&G0Yj)ydloqCWkK-nA$M{DB>kz&Qi{!V0Il>^nFH??tEZ$s>pdH-U=yAt_V>2#0liC`<-cg04+M z7Q1HO@{5KOHEDXVL5N}6)=?&OG>)O*!o_Vk0Q-a?6zi5yL!qw*?~o1_{`hu?!E2jZ zE*OAa`fM#eE}>YwnPc$GemK1{3mP8B^&{Y^LB}}n`9thD1}DWCfdb&!MK<0IIzcWf zo?hWnoZeB{>2#`nVHDpU=hz$)}X7P&+35Z{XuWQ5I-z4V5UExfeo zU2JD9;A;%tpy0A08zz~HKjJCv5)h4_AWG7>!LfKx-oONQiK_(cqDrqW{MF9)MITFr ziykc83#APQatj6+b*;;sHeaNJiS;~WiMBjJZS9~Ftjb(IsdpcpAc&Gs{NbD=zo1{p zT|S1MvY{>th2^)}5$rvZ&%cnM68eYOcxAYK1&w=#w!B3zaqis)zy{Ew4Yz&>btupjB19i zG2R~%#Pkcq>g@Y<O1$wHN~C(Bgks7bxX-(Fos{j zq_CjGhA!{Uq ztAHQorJ*#KPP#sqre%yDM83h37hg=UP+L-{I0#!?U!H}hxWklaVjBnf(|^6HRR>hW!(mlZ-LRq$=I`hOUR;SMs3Og)7at! z@uCxX{LQ#lbT!J7u1$e@;9{mrA0GJWQ6#NIphFqHWiUifS+GZd(td|SJrGwj2vvhh z?$De)s|_NQW1C!+>(|XK2&R_6Rfyho#}@K0qyqMxE#aq8?nj}N&~naU)hHVZ@BLUG zfm7t$#`;ymNAK|D(}sPx=&u#6qkDM`E#wv!w&SyhDyp<`h>L&I?842B_WOF2_u|LM zgUJyRtg>a{&PU$?_$C)`l(mP$gb5C_$Sq|RB1(ngIprFa>itOu3hbG8K?=N}qvNKAZ;)?N!KtuyC=K9$sLzw|aapu5e!WO5BmxHibW!=-Ik<*FWP-w#-a*^Fet~EJIl=Y?(j05M8ZmOv_1U&umufFp?ld<~YUZE{nwzO|BxsBX- zx0;Wev`(u7O7>&2n)hc~aGz%g$3v)ctp_Ih|XKt<^&wi_U4YE7+t`9P4_9F_W4X_SXfAT|I>DbX$ zEKaMc1sDdr5A z2Hp%Cf+yjC5yX|iY@|VLk(E^?aa7al3K%3f!7tg6?|@pCY%gM;1!X41u>xeE(t(;b zEQ?hDH$EtxO_A z1Y`^(T7}3MWeN}=N*IJdLV%F@oO{&%?{n|{QttVfJWsOsUVFXkUGG|ZA0{-A5`<2n zaY6cEv#Y#&1GpEgWFV)Kp2?%-Ifa3rJeVuzcb<~B#DdXn!!XJTMab5s?#sv8(4Mz% z3(qiU2o7eDEi$g3&=bM@!ZjaCoD!D}Ecnb}rXDJ~C8ADojZAFLgEEG7_?IDH0*%Wd zMamoP8YeJgrgJ#ZP(k29c5)vO%$k9=$VPtJ^6Fk7>0NeoGR+He6-A1{=*v)ZnEgU0 z0j`0;!iE%$hs@hZdjh+pv?%az_n)KNK^n1?z5tDX!giVdSJM(U1tf0=w*!7(wT(%z z43OcmhnvvBVZgCkN{7`EDOW1)43!?UvQOdN2J`b)4Z>s&-*FhRl5SpQeDD}yb3y)++|^xU*92>sUjs4~>?%dZ zoQU=SNT!`I3styr(b;8Sx8$~H4y4 zu{{96q{DAdpewNgtKKuP089n7Ovl~!=m9D#>w>qW_8*0^*Sn!%i1H6z46LJZN5>kW zPBx-WsNOer>3`%IuTc)>s+OsOJ+O*=&Gv!n4H_bVC`ipv1oCUyP)B;zlCb)8`noez zsGE;;@k~5KgM5SlM`fCs1W+wN_b{TQC6Yu*0?0cE<#V71oO$s%VuEWGj|vTl;on@9 zC3<#meM{a0(8v&F4`d*sm5FO?MwBGC)Mf236NybAzj!A)3d$EjOg@mybC+!UUQfjcmX2-NUh>s2sHC`S(0ZKNAd|7-a>mAkU9rr4n?KHzQ}ndpM&%! z7$NdKzMhb?1>`>r=1Y;JaCR4SdFXw^qru)WDb zt`mCFIi1s;j#1I_N7A^x|Ag32tD7)i#mLhb_&w010nAkjgV?q8&~$NSzM~vGhJ(&8 zKq|~{tp+UsH9AEPh4Q`;(9gGy#-y`3GIF<&Ho>U4O{+%=PbkbT zwvlGcMpG8hxKntMxNlUQ4wCbsvAVu@$bMQS1jlCCZ4M)4fdH)LoiiXosFp1|hw|Tn zxqrJZY4_bri|UHsfF$o;wmYd}xY1*%mR%H?5U@~NrW^TZwZd*|60WRvScSOZrGd6N zlWli{^Yt$s`zP7?WKb$o20WP(Y8#q#F>weiGt3oWQ71fD z$mpdVw_hRK&}L`PP3U>#jm<;xZjqKxo~(DT2y#ppnEqZ@p{y-q3*6+3U93^CmO71( z{T?guD5*#u`H=oBV8>vnQm8DNqVP*1kh0-?wJj-e7z_(uB3XjX!{v~`c$J1Kju6H4 z7Ml2U)`vr70#owoL8>D3(ei#MRDhia99&4Io(*@9w&RYhRgzoH(a3u2u4SitDhq_| zvM}>{p@_{K43!T{$WM{M-m@c8b5B~uP&vS!HVs3;F=&9g}Aj&AH z&w_6Z$)CWmGFCy%a~gAj(e(iyK#Ytorw*=;2j06}E?@@fZz01msUBHGEm^hJ*wUk5 zTYzl&^g`AR_3r02s_4&DaRBHdK4oFL=RK=sAp3N-BM5iC-Mpnb$-@}i7+m76YSImFB3 zCh28v4w3Tkk|z5Ep*YbgqHGLCO5;U1FenuK`>Q1@5OD^vBwoHGBb&lQ5d;~6t5ODO ziyfzlj*bw=iwYi|7tuIx`ci+9>{@V}!2Esh5Kh{|etGtHSIG3aV%E`w@8=YWP$u<>Cstg`ujJH^_7wLjmjv=%2ErqxqIl`~Xw| zx4XYDTjfgwSQo}NS0?O^+XFO-L9K)7G$vmu52pGUm3ek6TAKv3f`edC$D*VD$LS`; za5~cJInHcpnvt$( zSr(l{8in^e8U`9OkjDUe+{@b9l!0NvLl$We&WYhDdXLT~{*bukhce=^AlF?j~ zR0P-osGCDd;qZ;YsZp?eRg@$z8OsM~>d-y1GM(hXK}X47F7SbAA{%=AD-o8Ufhe6n zyCD(VAw(`r0g*CfA7vQIm(d2UcY|6pbXQ89*6{ss!+OXOs=qNc3Jt>m=6fz`E1*1s zz##zhtf51U+>6nHAlNLT;jJ2_t76vj`QMhsWZSuXl{daCfIyRu4w~;QoDPTXYIrz% zLNz+KIpnjD7^ebwa%_lnWT3jJ-QqQgl}zg*?*=?4W5R%+a!p?zl}5`Yho4_MSv}ZS9CIi&V80!{I*$V-!rdXvCir^Wit8TYzn7Y9nFmW{q@T zeA-p*m>cw4+(PEE)$|?0e?L$gj$l+Aek}-_rj~zD7hJA#XgQ*#h!}}D&lQ-J1dR(P z$kNn|ZsdJHkpfvEdq-49nT`osr7bm4hHrH^yZXH`b_t?{Au%0(fLBRW(E}^pw&<1v zBx;pZsr&c}VF>!b;UJ5U%FB(5;8NecQ;E&X6&kCDeei(bg&jH`{`Nl~nkU=6Zuz_I zCdJ-rO5fCI!VR)H2|Za1ia+zE{~^-y-rM?T!sH9VJJ?0>7wxmn_*NDvPpeWlLF4s{ z^nhO_=bFYf_?6(^mN<*l5l1WjWdLV+R{rU4X%_*f+IP2US^nKCo4QWZ#*9dl%R>r; zl@(|x6}%r9d6;nU-kyLdB+M|{BFijYh#0)WHe6B#H9UxR z#{5WJXl7GR7mDW+TIq0fJblVbdBlC(vo}A0IQ_zd-RqxJq+5q$F8u%%;E3E)$&`}} zb2De?xk5N8__VK}Y(;f>*L{^Oxl4Pu0lA;Y^_~2UpUy*TN@Cvb<-s6xg<9!j1D0`; zsmD@$Exr z>!*Abi@<0b7p>o~|F??}pbJTamls+H5}t13?5Rwl*J+}Crp|_pgI0vSQf=x#d``%H z$z-Y*peHoVX|k;#bP14M_4?`3PtX6%mY^2JOG|v43066NC%h5<>ycZpr*|Dt)l&aaBewzu)uauL#_U(t`HIL+0PDZ2>o*xcw3qr zA0HINAHm#V8|$j>g726R!y}^8AtN@b+C-YBtf(7ST^S2=@aPfIc@S(!1lyERFocsh zh&pB-ssbXUN~ichny>B6HLWI9BpkMc5Lh!}+qd6GwdYQM`@Pr0X58xi@SWS0NlL** zk)jV?`ED>hcDM;80lN8t-s@8;vZvINk)M`9&Jg(sp6fFk{(3t&-<7ySD3HRR%jv7$ z`ltVhF5Np|RP2&@ulZV1#FOs24%Af2S_XB7YVUsfa!#v!g+tD9j!hSi#K-HBQtlgZ z7v8+Cox9*Z&3X!y>Vq4lW|sV)7u2V669kM~-T^@N7vTF5h&4A=IM@HXK4T)z#f2{E z)9wNN4A$zECw=np!SLcWAcmMQBf}XMW0?(^xAwS#x|luP*H$$3{g*^_?zfn zR`v6+p4KJil}c`*myGTQIJyuMUP0SwjsDXiCd$tOqmTR*VCC_5)c-QAzkrDK;_`Q2 z58s~g`#K?Z$+`bC9_J$~0g}gPK;CrXwpk(B-ivuAPqz~6{N7)4ICH%A?OMjFG|jNE z01Gjln20$OJO8D~zkiD|*^pI=Q}5ht5E>j2{nm4^=XrkNMZ9F%if@w&%~E?!r*2DL zXRZXHEQ3GeFESZpqC(Ij41e60Tqn)^cl{Uyt$VoX&Z78l!k9HqB;DSL{|pgujCsoM zZP30#Y(nw;j+RmSyZwEZrmIzd&O4RcN$+!LvK_Sk+O)!a7oB^)ylgHWS8ek?;{~fj z35sFs8163ZYJin3ekgNU!yq}x2S^`+jzcElCCakW?y_#Y8g_HfyyAU*r1ym6 z_sv=U@uI85xt77@;GH60Ho$yDV#Y$?zeuu-+r%!6e|h05!W-gb zj*3>Ht+LOKu)-;IzE#i3b8GmXtTFd;@i(rv8r$_|>RSBZFKQ8@)2p%Q%9j5{OB%RW znw!!zEUT-Zd+FPQ5%H0BqlrWAyYgb{ve=j2mvz=Be*o?vPJoxr5@hBxIBm!(`d_2c z9Mc&Q$xd3?&j*4mA|M!_AscEVGjQ=0`nz`Pi}Oe6MFO22ln2zQMO(f90=G|2Uq78l zY$8D2Ma_TgkDjCMu8DkkLxn7SxL|PVC7#Uez#K-Bo_Mo1?^o#b#Fr>hpvZY-TYY&s(JAGZ@$Q*E z+e`b!>qP+Ul!Qss`b03Y#+=EEU{Z%Z|K5B?6<&K5bMkc9%M2w%pw|c6a9J zaM$>L(UKVmhg|!R5#D4(##4t(ROs7Xpq9*_bZRvs5<6R*LkO%?Q*8f2S*8{9+PD?{ z;&;e5?g$=!^Y?HES>fmKOG)Zf(vGyw}k$6HI+mNR9@l%*(*J0EN2inV=+y{&MI+YrC2g$~-JXIbm zYEI=Jv0e*8GbWhpeAFYbrP%gBCoA1EfBHwdK>dael(hS{~7CdzpiM0P0 zdeD8V52sy#vlcC{t!FOwT6rbvC~Q7^IZMc*&Nr;$L;n}Z^@#4F?VHG&U1^ zeYUU3(zf!*m)HBd26Edktut)zn&8`LEgyJ9P}`n(uPyK(eZ_544*BW%W_{a=(9waT zDS^6r^@l2h`>=AtQnv;-lals4V!0*Zp8h|xo}5p+3OXwvzBx3GnGmnUt!Y?GPdWymCrH@kwp}ni|+5P4Y~&$oF+Q zrm+4@``G!=OO<=&T_{}&Mim`uJa(UfPW78oN?J<_hs;IhU#bH6X3x6(c5X81O*bt| z6&3{L7spthaMCHY*gzWaI7~3^UKi>OiV+L;eri-h55uhXsqX2&Ub=3!^!#GpO#X;$ zmD_N-%dzdi;oa^i9}yqq*-XQ z=uI)|cbDq&SAL?|k&S7?c4nPnoYDiriH*U{m|BZS;d)R@teImA)9hr-zU%n)|K=UH zF6~<1f?7VP5%V(Pm*2YMWUA}i3>4X?q=z?RY*GvJRvX zj~OFU0z7MW=kM(Q$9wz1j>(!>F^NN_zdhNs^K>0M{+K-Vt{~yYm!j@o^ z%P#5WCFw~1`hp;f-Ig_m6*O7r*v$R6R%*2wxb1sYT6~i_xn3{+!=|6U1`^o#AAaQe zu(sLFb?zn4c}oMLOxqKjVq>zrM(c+5HRkm<6`J4Yi4eXOsm)8 z1kMYx2Nmaitw3sneS~2~XWLiLGROCF&HmSIJl^0)b4Xq(IMO&TxbGLLbmB_MO{j|6 zIXY?b$GTkGL5FTTv&UhsiN#-5?hI14LbS4K*nam?{%X?4FUc8Z4CDRF5*a?mm#q;k zY2540N__VqL4vKjv8KWC68$e^mjA%sfpxFf$DkZ0{Em1kzZ#6{z8xH&@mrkb+(fHs zy(iJjA;qGM6woQ<0Ofq`@N9aKE=jWDvSE0+^)sX2CyOY+@VosDg!4ey{VDy=blLO+ zMTUFaYZGTv)#Cdo{!s+g(zH^tmW_r00zOtZYWcz>Yalx!w!+??72;lA+Ei@+hf<% zuN=IL-2b8S;g%e$^Iz8N2_}$JdTtncp0VQjWn6ujeUafk&D%S+gpV8&BrkwP6BB4M_iRTbaTiu4y&13dI8#V zSDbn&pA=i$Zd9@G>ZbasnHTP)R#RK6xoyGp-b~8k5;oX`2-(7samaH6;yuUPb^R8u_*R4+anv6CtCHIVWphBXCDR=f|&D(x)mcJD?ePe#t8f-P=5jB+4 zNWtdM%PQiVy6#`i$-?P++3nagX-?Xtj`juj*y8eP_0}Sr!8VVsGRe$oZT+??RRiX{ zG3{-K(^HVT=bkNg8(GPCShdES{gU&VGpohBXAUiAge0{=m%!$2N#dPO5!|1CN__kU zpD!avVsqM!He}_Y@^8LU#D8`gKHF6E>ZU!eIB_c4DU~K8r3B#aohBE)TtiHwqcOPV z?(fEm#{HVeFE1Wy3|5XA#WzS}KLqUx-RB%0KOa9Gc1qcI?iSPt#w|IF|Lvtb>liW} z(yeL0zZ7oB6($tILjzg?V~_wKo`GTv8*3U*>_=Ii&P+s?(+vphPm&b?fR|Ie{=XXE zfi-J<1O%1-jaoEr`#xlcL-eA@^q_HeOvSOTfV~%`r|x|c{ft>?3jQTDbp#qerw}0iVNZ}z4ws`mXNSh}i$1X{ zh)O_mzSEd-LEYoZquRq0kM;$FCR`KOH6S)lPH}(}U0ooe<*H-4{J++zD7oHT-eKsGdthMZh)v(fS<2=|GNxKG zfRoGTWAg2Jx_Jc=ddl5VCtYs*y zIzXtTl_J;-BuU5U62G;)uO#L9nj#ak2BjOL?R?Fg7&?~M+@U9 z`x0~wm-R{O-|FavJiBj&7NdLkCcItYb#h$D-JI#4UojF|uEU#|G1y%cf7Y$I6jQl+ zH@Ag&w-IW&CFLNgd>8?L3f>}D_kor_C(dXFhUIb}xmrCn=KjX(hU@zHhF+&SW$Gd0 zHK;jOV7sZUGWIvdrp;}8QWXzNw*iukGygW>)W*NxEmh&gs!c*e{&m(w|WO<`4W zTmY}|Msedru*PkpS&KM12T4X;M2^s`tR_$VVu8J8)k`7ZAB!3`Q3!w6j1{T#4byOe z*f)k&2zihXBu3M*+NY}Y>pfo=oq`Hx=02$uv~+&BpWlPc7_H%)p*&Uf}2l^XC&nj?XxdKdMQDo=w!ELJKh|KPyHb~dDqTlTM zOJ|{UXbP{lbyCxmnKJluywc=$J%TbMcAljeZRkuu$uGJPuJQ4k8)K$!{ZhB2pC@}W z^b)11M)Hd%e8Y>ZvSsR=AUqnuOL%Dbo*`i>3$CV7z?nyc6r0=pxp#oYsn1D>8!npo zfAq>)pPi;N4X5?IcD**ncYet7UCTydOj5$Ogr#5fi?SW}!KYqx-{M_coyuw|tHOz* z|Gb)dDo@g(QB#Z4eV5RqzW#Y={vnJ2$!09@HESP47%CPkPJLxt9rw_7izaSCsLVk? zjX1r9;V_o(CDPK+R%aP|9DH9!{q4X|4RguF%d2Jx3Jjjl@_6Ze9yS+S z^x(+p7_u0Pl{DlY`YrA?iRpBbF!0&c_t0tujgQDrDR1y5PF&+WQiZ-xbQ_5kI_th= zEfW+Yc%}z8+EGF`8>PJr67pVg?kZ;r7xCLQT(2%0kB@k?I3%ZRYwMcMl<@cL9u1@#q9Vs?g}n*IlM`-Img559Mh{v{wlZ>cle zy_dV9ut$uso{Y@Z+7sJ_z}mqUHW=n3tQ zB%7EtYJQ{)NkQtK?scpECE@)u|hb9x0QU^!>Ca%TP$kRV&hmTa3bxVxZz zFDdv}))I_`Q^LANAqG`pt%U;$;qFB!N7?U9<{CqK-!4CSLNf1up4WuSv@G6f86@l}$M|)OVKAbz;WG^{9 zy*5c#jCfCJy`|xoEfb@8QT3kLIFd@9gV(P0^sud!F!Iy)NDK)PuS^L~>lbT&pRPXO z%sTz|sSlgsE0KLosP|bTrYS@E?O{LU5bYW)e}e-+K9W_vwO)N2#ok9#7$!xHJK}j* zbOcnBW747Qk=pP`1W8V2xv$bL^(gd~{ozC1C}e(zp$vhDWXa+}+G|!m3BaV8PSl`I zo?$!QV$3PCRk(k-*7yWHo41j=-Y^PmzsA+g{Qa9dv zJtHt*Bg^a#OCwL^@avgHpXBSI|H{MeKSGqnWh-VjR9d@r6DPf|s@b+$@paY-!FU8b zw|huks;YqPQ-kls?-SMNhqpq+5vYv{pX%@o^9CuIF`avKR*_lL7d}*U&>5yeh{{BQ z_1u}HKP653MUltbQ}U}mle+ClvS*D=|MaaloS`Zm^JBvmzm%ut5Yo>_*D)OM-d2S+duvf D5sW7; diff --git a/install.sh b/install.sh index 4bd90a9b..6b675d6c 100755 --- a/install.sh +++ b/install.sh @@ -1,10 +1,11 @@ # Change current directory into project root original_dir=$(pwd) -script_dir=$(dirname "$0") +script_dir=$(realpath "$(dirname "$0")") cd "$script_dir" # Remove old dist file, build, and install -rm -rf dist +rm -rf build dist +rm -rf *.egg-info python setup.py bdist_wheel pip install dist/*.whl diff --git a/setup.py b/setup.py index 52add160..72db0500 100644 --- a/setup.py +++ b/setup.py @@ -4,32 +4,26 @@ import subprocess from setuptools import find_packages from setuptools.command.build_py import build_py -from setuptools.command.develop import develop +from torch.utils.cpp_extension import CppExtension, CUDA_HOME current_dir = os.path.dirname(os.path.realpath(__file__)) -jit_include_dirs = ('deep_gemm/include/deep_gemm', ) -third_party_include_dirs = ( +cxx_flags = ['-std=c++20', '-O3', '-fPIC', '-Wno-psabi'] +sources = ['csrc/python_api.cpp'] +build_include_dirs = [ + f'{CUDA_HOME}/include', + 'deep_gemm/include', + 'third-party/cutlass/include', + 'third-party/fmt/include', +] +build_libraries = ['cuda', 'cudart'] +build_library_dirs = [ + f'{CUDA_HOME}/lib64', + f'{CUDA_HOME}/lib64/stub' +] +third_party_include_dirs = [ 'third-party/cutlass/include/cute', 'third-party/cutlass/include/cutlass', -) - - -class PostDevelopCommand(develop): - def run(self): - self.make_jit_include_symlinks() - - @staticmethod - def make_jit_include_symlinks(): - # Make symbolic links of third-party include directories - for d in third_party_include_dirs: - dirname = d.split('/')[-1] - src_dir = f'{current_dir}/{d}' - dst_dir = f'{current_dir}/deep_gemm/include/{dirname}' - assert os.path.exists(src_dir) - if os.path.exists(dst_dir): - assert os.path.islink(dst_dir) - os.unlink(dst_dir) - os.symlink(src_dir, dst_dir, target_is_directory=True) +] class CustomBuildPy(build_py): @@ -46,7 +40,7 @@ def run(self): def generate_default_envs(self): code = '# Pre-installed environment variables\n' code += 'persistent_envs = dict()\n' - for name in ('DG_JIT_CACHE_HOME_DIR', 'DG_JIT_CACHE_SHARED_USERS'): + for name in ('DG_JIT_CACHE_DIR', 'DG_JIT_PRINT_COMPILER_COMMAND', 'DG_JIT_DISABLE_SHORTCUT_CACHE'): code += f"persistent_envs['{name}'] = '{os.environ[name]}'\n" if name in os.environ else '' with open(os.path.join(self.build_lib, 'deep_gemm', 'envs.py'), 'w') as f: @@ -79,9 +73,10 @@ def prepare_includes(self): except: revision = '' + # noinspection PyTypeChecker setuptools.setup( name='deep_gemm', - version='1.1.0' + revision, + version='2.0.0' + revision, packages=find_packages('.'), package_data={ 'deep_gemm': [ @@ -90,9 +85,16 @@ def prepare_includes(self): 'include/cutlass/**/*', ] }, + ext_modules=[ + CppExtension(name='deep_gemm_cpp', + sources=sources, + include_dirs=build_include_dirs, + libraries=build_libraries, + library_dirs=build_library_dirs, + extra_compile_args=cxx_flags) + ], zip_safe=False, cmdclass={ - 'develop': PostDevelopCommand, 'build_py': CustomBuildPy, }, ) diff --git a/tests/generators.py b/tests/generators.py index 8f0484ac..a0597ad0 100644 --- a/tests/generators.py +++ b/tests/generators.py @@ -1,52 +1,148 @@ +import enum import random import torch -from typing import Tuple +from typing import Generator, Tuple, List -from deep_gemm.utils.math import align, ceil_div, per_token_cast_to_fp8, per_block_cast_to_fp8 -from deep_gemm.utils.layout import MajorTypeAB, get_m_alignment_for_contiguous_layout +from deep_gemm.utils import ( + align, ceil_div, + per_token_cast_to_fp8, per_channel_cast_to_fp8, per_block_cast_to_fp8, + get_mk_alignment_for_contiguous_layout +) -def enumerate_normal(): - for m in (128, 4096): - for k, n in [(7168, 2112), (1536, 24576), (512, 32768), (16384, 7168), (7168, 4096), (2048, 7168)]: - for major_a, major_b in ((MajorTypeAB.KMajor, MajorTypeAB.KMajor), (MajorTypeAB.KMajor, MajorTypeAB.MNMajor), - (MajorTypeAB.MNMajor, MajorTypeAB.KMajor), (MajorTypeAB.MNMajor, MajorTypeAB.MNMajor)): - for out_dtype in (torch.bfloat16, torch.float): - for accumulate in (False, ) if out_dtype == torch.bfloat16 else (False, True): - yield m, k, n, major_a, major_b, accumulate, out_dtype +class KernelType(enum.Enum): + # For SM100 GEMMs + Kernel1D1D = 0 + Kernel1D2D = 1 + def is_1d1d(self): + return self.value == 0 -def enumerate_grouped_contiguous(): - for num_groups, expected_m_per_group, k, n in ((4, 8192, 7168, 4096), (4, 8192, 2048, 7168), (8, 4096, 7168, 4096), (8, 4096, 2048, 7168)): - for major_a, major_b in ((MajorTypeAB.KMajor, MajorTypeAB.KMajor), (MajorTypeAB.KMajor, MajorTypeAB.MNMajor)): - yield num_groups, expected_m_per_group, k, n, major_a, major_b + def is_1d2d(self): + return self.value == 1 -def enumerate_grouped_masked(): - for num_groups, m in ((1, 1024), (2, 512), (4, 256)): - for k, n in ((7168, 4096), (2048, 7168), ): - yield num_groups, m, k, n +class MajorTypeAB(enum.Enum): + KMajor = 0 + MNMajor = 1 + def is_k_major(self): + return self.value == 0 -def generate_normal(m: int, k: int, n: int, + def is_mn_major(self): + return self.value == 1 + + +def get_arch_major() -> int: + major, minor = torch.cuda.get_device_capability() + return major + + +def get_ue8m0_usage(kernel_type: KernelType) -> bool: + if get_arch_major() == 9: + return False + return kernel_type.is_1d1d() + + +def get_kernel_types() -> tuple: + return (KernelType.Kernel1D2D, ) if get_arch_major() == 9 else (KernelType.Kernel1D1D, KernelType.Kernel1D2D) + + +def get_out_dtype() -> tuple: + return (torch.bfloat16, ) if get_arch_major() == 9 else (torch.bfloat16, torch.float) + + +def get_major_ab(freeze_a: bool) -> tuple: + if get_arch_major() == 9: + return ((MajorTypeAB.KMajor, MajorTypeAB.KMajor), ) + if freeze_a: + return (MajorTypeAB.KMajor, MajorTypeAB.KMajor), (MajorTypeAB.KMajor, MajorTypeAB.MNMajor) + return (MajorTypeAB.KMajor, MajorTypeAB.KMajor), (MajorTypeAB.KMajor, MajorTypeAB.MNMajor), \ + (MajorTypeAB.MNMajor, MajorTypeAB.KMajor), (MajorTypeAB.MNMajor, MajorTypeAB.MNMajor) + + +def enumerate_normal() -> Generator: + for kernel_type in get_kernel_types(): + for m in (128, 4096): + for n, k in [(2112, 7168), (24576, 1536), (32768, 512), (7168, 16384), (4096, 7168), (7168, 2048)]: + for major_a, major_b in get_major_ab(False): + for out_dtype in get_out_dtype(): + for accumulate in (False, ) if out_dtype == torch.bfloat16 or kernel_type.is_1d2d() else (False, True): + yield kernel_type, m, n, k, major_a, major_b, accumulate, out_dtype + + +def enumerate_m_grouped_contiguous() -> Generator: + for kernel_type in get_kernel_types(): + for num_groups, expected_m_per_group, n, k in ((4, 8192, 4096, 7168), (4, 8192, 7168, 2048), (8, 4096, 4096, 7168), (8, 4096, 7168, 2048)): + for major_a, major_b in get_major_ab(True): + yield kernel_type, num_groups, expected_m_per_group, n, k, major_a, major_b + + +def enumerate_m_grouped_masked() -> Generator: + max_m = 4096 + for kernel_type in get_kernel_types(): + for num_groups, m in ((1, 1024), (2, 512), (4, 256)): + for n, k in ((4096, 7168), (7168, 2048), ): + yield kernel_type, num_groups, max_m, m, n, k + + +def enumerate_k_grouped_contiguous(): + # TODO: support SM90 kernels + if get_arch_major() == 9: + return [] + + # Must with FP32 accumulation and 1D1D kernels + for num_groups, m, n, expected_k_per_group in (( 4, 4096, 7168, 8192), ( 4, 7168, 2048, 8192), # EP64 + ( 8, 4096, 7168, 4096), ( 8, 7168, 2048, 4096), # EP32 + (16, 4096, 7168, 2048), (16, 7168, 2048, 2048)): # EP16 + ks = [align(int(expected_k_per_group * random.uniform(0.7, 1.3)), get_mk_alignment_for_contiguous_layout()) for _ in range(num_groups)] + yield num_groups, m, n, ks, expected_k_per_group + + +def enumerate_sf_layout(): + for with_transpose in (True, False): + for mn in (4096, 4097, 8192): + for k in (128, 7168, 7296): + for num_groups in (1, 2, 4) if with_transpose else (1, ): + if num_groups > 1 and (mn * ceil_div(k, 128)) % 4 != 0: + continue + if not with_transpose and mn % 4 != 0: + continue + yield mn, k, with_transpose, num_groups + + +def enumerate_k_grouped_sf_layout(): + alignment = get_mk_alignment_for_contiguous_layout() + assert alignment % 128 == 0 + for mn in (4096, 7168): + for num_groups, avg_k in ((16, 2048), (8, 4096), (72, 384), (128, 256)): + ks = [align(int(random.uniform(0.7, 1.3) * avg_k), alignment) for _ in range(num_groups)] + yield mn, ks, num_groups + + +def generate_normal(m: int, n: int, k: int, major_a: MajorTypeAB, major_b: MajorTypeAB, - accumulate: bool, out_dtype: torch.dtype): + accumulate: bool, out_dtype: torch.dtype, + use_ue8m0: bool): a = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) b = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) - c = torch.randn((m, n), device='cuda', dtype=out_dtype) * 64 if accumulate else None - d = torch.empty((m, n), device='cuda', dtype=out_dtype) + d = torch.randn((m, n), device='cuda', dtype=out_dtype) * 32 if accumulate else \ + torch.empty((m, n), device='cuda', dtype=out_dtype) + c = d if accumulate else None ref_d = (a.float() @ b.float().t() + (c if accumulate else 0)).to(out_dtype) - a_fp8, b_fp8 = per_token_cast_to_fp8(a), per_block_cast_to_fp8(b) - a_fp8 = a_fp8 if major_a == MajorTypeAB.KMajor else (a_fp8[0].T.contiguous().T, a_fp8[1]) - b_fp8 = b_fp8 if major_b == MajorTypeAB.KMajor else (b_fp8[0].T.contiguous().T, b_fp8[1]) + a_fp8, b_fp8 = per_token_cast_to_fp8(a, use_ue8m0=use_ue8m0), per_block_cast_to_fp8(b, use_ue8m0=use_ue8m0) + a_fp8 = a_fp8 if major_a.is_k_major() else (a_fp8[0].T.contiguous().T, a_fp8[1]) + b_fp8 = b_fp8 if major_b.is_k_major() else (b_fp8[0].T.contiguous().T, b_fp8[1]) return a_fp8, b_fp8, c, d, ref_d -def generate_grouped_contiguous(num_groups: int, expected_m_per_group: int, k: int, n: int, major_a: MajorTypeAB, major_b: MajorTypeAB) -> \ +def generate_m_grouped_contiguous(num_groups: int, expected_m_per_group: int, n: int, k: int, + major_a: MajorTypeAB, major_b: MajorTypeAB, use_ue8m0: bool) -> \ Tuple[int, Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: - group_ms = [align(int(expected_m_per_group * random.uniform(0.7, 1.3)), get_m_alignment_for_contiguous_layout()) for _ in range(num_groups)] - m = sum(group_ms) + actual_ms = [int(expected_m_per_group * random.uniform(0.7, 1.3)) for _ in range(num_groups)] + aligned_ms = [align(actual_m, get_mk_alignment_for_contiguous_layout()) for actual_m in actual_ms] + m = sum(aligned_ms) a = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) b = torch.randn((num_groups, n, k), device='cuda', dtype=torch.bfloat16) @@ -55,33 +151,62 @@ def generate_grouped_contiguous(num_groups: int, expected_m_per_group: int, k: i ref_d = torch.randn((m, n), device='cuda', dtype=torch.bfloat16) start = 0 - for i, group_m in enumerate(group_ms): - end = start + group_m - m_indices[start:end] = i - ref_d[start:end] = a[start:end] @ b[i].t() - start = end - - assert major_a == MajorTypeAB.KMajor - a_fp8 = per_token_cast_to_fp8(a) + for i, (actual_m, aligned_m) in enumerate(zip(actual_ms, aligned_ms)): + actual_end = start + actual_m + aligned_end = start + aligned_m + m_indices[start:actual_end] = i + m_indices[actual_end:aligned_end] = -1 + ref_d[start:aligned_end] = a[start:aligned_end] @ b[i].t() + start = aligned_end + ref_d = torch.where((m_indices == -1).unsqueeze(1), torch.zeros_like(ref_d), ref_d) + + assert major_a.is_k_major() + a_fp8 = per_token_cast_to_fp8(a, use_ue8m0=use_ue8m0) b_fp8 = (torch.empty_like(b, dtype=torch.float8_e4m3fn), torch.empty((num_groups, ceil_div(n, 128), ceil_div(k, 128)), device='cuda', dtype=torch.float)) for i in range(num_groups): - b_fp8[0][i], b_fp8[1][i] = per_block_cast_to_fp8(b[i]) - b_fp8 = b_fp8 if major_b == MajorTypeAB.KMajor else (b_fp8[0].mT.contiguous().mT, b_fp8[1]) + b_fp8[0][i], b_fp8[1][i] = per_block_cast_to_fp8(b[i], use_ue8m0=use_ue8m0) + b_fp8 = b_fp8 if major_b.is_k_major() else (b_fp8[0].mT.contiguous().mT, b_fp8[1]) return m, a_fp8, b_fp8, m_indices, d, ref_d -def generate_grouped_masked(num_groups: int, m: int, k: int, n: int) -> \ - Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor]: - a = torch.randn((num_groups, m, k), device='cuda', dtype=torch.bfloat16) +def generate_m_grouped_masked(num_groups: int, max_m: int, expected_m_per_group: int, n: int, k: int, use_ue8m0: bool) -> \ + Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor], torch.Tensor, torch.Tensor, torch.Tensor]: + a = torch.randn((num_groups, max_m, k), device='cuda', dtype=torch.bfloat16) b = torch.randn((num_groups, n, k), device='cuda', dtype=torch.bfloat16) - d = torch.empty((num_groups, m, n), device='cuda', dtype=torch.bfloat16) + d = torch.empty((num_groups, max_m, n), device='cuda', dtype=torch.bfloat16) ref_d = torch.einsum('gmk,gnk->gmn', a, b) - a_fp8 = (torch.empty_like(a, dtype=torch.float8_e4m3fn), torch.empty((num_groups, m, ceil_div(k, 128)), device='cuda', dtype=torch.float)) + a_fp8 = (torch.empty_like(a, dtype=torch.float8_e4m3fn), torch.empty((num_groups, max_m, ceil_div(k, 128)), device='cuda', dtype=torch.float)) b_fp8 = (torch.empty_like(b, dtype=torch.float8_e4m3fn), torch.empty((num_groups, ceil_div(n, 128), ceil_div(k, 128)), device='cuda', dtype=torch.float)) for i in range(num_groups): - a_fp8[0][i], a_fp8[1][i] = per_token_cast_to_fp8(a[i]) - b_fp8[0][i], b_fp8[1][i] = per_block_cast_to_fp8(b[i]) + a_fp8[0][i], a_fp8[1][i] = per_token_cast_to_fp8(a[i], use_ue8m0=use_ue8m0) + b_fp8[0][i], b_fp8[1][i] = per_block_cast_to_fp8(b[i], use_ue8m0=use_ue8m0) + + masked_m = torch.empty((num_groups, ), device='cuda', dtype=torch.int) + for j in range(num_groups): + masked_m[j] = int(expected_m_per_group * random.uniform(0.7, 1.3)) + assert masked_m.amax().item() <= max_m + + return a_fp8, b_fp8, masked_m, d, ref_d + + +def generate_k_grouped_contiguous(num_groups: int, m: int, n: int, ks: List[int], use_ue8m0: bool): + assert get_mk_alignment_for_contiguous_layout() % 128 == 0 + k = sum(ks) + + a = torch.randn((k, m), device='cuda', dtype=torch.bfloat16) + b = torch.randn((k, n), device='cuda', dtype=torch.bfloat16) + c = torch.randn((num_groups, m, n), device='cuda', dtype=torch.float) * 32 + d = c + ref_d = torch.empty_like(c) + + start = 0 + for i, group_k in enumerate(ks): + end = start + group_k + ref_d[i] = c[i] + (a[start:end].T @ b[start:end]) + start = end - return a_fp8, b_fp8, d, ref_d + a_fp8 = per_channel_cast_to_fp8(a, use_ue8m0=use_ue8m0) + b_fp8 = per_channel_cast_to_fp8(b, use_ue8m0=use_ue8m0) + return k, a_fp8, b_fp8, c, d, ref_d diff --git a/tests/test_core.py b/tests/test_core.py index c3f4a29d..d9ddc75d 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -1,104 +1,161 @@ -# PyTorch has its own NVRTC, which may have a lower version than the system -# So try to disable PyTorch's NVRTC, or import NVRTC before PyTorch -import cuda.bindings.nvrtc as nvrtc -print(f'NVRTC version: {nvrtc.nvrtcVersion()[1:]}') - +import copy import random +import time import torch import deep_gemm -from deep_gemm.utils.layout import MajorTypeAB -from deep_gemm.testing.bench import bench_kineto -from deep_gemm.testing.numeric import calc_diff, count_bytes +from deep_gemm.testing import ( + bench, bench_kineto, + calc_diff, count_bytes +) from generators import ( - enumerate_normal, enumerate_grouped_contiguous, enumerate_grouped_masked, - generate_normal, generate_grouped_contiguous, generate_grouped_masked, + KernelType, get_ue8m0_usage, + enumerate_normal, enumerate_m_grouped_contiguous, enumerate_m_grouped_masked, enumerate_k_grouped_contiguous, + generate_normal, generate_m_grouped_contiguous, generate_m_grouped_masked, generate_k_grouped_contiguous ) def test_gemm() -> None: print('Testing GEMM:') - for m, k, n, major_a, major_b, accumulate, out_dtype in enumerate_normal(): - major_opt = 'N' if major_a == MajorTypeAB.KMajor else 'T' - major_opt += 'T' if major_b == MajorTypeAB.KMajor else 'N' + for kernel_type, m, n, k, major_a, major_b, accumulate, out_dtype in enumerate_normal(): + major_opt = 'N' if major_a.is_k_major() else 'T' + major_opt += 'T' if major_b.is_k_major() else 'N' out_opt = 'FP32' if out_dtype == torch.float else 'BF16' - acc_opt = f'accumulate={int(accumulate)}' - - a, b, c, d, ref_d = generate_normal(m, k, n, major_a, major_b, accumulate, out_dtype) - deep_gemm.fp8_gemm_nt(a, b, d, c=c) - diff = calc_diff(d, ref_d) - assert diff < 0.001, f'{m=}, {k=}, {n=}, {major_opt=}, {diff:.5f}' + acc_opt = f'acc={int(accumulate)}' + kernel_opt = f'1D1D' if kernel_type.is_1d1d() else '1D2D' + use_ue8m0 = get_ue8m0_usage(kernel_type) + disable_ue8m0_cast = not use_ue8m0 + + for test_alias in (False, True): + a, b, c, d, ref_d = generate_normal(m, n, k, major_a, major_b, accumulate, out_dtype, use_ue8m0=use_ue8m0) + func_name = f'fp8_gemm_{major_opt.lower() if test_alias else "nt"}' + if test_alias: + a = a if major_a.is_k_major() else (a[0].T, a[1].T) + b = b if major_b.is_k_major() else (b[0].T, b[1].T) + assert a[0].is_contiguous() and b[0].is_contiguous() + getattr(deep_gemm, func_name)(a, b, d, c=c, disable_ue8m0_cast=disable_ue8m0_cast) + diff = calc_diff(d, ref_d) + assert diff < 0.001, (f'{m=}, {n=}, {k=}, {kernel_opt}, {major_opt=}, {accumulate=}, {out_dtype=}, ' + f'{diff:.5f}, alias={test_alias}') + a, b, c, d, ref_d = generate_normal(m, n, k, major_a, major_b, accumulate, out_dtype, use_ue8m0=use_ue8m0) + + # Test launch overhead + launch_start_t = time.time_ns() + deep_gemm.fp8_gemm_nt(a, b, d, c=c, disable_ue8m0_cast=disable_ue8m0_cast) + launch_end_t = time.time_ns() + torch.cuda.synchronize() # noinspection PyShadowingNames def test_func(): - deep_gemm.fp8_gemm_nt(a, b, d) + deep_gemm.fp8_gemm_nt(a, b, d, c=c, disable_ue8m0_cast=disable_ue8m0_cast) t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) - print(f' > Perf (m={m:5}, n={n:5}, k={k:5}, MemLayout={major_opt}, {out_opt}, {acc_opt}):' - f'{t * 1e6:4.0f} us | ' + print(f' > Perf (m={m:5}, n={n:5}, k={k:5}, {kernel_opt}, layout={major_opt}, {out_opt}, {acc_opt}):' + f' launch {(launch_end_t - launch_start_t) / 1e3:4.0f} us | {t * 1e6:4.0f} us | ' f'{2 * m * n * k / t / 1e12:4.0f} TFLOPS | ' - f'{count_bytes((a, b, c, d)) / 1e9 / t:4.0f} GB/s') + f'{count_bytes(a, b, c, d) / 1e9 / t:4.0f} GB/s') print() def test_m_grouped_gemm_contiguous() -> None: - print('Testing grouped contiguous GEMM:') - - for num_groups, expected_m_per_group, k, n, major_a, major_b in enumerate_grouped_contiguous(): - # TODO: make a stronger test - major_opt = 'N' if major_a == MajorTypeAB.KMajor else 'T' - major_opt += 'T' if major_b == MajorTypeAB.KMajor else 'N' - - m, a, b, m_indices, d, ref_d = generate_grouped_contiguous(num_groups, expected_m_per_group, k, n, major_a, major_b) - deep_gemm.m_grouped_fp8_gemm_nt_contiguous(a, b, d, m_indices) - diff = calc_diff(d, ref_d) - assert diff < 0.001, f'{m=}, {k=}, {n=}, {major_opt}, {diff:.5f}' + print('Testing m-grouped contiguous GEMM:') + + for kernel_type, num_groups, expected_m_per_group, n, k, major_a, major_b in enumerate_m_grouped_contiguous(): + major_opt = 'N' if major_a.is_k_major() else 'T' + major_opt += 'T' if major_b.is_k_major() else 'N' + kernel_opt = f'1D1D' if kernel_type.is_1d1d() else '1D2D' + use_ue8m0 = get_ue8m0_usage(kernel_type) + disable_ue8m0_cast = not use_ue8m0 + + for test_alias in (False, True): + m, a, b, m_indices, d, ref_d = generate_m_grouped_contiguous(num_groups, expected_m_per_group, n, k, major_a, major_b, use_ue8m0=use_ue8m0) + func_name = f"m_grouped_fp8_gemm_{(major_opt.lower() if test_alias else 'nt')}_contiguous" + if test_alias: + assert major_a.is_k_major() + b = b if major_b.is_k_major() else (b[0].mT, b[1].mT) + assert a[0].is_contiguous() and b[0].is_contiguous() + getattr(deep_gemm, func_name)(a, b, d, m_indices, disable_ue8m0_cast=disable_ue8m0_cast) + d = torch.where((m_indices == -1).unsqueeze(1), torch.zeros_like(d), d) + diff = calc_diff(d, ref_d) + assert diff < 0.001, f'{m=}, {n=}, {k=}, {major_opt}, {kernel_opt}, {diff:.5f}, alias={test_alias}' + m, a, b, m_indices, d, ref_d = generate_m_grouped_contiguous(num_groups, expected_m_per_group, n, k, major_a, major_b, use_ue8m0=use_ue8m0) # noinspection PyShadowingNames def test_func(): - deep_gemm.m_grouped_fp8_gemm_nt_contiguous(a, b, d, m_indices) + deep_gemm.m_grouped_fp8_gemm_nt_contiguous(a, b, d, m_indices, disable_ue8m0_cast=disable_ue8m0_cast) t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) - print(f' > Perf ({num_groups=}, m={m:5}, n={n:5}, k={k:5}, MemLayout={major_opt}): ' + print(f' > Perf ({num_groups=}, m={m:5}, n={n:5}, k={k:5}, {kernel_opt}, layout={major_opt}): ' f'{t * 1e6:4.0f} us | ' f'{2 * m * n * k / t / 1e12:4.0f} TFLOPS | ' - f'{count_bytes((a, b, d)) / 1e9 / t:4.0f} GB/s') + f'{count_bytes(a, b, d) / 1e9 / t:4.0f} GB/s') print() def test_m_grouped_gemm_masked() -> None: - print('Testing grouped masked GEMM:') + print('Testing m-grouped masked GEMM:') + + # TODO: when the actual `m` is greater than `expected_m_per_group`, efficiency may significantly decrease. + for kernel_type, num_groups, max_m, expected_m_per_group, n, k in enumerate_m_grouped_masked(): + kernel_opt = f'1D1D' if kernel_type.is_1d1d() else '1D2D' + use_ue8m0 = get_ue8m0_usage(kernel_type) + disable_ue8m0_cast = not use_ue8m0 - # TODO: merge Hopper's tests - for num_groups, m, k, n in enumerate_grouped_masked(): # Test correctness - masked_m_candidates = list(filter(lambda candidate: candidate <= m, (128, 256, 384))) for i in range(10): - a, b, d, ref_d = generate_grouped_masked(num_groups, m, k, n) - masked_m = torch.empty((num_groups, ), device='cuda', dtype=torch.int) - for j in range(num_groups): - masked_m[j] = random.choice(masked_m_candidates) - expected_m = min(int(masked_m.float().mean()) + 1, m) - deep_gemm.fp8_m_grouped_gemm_nt_masked(a, b, d, masked_m, expected_m) + a, b, masked_m, d, ref_d = generate_m_grouped_masked(num_groups, max_m, expected_m_per_group, n, k, use_ue8m0=use_ue8m0) + deep_gemm.fp8_m_grouped_gemm_nt_masked(a, b, d, masked_m, expected_m_per_group, disable_ue8m0_cast=disable_ue8m0_cast) for j in range(num_groups): diff = calc_diff(d[j, :masked_m[j].item()], ref_d[j, :masked_m[j].item()]) - assert diff < 0.001, f'{m=}, {k=}, {n=}, {j=}, masked_m={masked_m[j]}, {num_groups=}, {diff:.5f}' + assert diff < 0.001, f'{m=}, {n=}, {k=}, {j=}, masked_m={masked_m[j]}, {kernel_opt}, {num_groups=}, {diff:.5f}' # Construct full cases - a, b, d, ref_d = generate_grouped_masked(num_groups, m, k, n) - masked_m = torch.ones((num_groups, ), device='cuda', dtype=torch.int) * m + a, b, masked_m, d, ref_d = generate_m_grouped_masked(num_groups, max_m, expected_m_per_group, n, k, use_ue8m0=use_ue8m0) # noinspection PyShadowingNames def test_func(): - deep_gemm.fp8_m_grouped_gemm_nt_masked(a, b, d, masked_m, m) + deep_gemm.fp8_m_grouped_gemm_nt_masked(a, b, d, masked_m, expected_m_per_group, disable_ue8m0_cast=disable_ue8m0_cast) # Test performance with fixed shapes + valid_m = masked_m.sum().item() t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) - print(f' > Perf ({num_groups=}, m_per_group={m:4}, n={n:4}, k={k:4}): ' + print(f' > Perf ({num_groups=}, expected_m_per_group={expected_m_per_group:4}, n={n:4}, k={k:4}, {kernel_opt}): ' f'{t * 1e6:4.0f} us | ' - f'{2 * num_groups * m * n * k / t / 1e12:4.0f} TFLOPS | ' - f'{count_bytes((a, b, d)) / 1e9 / t:4.0f} GB/s') + f'{2 * valid_m * n * k / t / 1e12:4.0f} TFLOPS | ' + f'{(count_bytes(a, d) * valid_m / (max_m * num_groups) + count_bytes(b)) / 1e9 / t:4.0f} GB/s') + print() + + +def test_k_grouped_gemm_contiguous() -> None: + print('Testing k-grouped contiguous GEMM:') + + for num_groups, m, n, ks, expected_k_per_group in enumerate_k_grouped_contiguous(): + use_ue8m0 = get_ue8m0_usage(KernelType.Kernel1D1D) + + for test_empty_groups in (False, True): + new_ks = copy.deepcopy(ks) + if test_empty_groups: + new_ks[random.randint(0, num_groups - 1)] = 0 + k, a, b, c, d, ref_d = generate_k_grouped_contiguous(num_groups, m, n, new_ks, use_ue8m0=use_ue8m0) + new_ks_tensor = torch.tensor(new_ks, dtype=torch.int, device='cuda') + deep_gemm.k_grouped_fp8_gemm_tn_contiguous(a, b, d, new_ks, new_ks_tensor, c=c) + diff = calc_diff(d, ref_d) + assert diff < 0.001, f'{m=}, {n=}, {k=}, {i=}, {diff:.5f}' + + # Test performance + k, a, b, c, d, ref_d = generate_k_grouped_contiguous(num_groups, m, n, ks, use_ue8m0=use_ue8m0) + ks_tensor = torch.tensor(ks, dtype=torch.int, device='cuda') + + # noinspection PyShadowingNames + def test_func(): + deep_gemm.k_grouped_fp8_gemm_tn_contiguous(a, b, d, ks, ks_tensor, c=c) + + t = bench_kineto(test_func, 'fp8_gemm', suppress_kineto_output=True) + print(f' > Perf ({num_groups=:2}, m={m:5}, n={n:5}, k={k:5}): ' + f'{t * 1e6:4.0f} us | ' + f'{2 * m * n * k / t / 1e12:4.0f} TFLOPS | ' + f'{count_bytes(a, b, c, d) / 1e9 / t:4.0f} GB/s') print() @@ -114,3 +171,4 @@ def test_func(): test_gemm() test_m_grouped_gemm_contiguous() test_m_grouped_gemm_masked() + test_k_grouped_gemm_contiguous() diff --git a/tests/test_jit.py b/tests/test_jit.py deleted file mode 100644 index 26b7b36c..00000000 --- a/tests/test_jit.py +++ /dev/null @@ -1,98 +0,0 @@ -import ctypes -import os -import torch -import cuda.bindings.driver as cbd -from typing import Any, Dict - -from deep_gemm import jit - -# Essential debugging staffs -os.environ['DG_JIT_DEBUG'] = os.getenv('DG_JIT_DEBUG', '1') -os.environ['DG_JIT_DISABLE_CACHE'] = os.getenv('DG_JIT_DISABLE_CACHE', '1') - - -class VectorAddRuntime(jit.Runtime): - def __init__(self, path: str) -> None: - super().__init__(path) - - @staticmethod - def generate(kwargs: Dict[str, Any]) -> str: - return f""" -#ifdef __CUDACC_RTC__ -#include -#else -#include -#endif - -#include -#include - -template -__global__ void vector_add(T* a, T* b, T* c, uint32_t n) {{ - uint32_t i = blockDim.x * blockIdx.x + threadIdx.x; - if (i < n) {{ - c[i] = a[i] + b[i]; - }} -}} - -static void __instantiate_kernel() {{ - auto ptr = reinterpret_cast(&vector_add<{kwargs['T']}>); -}} -""" - - # noinspection PyShadowingNames,PyMethodOverriding - @staticmethod - def launch(kernel: cbd.CUkernel, kwargs: Dict[str, Any]) -> cbd.CUresult: - assert kwargs['A'].shape == kwargs['B'].shape == kwargs['C'].shape - assert kwargs['A'].device == kwargs['B'].device == kwargs['C'].device - assert kwargs['A'].dim() == 1 - - config = cbd.CUlaunchConfig() - config.gridDimX = (kwargs['A'].numel() + 127) // 128 - config.gridDimY = 1 - config.gridDimZ = 1 - config.blockDimX = 128 - config.blockDimY = 1 - config.blockDimZ = 1 - config.hStream = kwargs['STREAM'] - - arg_values = ( - kwargs['A'].data_ptr(), - kwargs['B'].data_ptr(), - kwargs['C'].data_ptr(), - kwargs['A'].numel(), - ) - arg_types = ( - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_void_p, - ctypes.c_uint32, - ) - - return cbd.cuLaunchKernelEx(config, kernel, (arg_values, arg_types), 0)[0] - - -if __name__ == '__main__': - print('Generated code:') - kwargs = {'T': 'float'} - code = VectorAddRuntime.generate(kwargs) - print(code) - print() - - for compiler_name in ('NVCC', 'NVRTC'): - # Get compiler - compiler_cls = getattr(jit, f'{compiler_name}Compiler') - print(f'Compiler: {compiler_name}, version: {compiler_cls.__version__()}') - - # Build - print('Building ...') - func = compiler_cls.build('test_func', code, VectorAddRuntime, kwargs) - - # Run and check - a = torch.randn((1024, ), dtype=torch.float32, device='cuda') - b = torch.randn((1024, ), dtype=torch.float32, device='cuda') - c = torch.empty_like(a) - ret = func(A=a, B=b, C=c, STREAM=torch.cuda.current_stream().cuda_stream) - assert ret == cbd.CUresult.CUDA_SUCCESS, ret - torch.testing.assert_close(c, a + b) - print(f'JIT test for {compiler_name} passed\n') diff --git a/tests/test_layout.py b/tests/test_layout.py new file mode 100644 index 00000000..6cad6426 --- /dev/null +++ b/tests/test_layout.py @@ -0,0 +1,104 @@ +import time +import torch +import random +from deep_gemm.testing import bench_kineto, count_bytes +from deep_gemm.utils import ( + align, ceil_div, + per_token_cast_to_fp8, per_channel_cast_to_fp8, + get_tma_aligned_size, + get_mn_major_tma_aligned_packed_ue8m0_tensor, + get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor +) + +from generators import ( + enumerate_sf_layout, + enumerate_k_grouped_sf_layout +) + + +def get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_impl(x: torch.Tensor) -> torch.Tensor: + assert x.dtype == torch.float and x.dim() in (2, 3) + + # First, convert into UE8M0 `uint8_t` + ue8m0_tensor = (x.view(torch.int) >> 23).to(torch.uint8) + + # Second, make padded packed tensors + mn, k = x.shape[-2], x.shape[-1] + remove_dim = False + if x.dim() == 2: + x, remove_dim = x.unsqueeze(0), True + b = x.shape[0] + aligned_mn = get_tma_aligned_size(mn, 4) + aligned_k = align(k, 4) + padded = torch.zeros((b, aligned_mn, aligned_k), device=x.device, dtype=torch.uint8) + padded[:, :mn, :k] = ue8m0_tensor + padded = padded.view(-1).view(dtype=torch.int).view(b, aligned_mn, aligned_k // 4) + + # Finally, transpose + transposed = torch.zeros((b, aligned_k // 4, aligned_mn), device=x.device, dtype=torch.int).mT + transposed[:, :, :] = padded + aligned_x = transposed[:, :mn, :] + return aligned_x.squeeze(0) if remove_dim else aligned_x + + +def test_sf_layout_kernels() -> None: + print('Testing SF layout kernels:') + for mn, k, with_transpose, num_groups in enumerate_sf_layout(): + x = torch.randn((num_groups * mn, k), dtype=torch.bfloat16, device='cuda') + x, fp32_sf = per_token_cast_to_fp8(x, use_ue8m0=True) + fp32_sf = fp32_sf if num_groups == 1 else fp32_sf.view(num_groups, mn, -1) + fp32_sf = fp32_sf if with_transpose else fp32_sf.transpose(-1, -2).contiguous().transpose(-1, -2) + + # Correctness + packed_sf = get_mn_major_tma_aligned_packed_ue8m0_tensor(fp32_sf) + ref_packed_sf = get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_impl(fp32_sf) + assert torch.equal(packed_sf, ref_packed_sf), f'{mn=}, {k=}, {with_transpose=}, {num_groups=}' + assert packed_sf.shape == ref_packed_sf.shape + assert all([packed_sf.stride(i) == ref_packed_sf.stride(i) for i in range(packed_sf.dim())]) + + # Test launch overhead + launch_start_t = time.time_ns() + get_mn_major_tma_aligned_packed_ue8m0_tensor(fp32_sf) + launch_end_t = time.time_ns() + + # Performance + t = bench_kineto(lambda: get_mn_major_tma_aligned_packed_ue8m0_tensor(fp32_sf), 'pack_fp32_into_ue8m0') + print(f' > Perf ({num_groups=:2}, {mn=:5}, {k=:5}, transpose={int(with_transpose)}): ' + f'launch {(launch_end_t - launch_start_t) / 1e3:3.0f} us | {t * 1e6:4.0f} us | ' + f'{count_bytes(fp32_sf, packed_sf) / 1e9 / t:4.0f} GB/s') + print() + + +def test_k_grouped_sf_layout_kernels() -> None: + print('Testing k-grouped SF layout kernels:') + for mn, ks, num_groups in enumerate_k_grouped_sf_layout(): + sf_ks = [k // 128 for k in ks] + packed_sf_ks = [ceil_div(k, 512) for k in ks] + ks_tensor = torch.tensor(ks, dtype=torch.int, device='cuda') + x = torch.randn((sum(ks), mn), dtype=torch.bfloat16, device='cuda') + x, fp32_sf = per_channel_cast_to_fp8(x, use_ue8m0=True) + + # Correctness + packed_sf = get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(fp32_sf, ks_tensor, ks) + split_packed_sf = packed_sf.split(packed_sf_ks) + split_fp32_sf = fp32_sf.split(sf_ks) + for i in range(num_groups): + ref_packed_sf = get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_impl(split_fp32_sf[i].T).T + assert torch.equal(split_packed_sf[i], ref_packed_sf), f'{i=}' + + # Performance + t = bench_kineto(lambda: get_k_grouped_mn_major_tma_aligned_packed_ue8m0_tensor(fp32_sf, ks_tensor, ks), 'pack_fp32_into_ue8m0') + print(f' > Perf ({num_groups=:3}, {mn=:5}, sum_k={sum(ks):5}):' + f'{t * 1e6:4.0f} us | ' + f'{count_bytes(fp32_sf, packed_sf, ks_tensor) / 1e9 / t:4.0f} GB/s') + print() + + +if __name__ == '__main__': + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + torch.manual_seed(1) + random.seed(1) + + test_sf_layout_kernels() + test_k_grouped_sf_layout_kernels() From 2e937c4656963c961265fd08a0c025c93127dd1f Mon Sep 17 00:00:00 2001 From: Ray Wang Date: Thu, 17 Jul 2025 20:17:48 -0700 Subject: [PATCH 4/4] Add fmtlib submodule at commit 553ec11 --- third-party/fmt | 1 + 1 file changed, 1 insertion(+) create mode 160000 third-party/fmt diff --git a/third-party/fmt b/third-party/fmt new file mode 160000 index 00000000..553ec11e --- /dev/null +++ b/third-party/fmt @@ -0,0 +1 @@ +Subproject commit 553ec11ec06fbe0beebfbb45f9dc3c9eabd83d28