diff --git a/.github/workflows/ibm-ce-tests.yml b/.github/workflows/ibm-ce-tests.yml new file mode 100644 index 000000000..038793f28 --- /dev/null +++ b/.github/workflows/ibm-ce-tests.yml @@ -0,0 +1,91 @@ +name: IBM CE Tests + +on: + workflow_dispatch: + # this allows to run the workflow manually through the github dashboard + +env: + HAVE_LITHOPS_CONFIG: ${{ secrets.LITHOPS_CONFIG != '' }} + LITHOPS_CONFIG_FILE: /tmp/lithops_config.yaml + +jobs: + + determine_runnable_test_jobs: + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' + + outputs: + localhost: ${{ steps.script.outputs.localhost }} + code_engine: ${{ steps.script.outputs.code_engine }} + + steps: + - name: Set jobs to run + id: script + run: | + echo "localhost=true" >> $GITHUB_OUTPUT + echo "code_engine=false" >> $GITHUB_OUTPUT + + + localhost_tests: + runs-on: ubuntu-latest + needs: determine_runnable_test_jobs + if: needs.determine_runnable_test_jobs.outputs.localhost == 'true' + + steps: + - name: Clone Lithops repository + uses: actions/checkout@v4 + + - name: Install Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install Lithops + run: | + pip3 install -U .[tests] + + - name: Run Lithops tests + run: | + cd lithops/tests + pytest -v --backend localhost --storage localhost + + + ibm_ce_cos_tests: + runs-on: ubuntu-latest + needs: determine_runnable_test_jobs + if: needs.determine_runnable_test_jobs.outputs.code_engine == 'true' + + steps: + - name: Clone Lithops repository + uses: actions/checkout@v4 + + - name: Install Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install Lithops + run: | + pip3 install -U .[tests] + + - name: Install Lithops config + id: config + run: | + echo -n -e "${{ secrets.LITHOPS_CONFIG }}" > $LITHOPS_CONFIG_FILE + + - name: Build new runtime + run: | + docker login -u ${{ secrets.DOCKER_USER }} -p ${{ secrets.DOCKER_TOKEN }} + cd runtime/code_engine + lithops runtime build -f Dockerfile.githubci ${{ secrets.DOCKER_USER }}/lithops-ce-gihub-ci:${{ github.run_id }} -b code_engine + sed -i '/runtime: lithops-ce/c\ runtime: '${{ secrets.DOCKER_USER }}'/lithops-ce-gihub-ci:'${{ github.run_id }} $LITHOPS_CONFIG_FILE + + - name: Run Lithops tests + run: | + cd lithops/tests + pytest -v --backend code_engine --storage ibm_cos + + - name: Delete Lithops CE runtime + if: needs.determine_runnable_test_jobs.outputs.code_engine == 'true' + run: | + lithops runtime delete ${{ secrets.DOCKER_USER }}/lithops-ce-gihub-ci:${{ github.run_id }} -b code_engine -s ibm_cos diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml deleted file mode 100644 index 89ce84e6b..000000000 --- a/.github/workflows/main.yml +++ /dev/null @@ -1,134 +0,0 @@ -name: "on-pull-request" - -on: - pull_request: - branches: - - master - - workflow_dispatch: - # this allows to run the workflow manually through the github dashboard - -env: - HAVE_LITHOPS_CONFIG: ${{ secrets.LITHOPS_CONFIG != '' }} - LITHOPS_CONFIG_FILE: /tmp/lithops_config.yaml - -jobs: - - determine_runnable_jobs: - runs-on: ubuntu-latest - if: github.event_name == 'pull_request' || github.event_name == 'workflow_dispatch' - - outputs: - localhost: ${{ steps.script.outputs.localhost }} - ibm_cf: ${{ steps.script.outputs.ibm_cf }} - code_engine: ${{ steps.script.outputs.code_engine }} - - steps: - - name: Set jobs to run - id: script - run: | - echo "localhost=true" >> $GITHUB_OUTPUT - echo "ibm_cf=false" >> $GITHUB_OUTPUT - echo "code_engine=false" >> $GITHUB_OUTPUT - - - localhost_tests: - runs-on: ubuntu-latest - needs: determine_runnable_jobs - if: needs.determine_runnable_jobs.outputs.localhost == 'true' - - steps: - - name: Clone Lithops repository - uses: actions/checkout@v3 - - - name: Install Python 3.10 - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - - name: Install Lithops - run: | - pip3 install -U . - - - name: Run Lithops tests - run: | - lithops test -b localhost -s localhost - - - ibm_ce_cos_tests: - runs-on: ubuntu-latest - needs: determine_runnable_jobs - if: needs.determine_runnable_jobs.outputs.code_engine == 'true' - - steps: - - name: Clone Lithops repository - uses: actions/checkout@v3 - - - name: Install Python 3.10 - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - - name: Install Lithops - run: | - pip3 install -U . - - - name: Install Lithops config - id: config - run: | - echo -n -e "${{ secrets.LITHOPS_CONFIG }}" > $LITHOPS_CONFIG_FILE - - - name: Build new runtime - run: | - docker login -u ${{ secrets.DOCKER_USER }} -p ${{ secrets.DOCKER_TOKEN }} - cd runtime/code_engine - lithops runtime build -f Dockerfile.githubci ${{ secrets.DOCKER_USER }}/lithops-ce-gihub-ci:${{ github.run_id }} -b code_engine - sed -i '/runtime: lithops-ce/c\ runtime: '${{ secrets.DOCKER_USER }}'/lithops-ce-gihub-ci:'${{ github.run_id }} $LITHOPS_CONFIG_FILE - - - name: Run Lithops tests - run: | - lithops test -b code_engine -s ibm_cos -k - - - name: Delete Lithops CE runtime - if: needs.determine_runnable_jobs.outputs.code_engine == 'true' - run: | - lithops runtime delete ${{ secrets.DOCKER_USER }}/lithops-ce-gihub-ci:${{ github.run_id }} -b code_engine -s ibm_cos - - - ibm_cf_cos_tests: - runs-on: ubuntu-latest - needs: determine_runnable_jobs - if: needs.determine_runnable_jobs.outputs.ibm_cf == 'true' - - steps: - - name: Clone Lithops repository - uses: actions/checkout@v3 - - - name: Install Python 3.10 - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - - name: Install Lithops - run: | - pip3 install -U . - - - name: Install Lithops config - id: config - run: | - echo -n -e "${{ secrets.LITHOPS_CONFIG }}" > $LITHOPS_CONFIG_FILE - - - name: Build new runtime - run: | - docker login -u ${{ secrets.DOCKER_USER }} -p ${{ secrets.DOCKER_TOKEN }} - cd runtime/ibm_cf - lithops runtime build -f Dockerfile.githubci ${{ secrets.DOCKER_USER }}/lithops-cf-gihub-ci:${{ github.run_id }} -b ibm_cf - sed -i '/runtime: lithops-cf/c\ runtime: '${{ secrets.DOCKER_USER }}'/lithops-cf-gihub-ci:'${{ github.run_id }} $LITHOPS_CONFIG_FILE - - - name: Run Lithops tests - run: | - lithops test -b ibm_cf -s ibm_cos -k - - - name: Delete Lithops CF runtime - run: | - lithops runtime delete ${{ secrets.DOCKER_USER }}/lithops-cf-gihub-ci:${{ github.run_id }} -b ibm_cf -s ibm_cos diff --git a/.github/workflows/python-linting.yml b/.github/workflows/python-linting.yml new file mode 100644 index 000000000..0e953752b --- /dev/null +++ b/.github/workflows/python-linting.yml @@ -0,0 +1,39 @@ +name: Python Linting + +on: + pull_request: + branches: + - master + paths: + - 'setup.py' + - 'lithops/**' + + workflow_dispatch: + # this allows to run the workflow manually through the github dashboard + +jobs: + + flake8: + runs-on: ubuntu-latest + + steps: + - name: Clone Lithops repository + uses: actions/checkout@v4 + + - name: Install Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python3 -m pip install --upgrade pip + pip3 install -U flake8 + + - name: Install Lithops + run: | + pip3 install -U . + + - name: Lint with flake8 + run: | + flake8 lithops --count --max-line-length=180 --statistics --ignore W605,W503 diff --git a/.github/workflows/tests-all-os.yml b/.github/workflows/tests-all-os.yml new file mode 100644 index 000000000..7459380f8 --- /dev/null +++ b/.github/workflows/tests-all-os.yml @@ -0,0 +1,94 @@ +name: Tests all OS + +on: + workflow_dispatch: + +jobs: + localhost_tests: + runs-on: ${{ matrix.os }} + timeout-minutes: 6 + env: + OBJC_DISABLE_INITIALIZE_FORK_SAFETY: YES + + strategy: + fail-fast: False + matrix: + include: + # Linux + - os: ubuntu-latest + python-version: "3.10" + - os: ubuntu-latest + python-version: "3.11" + - os: ubuntu-latest + python-version: "3.12" + - os: ubuntu-latest + python-version: "3.13" + - os: ubuntu-22.04 + python-version: "3.10" + - os: ubuntu-22.04 + python-version: "3.11" + + # macOS + - os: macos-latest + python-version: "3.10" + - os: macos-latest + python-version: "3.11" + - os: macos-latest + python-version: "3.12" + - os: macos-latest + python-version: "3.13" + - os: macos-15 + python-version: "3.10" + - os: macos-15 + python-version: "3.11" + + # Windows + - os: windows-latest + python-version: "3.10" + - os: windows-latest + python-version: "3.11" + - os: windows-latest + python-version: "3.12" + - os: windows-latest + python-version: "3.13" + + steps: + - name: Clone Lithops repository + uses: actions/checkout@v4 + + - name: Install Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Lithops + run: | + pip3 install -U .[tests] + + - name: Create Lithops config file + run: | + mkdir -p $HOME/.lithops + echo "lithops:" >> $HOME/.lithops/config + echo " monitoring_interval: 0.1" >> $HOME/.lithops/config + echo " log_level: DEBUG" >> $HOME/.lithops/config + echo " include_modules: None" >> $HOME/.lithops/config + echo "localhost:" >> $HOME/.lithops/config + echo " version: 2" >> $HOME/.lithops/config + + - name: Run Lithops tests + run: | + cd lithops/tests + # pytest -v --durations=0 --backend localhost --storage localhost + pytest -v --durations=0 -o log_cli=true --log-cli-level=DEBUG --backend localhost --storage localhost + + - name: Display last 500 lines of the Lithops log file + if: cancelled() || failure() + shell: bash + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + tail -n 500 /tmp/lithops-runner/localhost-runner.log + elif [ "$RUNNER_OS" == "macOS" ]; then + tail -n 500 $TMPDIR/lithops-runner/localhost-runner.log + elif [ "$RUNNER_OS" == "Windows" ]; then + tail -n 500 "C:\Users\RUNNER~1\AppData\Local\Temp\lithops-root\localhost-runner.log" + fi diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..315f74ba1 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,41 @@ +name: Tests + +on: + pull_request: + branches: + - master + paths: + - 'setup.py' + - 'lithops/**' + + workflow_dispatch: + # this allows to run the workflow manually through the github dashboard + +jobs: + + localhost_tests: + runs-on: ubuntu-22.04 + timeout-minutes: 5 + + strategy: + fail-fast: false + matrix: + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] + + steps: + - name: Clone Lithops repository + uses: actions/checkout@v4 + + - name: Install Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Lithops + run: | + pip3 install -U .[tests] + + - name: Run Lithops tests + run: | + cd lithops/tests + pytest -v --backend localhost --storage localhost diff --git a/.gitignore b/.gitignore index 252671700..1817e5ee3 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ dist/ plots/ *.egg-info lithops_*.zip +*.log +*.txt +*.csv +*.coverage* # Virtual environments .env diff --git a/CHANGELOG.md b/CHANGELOG.md index 484adc067..de44092d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,224 @@ # Changelog -## [v3.0.1.dev0] +## [v3.6.3.dev0] + +### Added +- + +### Changed +- + +### Fixed +- + + +## [v3.6.2] + +### Fixed +- [Localhost] Fix shutil.Error caused by existing __pycache__ directory when copying files in the runner +- [Executor] Make retry count configurable in RetryingFunctionExecutor + + +## [v3.6.1] + +### Fixed +- [GCP Functions] Poll long-running operations for GCP Function deployment +- [GCP Functions] Retry function creation on failure to improve reliability +- [IBM CE] Sanitize user_key in IBM CE to be RFC 1233 compliant +- [CLI] Fix storage list error +- [K8s] Fixed bug with first execution of K8s and Singularity +- [Core] Prevent job monitor from stopping abruptly on iteration error causing hanging jobs + + +## [v3.6.0] + +### Added +- [Core] Added support for python 3.13 +- [AWS EC2] Add support for configuring EBS volumes in EC2 lithops workers +- [AWS EC2] Add support for specifying CIDR block in EC2 public subnet + +### Fixed +- [Standalone] Fixed an issue causing workers to stop prematurely in Consume mode +- [Invoker] Reduced the number of threads used in the async FaaS Invoker +- [Monitoring] Fixed token bucket issue that prevented generating the correct number of tokens +- [Code Engine] Allow to build the default runtime with Python 3.13 +- [Monitoring] Fixed race condition in RMQ Monitor +- [AWS S3] Fixed InvalidLocationConstraint error in AWS S3 handled + + +## [v3.5.1] + +### Fixed +- [Core] Fix issue in "if self._call_output in future.py" for objects with ambiguous truth values +- [Standalone] Consume execution mode failing to run the installation script thus failing all the jobs +- [Azure VMs] Consume execution mode failing to execute jobs + + +## [v3.5.0] + +### Added +- [Singularity] Added new singularity compute backend +- [Oracle Functions] Added support for python 3.11 +- [k8s] Added 'master_timeout' parameter to k8s backend +- [AWS Lambda] Added user_tags to the runtime deployment + +### Fixed +- [Storage] Fixed "KeyError: 'monitoring_interval'" error when instantiating Storage() class +- [k8s] Fixed bug between threads when there are multiple executions +- [OpenWhisk] Fixed issue in the list_runtimes method +- [OpenWhisk] Fixed runtime name formatting for self hosted container registries + + +## [v3.4.1] + +### Added +- [Localhost] Added error capturing and logging for job/task process failures + +### Fixed +- [Worker] Fixed potential issue that can appear during 'func_obj' loading from cache + + +## [v3.4.0] + +### Added +- [CLI] Allow to pass a name in the "lithops runtime list" command +- [Ceph] Added extra region parameter to Ceph backend + +### Changed +- [Setup] Moved IBM and AWS deps to lithops[ibm] and lithops[aws] extra +- [Setup] Moved kubernetes and knative deps to lithops[kubernetes] and lithops[knative] extra +- [Setup] Moved minio, ceph and redis deps to lithops[minio], lithops[ceph] and lithops[redis] extra +- [Setup] Moved matplotlib, seaborn, numpy and pandas dependencies to lithops[plotting] extra +- [Setup] Removed unused 'lxml', 'docker' and 'python-dateutil' packages from the setup.py +- [Core] Detached progress bar from INFO logs +- [Future] Exposed 'wait_dur_sec' and 'retries' in future.wait() and future.get_result() methods +- [Localhost] Upgraded localhost backend v2 and set it as the default localhost backend +- [Localhost] Set monitoring_interval to 0.1 in the localhost storage backend +- [AWS Batch] Updated CPU and Memory resource limits + +### Fixed +- [AWS Lambda] Fixed wrong AWS Lambda delete runtime_name match semantics +- [Worker] Fixed potential issue that can appear during 'func_obj' loading from cache +- [Monitor] Fixed potential 'keyerror' exceptions +- [Swift] Fixed OpenStack Swift parameters and authentication by adding domain information +- [AWS Batch] Fixed missing ecsTaskJobRole + + +## [v3.3.0] + +### Added +- [Core] Added a mechanism to automatically retry failed tasks +- [Azure Containers] Automatically login to the container registry if the credentials are in the config + +### Changed +- [AWS] Eliminated the need for access and secret keys in the configuration +- [Tests] Moved tests from unittest to pytest + +### Fixed +- [AWS Lambda] Fixed runtime deletion with "lithops runtime delete" +- [Localhost] Fixed issue with the job manager +- [Serializer] Fix serialization bug which triggers side effects on dynamic attributes +- [Worker] Removed "distutils" lib imports as it is deprecated in python 3.12 +- [Serverless] Allow to build container runtimes with the MacBook Mx chip +- [K8s] Fixed task granularity calculation and memory units issue (RabbitMQ version) +- [AWS Lambda] Fixed AWS Lambda function Name for SSO users +- [AWS] Fixed generated user-key for SSO users +- [Azure VMs] Fixed worker creation and communication + + +## [v3.2.0] + +### Added +- [Lithops] Added support for Python 3.12 +- [CLI] Added "--include" parameter in "lithops image build" to allow users upload local files to the VM image + +### Changed +- [Standalone] Use redis in the master VM to store all the relevant data about jobs and workers +- [Standalone] Use redis to store the work queues +- [Standalone] Improved resiliency and worker granularity +- [CLI] Show the timestamp in the local timezone format on "lithops job list" +- [CLI] Show worker creation timestamp and time-to-dismantle on "lithops worker list" + +### Fixed +- [SSH Cli] Fixed minor error with the "err" variable +- [Cli] Fixed job status on "lithops job list" for standalone backends +- [Standalone] Fixed issue in the "lithops image build" that appears when the vpc is already created +- [Future] Fixed issue with missing 'worker_end_tstamp' variable + + +## [v3.1.2] + +### Added +- [Plots] Allow to set the figure size of the plots +- [Stats] Added new CPU, Memory and Network statistics in the function results +- [IBM VPC] Added a new parameter to enable/disable resource existence check in the platform + +### Changed +- [Config] Renamed 'customized_runtime' to 'runtime_include_function' +- [IBM VPC] Increased the total number of available IPs in the private network +- [Standalone] Do not stop the VM immediately after a job in the Consume mode + +### Fixed +- [Standalone] Fixed issue that appears when the invocation payload is too big +- [Invoker] Fixed "runtime_include_function" function/modules path +- [AWS EC2] Reset the public IP address of the master VM on stop + + +## [v3.1.1] + +### Added +- [k8s] Added a new way of invoking functions using a RabbitMQ work queue +- [IBM VPC] Added "zone" config parameter +- [IBM Code Engine] Get and print an error message in case of container execution failure + +### Changed +- [OpenWhisk] Updated default runtimes + +### Fixed +- [Standalone] Fixed issue with a wrong value of "chunksize" +- [IBM Code Engine] Fixed missing parameter on clean +- [Executor] Fixed potential deadlock in wait() and get_result() when an exception is produced in a function activation + + +## [v3.1.0] + +### Added +- [Cli] Added new 'lithops image delete' command for standalone backends +- [Cli] Added new 'lithops job list' command for standalone backends +- [Cli] Added new 'lithops worker list' command for standalone backends +- [AWS EC2] Added delete_image() method for deleting VM images through the cli +- [IBM VPC] Added delete_image() method for deleting VM images through the cli +- [localhost] New localhost backend v2 to maximize resource utilization when multiple maps are executed from the same FunctionExecutor +- [Standalone] Automatically retrieve the CPU_COUNT from the VM in case worker_processes is not set in config +- [Standalone] Keep track of the worker and job status +- [Storage] Include "Config" parameter to download_file() and upload_file() methods for boto3 related backends +- [Cli] Include 'worker name' in the 'lithops runtime list' cmd +- [AWS Lambda] Created 'namespace' config key to virtually separate worker deployments + +### Changed +- [Standalone] Changed default mode of execution from 'consume' to 'reuse' +- [Joblib] Updated the joblib backend to make it compatible with new versions of joblib +- [Joblib] Spawn only one function when 'prefer' is set to 'threads' +- [AWS EC2] Changed default image name from "lithops-worker-default" to "lithops-ubuntu-jammy-22.04-amd64-server" +- [IBM VPC] Changed default image name from "lithops-worker-default" to "lithops-ubuntu-22-04-3-minimal-amd64-1" +- [Serializer] Improve serializer performance when include_modules is set in config +- [SSH Client] Do not raise LithopsValidationError on Authentication failed +- [AWS Lambda] Renamed function name to "lithops-worker-xxxx" + +### Fixed +- [Job] Fixed max data size in the invocation payload +- [Multiprocessing] Fixed cpu_count +- [Standalone] Start new workers when the VM instance type changes (in reuse mode) +- [GCP Functions] Fixed issue with "function_url" variable +- [Standalone] Fixed multiple runtime usage at the same time in master VM +- [localhost] Get the correct docker/podman path for jobs that run in a container +- [k8s] Limit the size of the "user" label as the maximum allowed is 63 chars +- [Joblib] Fix shared objects utility when multiple maps run from the same executor +- [Azure VMs] Fix wrong exception when trying to connect to the master VM for the first time +- [Partitioner] Fix partitioner + + +## [v3.0.1] ### New - [OCI Functions] Added new 'Oracle Cloud Functions' serverless backend @@ -8,7 +226,7 @@ ### Added - [Kubernetes] Added Redis server in master pod for shared data between workers -- [Kubernetes] Allow to set "conntext" and "namespace" in lithops config +- [Kubernetes] Allow to set "context" and "namespace" in lithops config ### Changed - [CodeEngine] Create the CE project only when necessary instead of creating it always @@ -26,15 +244,8 @@ - [Azure Virtual Machines] Added new 'Azure Virtual Machines' standalone backend ### Added -- [AWS Lambda] Added support for python 3.10 runtimes -- [AWS Lambda] Added support for python 3.11 runtimes -- [Azure Functions] Added support for python 3.10 runtimes -- [Azure Functions] Added support for python 3.11 runtimes -- [Google Cloud Functions] Added support for python 3.11 runtimes -- [IBM CF] Added support for python 3.11 runtimes -- [Openwhisk] Added support for python 3.11 runtimes -- [Aliyun Functions] Added support for python 3.10 runtimes -- [Executor] Allow to set all the compute backend params programatically in the FunctionExecutor() +- [Serverless] Added support for python 3.10 and 3.11 runtimes +- [Executor] Allow to set all the compute backend params programmatically in the FunctionExecutor() - [AWS EC2] Allow to automatically create the VPC and all the necessary resources - [IBM VPC & AWS EC2] General fixes and Improvements - [Executor] Allow to pass the config file location in the FunctionExecutor() @@ -44,10 +255,10 @@ - [Cli] Added new 'lithops image build' command for standalone backends - [Cli] Added new 'lithops image list' command for standalone backends - [IBM VPC] Added build_image() method for automatically building VM images -- [IBM VPC] Added list_image() method for listing the availabe VM images +- [IBM VPC] Added list_image() method for listing the available VM images - [AWS EC2] Added build_image() method for automatically building VM images -- [AWS EC2] Added list_image() method for listing the availabe VM images -- [Azure VMS] Added list_image() method for listing the availabe VM images +- [AWS EC2] Added list_image() method for listing the available VM images +- [Azure VMS] Added list_image() method for listing the available VM images - [IBM CF] Automatically create a CF namespace if not provided in config - [IBM VPC] Added Madrid (Spain) region - [Code Engine] Automatically create a new project if not provided in config @@ -77,6 +288,7 @@ - [Multiprocessing] Check redis connection before starting to submit jobs - [Redis] Fixed redis backend exception regarding storage_bucket + ## [v2.9.0] ### Added @@ -142,7 +354,7 @@ - [Code Engine] Add CE conflict exception to retriables - [Core] Show logs from module_dependency.py - [GCP Functions] Fix runtime_build command -- [Infinispan] Fix infinispan storage backend +- [Infinispan] Fix Infinispan storage backend - [Core] Detect a Class if passed as a lithops input function @@ -221,7 +433,7 @@ - [Standalone] Fix cloudinit initialization script - [Future] Fix host_status_query_count stat - [Google Cloud Run] Fixed wrong variable name 'runtime_cpus' -- [Google Cloud] Changed docs for Google cloud backend refering to id instead of name +- [Google Cloud] Changed docs for Google cloud backend regarding to id instead of name ## [v2.5.8] @@ -238,7 +450,7 @@ ### Added - [AWS Batch] Added AWS Batch backend - [Standalone] Allow to start workers using a public key instead of using a password -- [Standalone] Added diferent levels of worker verification +- [Standalone] Added different levels of worker verification - [Infinispan] Added new Infinispan Hot Rod storage backend ### Fixed @@ -256,7 +468,7 @@ ### Added - [AWS_EC2] Added AWS EC2 Standalone backend - [AWS_EC2] Allow to start workers using Spot instances in AWS EC2 Standalone backend -- [Standalone] Added the logic to create the missing deleta of workers in reuse mode +- [Standalone] Added the logic to create the missing delta of workers in reuse mode - [Standalone] Cancel running job tasks on ctrl-c - [Standalone] New logic to verify that the master VM is correctly setup - [Standalone] Added new command "lithops attach" that allows to create live ssh connections to the master VM @@ -269,7 +481,7 @@ - [Standalone] Fixed VM initial installation script - [Standalone] Fixed get_workers method on master - [Standalone] Deleted unnecessary extra worker -- [Standalone] Ensure all workers are proppery started on reuse mode +- [Standalone] Ensure all workers are properly started on reuse mode - [Localhost] Fixed storage delete_objects method that was deleting the entire folder of a file - [IBM VPC] General fixes in IBM VPC backend @@ -277,24 +489,24 @@ ## [v2.5.5] ### Added -- [CLI] Allow to pass all availbe 'docker' parameter to 'lithops runtime build' command +- [CLI] Allow to pass all available 'docker' parameter to 'lithops runtime build' command - [Multiprocessing] Add example file with different argument passing examples for Pool and Process ### Fixed -- [Localhost] Fixed minnor issue when deleting completed jobs +- [Localhost] Fixed minor issue when deleting completed jobs - [Multiprocessing] Fixed args mismatch error when passing list of tuples to Pool.map -- [Standalone] Fixed cloud-init script that ocasionaly fails to set ssh credentials +- [Standalone] Fixed cloud-init script that occasionally fails to set ssh credentials ## [v2.5.4] -### Fixes +### Fixed - [Standalone] Avoid deleting the master VM on consume mode ## [v2.5.3] -### Fixes +### Fixed - [Core] Fixed lithops.map_reduce() jobs. Sometimes jobs where not finishing - [Core] Spawn lithops.cleaner only once in the same execution instance - [Tests] Fix when running 'lithops verify' command @@ -309,12 +521,12 @@ - [Core] Allow to spawn the reduce function in map_reduce() after a configurable percentage of completed map activations ### Changed -- [Config] 'max_workers' and 'worker_processess' keys must be set at backend level in config +- [Config] 'max_workers' and 'worker_processes' keys must be set at backend level in config - [Config] 'remote_invoker' key must be set at backend level in config - [Config] 'customized_runtime' key must be set at lithops level in config - [Config] 'serverless' section in config is no longer required -### Fixes +### Fixed - [CodeEngine] Fixed 'max_workers' parameter to limit the number of max workers per map invocation - [IBM CF] Create the runtime if not deployed when invoked - [Localhost] Fix localhost paths for windows hosts @@ -329,7 +541,7 @@ - [Localhost] Start container with user's uid:gid - [Localhost] Extended default execution timeout to 3600 seconds -### Fixes +### Fixed - [Standalone] Fixed standalone execution on consume mode - [Aliyun FC] Fixed Aliyun Function compute backend - [Core] Fixed 'lithops runtime build' command when the backend is not configured in config @@ -340,7 +552,7 @@ ### Added - [CLI] Add new command in cli to list deployed runtimes - [Standalone] Add reuse mode that allows to reuse the same VMs for all the maps -- [Config] alow to configure worker_processes parameter in serverless and standalone sections +- [Config] Allow to configure worker_processes parameter in serverless and standalone sections - [Localhost] Prevent multiple jobs in the same executor to run at the same time - [Standalone] Prevent multiple jobs submitted to the same master VM to run at the same time - [CE] Added COS Direct endpoints for free bandwidth from/to CodeEngine @@ -354,7 +566,7 @@ - [AWS Lambda] Add support for Python3.9 - [Standalone] ssh VM password is now a 37 chars random and dynamic password (for create and resue modes) -### Fixes +### Fixed - [CE] Create a new token when it expires after 20 minutes when using the same FunctionExecutor - [CE] Prevent exception when detecting the docker username in k8s and CE backends - [Core] Fix minor issue in jobrunner @@ -363,7 +575,7 @@ ## [v2.4.1] -### Fixes +### Fixed - [IBM VPC] Fixed a data inconsistency on consume mode ## [v2.4.0] @@ -380,8 +592,9 @@ ### Changed - [Core] Improved performance and efficiency of the lithops cleaner background process - [AWS Lambda] Use layer from Klayers API for pre-compiled Amazon Linux numpy binaries +- [Core] Moved invoke_pool_threads param from map and map_reduce calls. Now it must be set at backend level in config -### Fixes +### Fixed - [Localhost] Fixed error when processing localhost objects - [Localhost] Allow to create a localhost storage instance when a config file exists with a cloud configuration - [Core] Fixed an unusual inconsistency in configuration between 'backend' and 'mode' parameters @@ -391,9 +604,6 @@ - [Core] Fixed 'lithops storage list' CLI when a bucket is empty - [Standalone] Fixed execution -### Deleted -- [Core] Deleted invoke_pool_threads param from map and map_reduce calls. Now it must be set at backend level in config - ## [v2.3.5] @@ -408,7 +618,7 @@ - [Core] Add 'key' and 'bucket' attrs in localhost partitioner for compatibility with OS - [Serverless] runtime, runtime_memory and runtime_timeout can only be set at backend level -### Fixes +### Fixed - [Standalone] Fix execution - [Core] Avoid loading the config file twice @@ -431,7 +641,7 @@ - [multiprocessing] Improved nanomsg Pipe implementation - [joblib] Optimized joblib backend (concurrent args data upload/download) -### Fixes +### Fixed - [Core] Fixed module analyzer - [Core] Clear only present jobs instead of all after wait() or get_result() - [multiprocessing] Fix put/get slice to/from mp.Array or mp.RawArray @@ -439,7 +649,7 @@ ## [v2.3.3] -### Fixes +### Fixed - [Core] Allow to execute class methods as lithops function @@ -453,7 +663,7 @@ - [Core] New monitoring system - [Core] Deleted strong dependency to pika==0.13.1 -### Fixes +### Fixed - [Partitioner] Fixed partitioner when obj url contains more than one subfolder - [Cli] Fixed serverless runtime lifecycle methods @@ -484,7 +694,7 @@ - [Core] Improved worker when chunksize is set to values > 1 - [Core] Check lithops version mismatch in host instead of in worker -### Fixes +### Fixed - [Core] Overwrite the runtime set in config with the runtime set in the FunctionExecutor - [Cli] Fixed --config param in lithops cli - [Standalone] Fixed internal executions @@ -521,12 +731,18 @@ - [IBM VPC] Improved IBM VPC backend - [AWS Lambda] Lambda layer modules update -### Fixes +### Fixed - [Multiprocessing] Fix issues related to Pipes and Queues - [Multiprocessing] Fix multiprocessing.context methods - [CodeEngine/knative] Fix getting docker username in MAC OS hosts +## [v2.2.16] + +### Fixed +- [Code Engine] Fixing code engine docker image + + ## [v2.2.15] ### Added @@ -630,7 +846,7 @@ ### Added - [Core] Add joblib backend for scikit-learn -- [Cli] Add more config paramters in lithops cli +- [Cli] Add more config parameters in lithops cli - [IBM COS] Add 'region' config param - [Knative] Add 'min_instances', 'max_instances' and 'concurrency' config params @@ -750,18 +966,24 @@ - [Core] IBM VPC service client lib - [Docker] Docker backend compatible with IBM VPC VM -### Fixed -- [Ceph] Fix in ceph endpoint - ### Changed - [Docker] Improved Docker executor +### Fixed +- [Ceph] Fix in Ceph endpoint + + ## [v1.7.2] ### Added - [GCR] Added Google Cloud Run Backend + +### Changed +- [Core] Improved Storage abstraction +- [Core] InternalStorage uses storage abstraction + ### Fixed - [Core] Fixed invoker token bucket when quota limit is reached - [Core] Fixed logging @@ -770,11 +992,6 @@ - [Localhost] Fixed invocations ability to launch subprocesses - [Docker] Fixed docker running as user and not root -### Changed -- [Core] Improved Storage abstraction -- [Core] InternalStorage uses storage abstraction - - ## [v1.7.0] ### Added @@ -868,6 +1085,7 @@ - [Core] Fixed issue with windows hosts - [Core] Some other Internal fixes + ## [v1.4.2] ### Added @@ -1405,7 +1623,7 @@ - Moved some info prints to debug - improved remote function invocation mechanism -### Fixes +### Fixed - Fixing flask security issues CVE-2018-1000656 - Fixed minor issue when futures is not a list - Fixed default config exception. API KEY is not mandatory. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0eaaab558..bfd81394a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,23 +16,39 @@ To contribute a patch: 1. Break your work into small, single-purpose patches if possible. It's much harder to merge in a large change with a lot of disjoint features. 2. Submit the patch as a GitHub pull request against the master branch. -3. Make sure that your code passes the unit tests. -4. Make sure that your code passes the linter. -5. Add new unit tests for your code. +3. Make sure that your code passes the tests. +4. Make sure that your code passes the linter. Install `flake8` with `pip3 install flake8` and run the next command until you don't see any linitng error: + ```bash + flake8 lithops --count --max-line-length=180 --statistics --ignore W605,W503 + ``` +5. Add new tests for your code. -Unit testing ------------- +Testing +------- -To test that all is working as expected, run either: +To test that all is working as expected, you must install `pytest`, navigate to the tests folder `lithops/tests/`, and execute: +```bash +pytest -v +``` +If you made changes to a specific backend, please run tests on that backend. +For example, if you made changes to the AWS Lambda backend, execute the tests with: ```bash -$ lithops test +pytest -v --backend aws_lambda --storage aws_s3 ``` -or +You can list all the available tests using: ```bash -$ python3 -m lithops.tests.tests_main +pytest --collect-only ``` -Please follow the guidelines in [docs/testing.md](docs/source/testing.rst) for more details. \ No newline at end of file +To run a specific test or group of tests, use the `-k` parameter, for example: +```bash +pytest -v --backend localhost --storage localhost -k test_map +``` + +To view all the Lithops logs during the tests, and in DEBUG mode, execute: +```bash +pytest -o log_cli=true --log-cli-level=DEBUG --backend localhost --storage localhost +``` diff --git a/README.md b/README.md index 97d5e63b0..d48fc74a7 100644 --- a/README.md +++ b/README.md @@ -5,22 +5,12 @@

- - - - - - - - - +    PyPI - Downloads Ask DeepWiki

-Lithops is a Python multi-cloud distributed computing framework. It allows you to run unmodified local python code at massive scale in the main -serverless computing platforms. Lithops delivers the user’s code into the cloud without requiring knowledge of how it is deployed and run. Moreover, its multicloud-agnostic architecture ensures portability across cloud providers. - -Lithops is specially suited for highly-parallel programs with little or no need for communication between processes, but it also supports parallel applications that need to share state among processes. Examples of applications that run with Lithops include Monte Carlo simulations, deep learning and machine learning processes, metabolomics computations, and geospatial analytics, to name a few. +Lithops is a Python multi-cloud distributed computing framework that lets you run unmodified Python code at massive scale across cloud, HPC, and on-premise platforms. It supports major cloud providers and Kubernetes platforms, running your code transparently without requiring you to manage deployment or infrastructure. +Lithops is ideal for highly parallel workloads—such as Monte Carlo simulations, machine learning, metabolomics, or geospatial analytics—and lets you tailor execution to your priorities: you can optimize for performance using AWS Lambda to launch hundreds of functions in milliseconds, or reduce costs by running the same code on AWS Batch with Spot Instances. ## Installation @@ -37,7 +27,7 @@ Lithops is specially suited for highly-parallel programs with little or no need ``` ## Configuration -Lithops provides an extensible backend architecture (compute, storage) that is designed to work with different Cloud providers and on-premise backends. In this sense, you can code in python and run it unmodified in IBM Cloud, AWS, Azure, Google Cloud, Aliyun and Kubernetes or OpenShift. +Lithops provides an extensible backend architecture (compute, storage) designed to work with various cloud providers and on-premise platforms. You can write your code in Python and run it unmodified across major cloud providers and Kubernetes environments. [Follow these instructions to configure your compute and storage backends](config/) @@ -71,12 +61,12 @@ Lithops is shipped with 2 different high-level Compute APIs, and 2 high-level St ```python from lithops import FunctionExecutor -def hello(name): - return f'Hello {name}!' +def double(i): + return i * 2 with FunctionExecutor() as fexec: - fut = fexec.call_async(hello, 'World') - print(fut.result()) + f = fexec.map(double, [1, 2, 3, 4]) + print(f.result()) ``` @@ -120,14 +110,14 @@ if __name__ == "__main__": key='test.txt', body='Hello World') - print(st.get_object(bucket='mybucket', + print(st.get_object(bucket='lithops', key='test.txt')) ``` ```python -from lithops.storage.cloud_proxy import os +from lithops.storage.cloud_proxy import os if __name__ == "__main__": filepath = 'bar/foo.txt' @@ -146,47 +136,36 @@ if __name__ == "__main__": You can find more usage examples in the [examples](/examples) folder. -## Execution Modes - -Lithops is shipped with 3 different modes of execution. The execution mode allows you to decide where and how the functions are executed. - -* [Localhost Mode](docs/source/execution_modes.rst#localhost-mode) - - This mode allows you to execute functions on the local machine using processes, providing a convenient and efficient way to leverage Lithops' distributed computing capabilities without relying on cloud resources. This mode is particularly useful for development, testing, and debugging purposes. This is the default mode of execution if no configuration is provided. - -* [Serverless Mode](docs/source/execution_modes.rst#serverless-mode) - - This mode allows you to efficiently execute functions on popular serverless compute services, leveraging the scalability, isolation, and automatic resource provisioning provided by these platforms. With serverless mode, you can easily parallelize task execution, harness the elastic nature of serverless environments, and simplify the development and deployment of scalable data processing workloads and parallel applications. - -* [Standalone Mode](docs/source/execution_modes.rst#standalone-mode) - - This mode provides the capability to execute functions on one or multiple virtual machines (VMs) simultaneously, in a serverless-like fashion, without requiring manual provisioning as everything is automatically created. This mode can be used in a private cluster or in the cloud, where functions within each VM are executed using parallel processes, similar to the functionality offered in localhost mode. - - ## Documentation -For documentation on using Lithops, see [latest release documentation](https://lithops-cloud.github.io/docs/) or [current github docs](docs/user_guide.md). +For documentation on using Lithops, see [latest release documentation](https://lithops-cloud.github.io/docs/) If you are interested in contributing, see [CONTRIBUTING.md](./CONTRIBUTING.md). ## Additional resources ### Blogs and Talks + +* [How to run Lithops over EC2 VMs using the new K8s backend](https://danielalecoll.medium.com/how-to-run-lithops-over-ec2-vms-using-the-new-k8s-backend-4b0a4377c4e9) * [Simplify the developer experience with OpenShift for Big Data processing by using Lithops framework](https://medium.com/@gvernik/simplify-the-developer-experience-with-openshift-for-big-data-processing-by-using-lithops-framework-d62a795b5e1c) * [Speed-up your Python applications using Lithops and Serverless Cloud resources](https://itnext.io/speed-up-your-python-applications-using-lithops-and-serverless-cloud-resources-a64beb008bb5) -* [Serverless Without Constraints](https://www.ibm.com/cloud/blog/serverless-without-constraints) * [Lithops, a Multi-cloud Serverless Programming Framework](https://itnext.io/lithops-a-multi-cloud-serverless-programming-framework-fd97f0d5e9e4) * [CNCF Webinar - Toward Hybrid Cloud Serverless Transparency with Lithops Framework](https://www.youtube.com/watch?v=-uS-wi8CxBo) +* [Your easy move to serverless computing and radically simplified data processing](https://www.slideshare.net/gvernik/your-easy-move-to-serverless-computing-and-radically-simplified-data-processing-238929020) Strata Data Conference, NY 2019. See video of Lithops usage [here](https://www.youtube.com/watch?v=EYa95KyYEtg&list=PLpR7f3Www9KCjYisaG7AMaR0C2GqLUh2G&index=3&t=0s) and the example of Monte Carlo [here](https://www.youtube.com/watch?v=vF5HI2q5VKw&list=PLpR7f3Www9KCjYisaG7AMaR0C2GqLUh2G&index=2&t=0s) + + ### Papers - +* [Serverful Functions: Leveraging Servers in Complex Serverless Workflows](https://dl.acm.org/doi/10.1145/3700824.3701095) - ACM Middleware Industrial Track 2024 +* [Transparent serverless execution of Python multiprocessing applications](https://dl.acm.org/doi/10.1016/j.future.2022.10.038) - Elsevier Future Generation Computer Systems 2023 * [Outsourcing Data Processing Jobs with Lithops](https://ieeexplore.ieee.org/document/9619947) - IEEE Transactions on Cloud Computing 2022 * [Towards Multicloud Access Transparency in Serverless Computing](https://www.computer.org/csdl/magazine/so/5555/01/09218932/1nMMkpZ8Ko8) - IEEE Software 2021 * [Primula: a Practical Shuffle/Sort Operator for Serverless Computing](https://dl.acm.org/doi/10.1145/3429357.3430522) - ACM/IFIP International Middleware Conference 2020. [See presentation here](https://www.youtube.com/watch?v=v698iu5YfWM) @@ -195,4 +174,4 @@ If you are interested in contributing, see [CONTRIBUTING.md](./CONTRIBUTING.md). # Acknowledgements -This project has received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 825184. +This project has received funding from the European Union's Horizon 2020 research and innovation programme under grant agreement No 825184 (CloudButton). diff --git a/config/README.md b/config/README.md index e94d2576a..dbf4fd1d7 100644 --- a/config/README.md +++ b/config/README.md @@ -46,29 +46,30 @@ Storage Backends - [Localhost](../docs/source/compute_config/localhost.md) -

-Serverless Backends: -- [IBM Cloud Functions](../docs/source/compute_config/ibm_cf.md) -- [IBM Code Engine](../docs/source/compute_config/code_engine.md) + +Serverless (FaaS) Backends: - [AWS Lambda](../docs/source/compute_config/aws_lambda.md) -- [AWS Batch](../docs/source/compute_config/aws_batch.md) - [Google Cloud Functions](../docs/source/compute_config/gcp_functions.md) -- [Google Cloud Run](../docs/source/compute_config/gcp_cloudrun.md) - [Azure Functions](../docs/source/compute_config/azure_functions.md) -- [Azure Container APPs](../docs/source/compute_config/azure_containers.md) - [Aliyun Functions](../docs/source/compute_config/aliyun_functions.md) - [Oracle Cloud Functions](../docs/source/compute_config/oracle_functions.md) +- [OpenWhisk](../docs/source/compute_config/openwhisk.md) + +Serverless (CaaS) Backends: +- [IBM Code Engine](../docs/source/compute_config/code_engine.md) +- [AWS Batch](../docs/source/compute_config/aws_batch.md) +- [Google Cloud Run](../docs/source/compute_config/gcp_cloudrun.md) +- [Azure Container APPs](../docs/source/compute_config/azure_containers.md) - [Kubernetes](../docs/source/compute_config/kubernetes.md) - [Knative](../docs/source/compute_config/knative.md) -- [OpenWhisk](../docs/source/compute_config/openwhisk.md) -

+- [Singularity](../docs/source/compute_config/singularity.md) + Standalone Backends: - [Virtual Machine](../docs/source/compute_config/vm.md) - [IBM Virtual Private Cloud](../docs/source/compute_config/ibm_vpc.md) - [AWS Elastic Compute Cloud (EC2)](../docs/source/compute_config/aws_ec2.md) - [Azure Virtual Machines](../docs/source/compute_config/azure_vms.md) - @@ -112,41 +113,50 @@ if __name__ == '__main__': ``` ### Providing configuration in runtime -Example of providing configuration keys for IBM Cloud Functions and IBM Cloud Object Storage +Example of providing configuration keys for IBM Code Engine and IBM Cloud Object Storage ```python import lithops -config = {'lithops': {'backend': 'ibm_cf', 'storage': 'ibm_cos'}, - 'ibm': {'region': 'REGION', - 'iam_api_key': 'IAM_API_KEY', - 'resource_group_id': 'RESOURCE_GROUP_ID'}, - 'ibm_cos': {'storage_bucket': 'STORAGE_BUCKET'}} - -def hello_world(name): - return f'Hello {name}!' +config = { + 'lithops': { + 'backend': 'code_engine', + 'storage': 'ibm_cos' + }, + 'ibm': { + 'region': 'REGION', + 'iam_api_key': 'IAM_API_KEY', + 'resource_group_id': 'RESOURCE_GROUP_ID' + }, + 'ibm_cos': { + 'storage_bucket': 'STORAGE_BUCKET' + } +} + +def hello_world(number): + return f'Hello {number}!' if __name__ == '__main__': fexec = lithops.FunctionExecutor(config=config) - fexec.call_async(hello_world, 'World') + fexec.map(hello_world, [1, 2, 3, 4]) print(fexec.get_result()) ``` ## Summary of configuration keys for Lithops -|Group|Key|Default|Mandatory|Additional info| -|---|---|---|---|---| -|lithops | backend | ibm_cf | no | Compute backend implementation. IBM Cloud Functions is the default | -|lithops | storage | ibm_cos | no | Storage backend implementation. IBM Cloud Object Storage is the default | -|lithops | data_cleaner | True | no |If set to True, then the cleaner will automatically delete all the temporary data that was written into `storage_bucket/lithops.jobs`| -|lithops | monitoring | storage | no | Monitoring system implementation. One of: **storage** or **rabbitmq** | -|lithops | monitoring_interval | 2 | no | Monitoring check interval in seconds in case of **storage** monitoring | -|lithops | data_limit | 4 | no | Max (iter)data size (in MB). Set to False for unlimited size | -|lithops | execution_timeout | 1800 | no | Functions will be automatically killed if they exceed this execution time (in seconds). Alternatively, it can be set in the `call_async()`, `map()` or `map_reduce()` calls using the `timeout` parameter.| -|lithops | include_modules | [] | no | Explicitly pickle these dependencies. All required dependencies are pickled if default empty list. No one dependency is pickled if it is explicitly set to None | -|lithops | exclude_modules | [] | no | Explicitly keep these modules from pickled dependencies. It is not taken into account if you set include_modules | -|lithops | log_level | INFO |no | Logging level. One of: WARNING, INFO, DEBUG, ERROR, CRITICAL, Set to None to disable logging | -|lithops | log_format | "%(asctime)s [%(levelname)s] %(name)s -- %(message)s" |no | Logging format string | -|lithops | log_stream | ext://sys.stderr |no | Logging stream. eg.: ext://sys.stderr, ext://sys.stdout| -|lithops | log_filename | |no | Path to a file. log_filename has preference over log_stream. | -|lithops | customized_runtime | False | no | Enables to build a new runtime with the map() function and its dependencies integrated. Only docker-based backends support this feature. | +| Group | Key | Default | Mandatory | Additional info | +|---------|---------------------|--------------|-----------|--------------------------------------------------------------------------------------------------| +| lithops | backend | aws_lambda | no | Compute backend implementation. `localhost` is the default if no config or config file is provided. | +| lithops | storage | aws_s3 | no | Storage backend implementation. `localhost` is the default if no config or config file is provided. | +| lithops | data_cleaner | True | no | If True, automatically deletes temporary data written to `storage_bucket/lithops.jobs`. | +| lithops | monitoring | storage | no | Monitoring system implementation. Options: **storage** or **rabbitmq**. | +| lithops | monitoring_interval | 2 | no | Interval in seconds for monitoring checks when using **storage** monitoring. | +| lithops | data_limit | 4 | no | Maximum size (in MB) for iterator data chunks. Set to False for unlimited size. | +| lithops | execution_timeout | 1800 | no | Maximum execution time in seconds for functions. Functions exceeding this time are terminated. Can also be set per call via the `timeout` parameter. | +| lithops | include_modules | [] | no | List of dependencies to explicitly include for pickling. If empty, all required dependencies are included. If set to None, no dependencies are included. | +| lithops | exclude_modules | [] | no | List of dependencies to exclude from pickling. Ignored if `include_modules` is set. | +| lithops | log_level | INFO | no | Logging level. Options: WARNING, INFO, DEBUG, ERROR, CRITICAL. Set to None to disable logging. | +| lithops | log_format | "%(asctime)s [%(levelname)s] %(name)s -- %(message)s" | no | Format string for log messages. | +| lithops | log_stream | ext://sys.stderr | no | Logging output stream, e.g., ext://sys.stderr or ext://sys.stdout. | +| lithops | log_filename | (empty) | no | File path for logging output. Overrides `log_stream` if set. | +| lithops | retries | 0 | no | Number of retries for failed function invocations when using the `RetryingFunctionExecutor`. Default is 0. Can be overridden per API call. | diff --git a/config/config_template.yaml b/config/config_template.yaml index 11b1e81fa..33bc462a9 100644 --- a/config/config_template.yaml +++ b/config/config_template.yaml @@ -1,6 +1,6 @@ #lithops: - #backend: ibm_cf - #storage: ibm_cos + #backend: aws_lambda + #storage: aws_s3 #data_cleaner: #monitoring: storage #monitoring_interval: 2 @@ -12,13 +12,11 @@ #log_format: "%(asctime)s [%(levelname)s] %(name)s -- %(message)s" #log_stream: ext://sys.stdout #log_filename - #customized_runtime: #localhost: #runtime: python3 #worker_processes: CPU_COUNT - #ibm: #iam_api_key: #region : @@ -36,6 +34,7 @@ #remote_invoker: #max_workers: Default is 1200 #worker_processes: Default is 1 + #runtime_include_function: #code_engine: #namespace: @@ -51,6 +50,7 @@ #max_workers: Default is 1000 #worker_processes: Default is 1 #connection_retries: + #runtime_include_function: #ibm_vpc: #region : @@ -72,9 +72,8 @@ #singlesocket: # Optional, default is False #runtime: - #exec_mode: consume + #exec_mode: reuse #auto_dismantle: True - #pull_runtime: #hard_dismantle_timeout: 3600 #soft_dismantle_timeout: 300 #workers_policy: # Optional, strict/permissive. Default is 'permissive' @@ -100,9 +99,8 @@ #worker_processes: Default is 2 #runtime: - #exec_mode: consume + #exec_mode: reuse #auto_dismantle: True - #pull_runtime: #hard_dismantle_timeout: 3600 #soft_dismantle_timeout: 300 #workers_policy: # Optional, strict/permissive. Default is 'permissive' @@ -143,6 +141,7 @@ #ibm_cos: #storage_bucket: #region: + #service_instance_id: #endpoint: #private_endpoint: #api_key: diff --git a/docs/api_futures.md b/docs/api_futures.md deleted file mode 100644 index 4eb0d4798..000000000 --- a/docs/api_futures.md +++ /dev/null @@ -1,425 +0,0 @@ -# Lithops Futures API Details - -## Executor -The primary object in Lithops is the executor. The standard way to get everything set up is to import `lithops`, and create an instance of one of the available modes of executions. - -Lithops is shipped with 3 modes of execution: **Localhost**, **Serverless** and **Standalone**. In this sense, each mode of execution has its own executor class: - -* `lithops.LocalhostExecutor()`: Executor that uses local processes to run jobs in the local machine. -* `lithops.ServerlessExecutor()`: Executor to run jobs in one of the available serverless compute backends. -* `lithops.StandaloneExecutor()`: Executor to run jobs in one of the available standalone compute backends. - -Additionally, Lithops includes a top-level function executor, which encompasses all three previous executors: - -* `lithops.FunctionExecutor()`: Generic executor that will use the configuration to determine its mode of execution, i.e., based on the configuration it will be **localhost**, **serverless** or **standalone**. - - -By default, the executor load the configuration from the config file. Alternatively, you can pass the configuration with a python dictionary. In any case, note that all the parameters set in the executor will overwrite those set in the configuration. - - -The available calls within an executor are: - -|API Call| Type | Description| -|---|---|---| -|[call_async()](api_futures.md#executorcall_async) | Async. | Method used to spawn one function activation | -|[map()](api_futures.md#executormap) | Async. | Method used to spawn multiple function activations | -|[map_reduce()](api_futures.md#executormap_reduce) | Async. | Method used to spawn multiple function activations with one (or multiple) reducers| -|[wait()](api_futures.md#executorwait) | Sync. | Wait for the function activations to complete. It blocks the local execution until all the function activations finished their execution (configurable)| -|[get_result()](api_futures.md#executorget_result) | Sync. | Method used to retrieve the results of all function activations. The results are returned within an ordered list, where each element of the list is the result of one activation| -|[plot()](api_futures.md#executorplot) | Sync. | Method used to create execution plots | -|[job_summary()](api_futures.md#jobsummary) | Sync. | Method used to create a summary file of the executed jobs. It includes times and money | -|[clean()](api_futures.md#executorclean) | Async. | Method used to clean the temporary data generated by Lithops| - - -**LocalhostExecutor(\*\*kwargs)** - -Initialize and return Localhost executor object. - -|Parameter | Default | Description| -|---|---|---| -|config | None | Settings passed in here will override those in lithops_config| -|runtime | None | Name of the docker image to run the functions | -|workers | cpu_count | Max number of parallel workers | -|storage | localhost | Storage backend to store temp data| -|monitoring | storage | Monitoring system implementation. One of: storage, rabbitmq | -|log_level | INFO | Log level printing (INFO, DEBUG, ...). Set it to None to hide all logs. If this is param is set, all logging params in config are disabled| - -Usage: - -```python -import lithops -fexec = lithops.LocalhostExecutor() -``` - -**ServerlessExecutor(\*\*kwargs)** - -Initialize and return a Serverless executor object. - -|Parameter | Default | Description| -|---|---|---| -|config | None | Settings passed in here will override those in lithops_config| -|backend | ibm_cf | Serverless compute backend to run the functions| -|runtime | None | Name of the docker image to run the functions | -|runtime_memory | 256 | Memory (in MB) to use to run the functions | -|storage | ibm_cos | Storage backend to store temp data| -|workers | *depends of the backend* | Max number of parallel workers | -|monitoring | storage | Monitoring system implementation. One of: storage, rabbitmq | -|remote_invoker | False | Spawn a function that will perform the actual job invocation (True/False) | -|log_level | INFO | Log level printing (INFO, DEBUG, ...). Set it to None to hide all logs. If this is param is set, all logging params in config are disabled| - -Usage: - -```python -import lithops -fexec = lithops.ServerlessExecutor() -``` - -**StandaloneExecutor(\*\*kwargs)** - -Initialize and return an Standalone executor object. - -|Parameter | Default | Description| -|---|---|---| -|config | None | Settings passed in here will override those in lithops_config| -|backend | ibm_vpc | Standalone compute backend to run the functions| -|runtime | python3 | Name of the runtime to run the functions. It can be a docker image or *python3* | -|workers | cpu_count | Max number of parallel workers | -|storage | ibm_cos | Storage backend to store temp data| -|monitoring | storage | Monitoring system implementation. One of: storage, rabbitmq | -|log_level | INFO | Log level printing (INFO, DEBUG, ...). Set it to None to hide all logs. If this is param is set, all logging params in config are disabled| - -Usage: - -```python -import lithops -fexec = lithops.StandaloneExecutor() -``` - -**FunctionExecutor(\*\*kwargs)** - -Initialize and return a generic function executor. - -|Parameter | Default | Description| -|---|---|---| -|mode | serverless | Execution mode. One of: localhost, serverless or standalone| -|config | None | Settings passed in here will override those in lithops_config| -|backend | None | Compute backend to run the functions| -|runtime | None | Name of the runtime to run the functions. | -|runtime_memory | None | Memory (in MB) to use to run the functions | -|workers | None | Max number of parallel workers | -|storage | ibm_cos | Storage backend to store temp data| -|monitoring | storage | Monitoring system implementation. One of: storage, rabbitmq | -|remote_invoker | False | Spawn a function that will perform the actual job invocation (True/False) | -|log_level | INFO | Log level printing (INFO, DEBUG, ...). Set it to None to hide all logs. If this is param is set, all logging params in config are disabled| - -Usage: - -```python -import lithops -fexec = lithops.FunctionExecutor() -``` - - -## Executor.call_async() - -Spawn only one function activation. - -**call_async**(func, data, \*\*kwargs) - -|Parameter | Default |Description| -|---|---|---| -|func | |The function to map over the data | -|data | |A single value of data | -|extra_env| None |Additional environment variables for CF environment| -|runtime_memory| 256 |Memory (in MB) to use to run the functions| -|timeout| 600 |Max time per function activation (seconds)| -|include_modules| [] |Explicitly pickle these dependencies. All required dependencies are pickled if default empty list. No one dependency is pickled if it is explicitly set to None | -|exclude_modules| [] |Explicitly keep these modules from pickled dependencies. It is not taken into account if you set include_modules | - -* **Returns**: One future for each job (Futures are also internally stored by Lithops). - -* **Usage**: - - ```python - future = fexec.call_async(foo, data) - ``` - -* **Code example**: [call_async.py](../examples/call_async.py) - -## Executor.map() - -Spawn multiple function activations based on the items of an input list. - -**map**(map_function, map_iterdata, \*\*kwargs) - -|Parameter| Default |Description| -|---|---|---| -|map_function | |The function to map over the data | -|map_iterdata | |An iterable of input data (e.g python list) | -|chunksize | 1 | Split map_iteradata in chunks of this size. Lithops spawns 1 worker per resulting chunk | -|worker_processes | 1 | Number of concurrent/parallel processes in each worker| -|extra_args| None | Additional arguments to pass to each map_function activation | -|extra_env| None |Additional environment variables for CF environment | -|runtime_memory| 256 |Memory (in MB) to use to run the functions | -|timeout| 600 |Max time per function activation (seconds) | -|include_modules| [] |Explicitly pickle these dependencies. All required dependencies are pickled if default empty list. No one dependency is pickled if it is explicitly set to None | -|exclude_modules| [] |Explicitly keep these modules from pickled dependencies. It is not taken into account if you set include_modules | -|obj_chunk_size| None | Used for data_processing. Chunk size to split each object in bytes. Must be >= 1MiB. 'None' for processing the whole file in one function activation| -|obj_chunk_number| None | Used for data_processing. Number of chunks to split each object. 'None' for processing the whole file in one function activation. chunk_n has prevalence over chunk_size if both parameters are set| -|obj_newline| '\n' | New line character for keeping line integrity of partitions. 'None' for disabling line integrity logic and get partitions of the exact same size in the functions| - -* **Returns**: A list with size len(map_iterdata) of futures for each job (Futures are also internally stored by Lithops). - -* **Usage**: - - ```python - iterdata = [1, 2, 3, 4] - futures = fexec.map(foo, iterdata) - ``` - -* **Code example**: [map.py](../examples/map.py) - -## Executor.map_reduce() - -Spawn multiple *map_function* activations, based on the items of an input list, eventually spawning one (or multiple) *reduce_function* activations over the results of the map phase. - -**map_reduce**(map_function, map_iterdata, reduce_function, \*\*kwargs) - -|Parameter| Default |Description| -|---|---|---| -|map_function| |The function to map over the data | -|map_iterdata | |An iterable of input data (e.g python list)| -|chunksize | 1 | Split map_iteradata in chunks of this size. Lithops spawns 1 worker per resulting chunk | -|worker_processes | 1 | Number of concurrent/parallel processes in each worker| -|extra_args| None | Additional arguments to pass to each map_function activation | -|reduce_function| |The function to map over the results of map_function | -|spawn_reducer| 20 | Percentage of done map functions before spawning the reduce function. By default the reducer is spawned when 20% of the map activations are done. | -|extra_env| None | Additional environment variables for CF environment| -|map_runtime_memory| 256 | Memory (in MB) to use to run the map_function| -|reduce_runtime_memory| 256| Memory (in MB) to use to run the reduce_function| -|timeout| 600 | Max time per function activation (seconds)| -|include_modules| [] |Explicitly pickle these dependencies. All required dependencies are pickled if default empty list. No one dependency is pickled if it is explicitly set to None | -|exclude_modules| [] |Explicitly keep these modules from pickled dependencies. It is not taken into account if you set include_modules | -|obj_chunk_size| None | Used for data_processing. Chunk size to split each object in bytes. Must be >= 1MiB. 'None' for processing the whole file in one function activation| -|obj_chunk_number| None | Used for data_processing. Number of chunks to split each object. 'None' for processing the whole file in one function activation. chunk_n has prevalence over chunk_size if both parameters are set| -|obj_newline| '\n' | New line character for keeping line integrity of partitions. 'None' for disabling line integrity logic and get partitions of the exact same size in the functions| -|obj_reduce_by_key| False| Used for data_processing. Set one reducer per object after running the partitioner (reduce-by-key) | - - -* **Returns**: A list with size len(map_iterdata) of futures for each job (Futures are also internally stored by Lithops). - -* **Usage**: - - ```python - iterdata = [1, 2, 3, 4] - futures = fexec.map_reduce(foo, iterdata, bar) - ``` - -* **Code example**: [map_reduce.py](../examples/map_reduce.py) - - -## Executor.wait() - -Waits for the function activations to finish. - -**wait**(\*\*kwargs) - -|Parameter| Default |Description| -|---|---|---| -|fs| None | List of futures to wait. If None, Lithops uses the internally stored futures | -|throw_except | True | Re-raise exception if call raised| -|return_when| ALL_COMPLETED | One of 'ALL_COMPLETED', 'ANY_COMPLETED', 'ALWAYS' | -|download_results| False | Whether or not download the results while monitoring activations | -|timeout| None | Timeout of waiting for results (in seconds)| -|THREADPOOL_SIZE| 128 | Number of threads to use waiting for results| -|WAIT_DUR_SEC| 1 | Time interval between each check (seconds) if no rabbitmq_monitor activated | -|show_progressbar| True | whether or not to show the progress bar | - - -* **Returns**: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have completed and `fs_notdone` is a list of futures that have not completed. - -* **Usage**: - - ```python - iterdata = [1, 2, 3, 4] - futures = fexec.map(foo, iterdata) - fexec.wait() - ``` - -* **Code example**: [wait.py](../examples/wait.py) - -## Executor.get_result() - -Gets the results from all the function activations. It internally makes use of the `Executor.wait()` method. - -**get_result**(\*\*kwargs) - -|Parameter| Default |Description| -|---|---|---| -|fs| None | List of futures to get the results. If None, Lithops uses the internally stored futures | -|throw_except | True | Re-raise exception if call raised| -|timeout| None | Timeout of waiting for results (in seconds)| -|THREADPOOL_SIZE| 128 | Number of threads to use waiting for results| -|WAIT_DUR_SEC| 1 | Time interval between each check (seconds) if no rabbitmq_monitor activated | -|show_progressbar| True | whether or not to show the progress bar | - -* **Returns**: The results are returned within an ordered list, where each element of the list is the result of one activation. - -* **Usage**: - - ```python - iterdata = [1, 2, 3, 4] - futures = fexec.map(foo, iterdata) - results = fexec.get_result() - ``` - -* **Code example**: [call_async.py](../examples/call_async.py), [map.py](../examples/map.py), [map_reduce.py](../examples/map_reduce.py) - -## Executor.plot() - -Creates 2 detailed execution plots: A timeline plot and a histogram plot. - -**plot**(\*\*kwargs) - -|Parameter| Default |Description| -|---|---|---| -|fs| None | List of futures to plot. If None, Lithops uses the internally stored futures| -|dst| None | Path to destination file, either absolute or relative. If set, you must specify the path + the file prefix (see example below), then lithops will generate the *prefix*_histogram.png and *prefix*_timeline.png files. If None, Lithops will create a new folder called *plots* in the current directory and use the current timestamp as file *prefix* | - -* **Returns**: *Nothing*. It stores 2 different plots in the selected `dst` path. - -* **Usage**: - - ```python - iterdata = [1, 2, 3, 4] - fexec.map(foo, iterdata) - results = fexec.get_result() # or fexec.wait() - # The next command will generate test_timeline.png and test_histogram.png in ~/lithops_plots - fexec.plot(dst='~/lithops_plots/test') - ``` - -* **Example**: - -

- - -

- -## Executor.clean() - -Cleans the temporary data generated by Lithops in IBM COS. This process runs asynchronously to the main execution since Lithops starts another process to do the task. If `data_cleaner=True` (default), this method is executed automatically after calling `get_result()`. - -**clean**(\*\*kwargs) - -|Parameter| Default |Description| -|---|---|---| -|fs| None | List of futures to clean temp data. If None, Lithops uses the internally stored futures | -|cs| None | List of cloudobjects to clean | -|clean_cloudobjects| True | Clean or not the cloudobjects generated in the executor | -|spawn_cleaner| True | Spawn cleaner process. If false it stores the data to be cleaned in a tmp dir | - -* **Returns**: *Nothing*. - -* **Usage**: - - ```python - iterdata = [1, 2, 3, 4] - futures = fexec.map(foo, iterdata) - results = fexec.get_result() - fexec.clean() - ``` - -* **Code example**: [map.py](../examples/map.py) - - -# Function chaining - -Function chaining is a pattern where multiple functions are called on the same executor consecutively. Using the same `lithops.FunctionExecutor` object reference, multiple functions can be invoked. It increases the readability of the code and means less redundancy. This means we chain multiple functions together with the same element reference. It’s not necessary to attach the `lithops.FunctionExecutor` reference multiple times for each function call. - -This patter is specially useful when the output of one invocation is the input of another invocation. In this case, Lithops does not download the intermediate results to the local client, instead, the intermediate results are directly read from the next function. - -It currently works with the Futures API, and you can chain the `map()`, `map_reuce()`, `wait()` and `get_result()` methods. Note that the returning value of one function must match the signature of the next function when chaining multiple `map()` calls. View the next examples: - - -Getting the result from a single `map()` call: - -```python -import lithops - -def my_func1(x): - return x*2 - -iterdata = [1, 2, 3] - -fexec = lithops.FunctionExecutor() -res = fexec.map(my_func1, iterdata).get_result() -print(res) -``` - - -Chain multiple map() calls and get the final result: - -```python -import lithops - - -def my_func1(x): - return x*2, 5 - -def my_func2(x, y): - return x+y - -iterdata = [1, 2, 3] - -fexec = lithops.FunctionExecutor() -res = fexec.map(my_func1, iterdata).map(my_func2).get_result() -print(res) -``` - -There is no limit in the number of map() calls that can be chained: - -```python -def my_func1(x): - return x+2, 5 - - -def my_func2(x, y): - return x+y, 5, 2 - - -def my_func3(x, y, z): - return x+y+z - - -iterdata = [1, 2, 3] - -fexec = lithops.FunctionExecutor() -res = fexec.map(my_func1, iterdata).map(my_func2).map(my_func3).get_result() -print(res) -``` - -Alternatively, you can pass the `futures` generated in a `map()` or `map_reduce()` call to the `iterdata` parameter with the same effect. Not that in this case you will only get the results of the last `map()` execution. Results of intermediate `map()`s are never downloaded: - -```python -def my_func1(x): - return x+2, 5 - - -def my_func2(x, y): - return x+y, 5, 2 - - -def my_func3(x, y, z): - return x+y+z - - -iterdata = [1, 2, 3] - -fexec = lithops.FunctionExecutor() -futures1 = fexec.map(my_func1, iterdata) -futures2 = fexec.map(my_func2, futures1) -futures3 = fexec.map(my_func3, futures2) -final_result = fexec.get_result() - -print(final_result) -``` \ No newline at end of file diff --git a/docs/api_storage.md b/docs/api_storage.md deleted file mode 100644 index 5d126201c..000000000 --- a/docs/api_storage.md +++ /dev/null @@ -1,322 +0,0 @@ -# Lithops Storage API Details - -Lithops allows to create a **Storage** instance and abstract away the backend implementation details. The standard way to get a Storage object set up is to import the lithops `Storage` class and create an instance. - - -**Storage(\*\*kwargs)** - -Initialize and return a Storage object. - -|Parameter | Default | Description| -|---|---|---| -|config | None | Lithops configuration dictionary | -|backend | None | Name of the backend | - - - -By default, the configuration is loaded from the lithops config file, so there is no need to provide any parameter to create a Storage instance: - -```python -from lithops import Storage - -storage = Storage() -``` - -Alternatively, you can pass the lithops configuration through a dictionary. In this case, it will load the storage backend set in the `storage` key of the `lithops` section: - -```python -from lithops import Storage - -config = {'lithops' : {'storage' : 'ibm_cos'}, - 'ibm_cos': {'region': 'REGION', 'api_key': 'API_KEY'}} - -storage = Storage(config=config) -``` - -In case you have multiple storage set in your configuration, you can force the storage backend by using the `backend` parameter: - -```python -from lithops import Storage - -storage = Storage(backend='redis') # this will create a redis Storage instance -``` - -or: - -```python -from lithops import Storage - -config = {'lithops' : {'storage' : 'ibm_cos'}, - 'ibm_cos': {'region': 'REGION', 'api_key': 'API_KEY'}} - 'redis': {'host': 'HOST', 'port':'PORT'}} - - -storage = Storage(config=config) # this will create an ibm_cos Storage instance -storage = Storage(config=config, backend='redis') # this will create a redis Storage instance -``` - -## Storage API Calls - -### `Storage.put_object()` - -Adds an object to a bucket of the storage backend. - -**put_object**(bucket, key, data) - -|Parameter | Description| -|---|---| -|bucket | Name of the bucket (String)| -|key | Name of the object (String)| -|data| Object data (bytes/string or seekable file-like object)| - -* **Usage**: - - ```python - storage = Storage() - # Bytes/string data - storage.put_object('my_bucket', 'test.txt', 'Hello World') - ``` - - ```python - storage = Storage() - # Seekable file-like object - with open('/tmp/my_big_file.csv', 'rb') as fl: - storage.put_object('my_bucket', 'my_big_file.csv', fl) - ``` - - -### `Storage.get_object()` - -Retrieves objects from the storage backend. - -**get_object**(bucket, key, \*\*kwargs) - -|Parameter | Description| -|---|---| -|bucket | Name of the bucket (String)| -|key | Name of the object (String)| -|stream | Get the object data or a file-like object (True/False) | -|extra_get_args | Extra get arguments to be passed to the underlying backend implementation (dict). For example, to specify the byte-range to read: `extra_get_args={'Range': 'bytes=0-100'}`| - -* **Usage**: - - ```python - storage = Storage() - data = storage.get_object('my_bucket', 'test.txt') - ``` - - -### `Storage.head_object()` -The HEAD operation retrieves metadata from an object without returning the object itself. This operation is useful if you're only interested in an object's metadata. - -**head_object**(bucket, key) - -|Parameter | Description| -|---|---| -|bucket | Name of the bucket (String)| -|key | Name of the object (String)| - -* **Usage**: - - ```python - storage = Storage() - obj_metadata = storage.head_object('my_bucket', 'test.txt') - ``` - - -### `Storage.delete_object()` - -Removes objects from the storage backend - -**delete_object**(bucket, key) - -|Parameter | Description| -|---|---| -|bucket | Name of the bucket (String)| -|key | Name of the object (String)| - -* **Usage**: - - ```python - storage = Storage() - storage.delete_object('my_bucket', 'test.txt') - ``` - -### `Storage.delete_objects()` - -This operation enables you to delete multiple objects from a bucket using a single HTTP request. If you know the object keys that you want to delete, then this operation provides a suitable alternative to sending individual delete requests, reducing per-request overhead. - -**delete_objects**(bucket, key_list) - -|Parameter | Description| -|---|---| -|bucket | Name of the bucket (String)| -|key_list | Name of the objects (list)| - -* **Usage**: - - ```python - storage = Storage() - storage.delete_objects('my_bucket', ['test1.txt', 'test2.txt']) - ``` - - -### `Storage.head_bucket()` - -This operation is useful to determine if a bucket exists and you have permission to access it. The operation returns a 200 OK if the bucket exists and you have permission to access it. Otherwise, the operation might return responses such as 404 Not Found and 403 Forbidden . - -**head_bucket**(bucket) - -|Parameter | Description| -|---|---| -|bucket | Name of the bucket (String)| - -* **Usage**: - - ```python - storage = Storage() - storage.head_bucket('my_bucket') - ``` - - -### `Storage.list_objects()` - -Returns all of the objects in a bucket. For each object, the list contains the name of the object (key) and the size. - -**list_objects**(bucket, \*\*kwargs) - -|Parameter | Description| -|---|---| -|bucket | Name of the bucket (String)| -|prefix | key prefix for filtering (String)| - -* **Usage**: - - ```python - storage = Storage() - storage.list_objects('my_bucket', prefix='temp/') - ``` - - -### `Storage.list_keys()` - -Similar to list_objects(), it returns all of the objects in a bucket. For each object, the list contains only the names of the objects (keys). - -**list_keys**(bucket, \*\*kwargs) - -|Parameter | Description| -|---|---| -|bucket | Name of the bucket (String)| -|prefix | key prefix for filtering (String)| - -* **Usage**: - - ```python - storage = Storage() - storage.list_keys('my_bucket') - ``` - - -### `Storage.get_client()` -Returns the underlying storage backend client. For example, if `Storage` is an instance built on top of AWS S3, it returns a boto3 client. - -**get_client**() - -* **Usage**: - - ```python - storage = Storage() - boto3_client = storage.get_client() - ``` - -### `Storage.put_cloudobject()` - -Adds objects to a bucket of the storage backend. Returns a **cloudobject** that is a reference to the object. - -**put_cloudobject**(body, \*\*kwargs) - -|Parameter | Description| -|---|---| -|body| Object data (bytes/string or seekable file-like object)| -|bucket | Name of the bucket (String). By default it uses the `storage_bucket`| -|key | Name of the object (String). By default it creates a random key| - -If `bucket` paramter is not provided, it will use the `storage_bucket` set in the lithops config. If `key` is not provided, it will create a random temporary key. - -* **Usage**: - - ```python - storage = Storage() - # Bytes/string - cobj = storage.put_cloudobject('Hello World!') - ``` - - ```python - storage = Storage() - # Seekable file-like object - with open('/tmp/my_big_file.csv', 'rb') as fl: - cobj = storage.put_cloudobject(fl) - ``` - - -### `Storage.get_cloudobject()` - -Retrieves CloudObjects from a bucket of the storage backend. - -**get_cloudobject**(cloudobject, \*\*kwargs) - - -|Parameter | Description| -|---|---| -|cloudobject| CloudObject Instance| -|stream | Get the object data or a file-like object (True/False) | - - -* **Usage**: - - ```python - storage = Storage() - cobj = storage.put_cloudobject('Hello World!', 'my-bucket', 'test.txt') - data = storage.get_cloudobject(cobj) - ``` - - -### `Storage.delete_cloudobject()` - -Removes CloudObjects from a bucket of the storage backend. - -**delete_cloudobject**(cloudobject) - - -|Parameter | Description| -|---|---| -|cloudobject| CloudObject Instance| - - -* **Usage**: - - ```python - storage = Storage() - cobj = storage.put_cloudobject('Hello World!', 'test.txt') - storage.delete_cloudobject(cobj) - ``` - -### `Storage.delete_cloudobjects()` - -This operation enables you to delete multiple objects from a bucket using a single HTTP request. If you know the object keys that you want to delete, then this operation provides a suitable alternative to sending individual delete requests, reducing per-request overhead. - -**delete_cloudobject**(cloudobjects, \*\*kwargs) - - -|Parameter | Description| -|---|---| -|cloudobjects| CloudObject Instances (list)| - - -* **Usage**: - - ```python - storage = Storage() - cobj1 = storage.put_cloudobject('Hello World!', 'test1.txt') - cobj2 = storage.put_cloudobject('Hello World!', 'test2.txt') - storage.delete_cloudobjects([cobj1, cobj2]) \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index db7db47a1..20e19b567 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,6 +12,7 @@ import os import sys +import lithops sys.path.insert(0, os.path.abspath('.')) sys.path.insert(0, os.path.abspath("../")) @@ -72,8 +73,6 @@ # html_theme = 'sphinx_material' # html_theme = 'karma_sphinx_theme' html_theme = 'sphinx_book_theme' - -html_logo = "_static/lithops_logo_readme.png" html_favicon = '_static/favicon.png' language = 'en' @@ -81,16 +80,23 @@ html_theme_options = { 'repository_url': 'https://github.com/lithops-cloud/lithops', 'repository_branch': 'master', - 'google_analytics_id': 'G-7YKZHZYDCR', 'use_issues_button': False, 'use_download_button': True, 'use_fullscreen_button': False, 'use_repository_button': True, - 'show_navbar_depth': 1 + 'show_navbar_depth': 1, + # https://pydata-sphinx-theme.readthedocs.io/en/latest/user_guide/branding.html + "logo": { + "text": f"Lithops v{lithops.__version__}", + "image_light": "_static/lithops_logo_readme.png", + "image_dark": "_static/lithops_logo_readme.png", + }, + # https://pydata-sphinx-theme.readthedocs.io/en/latest/user_guide/analytics.html + "analytics": { + "google_analytics_id": "G-7YKZHZYDCR", + } } -# html_title = f"Lithops v{lithops.__version__}" -html_title = '' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/index.rst b/docs/index.rst index 4f4f4264d..762d48d15 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,92 +1,198 @@ -What is Lithops? -**************** +Welcome to Lithops! +******************** + +**Lithops is a Python multi-cloud serverless computing framework** that empowers you to **run unmodified Python code at massive scale** on leading serverless platforms and beyond. + +Whether you're processing terabytes of data or launching thousands of parallel tasks, Lithops lets you **focus on your code, not infrastructure**. It brings simplicity, performance, and flexibility to cloud-native computing. + + +Why Lithops? +============ + +Serverless computing makes it easy to run code in the cloud — but scaling data-intensive workloads across clouds is hard. Lithops solves this by providing: + +- ✅ **Zero-configuration scale-out**: Run your Python functions on thousands of cloud workers with no infrastructure management. +- 🌍 **True multi-cloud portability**: Move seamlessly between AWS, GCP, Azure, IBM Cloud, etc... +- 💡 **Developer-first experience**: Write standard Python code, including NumPy, pandas, and scikit-learn — no cloud-specific boilerplate required. +- 🧠 **Optimized for big data and AI**: Efficiently process massive datasets stored in object storage services with automatic partitioning. + + +What You Can Build +=================== + +Lithops is ideal for **highly parallel, data-heavy workloads**. These include: + +- 🔁 Monte Carlo simulations +- 🧬 Metabolomics and genomics pipelines +- 🗺️ Geospatial analytics +- 🧠 Deep learning and hyperparameter tuning +- 📊 Big Data ETL and analytics workflows + +If your problem can be broken down into many small, independent tasks, Lithops will help you solve it at scale — fast. + +Key Features +============ -.. image:: source/images/lithops_logo_readme.png - :alt: Lithops +Compute Anywhere +---------------- +**Lithops features a modular and extensible backend architecture**, allowing you to run workloads across: + +- Serverless functions +- Cloud VMs and Kubernetes clusters +- On-premise compute resources + +No matter where your data lives, Lithops can execute your code right next to it. + +.. image:: source/images/multicloud.jpg + :alt: Available backends :align: center -| -**Lithops is a Python multi-cloud serverless computing framework. It allows to run unmodified local python code at massive scale in the main serverless computing platforms.** +Object Storage Made Easy +------------------------- -Lithops delivers the user’s code into the cloud without requiring knowledge of how it is deployed and run. -Moreover, its multicloud-agnostic architecture ensures portability across cloud providers, overcoming vendor lock-in. +**Seamlessly process large-scale data stored in object storage.** ------------- +Lithops simplifies working with data lakes and object storage by providing: -**Lithops provides great value for data-intensive applications like Big Data analytics and embarrassingly parallel jobs.** +- 🔍 **Automatic data discovery**: Detects and lists files across nested directories. +- 📂 **Transparent data partitioning**: Splits large files (e.g., CSV, Parquet, JSON) into chunks for efficient parallel processing. +- 🧰 **Unified, Pythonic API**: Interact with your data using a single interface, regardless of where it's stored. -It is specially suited for highly-parallel programs with little or no need for communication between processes. +You write simple Python code — Lithops handles the complexity of parallel I/O, data distribution, and storage backends under the hood. -Examples of applications that run with Lithops include Monte Carlo simulations, deep learning and machine learning processes, metabolomics computations, and geospatial -analytics, to name a few. ------------- +Get Started Quickly +==================== -**Lithops facilitates consuming data from object storage (like AWS S3, GCP Storage or IBM Cloud Object Storage) by providing automatic partitioning and data discovery for common data formats like CSV.** +To start using Lithops: -Lithops abstracts away the underlying cloud-specific APIs for accessing storage and provides an intuitive and easy to use interface to process high volumes of data. +1. Install via pip: + .. code-block:: bash -Quick Start -*********** + pip install lithops -Lithops is available for Python 3.6 and up. Install it using ``pip``: +2. Configure your cloud credentials (see the :doc:`Configuration Guide `) -.. code-block:: +3. Write and run your first parallel job: - pip install -U lithops + .. code-block:: python -You're ready to execute a simple example! + import lithops -.. code:: python + def my_function(x): + return x * 2 - from lithops import FunctionExecutor + fexec = lithops.FunctionExecutor() + fexec.map(my_function, range(10)) + print(fexec.get_result()) - def hello(name): - return 'Hello {}!'.format(name) +You're now running massively parallel workloads with just a few lines of code! - with FunctionExecutor() as fexec: - fut = fexec.call_async(hello, 'World') - print(fut.result()) -Use any Cloud -************* -**Lithops provides an extensible backend architecture that is designed to work with different compute and storage services available on Cloud providers and on-premise backends.** +Success stories +=============== -In this sense, you can code your application in Python and run it unmodified wherever your data is located at: IBM Cloud, AWS, Azure, Google Cloud and Alibaba Aliyun... +* `Metaspace Metabolomics Platform `_ is running in production in AWS with hundreds of users. + MetaSpace is using Lithops over Lambda Functions and EC2 VMs to access metabolomics data in Amazon S3. + MetaSpace moved from Spark to Lithops to simplify dynamic and elastic resource provisioning. -.. image:: source/images/multicloud.jpg - :alt: Available backends - :align: center +* `OpenNebula Open Source Cloud and Edge Computing platform `_ integrates Lithops as an easy-to-use appliance + for data analytics. OpenNebula also deploys MinIO storage and Lithops Kubernetes backend to facilitate data analytics + in on-premise and edge deployments. -| +* `Cubed `_ is a popular library for scalable multidimensional array processing with bounded memory. + Cubed is a drop-in replacement for Dask's Array API. + Cubed integrates Lithops as a fast compute backend enabling scalable array processing in the Cloud. + +* `BSC Marenostrum 5 SuperComputer `_ is a pre-exascale EuroHPC supercomputer with + a peak computational power of 314 PFlops. A new Lithops HPC compute backend has been created enabling large-scale computing + reaching tens of thousands of concurrent functions. LithopsHPC is now being used in the neardata.eu project for extreme + data analytics of genomics pipelines. -Additional resources -******************** Blogs and Talks ---------------- -* `Simplify the developer experience with OpenShift for Big Data processing by using Lithops framework `_ -* `Speed-up your Python applications using Lithops and Serverless Cloud resources `_ -* `Serverless Without Constraints `_ -* `Lithops, a Multi-cloud Serverless Programming Framework `_ -* `CNCF Webinar - Toward Hybrid Cloud Serverless Transparency with Lithops Framework `_ -* `Using Serverless to Run Your Python Code on 1000 Cores by Changing Two Lines of Code `_ -* `Decoding dark molecular matter in spatial metabolomics with IBM Cloud Functions `_ -* `Your easy move to serverless computing and radically simplified data processing `_ Strata Data Conference, NY 2019 -* `Speed up data pre-processing with Lithops in deep learning `_ -* `Predicting the future with Monte Carlo simulations over IBM Cloud Functions `_ -* `Process large data sets at massive scale with Lithops over IBM Cloud Functions `_ -* `Industrial project in Technion on Lithops `_ +=============== + +* `Simplify the developer experience with OpenShift for Big Data processing by using Lithops framework + `_ + +* `Speed-up your Python applications using Lithops and Serverless Cloud resources + `_ + +* `Serverless Without Constraints + `_ + +* `Lithops, a Multi-cloud Serverless Programming Framework + `_ + +* `CNCF Webinar - Toward Hybrid Cloud Serverless Transparency with Lithops Framework + `_ + +* `Using Serverless to Run Your Python Code on 1000 Cores by Changing Two Lines of Code + `_ + +* `Decoding dark molecular matter in spatial metabolomics with IBM Cloud Functions + `_ + +* `Your easy move to serverless computing and radically simplified data processing + `_ + Strata Data Conference, NY 2019 + +* `Speed up data pre-processing with Lithops in deep learning + `_ + +* `Predicting the future with Monte Carlo simulations over IBM Cloud Functions + `_ + +* `Process large data sets at massive scale with Lithops over IBM Cloud Functions + `_ + +* `Industrial project in Technion on Lithops + `_ + Papers ------- -* `Outsourcing Data Processing Jobs with Lithops `_ - IEEE Transactions on Cloud Computing 2022 -* `Towards Multicloud Access Transparency in Serverless Computing `_ - IEEE Software 2021 -* `Primula: a Practical Shuffle/Sort Operator for Serverless Computing `_ - ACM/IFIP International Middleware Conference 2020. `See Primula presentation here `_ -* `Bringing scaling transparency to Proteomics applications with serverless computing `_ - 6th International Workshop on Serverless Computing (WoSC6) 2020. `See Workshop presentation here `_ -* `Serverless data analytics in the IBM Cloud `_ - ACM/IFIP International Middleware Conference 2018 +====== + +* `Serverful Functions: Leveraging Servers in Complex Serverless Workflows + `_ - ACM Middleware Industrial Track 2024 + +* `Transparent serverless execution of Python multiprocessing applications + `_ - Elsevier Future Generation Computer Systems 2023 + +* `Outsourcing Data Processing Jobs with Lithops + `_ - IEEE Transactions on Cloud Computing 2022 + +* `Towards Multicloud Access Transparency in Serverless Computing + `_ - IEEE Software 2021 + +* `Primula: a Practical Shuffle/Sort Operator for Serverless Computing + `_ - ACM/IFIP International Middleware Conference 2020. + `See Primula presentation here `_ + +* `Bringing scaling transparency to Proteomics applications with serverless computing + `_ - 6th International Workshop on Serverless Computing (WoSC6) 2020. + `See Workshop presentation here `_ + +* `Serverless data analytics in the IBM Cloud + `_ - ACM/IFIP International Middleware Conference 2018 + + +Join the Community +================== + +Lithops is an open-source project, actively maintained and supported by a community of contributors and users. You can: + +- 💬 Join the discussion on `GitHub Discussions `_ +- 🐞 Report issues or contribute on `GitHub `_ +- 📖 Read more in the full documentation + + +--- + +**Start writing scalable cloud applications — with Lithops.** .. toctree:: @@ -123,6 +229,7 @@ Papers source/api_futures.rst source/functions.md + source/worker_granularity.rst source/notebooks/function_chaining.ipynb source/api_stats.rst @@ -156,10 +263,8 @@ Papers :maxdepth: 0 :caption: Advanced Features - source/worker_granularity.rst source/monitoring.rst - Custom Runtime - source/customized_runtime.rst + Custom Runtimes .. toctree:: @@ -178,5 +283,4 @@ Papers Applications source/contributing.rst - source/testing.rst Changelog diff --git a/docs/source/api_futures.rst b/docs/source/api_futures.rst index a2cee0c5f..b42999795 100644 --- a/docs/source/api_futures.rst +++ b/docs/source/api_futures.rst @@ -3,19 +3,43 @@ Lithops Futures API =================== -The primary object in Lithops is the executor. The standard way to get everything set up is to import `lithops`, and create an instance of one of the available modes of executions. +The core abstraction in Lithops is the **executor**, responsible for orchestrating the execution of your functions across different environments. -Lithops is shipped with 3 modes of execution: **Localhost**, **Serverless** and **Standalone**. In this sense, each mode of execution has its own executor class: +To get started, you typically import `lithops` and create an executor instance to run your code. Lithops provides a flexible set of executors to suit different needs. -* `lithops.LocalhostExecutor()`: Executor that uses local processes to run jobs in the local machine. -* `lithops.ServerlessExecutor()`: Executor to run jobs in one of the available serverless compute backends. -* `lithops.StandaloneExecutor()`: Executor to run jobs in one of the available standalone compute backends. +Primary Executors +----------------- -Additionally, Lithops includes a top-level function executor, which encompasses all three previous executors: +* **FunctionExecutor** (`lithops.FunctionExecutor()`): + The main, generic executor that automatically selects its execution mode based on the provided configuration. + This lets you write your code once and run it seamlessly on localhost, serverless, or standalone backends without changing your code. -* `lithops.FunctionExecutor()`: Generic executor that will use the configuration to determine its mode of execution, i.e., based on the configuration it will be **localhost**, **serverless** or **standalone**. +* **RetryingFunctionExecutor** (`lithops.RetryingFunctionExecutor()`): + A robust wrapper around `FunctionExecutor` that transparently handles retries on failed tasks. + It supports all features of `FunctionExecutor` with added automatic retry logic, improving fault tolerance and reliability for unstable or transient failure-prone environments. + +Secondary Executors +------------------- + +For more specialized use cases, Lithops also provides explicit executors for each execution mode: + +* **LocalhostExecutor** (`lithops.LocalhostExecutor()`): + Runs jobs locally using multiple processes on your machine. Ideal for development, debugging, or small-scale workloads. + +* **ServerlessExecutor** (`lithops.ServerlessExecutor()`): + Executes jobs on serverless compute platforms, managing scaling and deployment automatically. Best for massively parallel, ephemeral workloads. + +* **StandaloneExecutor** (`lithops.StandaloneExecutor()`): + Runs jobs on standalone compute backends such as clusters or virtual machines, suitable for long-running or resource-heavy tasks. + + +Configuration and Initialization +================================ + +By default, executors load configuration from the Lithops configuration file (e.g., `lithops_config.yaml`). You can also supply configuration parameters programmatically via a Python dictionary when creating an executor instance. Parameters passed explicitly override those in the config file, allowing for flexible customization on the fly. + +This layered executor design lets Lithops provide a powerful, unified API for parallel function execution — from local development to multi-cloud production deployments with fault tolerance and retries built-in. -By default, the executor load the configuration from the config file. Alternatively, you can pass the configuration with a python dictionary. In any case, note that all the parameters set in the executor will overwrite those set in the configuration. Futures API Reference --------------------- @@ -24,3 +48,8 @@ Futures API Reference :members: :undoc-members: :show-inheritance: + +.. autoclass:: lithops.retries.RetryingFunctionExecutor + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/api_multiprocessing.rst b/docs/source/api_multiprocessing.rst index 67cf5588d..8ee24c5d2 100644 --- a/docs/source/api_multiprocessing.rst +++ b/docs/source/api_multiprocessing.rst @@ -3,6 +3,13 @@ Multiprocessing API Lithops implements Python's `multiprocessing API `_ to transparently run local-parallel applications but using serverless functions for Processes and a Redis instance for shared state and Inter-Process Communication (IPC). +Before utilizing this API, you will need to install its dependencies: + +.. code-block:: bash + + python3 -m pip install lithops[multiprocessing] + + Process and Pool ---------------- @@ -50,9 +57,9 @@ The Redis credentials (host, password...) is loaded from the ``redis`` section o The fastest way to deploy a Redis instance is using Docker in a VM located in the cloud of your choice: -.. code:: +.. code:: bash - $ docker run --rm -d --network host --name redis redis:6.2.1 --requirepass redispassword + docker run --rm -d --network host --name redis redis:6.2.1 --requirepass redispassword To have lower latency, you can deploy the functions and the VM in the same VPC and use route through internal traffic instead of the internet. For example, in AWS, the functions and VM can be deployed in the same VPC: Lambdas go to a private subnet and the VM in a public subnet. This way, the VM has access to the internet and the local Lithops process can also access it. @@ -79,7 +86,7 @@ For this reason, to set specific configuration in runtime, the ``Lithops.multipr # To set a config parameter, use the set_parameter # function and specify the parameter and the desired value - mp_config.set_parameter(mp_config.LITHOPS_CONFIG, {'lithops': {'mode': 'localhost'}}) + mp_config.set_parameter(mp_config.LITHOPS_CONFIG, {'lithops': {'backend': 'localhost'}}) mp_config.set_parameter(mp_config.STREAM_STDOUT, True) mp_config.set_parameter(mp_config.REDIS_EXPIRY_TIME, 1800) mp_config.set_parameter(mp_config.PIPE_CONNECTION_TYPE, 'redislist') @@ -120,5 +127,4 @@ Multiprocessing configuration keys - ``None`` - * To use nanomsg for Pipes, you must still deploy a Redis instance (used for pipe directory). Note that this feature only works in environments where functions can open a port and communicate with each other. diff --git a/docs/source/api_stats.rst b/docs/source/api_stats.rst index afcdcf466..6da164720 100644 --- a/docs/source/api_stats.rst +++ b/docs/source/api_stats.rst @@ -10,6 +10,12 @@ Execution summary plots The :code:`plot()` method from :code:`FunctionExecutor` creates a scatter plot and a histogram plot showing a summary of the tasks executed by a :code:`FunctionExecutor`. By default, lithops creates a :code:`plots/` directory in the working directory path containing both plots in PNG format. For more details refer to the `FunctionExecutor API reference `_. +To get started, first install Lithops and the plotting dependencies with: + +.. code-block:: bash + + python3 -m pip install lithops[plotting] + * **Scatter Plot**: the scatter plot shows a timeline on the horizontal axis where the stages of all invocations are arranged on the vertical axis. - :code:`host submit` indicates the time that the orchestrator process has invoked the function. - :code:`call start` indicates the timestamp at which the function starts its execution. @@ -85,13 +91,21 @@ The user can obtain these statistics through the future object: 'worker_cold_start': True, 'worker_end_tstamp': 1647526902.397567, 'worker_exec_time': 0.23604679, + 'worker_func_cpu_usage': [0.0, 25.0], + 'worker_func_cpu_user_time': 70566.78125, + 'worker_func_cpu_system_time': 16418.34375, 'worker_func_end_tstamp': 1647526902.2985177, 'worker_func_exec_time': 1.91e-06, + 'worker_func_recv_net_io': 5968, + 'worker_func_sent_net_io': 1223, 'worker_func_start_tstamp': 1647526902.2985158, - 'worker_result_upload_time': 0.07001352, - 'worker_start_tstamp': 1647526902.1615202, + 'worker_func_rss': 60678144, + 'worker_func_uss': 44838912, + 'worker_func_vms': 552267776, 'worker_peak_memory_start': 88469504, - 'worker_peak_memory_end': 126469504} + 'worker_peak_memory_end': 126469504, + 'worker_result_upload_time': 0.07001352, + 'worker_start_tstamp': 1647526902.1615202} .. list-table:: @@ -119,11 +133,11 @@ The user can obtain these statistics through the future object: * - :code:`host_result_done_tstamp` - Timestamp of when host received the function result from cloud object storage. * - :code:`host_result_query_count` - - Number of queries to the object storage to get the status object (synchronize the completion of the function) + - Number of queries to the object storage to get the result object. * - :code:`host_status_done_tstamp` - Timestamp of when the host received the signal that the function has finished its execution. * - :code:`host_status_query_count` - - Number of queries to the object storage to get the result object + - Number of queries to the object storage to get the status object (synchronize the completion of the function). * - :code:`host_submit_tstamp` - Timestamp of function invocation. * - :code:`worker_cold_start` @@ -132,12 +146,28 @@ The user can obtain these statistics through the future object: - Timestamp in which the worker function had finished its execution. * - :code:`worker_exec_time` - Total execution time of the worker function (lithops wrapper + user defined funtion execution time). + * - :code:`worker_func_cpu_usage` + - Array of CPU usage percentages, with each element representing the average usage of each CPU core during user-defined function execution. + * - :code:`worker_func_cpu_user_time` + - CPU user time during the execution of the user-defined function. + * - :code:`worker_func_cpu_system_time` + - CPU system time during the execution of the user-defined function. * - :code:`worker_func_end_tstamp` - Timestamp of the end of execution of the user-defined function. * - :code:`worker_func_exec_time` - Total execution time of the user-defined function. + * - :code:`worker_func_recv_net_io` + - Network I/O bytes received during the execution of the user-defined function. + * - :code:`worker_func_sent_net_io` + - Network I/O bytes sent during the execution of the user-defined function. * - :code:`worker_func_start_tstamp` - Timestamp of the start of execution of the user-defined function. + * - :code:`worker_func_rss` + - Resident Set Size (RSS) in bytes, indicating the amount of physical memory occupied by the user-defined function during its execution. + * - :code:`worker_func_uss` + - Unique Set Size (USS) in bytes, representing the memory exclusively used by the function that is not shared with other processes. + * - :code:`worker_func_vms` + - Virtual Memory Size (VMS) in bytes used by the user-defined function. This metric quantifies the total virtual memory allocated. * - :code:`worker_result_upload_time` - Total time taken for the function to upload the result to cloud object storage. * - :code:`worker_start_tstamp` diff --git a/docs/source/cli.rst b/docs/source/cli.rst index e953d317d..d2c7fd317 100644 --- a/docs/source/cli.rst +++ b/docs/source/cli.rst @@ -10,7 +10,7 @@ Lithops management ------------------ ``lithops hello`` -~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~ Runs a *hello-world* function. @@ -31,7 +31,7 @@ Runs a *hello-world* function. - **Usage example**: ``lithops hello -b ibm_cf -s ibm_cos`` ``lithops test`` -~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~ Runs the unit testing suite. For more instructions about testing `view this page `__. @@ -58,7 +58,7 @@ this page `__. | --keep\_datasets, -k | Keeps datasets in storage after the test run (Flag) | +------------------------+----------------------------------------------------------------+ -- **Usage example**: ``lithops test -b ibm_cf -s ibm_cos -f`` +- **Usage example**: ``lithops test -b ibm_cf -s ibm_cos`` ``lithops clean`` ~~~~~~~~~~~~~~~~~ @@ -108,12 +108,50 @@ Open an ssh connection to the master VM (Only available for standalone backends) - **Usage example**: ``lithops attach -b ibm_vpc`` +``lithops worker list`` +~~~~~~~~~~~~~~~~~~~~~~~ + +Lists the available workers in the master VM (Only available for standalone backends) + ++------------------------+----------------------------------------------------------------+ +| Parameter | Description | ++========================+================================================================+ +| --config, -c | Path to your config file | ++------------------------+----------------------------------------------------------------+ +| --backend, -b | Compute backend name | ++------------------------+----------------------------------------------------------------+ +| --region, -r | Compute backend region | ++------------------------+----------------------------------------------------------------+ +| --debug, -d | Activate debug logs (Flag) | ++------------------------+----------------------------------------------------------------+ + +- **Usage example**: ``lithops worker list -b ibm_vpc`` + +``lithops job list`` +~~~~~~~~~~~~~~~~~~~~ + +Lists the jobs submitted to the master VM (Only available for standalone backends) + ++------------------------+----------------------------------------------------------------+ +| Parameter | Description | ++========================+================================================================+ +| --config, -c | Path to your config file | ++------------------------+----------------------------------------------------------------+ +| --backend, -b | Compute backend name | ++------------------------+----------------------------------------------------------------+ +| --region, -r | Compute backend region | ++------------------------+----------------------------------------------------------------+ +| --debug, -d | Activate debug logs (Flag) | ++------------------------+----------------------------------------------------------------+ + +- **Usage example**: ``lithops job list -b ibm_vpc`` + Runtime management ------------------ For complete instructions on how to build runtimes for Lithops, please -refer to ``runtime/`` folder and choose your compute backend. +refer to the ``runtime/`` folder and choose your compute backend. ``lithops runtime build `` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -121,7 +159,7 @@ refer to ``runtime/`` folder and choose your compute backend. Build a new runtime image. Depending of the compute backend, there must be a Dockerfile located in the same folder you run the command, otherwise use ``-f`` parameter. Note that this command only builds the -image and puts it to a container registry. This command do not deploy +image and puts it into a container registry. This command do not deploy the runtime to the compute backend. +-----------------+-----------------------------------+ @@ -409,7 +447,7 @@ Deletes objects from a given bucket. ``lithops storage list `` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Deletes objects from a given bucket. +Lists objects from a given bucket. +-----------------+---------------------------------+ | Parameter | Description | @@ -431,5 +469,3 @@ Deletes objects from a given bucket. - To list all objects that start with given prefix :``lithops storage list -b ibm_cos cloudbucket -p test/`` - - diff --git a/docs/source/comparing_lithops.rst b/docs/source/comparing_lithops.rst index 64bd843b7..07565ce7b 100644 --- a/docs/source/comparing_lithops.rst +++ b/docs/source/comparing_lithops.rst @@ -1,77 +1,61 @@ -Comparing Lithops with other distributed computing frameworks +Comparing Lithops with Other Distributed Computing Frameworks ============================================================= -In a nutshell, Lithops differs from other distributed computing frameworks in that Lithops leverages serverless -functions to compute massively parallel computations. +Lithops introduces a novel approach to distributed computing by leveraging **serverless functions** for massively parallel computations. Unlike traditional frameworks that require managing a cluster of nodes, Lithops utilizes Function-as-a-Service (FaaS) platforms to dynamically scale execution resources — down to zero when idle and massively up when needed. -In addition, Lithops provides a simple and easy-to-use interface to access and process data stored in Object Storage -from your serverless functions. - -Moreover, Lithops abstract design allows seamlessly portability between clouds and FaaS services, avoiding vendor -lock-in. +In addition, Lithops offers a simple and consistent programming interface to transparently process data stored in **Object Storage** from within serverless functions. Its **modular and cloud-agnostic architecture** enables seamless portability across different cloud providers and FaaS platforms, effectively avoiding vendor lock-in. PyWren ------ -.. image:: https://www.faasification.com/assets/img/tools/pywren-logo-big.png - :align: center - :width: 250 +`PyWren `_ is the precursor to Lithops. Initially designed to run exclusively on AWS Lambda using a Conda runtime and supporting only Python 2.7, it served as a proof of concept for using serverless functions in scientific computing. +In 2018, the Lithops team forked PyWren to adapt it for **IBM Cloud Functions**, which offered a Docker-based runtime. This evolution also introduced support for **Object Storage as a primary data source** and opened the door to more advanced use cases such as Big Data analytics. -`PyWren `_ is Lithops' "father" project. PyWren was only designed to run in AWS Lambda with a -Conda environment and only supported Python 2.7. In 2018, Lithops' creators forked PyWren and adapted it to IBM Cloud -Functions, which, in contrast, uses a Docker runtime. The authors also explored new usages for PyWren, like processing Big Data from -Object Storage. Then, on September 2020, IBM PyWren authors decided that the project had evolved enough to no longer be -considered a simple fork of PyWren for IBM cloud and became Lithops. With this change, the project would no longer be -tied to the old PyWren model and could move to more modern features such as mulit-cloud support or the transparent -multiprocessing interface. +By September 2020, the IBM PyWren fork had diverged significantly. The maintainers rebranded the project as **Lithops**, reflecting its broader goals — including multi-cloud compatibility, improved developer experience, and support for modern Python environments and distributed computing patterns. -You can read more about PyWren IBM Cloud at the Middleware'18 industry paper `Serverless Data Analytics in the IBM Cloud `_. +For more details, refer to the Middleware'18 industry paper: +`Serverless Data Analytics in the IBM Cloud `_. Ray and Dask ------------ -.. image:: https://warehouse-camo.ingress.cmh1.psfhosted.org/98ae79911b7a91517ba16ef2dc7dc3b972214820/68747470733a2f2f6769746875622e636f6d2f7261792d70726f6a6563742f7261792f7261772f6d61737465722f646f632f736f757263652f696d616765732f7261795f6865616465725f6c6f676f2e706e67 - :align: center +.. image:: https://github.com/ray-project/ray/raw/master/doc/source/images/ray_logo.png :width: 250 - .. image:: https://docs.dask.org/en/stable/_images/dask_horizontal.svg - :align: center :width: 250 -In comparison with Lithops, both `Ray `_ and `Dask `_ leverage a cluster of nodes for distributed computing, while Lithops -mainly leverages serverless functions. This restraint makes Ray much less flexible than Lithops in terms of scalability. +`Ray `_ and `Dask `_ are distributed computing frameworks designed to operate on a **predefined cluster of nodes** (typically virtual machines). In contrast, Lithops relies on **serverless runtimes**, which allows for *elastic and fine-grained scaling* — including scaling to zero — with no idle infrastructure costs. -Although Dask and Ray can scale and adapt the resources to the amount of computation needed, they don't scale to zero since -they must keep a "head node" or "master" that controls the cluster and must be kept up. +While Ray and Dask provide dynamic task scheduling and can autoscale within an IaaS environment, they always require a **centralized "head node" or controller** to manage the cluster, making them less suitable for ephemeral and cost-efficient cloud-native computing. -In any case, the capacity and scalability of Ray or Dask in IaaS using virtual machines is not comparable to that of serverless functions. +Additionally, the performance and elasticity of Ray and Dask in IaaS environments are not directly comparable to Lithops' **fully serverless model**, which benefits from the near-infinite parallelism offered by cloud functions. PySpark ------- .. image:: https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/Apache_Spark_logo.svg/2560px-Apache_Spark_logo.svg.png - :align: center :width: 250 +`PySpark `_ is the Python interface for Apache Spark, a well-established distributed computing engine. Spark is typically deployed on a **static cluster of machines**, either on-premises or in cloud environments using HDFS or cloud-native file systems. -Much like Ray or Dask, PySpark is a distributed computing framework that uses cluster technologies. PySpark provides Python bindings for Spark. -Spark is designed to work with a fixed-size node cluster, and it is typically used to process data from on-prem HDFS -and analyze it using SparkSQL and Spark DataFrame. - +PySpark is optimized for **batch analytics** using DataFrames and SparkSQL, but it lacks native integration with FaaS models. Its operational model is not inherently elastic and requires continuous management of a Spark cluster, which may not align with modern, fully managed, or serverless computing paradigms. Serverless Framework -------------------- .. image:: https://cdn.diegooo.com/media/20210606183353/serverless-framework-icon.png - :align: center :width: 250 +`Serverless Framework `_ is a deployment toolchain designed primarily for **building and deploying serverless web applications**, especially on AWS, GCP, and Azure. It is widely used to manage HTTP APIs, event-driven services, and infrastructure-as-code (IaC) for cloud-native apps. -Serverless Framework is a tool to develop serverless applications (mainly NodeJS) and deploy them seemlessly on AWS, GCP -or Azure. +Although both Lithops and Serverless Framework leverage **serverless functions**, their objectives are fundamentally different: + +- **Serverless Framework** focuses on application deployment (e.g., microservices, REST APIs). +- **Lithops** targets **parallel and data-intensive workloads**, enabling large-scale execution of Python functions over scientific datasets, data lakes, and unstructured data in object storage. + +Summary +------- -Although both Serverless Framework and Lithops use serverless functions, their objective is completely different: -Serverless Framework aims to provide an easy-to-use tool to develop applications related to web services, like HTTP APIs, -while Lithops aims to develop applications related to highly parallel scientific computation and Big Data processing. +Lithops stands out as a **cloud-native, serverless-first framework** purpose-built for **parallel computing, data analytics, and scientific workloads**. By abstracting away infrastructure management and providing built-in object storage integration, it delivers a unique balance of **simplicity**, **performance**, and **multi-cloud compatibility** — distinguishing it from traditional cluster-based frameworks and generic serverless tools alike. diff --git a/docs/source/compute_backends.rst b/docs/source/compute_backends.rst index 437a7d711..49c52f14f 100644 --- a/docs/source/compute_backends.rst +++ b/docs/source/compute_backends.rst @@ -7,27 +7,34 @@ Compute Backends compute_config/localhost.md -**Serverless Compute Backends:** +**Serverless (FaaS) Backends:** .. toctree:: :glob: :maxdepth: 1 - compute_config/ibm_cf.md - compute_config/code_engine.md compute_config/aws_lambda.md - compute_config/aws_batch.md compute_config/gcp_functions.md - compute_config/gcp_cloudrun.md compute_config/azure_functions.md - compute_config/azure_containers.md compute_config/oracle_functions.md compute_config/aliyun_functions.md + compute_config/openwhisk.md + +**Serverless (CaaS) Backends:** + +.. toctree:: + :glob: + :maxdepth: 1 + + compute_config/code_engine.md + compute_config/aws_batch.md + compute_config/gcp_cloudrun.md + compute_config/azure_containers.md compute_config/kubernetes.md compute_config/knative.md - compute_config/openwhisk.md + compute_config/singularity.md -**Standalone Compute Backends:** +**Standalone Backends:** .. toctree:: :glob: diff --git a/docs/source/compute_config/aliyun_functions.md b/docs/source/compute_config/aliyun_functions.md index 5702fb60f..8d15966f5 100644 --- a/docs/source/compute_config/aliyun_functions.md +++ b/docs/source/compute_config/aliyun_functions.md @@ -6,11 +6,11 @@ Lithops with *Aliyun Function Compute* as serverless compute backend. 1. Install Alibaba Cloud backend dependencies: -``` +```bash python3 -m pip install lithops[aliyun] ``` -1. Access to your [console](https://homenew-intl.console.aliyun.com/) and activate your Functions service instance. +2. Access to your [console](https://homenew-intl.console.aliyun.com/) and activate your Functions service instance. ## Configuration diff --git a/docs/source/compute_config/aws_batch.md b/docs/source/compute_config/aws_batch.md index 530340695..d03642b99 100644 --- a/docs/source/compute_config/aws_batch.md +++ b/docs/source/compute_config/aws_batch.md @@ -4,54 +4,104 @@ Lithops with *AWS Batch* as serverless batch compute backend. ## Installation -1. Install Amazon Web Services backend dependencies: +1. Install AWS backend dependencies: -``` +```bash python3 -m pip install lithops[aws] ``` -2. [Login](https://console.aws.amazon.com/?nc2=h_m_mc) to Amazon Web Services Console (or signup if you don't have an account) +## Configuration + +1. [Login](https://console.aws.amazon.com/?nc2=h_m_mc) to Amazon Web Services Console (or signup if you don't have an account) -3. Navigate to **IAM > Roles** to create the ECS Task Execution Role. AWS provides a defualt role named `ecsTaskExecutionRole`, which can be used instead. If you want to create another role or it is missing, create a new role attached to `Elastic Container Service Task`, and add the following policies: +2. Navigate to **IAM > Roles** to create the ECS Task Execution Role. AWS provides a default role named `ecsTaskExecutionRole`, which can be used instead. If you want to create another role or it is missing, create a new role attached to `Elastic Container Service Task`, and add the following policies: - `SecretsManagerReadWrite` - `AmazonEC2ContainerRegistryFullAccess` - `CloudWatchFullAccess` - `AmazonECSTaskExecutionRolePolicy` -4. Navigate to **IAM > Roles** to create the ECS Instance Role. AWS provides a defualt role named `ecsInstanceRole`, which can be used instead. If you want to create another role or it is missing, create a new role attached to `EC2`, and add the following policy: +3. Navigate to **IAM > Roles** to create the ECS Instance Role. AWS provides a default role named `ecsInstanceRole`, which can be used instead. If you want to create another role or it is missing, create a new role attached to `EC2`, and add the following policy: - `AmazonEC2ContainerServiceforEC2Role` -## Configuration - -5. Edit your lithops config and add the following keys: - -```yaml -aws: - region: - access_key_id: - secret_access_key: - -aws_batch: - runtime : - runtime_timeout: - runtime_memory: - worker_processes: - container_vcpus: - execution_role: - instance_role: - env_type: - env_max_cpus: - assign_public_ip: - subnets: - - - - - - ... - security_groups: - - - - - - ... +4. Navigate to **IAM > Policies**. Click on **Create policy**. If you already created this policy for the AWS Lambda or AWS EC2 backend, jump to step 7. + +5. Select **JSON** tab and paste the following JSON policy: +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:*", + "lambda:*", + "ec2:*", + "ecr:*", + "sts:GetCallerIdentity", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource": "*" + } + ] +} ``` +6. Click **Next: Tags** and **Next: Review**. Fill the policy name field (you can name it `lithops-policy` or similar) and create the policy. + +7. Go back to **IAM** and navigate to **Roles** tab. Click **Create role**. + +8. Choose **Elastic Container Service** on the use case list and then click on **Elastic Container Service Task**. Click **Next: Permissions**. Select the policy created before (`lithops-policy`). Click **Next: Tags** and **Next: Review**. Type a role name, for example `ecsTaskJobRole`. Click on **Create Role**. + +## AWS Credential setup + +Lithops loads AWS credentials as specified in the [boto3 configuration guide](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html). + +In summary, you can use one of the following settings: + +1. Provide the credentials via the `~/.aws/config` file, or set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. + + You can run `aws configure` command if the AWS CLI is installed to setup the credentials. Then set in the Lithops config file: + ```yaml + lithops: + backend: aws_batch + + aws_batch: + region : + execution_role: + job_role: + subnets: + - + - + - ... + security_groups: + - + - ... + ``` + +2. Provide the credentials in the `aws` section of the Lithops config file. In this case you can omit setting the `job_role`: + ```yaml + lithops: + backend: aws_batch + + aws: + access_key_id: + secret_access_key: + region: + + aws_batch: + execution_role: + job_role: # Not mandatory if the credentials are in the aws section + subnets: + - + - + - ... + security_groups: + - + - ... + ``` + ## Summary of configuration keys for AWS ### AWS @@ -59,8 +109,8 @@ aws_batch: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| |aws | region | |yes | AWS region name. For example `us-east-1` | -|aws | access_key_id | |yes | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | -|aws | secret_access_key | |yes | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | access_key_id | |no | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | secret_access_key | |no | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | |aws | session_token | |no | Session token for temporary AWS credentials | |aws | account_id | |no | *This field will be used if present to retrieve the account ID instead of using AWS STS. The account ID is used to format full image names for container runtimes. | @@ -69,17 +119,18 @@ aws_batch: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| | aws_batch | execution_role | | yes | ARN of the execution role used to execute AWS Batch tasks on ECS for Fargate environments | -| aws_batch | instance_role | | yes | ARN of the execution role used to execute AWS Batch tasks on ECS for EC2 environments | +| aws_batch | job_role | | yes | ARN of the job role used to execute AWS Batch tasks on ECS for Fargate environments. Not mandatory if the credentials are in the `aws` section of the configuration| | aws_batch | security_groups | | yes | List of Security groups to attach for ECS task containers. By default, you can use a security group that accepts all outbound traffic but blocks all inbound traffic. | -| aws_batch | subnets | | yes | List of subnets from a VPC where to deploy the ECS task containers. Note that if you are using a **private subnet**, you can set `assing_public_ip` to `false` but make sure containers can reach other AWS services like ECR, Secrets service, etc., by, for example, using a NAT gateway. If you are using a **public subnet** you must set `assing_public_ip` to `true` | +| aws_batch | subnets | | yes | List of subnets from a VPC where to deploy the ECS task containers. Note that if you are using a **private subnet**, you can set `assign_public_ip` to `false` but make sure containers can reach other AWS services like ECR, Secrets service, etc., by, for example, using a NAT gateway. If you are using a **public subnet** you must set `assign_public_up` to `true` | +| aws_batch | instance_role | | no | ARN of the execution role used to execute AWS Batch tasks on ECS for EC2 environments. Mandatory if using the **EC2** or **SPOT** `env_type` | | aws_batch | region | | no | Region name (like `us-east-1`) where to deploy the ECS cluster. Lithops will use the region set under the `aws` section if it is not set here | -| aws_batch | assign_public_ip | `true` | no | Assing public IPs to ECS task containers. Set to `true` if the tasks are being deployed in a public subnet. Set to `false` when deploying on a private subnet. | -| aws_batch | runtime | `default_runtime-v3X` | no | Runtime name | -| aws_batch | runtime_timeout | 180 | no | Runtime timeout | -| aws_batch | runtime_memory | 1024 | no | Runtime memory | -| aws_batch | worker_processes | 1 | no | Worker processes | -| aws_batch | container_vcpus | 0.5 | no | Number of vCPUs assigned to each task container. It can be different from `worker_processes`. Use it to run a task that uses multiple processes within a container. | -| aws_batch | service_role | `None` | no | Service role for AWS Batch. Leave empty for use a service-linked execution role. More info [here](https://docs.aws.amazon.com/batch/latest/userguide/using-service-linked-roles.html) | +| aws_batch | assign_public_ip | `true` | no | Assign public IPs to ECS task containers. Set to `true` if the tasks are being deployed in a public subnet. Set to `false` when deploying on a private subnet. | +| aws_batch | runtime | | no | Container runtime name in ECR. If not provided Lithops will automatically build a default runtime | +| aws_batch | runtime_timeout | 180 | no | Runtime timeout managed by the cloud provider. | +| aws_batch | runtime_memory | 1024 | no | Runtime memory assigned to each task container. | +| aws_batch | runtime_cpu | 0.5 | no | Number of vCPUs assigned to each task container. It can be different from `worker_processes`. | +| aws_batch | worker_processes | 1 | no | Number of parallel Lithops processes in a worker. This is used to parallelize function activations within the worker. | +| aws_batch | service_role | | no | Service role for AWS Batch. Leave empty to use a service-linked execution role. More info [here](https://docs.aws.amazon.com/batch/latest/userguide/using-service-linked-roles.html) | | aws_batch | env_max_cpus | 10 | no | Maximum total CPUs of the compute environment | | aws_batch | env_type | FARGATE_SPOT | no | Compute environment type, one of: `["EC2", "SPOT", "FARGATE", "FARGATE_SPOT"]` | @@ -97,4 +148,4 @@ You can view the function executions logs in your local machine using the *litho ```bash lithops logs poll -``` \ No newline at end of file +``` diff --git a/docs/source/compute_config/aws_ec2.md b/docs/source/compute_config/aws_ec2.md index 45a19efab..0b19b61ef 100644 --- a/docs/source/compute_config/aws_ec2.md +++ b/docs/source/compute_config/aws_ec2.md @@ -1,124 +1,240 @@ # AWS Elastic Compute Cloud (EC2) -The AWS EC2 client of Lithops can provide a truely serverless user experience on top of EC2 where Lithops creates new Virtual Machines (VMs) dynamically in runtime and scale Lithops jobs against them. Alternatively Lithops can start and stop an existing VM instances. +The AWS EC2 client of Lithops can provide a truely serverless user experience on top of EC2 where Lithops creates new Virtual Machines (VMs) dynamically in runtime and scale Lithops jobs against them (Create & Reuse modes). Alternatively Lithops can start and stop an existing VM instance (Consume mode). ## AWS The assumption that you already familiar with AWS, and you have AUTH credentials to your account (HMAC Credentials). ### Choose an operating system image for the VM -Any Virtual Machine (VM) need to define the instance’s operating system and version. Lithops support both standard operating system choices provided by the VPC or using pre-defined custom images that already contains all dependencies required by Lithops. +Any Virtual Machine (VM) needs to define the instance’s operating system and version. Lithops supports both standard operating system choices provided by the VPC or using pre-defined custom images that already contains all dependencies required by Lithops. -- Option 1: By default, Lithops uses an Ubuntu 22.04 image. In this case, no further action is required and you can continue to the next step. Lithops will install all required dependencies in the VM by itself. Notice this can consume about 3 min to complete all installations. +- Option 1: By default, Lithops uses an Ubuntu 22.04 image. In this case, no further action is required and you can continue to the next step. Lithops will install all required dependencies in the VM by itself. Note this can consume about 3 min to complete all installations. - Option 2: Alternatively, you can use a pre-built custom image that will greatly improve VM creation time for Lithops jobs. To benefit from this approach, navigate to [runtime/aws_ec2](https://github.com/lithops-cloud/lithops/tree/master/runtime/aws_ec2), and follow the instructions. -## Lithops and the VM consume mode +## Installation -In this mode, Lithops can start and stop an existing VM, and deploy an entire job to that VM. The partition logic in this scenario is different from the `create/reuse` modes, since the entire job is executed in the same VM. - -### Lithops configuration for the consume mode +1. Install AWS backend dependencies: -Edit your lithops config and add the relevant keys: +```bash +python3 -m pip install lithops[aws] +``` -```yaml -lithops: - backend: aws_ec2 +## Create and reuse modes +In the `create` mode, Lithops will automatically create new worker VM instances in runtime, scale Lithops job against generated VMs, and automatically delete the VMs when the job is completed. +Alternatively, you can set the `reuse` mode to keep running the started worker VMs, and reuse them for further executions. In the `reuse` mode, Lithops checks all the available worker VMs and start new workers only if necessary. -aws: - region : - access_key_id: - secret_access_key: -aws_ec2: - instance_id : +### Configuration + +1. Navigate to **IAM > Policies**. Click on **Create policy**. If you already created this policy for the AWS Lambda or AWS Batch backend, jump to step 4. + +2. Select **JSON** tab and paste the following JSON policy: +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "s3:*", + "lambda:*", + "ec2:*", + "ecr:*", + "sts:GetCallerIdentity", + "logs:CreateLogGroup", + "logs:CreateLogStream", + "logs:PutLogEvents" + ], + "Resource": "*" + } + ] +} ``` +3. Click **Next: Tags** and **Next: Review**. Fill the policy name field (you can name it `lithops-policy` or similar) and create the policy. + +4. Go back to **IAM** and navigate to **Roles** tab. Click **Create role**. + +5. Choose **EC2** on the use case list. Click **Next: Permissions**. Select the policy created before (`lithops-policy`). Click **Next: Tags** and **Next: Review**. Type a role name, for example `ec2LithopsInstanceRole`. Click on **Create Role**. + + +### AWS Credential setup + +Lithops loads AWS credentials as specified in the [boto3 configuration guide](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html). + +In summary, you can use one of the following settings: + +1. Provide the credentials via the `~/.aws/config` file, or set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. + + You can run `aws configure` command if the AWS CLI is installed to setup the credentials. Then set in the Lithops config file: + ```yaml + lithops: + backend: aws_ec2 + + aws_ec2: + region : + instance_role: + exec_mode: reuse + ``` + +2. Provide the credentials in the `aws` section of the Lithops config file: + ```yaml + lithops: + backend: aws_ec2 + + aws: + access_key_id: + secret_access_key: + region: + + aws_ec2: + instance_role: + exec_mode: reuse + ``` ### Summary of configuration keys for AWS |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| |aws | region | |yes | AWS Region. For example `us-east-1` | -|aws | access_key_id | |yes | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | -|aws | secret_access_key | |yes | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | access_key_id | |no | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | secret_access_key | |no | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | |aws | session_token | |no | Session token for temporary AWS credentials | |aws | account_id | |no | *This field will be used if present to retrieve the account ID instead of using AWS STS. The account ID is used to format full image names for container runtimes. | -### EC2 - Consume Mode +### EC2 - Create and Reuse Modes |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|aws_ec2 | instance_id | | yes | virtual server instance ID | -|aws_ec2 | region | |no | Region name of the VPC. For example `us-east-1`. Lithops will use the region set under the `aws` section if it is not set here | +|aws_ec2 | region | |no | Region name, for example: `eu-west-1`. Lithops will use the `region` set under the `aws` section if it is not set here | +|aws_ec2 | instance_role | | yes | EC2 Instance role name created in the configuration section above. Do not use the full ARN here; only the role name is required. For example: `ec2LithopsInstanceRole`| +|aws_ec2 | vpc_id | | no | VPC id. You can find all the available VPCs in the [VPC Console page](https://console.aws.amazon.com/vpc/v2/home#vpcs:). If not provided, Lithops will create a new VPC | +|aws_ec2 | public_subnet_id | | no | Public subnet id. You can find all the available Subnets in the [VPC Console page](https://console.aws.amazon.com/vpc/v2/home#subnets:). If not provided, Lithops will create a new public subnet | +|aws_ec2 | public_subnet_cidr_block | 10.0.1.0/24 | no | In case a `public_subnet_id` is not provided, Lithops will create a new subnet with this CIDR block | +|aws_ec2 | security_group_id | | no | Security group ID. You can find the available security groups in the [VPC console page](https://console.aws.amazon.com/vpc/v2/home#SecurityGroups:). The security group must have ports 22, 6379, 8080 and 8081 open. If not provided, Lithops will create a new security group | +|aws_ec2 | ssh_key_name | | no | SSH Key name. You can find the available keys in the [EC2 console page](https://console.aws.amazon.com/ec2/v2/home#KeyPairs:). Create a new one or upload your own key if it does not exist| |aws_ec2 | ssh_username | ubuntu |no | Username to access the VM | -|aws_ec2 | ssh_key_filename | ~/.ssh/id_rsa | no | Path to the ssh key file provided to create the VM. It will use the default path if not provided | -|aws_ec2 | worker_processes | 2 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of the VM. | -|aws_ec2 | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the defeuv python3 interpreter of the VM | +|aws_ec2 | ssh_password | |no | Password for accessing the worker VMs. If not provided, it is created randomly| +|aws_ec2 | ssh_key_filename | ~/.ssh/id_rsa | no | Path to the ssh key file provided to access the VPC. If not provided, Lithops will use the default path and create a new ssh key for the VPC | +|aws_ec2 | request_spot_instances | True | no | Request spot instance for worker VMs| +|aws_ec2 | target_ami | | no | Virtual machine image id. Default is Ubuntu Server 22.04 | +|aws_ec2 | master_instance_type | t2.micro | no | Profile name for the master VM | +|aws_ec2 | worker_instance_type | t2.medium | no | Profile name for the worker VMs | +|aws_ec2 | delete_on_dismantle | True | no | Delete the worker VMs when they are stopped. Master VM is never deleted when stopped | +|aws_ec2 | max_workers | 100 | no | Max number of workers per `FunctionExecutor()`| +|aws_ec2 | worker_processes | AUTO | no | Number of parallel Lithops processes in a worker. This is used to parallelize function activations within the worker. By default it detects the amount of CPUs in the `worker_instance_type` VM| +|aws_ec2 | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the default python3 interpreter of the VM | |aws_ec2 | auto_dismantle | True |no | If False then the VM is not stopped automatically.| |aws_ec2 | soft_dismantle_timeout | 300 |no| Time in seconds to stop the VM instance after a job **completed** its execution | |aws_ec2 | hard_dismantle_timeout | 3600 | no | Time in seconds to stop the VM instance after a job **started** its execution | +|aws_ec2 | exec_mode | reuse | no | One of: **consume**, **create** or **reuse**. If set to **create**, Lithops will automatically create new VMs for each map() call based on the number of elements in iterdata. If set to **reuse** will try to reuse running workers if exist | -## Lithops and the VM auto create|reuse mode -In the `create` mode, Lithops will automatically create new worker VM instances in runtime, scale Lithops job against generated VMs, and automatically delete the VMs when the job is completed. -Alternatively, you can set the `reuse` mode to keep running the started worker VMs, and reuse them for further executions. In the `reuse` mode, Lithops checks all the available worker VMs and start new workers only if necessary. +## Additional configuration -### Lithops configuration for the auto create mode +# Elastic Block Store (EBS) -Edit your lithops config and add the relevant keys: +To attach EBS volumes to an EC2 instance in Lithops, you can configure the `aws_ec2` section as follows. ```yaml -lithops: - backend: aws_ec2 - -aws: - access_key_id: - secret_access_key: - region: - aws_ec2: - iam_role: - exec_mode: reuse + execution_role: + region: + ... + ebs_volumes: + - device_name: /dev/xvda + ebs: + volume_size: 100 + volume_type: gp2 + delete_on_termination: true + encrypted: false + kms_key_id: + - device_name: /dev/xvdf + ebs: + volume_size: 50 + volume_type: gp3 + delete_on_termination: true + encrypted: false + iops: 3000 + throughput: 125 + ... ``` +|Group|Key|Default|Mandatory|Additional info| +|---|---|---|---|---| +| ebs | volume_size | 8 | No | Size of the volume in GiB | +| ebs | volume_type | gp2 | No | Type of volume. Options: `gp2`, `gp3`, `io1`, `io2`, `sc1`, `st1`, `standard`| +| ebs | delete_on_termination| True | No | Whether the volume is deleted automatically when the instance is terminated | +| ebs | encrypted | False | No | Whether the volume is encrypted | +| ebs | kms_key_i | | No | ARN of the KMS key used for encryption. If not provided, the default AWS-managed key is used | +| ebs | iops | | No | Provisioned IOPS for `io1`, `io2`, or `gp3` volumes | +| ebs | throughput | | No | Throughput in MiB/s for `gp3` volumes | + + +## Consume mode + +In this mode, Lithops can start and stop an existing VM, and deploy an entire job to that VM. The partition logic in this scenario is different from the `create/reuse` modes, since the entire job is executed in the same VM. + +### AWS Credential setup + +Lithops loads AWS credentials as specified in the [boto3 configuration guide](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html). + +In summary, you can use one of the following settings: + +1. Provide the credentials via the `~/.aws/config` file, or set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. + + You can run `aws configure` command if the AWS CLI is installed to setup the credentials. Then set in the Lithops config file: + ```yaml + lithops: + backend: aws_ec2 + + aws_ec2: + region : + exec_mode: consume + instance_id : + ``` + +2. Provide the credentials in the `aws` section of the Lithops config file: + ```yaml + lithops: + backend: aws_ec2 + + aws: + access_key_id: + secret_access_key: + region: + + aws_ec2: + exec_mode: consume + instance_id : + ``` + ### Summary of configuration keys for AWS |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|aws | region | |yes | AWS Region. For example `us-east-1` | -|aws | access_key_id | |yes | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | -|aws | secret_access_key | |yes | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | region | |no | AWS Region. For example `us-east-1` | +|aws | access_key_id | |no | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | secret_access_key | |no | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | |aws | session_token | |no | Session token for temporary AWS credentials | |aws | account_id | |no | *This field will be used if present to retrieve the account ID instead of using AWS STS. The account ID is used to format full image names for container runtimes. | -### EC2 - Create and Reuse Modes +### Summary of configuration keys for the consume Mode |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|aws_ec2 | region | |yes | Region name, for example: `eu-west-1`. Lithops will use the `region` set under the `aws` section if it is not set here | -|aws_ec2 | iam_role | | yes | IAM EC2 role name. You can find it in the [IAM Console page](https://console.aws.amazon.com/iamv2/home#/roles). Create a new EC2 role if it does not exist| -|aws_ec2 | vpc_id | | no | VPC id. You can find all the available VPCs in the [VPC Console page](https://console.aws.amazon.com/vpc/v2/home#vpcs:) | -|aws_ec2 | subnet_id | | no | Subnet id. You can find all the available Subnets in the [VPC Console page](https://console.aws.amazon.com/vpc/v2/home#subnets:) | -|aws_ec2 | security_group_id | | no | Security group ID. You can find the available security groups in the [VPC console page](https://console.aws.amazon.com/vpc/v2/home#SecurityGroups:). The security group must have ports 22 and 8080 open | -|aws_ec2 | ssh_key_name | | no | SSH Key name. You can find the available keys in the [EC2 console page](https://console.aws.amazon.com/ec2/v2/home#KeyPairs:). Create a new one or upload your own key if it does not exist| +|aws_ec2 | instance_id | | yes | virtual server instance ID | +|aws_ec2 | region | |yes | Region name of the VPC. For example `us-east-1`. Lithops will use the region set under the `aws` section if it is not set here | |aws_ec2 | ssh_username | ubuntu |no | Username to access the VM | -|aws_ec2 | ssh_password | |no | Password for accessing the worker VMs. If not provided, it is created randomly| -|aws_ec2 | ssh_key_filename | ~/.ssh/id_rsa | no | Path to the ssh key file provided to access the VPC. It will use the default path if not provided | -|aws_ec2 | request_spot_instances | True | no | Request spot instance for worker VMs| -|aws_ec2 | target_ami | | no | Virtual machine image id. Default is Ubuntu Server 20.04 | -|aws_ec2 | master_instance_type | t2.micro | no | Profile name for the master VM | -|aws_ec2 | worker_instance_type | t2.medium | no | Profile name for the worker VMs | -|aws_ec2 | delete_on_dismantle | True | no | Delete the worker VMs when they are stopped. Master VM is never deleted when stopped | -|aws_ec2 | max_workers | 100 | no | Max number of workers per `FunctionExecutor()`| -|aws_ec2 | worker_processes | 2 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of a worker VM. | -|aws_ec2 | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the default python3 interpreter of the VM | +|aws_ec2 | ssh_key_filename | ~/.ssh/id_rsa | no | Path to the ssh key file provided to create the VM. It will use the default path if not provided | +|aws_ec2 | worker_processes | AUTO | no | Number of parallel Lithops processes in a worker. This is used to parallelize function activations within the worker. By default it detects the amount of CPUs in the VM| +|aws_ec2 | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the default python3 interpreter of the VM | |aws_ec2 | auto_dismantle | True |no | If False then the VM is not stopped automatically.| |aws_ec2 | soft_dismantle_timeout | 300 |no| Time in seconds to stop the VM instance after a job **completed** its execution | |aws_ec2 | hard_dismantle_timeout | 3600 | no | Time in seconds to stop the VM instance after a job **started** its execution | -|aws_ec2 | exec_mode | consume | no | One of: **consume**, **create** or **reuse**. If set to **create**, Lithops will automatically create new VMs for each map() call based on the number of elements in iterdata. If set to **reuse** will try to reuse running workers if exist | -|aws_ec2 | pull_runtime | False | no | If set to True, Lithops will execute the command `docker pull ` in each VSI before executing the a job (in case of using a docker runtime)| -|aws_ec2 | workers_policy | permissive | no | One of: **permissive**, **strict**. If set to **strict** will force creation of required workers number | + ## Test Lithops Once you have your compute and storage backends configured, you can run a hello world function with: @@ -135,10 +251,40 @@ You can view the function executions logs in your local machine using the *litho lithops logs poll ``` -The master and worker VMs contain the Lithops service logs in `/tmp/lithops-root/service.log` +## VM Management + +Lithops for AWS EC2 follows a Master-Worker architecture (1:N). + +All the VMs, including the master VM, are automatically stopped after a configurable timeout (see hard/soft dismantle timeouts). You can login to the master VM and get a live ssh connection with: ```bash lithops attach -b aws_ec2 ``` + +The master and worker VMs contain the Lithops service logs in `/tmp/lithops-root/*-service.log` + +To list all the available workers in the current moment, use the next command: + +```bash +lithops worker list -b aws_ec2 +``` + +You can also list all the submitted jobs with: + +```bash +lithops job list -b aws_ec2 +``` + +You can delete all the workers with: + +```bash +lithops clean -b aws_ec2 -s aws_s3 +``` + +You can delete all the workers including the Master VM with the `--all` flag: + +```bash +lithops clean -b aws_ec2 -s aws_s3 --all +``` diff --git a/docs/source/compute_config/aws_lambda.md b/docs/source/compute_config/aws_lambda.md index 57696f287..8653d72c8 100644 --- a/docs/source/compute_config/aws_lambda.md +++ b/docs/source/compute_config/aws_lambda.md @@ -4,23 +4,24 @@ Lithops with *AWS Lambda* as serverless compute backend. ## Installation -1. Install Amazon Web Services backend dependencies: +1. Install AWS backend dependencies: -``` +```bash python3 -m pip install lithops[aws] ``` -2. [Login](https://console.aws.amazon.com/?nc2=h_m_mc) to Amazon Web Services Console (or signup if you don't have an account) +## Configuration + +1. [Login](https://console.aws.amazon.com/?nc2=h_m_mc) to Amazon Web Services Console (or signup if you don't have an account) -3. Navigate to **IAM > Policies**. Click on **Create policy**. +2. Navigate to **IAM > Policies**. Click on **Create policy**. -4. Select **JSON** tab and paste the following JSON policy: +3. Select **JSON** tab and paste the following JSON policy: ```json { "Version": "2012-10-17", "Statement": [ { - "Sid": "VisualEditor0", "Effect": "Allow", "Action": [ "s3:*", @@ -38,28 +39,43 @@ python3 -m pip install lithops[aws] } ``` -5. Click **Next: Tags** and **Next: Review**. Fill the policy name field (you can name it `lithops-policy` or simmilar) and create the policy. +4. Click **Next: Tags** and **Next: Review**. Fill the policy name field (you can name it `lithops-policy` or similar) and create the policy. -6. Go back to **IAM** and navigate to **Roles** tab. Click **Create role**. +5. Go back to **IAM** and navigate to **Roles** tab. Click **Create role**. -7. Choose **Lambda** on the use case list and click **Next: Permissions**. Select the policy created before (`lithops-policy`). Click **Next: Tags** and **Next: Review**. Type a role name, for example `lithops-execution-role`. Click on *Create Role*. +6. Choose **Lambda** on the use case list and click **Next: Permissions**. Select the policy created before (`lithops-policy`). Click **Next: Tags** and **Next: Review**. Type a role name, for example `lambdaLithopsExecutionRole`. Click on *Create Role*. -## Configuration +## AWS Credential setup -6. Edit your lithops config and add the following keys: +Lithops loads AWS credentials as specified in the [boto3 configuration guide](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html). -```yaml -lithops: - backend: aws_lambda +In summary, you can use one of the following settings: -aws: - region: - access_key_id: - secret_access_key: +1. Provide the credentials via the `~/.aws/config` file, or set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. -aws_lambda: - execution_role: -``` + You can run `aws configure` command if the AWS CLI is installed to setup the credentials. Then set in the Lithops config file: + ```yaml + lithops: + backend: aws_lambda + + aws_lambda: + execution_role: + region: + ``` + +2. Provide the credentials in the `aws` section of the Lithops config file: + ```yaml + lithops: + backend: aws_lambda + + aws: + access_key_id: + secret_access_key: + region: + + aws_lambda: + execution_role: + ``` ## Summary of configuration keys for AWS @@ -68,28 +84,32 @@ aws_lambda: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| |aws | region | |yes | AWS Region. For example `us-east-1` | -|aws | access_key_id | |yes | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | -|aws | secret_access_key | |yes | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | access_key_id | |no | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | secret_access_key | |no | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | |aws | session_token | |no | Session token for temporary AWS credentials | |aws | account_id | |no | *This field will be used if present to retrieve the account ID instead of using AWS STS. The account ID is used to format full image names for container runtimes. | ### AWS Lambda -| Group | Key | Default | Mandatory | Additional info | -|------------|---------------------|---------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| aws_lambda | execution_role | | yes | ARN of the execution role created at step 3. You can find it in the Role page at the *Roles* list in the *IAM* section (e.g. `arn:aws:iam::1234567890:role/lithops-execution-role` | -| aws_lambda | region | | no | Region where the S3 bucket is located and where Lambda functions will be invoked (e.g. `us-east-1`). Lithops will use the `region` set under the `aws` section if it is not set here | -| aws_lambda | max_workers | 1000 | no | Max number of workers per `FunctionExecutor()` | -| aws_lambda | worker_processes | 1 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker | -| aws_lambda | runtime | | no | Docker image name | -| aws_lambda | runtime_memory | 256 | no | Memory limit in MB. Default 256MB | -| aws_lambda | runtime_timeout | 180 | no | Runtime timeout in seconds. Default 3 minutes | -| aws_lambda | invoke_pool_threads | 64 | no | Number of concurrent threads used for invocation | -| aws_lambda | remote_invoker | False | no | Activate the remote invoker feature that uses one cloud function to spawn all the actual `map()` activations | -| aws_lambda | architecture | x86_64 | no | Runtime architecture. One of **x86_64** or **arm64** | -| aws_lambda | ephemeral_storage | 512 | no | Ephemeral storage (`/tmp`) size in MB (must be between 512 MB and 10240 MB) | -| aws_lambda | env_vars | {} | no | List of {name: ..., value: ...} pairs for Lambda instance environment variables | - +| Group | Key | Default | Mandatory | Additional info | +| --- | --- | --- | --- | --- | +| aws_lambda | execution_role | | yes | ARN of the execution role created at step 3. You can find it in the Role page at the *Roles* list in the *IAM* section (e.g. `arn:aws:iam::1234567890:role/lambdaLithopsExecutionRole` | +| aws_lambda | region | | no | Region where Lambda functions will be invoked (e.g. `us-east-1`). Lithops will use the `region` set under the `aws` section if it is not set here | +| aws_lambda | max_workers | 1000 | no | Max number of workers per `FunctionExecutor()` | +| aws_lambda | worker_processes | 1 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker | +| aws_lambda | runtime | | no | Docker image name | +| aws_lambda | runtime_memory | 256 | no | Memory limit in MB. Default 256MB | +| aws_lambda | runtime_timeout | 180 | no | Runtime timeout in seconds. Default 3 minutes | +| aws_lambda | invoke_pool_threads | 64 | no | Number of concurrent threads used for invocation | +| aws_lambda | remote_invoker | False | no | Activate the remote invoker feature that uses one cloud function to spawn all the actual `map()` activations | +| aws_lambda | architecture | x86_64 | no | Runtime architecture. One of **x86_64** or **arm64** | +| aws_lambda | ephemeral_storage | 512 | no | Ephemeral storage (`/tmp`) size in MB (must be between 512 MB and 10240 MB) | +| aws_lambda | user_tags | {} | no | List of {name: ..., value: ...} pairs for Lambda instance user tags | +| aws_lambda | env_vars | {} | no | List of {name: ..., value: ...} pairs for Lambda instance environment variables | +| aws_lambda | namespace | | no | Virtual namespace. This can be useful to virtually group Lithops function workers. The functions deployed by lithops will be prefixed by this namespace. For example you can set it to differentiate between `prod`, `dev` and `stage` environments. | +| aws_lambda | runtime_include_function | False | no | If set to true, Lithops will automatically build a new runtime, including the function's code, instead of transferring it through the storage backend at invocation time. This is useful when the function's code size is large (in the order of 10s of MB) and the code does not change frequently | + + ## Additional configuration ### VPC @@ -155,4 +175,4 @@ You can view the function executions logs in your local machine using the *litho ```bash lithops logs poll -``` \ No newline at end of file +``` diff --git a/docs/source/compute_config/azure_containers.md b/docs/source/compute_config/azure_containers.md index 656e805d6..637042d35 100644 --- a/docs/source/compute_config/azure_containers.md +++ b/docs/source/compute_config/azure_containers.md @@ -6,7 +6,7 @@ Lithops with Azure Container Apps as serverless compute backend. 1. Install Microsoft Azure backend dependencies: -``` +```bash python3 -m pip install lithops[azure] ``` @@ -95,7 +95,7 @@ az containerapp env create --name lithops --resource-group LithopsResourceGroup |---|---|---|---|---| |azure_containers| resource_group | |no | Name of a resource group, for example: `LithopsResourceGroup`. Lithops will use the `resource_group` set under the `azure` section if it is not set here | |azure_containers| region | |no | The location where you created the `lithops` Container APP environment. For example: `westeurope`, `westus2`, etc. Lithops will use the `region` set under the `azure` section if it is not set here| -|azure_containers| environment | lithops |no | The environemnt name you created in the step 5 of the installation | +|azure_containers| environment | lithops |no | The environment name you created in the step 5 of the installation | |azure_containers | docker_server | index.docker.io |no | Container registry URL | |azure_containers | docker_user | |no | Container registry user name | |azure_containers | docker_password | |no | Container registry password/token. In case of Docker hub, login to your docker hub account and generate a new access token [here](https://hub.docker.com/settings/security)| @@ -106,6 +106,8 @@ az containerapp env create --name lithops --resource-group LithopsResourceGroup |azure_containers | runtime_timeout | 600 |no | Runtime timeout in seconds. Default 10 minutes | |azure_containers| trigger | pub/sub | no | Currently it supports pub/sub invocation| |azure_containers | invoke_pool_threads | 32 |no | Number of concurrent threads used for invocation | +|azure_containers | runtime_include_function | False | no | If set to true, Lithops will automatically build a new runtime, including the function's code, instead of transferring it through the storage backend at invocation time. This is useful when the function's code size is large (in the order of 10s of MB) and the code does not change frequently | + ## Test Lithops diff --git a/docs/source/compute_config/azure_functions.md b/docs/source/compute_config/azure_functions.md index 561a14bd7..d48d83160 100644 --- a/docs/source/compute_config/azure_functions.md +++ b/docs/source/compute_config/azure_functions.md @@ -6,7 +6,7 @@ Lithops with Azure Functions as serverless compute backend. 1. Install Microsoft Azure backend dependencies: -``` +```bash python3 -m pip install lithops[azure] ``` @@ -89,7 +89,7 @@ az login |---|---|---|---|---| |azure_functions| resource_group | |no | Name of a resource group, for example: `LithopsResourceGroup`. Lithops will use the `resource_group` set under the `azure` section if it is not set here | |azure_functions| region | |no | The location of the consumption plan for the runtime. Use `az functionapp list-consumption-locations` to view the available locations. For example: `westeurope`, `westus2`, etc. Lithops will use the `region` set under the `azure` section if it is not set here| -|azure_functions | max_workers | 1000 | no | Max number of parallel workers. Altough Azure limits the number of workrs to 200, it is convenient to keep this value high| +|azure_functions | max_workers | 1000 | no | Max number of parallel workers. Although Azure limits the number of workers to 200, it is convenient to keep this value high| |azure_functions | worker_processes | 1 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker | |azure_functions| runtime | |no | Runtime name already deployed in the service| |azure_functions | runtime_timeout | 300 |no | Runtime timeout in seconds. Default 5 minutes | diff --git a/docs/source/compute_config/azure_vms.md b/docs/source/compute_config/azure_vms.md index cdfcd5ab3..53916033c 100644 --- a/docs/source/compute_config/azure_vms.md +++ b/docs/source/compute_config/azure_vms.md @@ -1,18 +1,58 @@ -# Azure Virtual Machines (Beta) +# Azure Virtual Machines -The Azure Virtual Machines client of Lithops can provide a truely serverless user experience on top of Azure VMs where Lithops creates new Virtual Machines (VMs) dynamically in runtime and scale Lithops jobs against them. Alternatively Lithops can start and stop an existing VM instances. +The Azure Virtual Machines client of Lithops can provide a truely serverless user experience on top of Azure VMs where Lithops creates new Virtual Machines (VMs) dynamically in runtime and scale Lithops jobs against them (Create & Reuse modes). Alternatively Lithops can start and stop an existing VM instance (Consume Mode). +## Installation -### Choose an operating system image for the VM +1. Install Microsoft Azure backend dependencies: + +```bash +python3 -m pip install lithops[azure] +``` + +2. Install [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) + +3. Sign in with the Azure CLI: + +```bash +az login +``` + +4. Create a Resource Group and a Storage Account: + + Option 1: + + 1. Access to the [Azure portal Resource Groups](https://portal.azure.com/#view/HubsExtension/BrowseResourceGroups) and create a new Resource group named **LithopsResourceGroup** (or similar) in your preferred region. If you already have a resource group, omit this step. + + 2. Access to the [Azure portal Storage Accounts](https://portal.azure.com/#view/HubsExtension/BrowseResource/resourceType/Microsoft.Storage%2FStorageAccounts) and create a new Storage Account with a unique name, for example: **lithops0sa25s1**. If you already have a storage account, omit this step. + + Option 2: + + 1. Create a Resource Group in a specific location. If you already have a resource group, omit this step. + + ```bash + az group create --name LithopsResourceGroup --location westeurope + ``` + + 2. Create a Storage Account with a unique name. If you already have a storage account, omit this step. + + ```bash + storage_account_name=lithops$(openssl rand -hex 3) + echo $storage_account_name + az storage account create --name $storage_account_name --location westeurope \ + --resource-group LithopsResourceGroup --sku Standard_LRS + ``` + +## Choose an operating system image for the VM - Option 1: By default, Lithops uses an Ubuntu 22.04 image. In this case, no further action is required and you can continue to the next step. Lithops will install all required dependencies in the VM by itself. Notice this can consume about 3 min to complete all installations. - Option 2: Alternatively, you can use a pre-built custom image that will greatly improve VM creation time for Lithops jobs. To benefit from this approach, navigate to [runtime/azure_vms](https://github.com/lithops-cloud/lithops/tree/master/runtime/azure_vms), and follow the instructions. -## Lithops and the VM consume mode - -In this mode, Lithops can start and stop an existing VM, and deploy an entire job to that VM. The partition logic in this scenario is different from the `create/reuse` modes, since the entire job is executed in the same VM. +## Create and reuse modes +In the `create` mode, Lithops will automatically create new worker VM instances in runtime, scale Lithops job against generated VMs, and automatically delete the VMs when the job is completed. +Alternatively, you can set the `reuse` mode to keep running the started worker VMs, and reuse them for further executions. In the `reuse` mode, Lithops checks all the available worker VMs and start new workers only if necessary. -### Lithops configuration for the consume mode +### Lithops configuration for the create or reuse modes Edit your lithops config and add the relevant keys: @@ -26,9 +66,7 @@ Edit your lithops config and add the relevant keys: subscription_id: azure_vms: - instance_name: - ssh_username: - ssh_key_filename: + exec_mode: reuse ``` @@ -40,26 +78,32 @@ Edit your lithops config and add the relevant keys: |azure| region | |yes | Location of the resource group, for example: `westeurope`, `westus2`, etc| |azure| subscription_id | |yes | Subscription ID from your account. Find it [here](https://portal.azure.com/#view/Microsoft_Azure_Billing/SubscriptionsBlade)| -### Azure VMs - Consume Mode +### Azure VMs - Create and Reuse Modes |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|azure_vms | instance_name | | yes | virtual server instance Name. The instance must exists in your resource group | -|azure_vms | ssh_username | ubuntu | yes | Username to access the VM. It will use `ubuntu` if not provided | -|azure_vms | ssh_key_filename | ~/.ssh/id_rsa | yes | Path to the ssh key file provided to create the VM. It will use the default path if not provided | -|azure_vms | region | |no | Location of the resource group, for example: `westeurope`, `westus2`, etc. Lithops will use the region set under the `azure` section if it is not set here | -|azure_vms | worker_processes | 2 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of the VM. | -|azure_vms | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the defeuv python3 interpreter of the VM | +|azure_vms| region | |no | Azure location for deploying the VMS. For example: `westeurope`, `westus2`, etc. Lithops will use the `region` set under the `azure` section if it is not set here| +|azure_vms | image_id | Canonical:0001-com-ubuntu-server-jammy:22_04-lts-gen2:latest |no | Image ID. ARM resource identifier | +|azure_vms | ssh_username | ubuntu |no | Username to access the VM | +|azure_vms | ssh_password | |no | Password for accessing the worker VMs. If not provided, it is created randomly| +|azure_vms | ssh_key_filename | ~/.ssh/id_rsa | no | Path to the ssh key file provided to access the VPC. It will use the default path if not provided | +|azure_vms | master_instance_type | Standard_B1s | no | Profile name for the master VM | +|azure_vms | worker_instance_type | Standard_B2s | no | Profile name for the worker VMs | +|azure_vms | delete_on_dismantle | False | no | Delete the worker VMs when they are stopped. Master VM is never deleted when stopped. `True` is NOT YET SUPPORTED | +|azure_vms | max_workers | 100 | no | Max number of workers per `FunctionExecutor()`| +|azure_vms | worker_processes | AUTO | no | Number of parallel Lithops processes in a worker. This is used to parallelize function activations within the worker. By default it detects the amount of CPUs in the `worker_instance_type` VM| +|azure_vms | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the default python3 interpreter of the VM | |azure_vms | auto_dismantle | True |no | If False then the VM is not stopped automatically.| |azure_vms | soft_dismantle_timeout | 300 |no| Time in seconds to stop the VM instance after a job **completed** its execution | |azure_vms | hard_dismantle_timeout | 3600 | no | Time in seconds to stop the VM instance after a job **started** its execution | +|azure_vms | exec_mode | reuse | no | One of: **consume**, **create** or **reuse**. If set to **create**, Lithops will automatically create new VMs for each map() call based on the number of elements in `iterdata`. If set to **reuse** will try to reuse running workers if exist | -## Lithops and the VM auto create and reuse modes -In the `create` mode, Lithops will automatically create new worker VM instances in runtime, scale Lithops job against generated VMs, and automatically delete the VMs when the job is completed. -Alternatively, you can set the `reuse` mode to keep running the started worker VMs, and reuse them for further executions. In the `reuse` mode, Lithops checks all the available worker VMs and start new workers only if necessary. +## Consume mode -### Lithops configuration for the create or reuse modes +In this mode, Lithops can start and stop an existing VM, and deploy an entire job to that VM. The partition logic in this scenario is different from the `create/reuse` modes, since the entire job is executed in the same VM. + +### Lithops configuration for the consume mode Edit your lithops config and add the relevant keys: @@ -73,7 +117,10 @@ Edit your lithops config and add the relevant keys: subscription_id: azure_vms: - exec_mode: reuse + exec_mode: consume + instance_name: + ssh_username: + ssh_key_filename: ``` @@ -85,27 +132,20 @@ Edit your lithops config and add the relevant keys: |azure| region | |yes | Location of the resource group, for example: `westeurope`, `westus2`, etc| |azure| subscription_id | |yes | Subscription ID from your account. Find it [here](https://portal.azure.com/#view/Microsoft_Azure_Billing/SubscriptionsBlade)| -### Azure VMs - Create and Reuse Modes +### Azure VMs - Consume Mode |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|azure_vms| region | |no | Azure location for deploying the VMS. For example: `westeurope`, `westus2`, etc. Lithops will use the `region` set under the `azure` section if it is not set here| -|azure_vms | image_id | Canonical:0001-com-ubuntu-server-jammy:22_04-lts-gen2:latest |no | Image ID. ARM resource identifier | -|azure_vms | ssh_username | ubuntu |no | Username to access the VM | -|azure_vms | ssh_password | |no | Password for accessing the worker VMs. If not provided, it is created randomly| -|azure_vms | ssh_key_filename | ~/.ssh/id_rsa | no | Path to the ssh key file provided to access the VPC. It will use the default path if not provided | -|azure_vms | master_instance_type | Standard_B1s | no | Profile name for the master VM | -|azure_vms | worker_instance_type | Standard_B2s | no | Profile name for the worker VMs | -|azure_vms | delete_on_dismantle | False | no | Delete the worker VMs when they are stopped. Master VM is never deleted when stopped. `True` is NOT YET SUPPORTED | -|azure_vms | max_workers | 100 | no | Max number of workers per `FunctionExecutor()`| -|azure_vms | worker_processes | 2 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of a worker VM. | -|azure_vms | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the default python3 interpreter of the VM | +|azure_vms | instance_name | | yes | virtual server instance Name. The instance must exists in your resource group | +|azure_vms | ssh_username | ubuntu | yes | Username to access the VM. It will use `ubuntu` if not provided | +|azure_vms | ssh_key_filename | ~/.ssh/id_rsa | yes | Path to the ssh key file provided to create the VM. It will use the default path if not provided | +|azure_vms | region | |no | Location of the resource group, for example: `westeurope`, `westus2`, etc. Lithops will use the region set under the `azure` section if it is not set here | +|azure_vms | worker_processes | AUTO | no | Number of parallel Lithops processes in a worker. This is used to parallelize function activations within the worker. By default it detects the amount of CPUs in the VM| +|azure_vms | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the defeuv python3 interpreter of the VM | |azure_vms | auto_dismantle | True |no | If False then the VM is not stopped automatically.| |azure_vms | soft_dismantle_timeout | 300 |no| Time in seconds to stop the VM instance after a job **completed** its execution | |azure_vms | hard_dismantle_timeout | 3600 | no | Time in seconds to stop the VM instance after a job **started** its execution | -|azure_vms | exec_mode | consume | no | One of: **consume**, **create** or **reuse**. If set to **create**, Lithops will automatically create new VMs for each map() call based on the number of elements in iterdata. If set to **reuse** will try to reuse running workers if exist | -|azure_vms | pull_runtime | False | no | If set to True, Lithops will execute the command `docker pull ` in each VSI before executing the a job (in case of using a docker runtime)| -|azure_vms | workers_policy | permissive | no | One of: **permissive**, **strict**. If set to **strict** will force creation of required workers number | + ## Test Lithops Once you have your compute and storage backends configured, you can run a hello world function with: @@ -122,10 +162,40 @@ You can view the function executions logs in your local machine using the *litho lithops logs poll ``` -The master and worker VMs contain the Lithops service logs in `/tmp/lithops-root/service.log` +## VM Management + +Lithops for Azure VMs follows a Master-Worker architecture (1:N). + +All the VMs, including the master VM, are automatically stopped after a configurable timeout (see hard/soft dismantle timeouts). You can login to the master VM and get a live ssh connection with: ```bash lithops attach -b azure_vms ``` + +The master and worker VMs contain the Lithops service logs in `/tmp/lithops-root/*-service.log` + +To list all the available workers in the current moment, use the next command: + +```bash +lithops worker list -b azure_vms +``` + +You can also list all the submitted jobs with: + +```bash +lithops job list -b azure_vms +``` + +You can delete all the workers with: + +```bash +lithops clean -b azure_vms -s azure_storage +``` + +You can delete all the workers including the Master VM with the `--all` flag: + +```bash +lithops clean -b azure_vms -s azure_storage --all +``` diff --git a/docs/source/compute_config/code_engine.md b/docs/source/compute_config/code_engine.md index fd927788d..e34e3542a 100644 --- a/docs/source/compute_config/code_engine.md +++ b/docs/source/compute_config/code_engine.md @@ -2,8 +2,13 @@ [IBM Code Engine](https://cloud.ibm.com/codeengine/overview) allows you to run your application, job or container on a managed serverless platform. Auto-scale workloads and only pay for the resources you consume. -IBM Code Engine exposes both Knative and Kubernetes Job Descriptor API. Lithops supports both of them. Follow IBM Code Engine documentation to get more details on the difference between those APIs. +## Installation +1. Install IBM Cloud backend dependencies: + +```bash +python3 -m pip install lithops[ibm] +``` ## Configuration @@ -13,7 +18,7 @@ IBM Code Engine exposes both Knative and Kubernetes Job Descriptor API. Lithops 3. Copy the generated IAM API key (You can only see the key the first time you create it, so make sure to copy it). -4. Naviagete to the [resource groups dashboard](https://cloud.ibm.com/account/resource-groups), and copy the desired resource group ID. +4. Navigate to the [resource groups dashboard](https://cloud.ibm.com/account/resource-groups), and copy the desired resource group ID. 5. Edit your lithops config and add the following keys: @@ -27,6 +32,37 @@ IBM Code Engine exposes both Knative and Kubernetes Job Descriptor API. Lithops resource_group_id: ``` + +## Summary of configuration keys for IBM Cloud: + +### IBM IAM: + +|Group|Key|Default|Mandatory|Additional info| +|---|---|---|---|---| +|ibm | iam_api_key | |yes | IBM Cloud IAM API key to authenticate against IBM services. Obtain the key [here](https://cloud.ibm.com/iam/apikeys) | +|ibm | region | |yes | IBM Region. One of: `eu-gb`, `eu-de`, `us-south`, `us-east`, `br-sao`, `ca-tor`, `jp-tok`, `jp-osa`, `au-syd` | +|ibm | resource_group_id | | yes | Resource group id from your IBM Cloud account. Get it from [here](https://cloud.ibm.com/account/resource-groups) | + +## Code Engine: + +|Group|Key|Default|Mandatory|Additional info| +|---|---|---|---|---| +|code_engine | project_name | |no | Project name that already exists in Code Engine. If not provided lithops will automatically create a new project| +|code_engine | namespace | |no | Alternatively to `project_name`, you can provide `namespace`. Get it from you code engine k8s config file.| +|code_engine | region | | no | Cluster region. One of: `eu-gb`, `eu-de`, `us-south`, `us-east`, `br-sao`, `ca-tor`, `jp-tok`, `jp-osa`, `au-syd`. Lithops will use the `region` set under the `ibm` section if it is not set here | +|code_engine | docker_server | docker.io |no | Container registry URL | +|code_engine | docker_user | |no | Container registry user name | +|code_engine | docker_password | |no | Container registry password/token. In case of Docker hub, login to your docker hub account and generate a new access token [here](https://hub.docker.com/settings/security)| +|code_engine | max_workers | 1000 | no | Max number of workers per `FunctionExecutor()`| +|code_engine | worker_processes | 1 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of the container. | +|code_engine | runtime | |no | Docker image name.| +|code_engine | runtime_cpu | 0.125 |no | CPU limit. Default 0.125vCPU. See [valid combinations](https://cloud.ibm.com/docs/codeengine?topic=codeengine-mem-cpu-combo) | +|code_engine | runtime_memory | 256 |no | Memory limit in MB. Default 256Mi. See [valid combinations](https://cloud.ibm.com/docs/codeengine?topic=codeengine-mem-cpu-combo) | +|code_engine | runtime_timeout | 600 |no | Runtime timeout in seconds. Default 600 seconds | +|code_engine | connection_retries | |no | If specified, number of job invoke retries in case of connection failure with error code 500 | +|code_engine | runtime_include_function | False | no | If set to true, Lithops will automatically build a new runtime, including the function's code, instead of transferring it through the storage backend at invocation time. This is useful when the function's code size is large (in the order of 10s of MB) and the code does not change frequently | + + ## Runtime ### Use your own runtime @@ -54,7 +90,7 @@ code_engine: .... docker_server : docker.io docker_user : - docker_password : + docker_password : ``` #### Configure IBM Container Registry @@ -66,43 +102,10 @@ code_engine: docker_server : us.icr.io # Change-me if you have the CR in another region docker_user : iamapikey docker_password : + docker_namespace : # namespace name from https://cloud.ibm.com/registry/namespaces ``` -## Summary of configuration keys for IBM Cloud: - -### IBM IAM: - -|Group|Key|Default|Mandatory|Additional info| -|---|---|---|---|---| -|ibm | iam_api_key | |yes | IBM Cloud IAM API key to authenticate against IBM services. Obtain the key [here](https://cloud.ibm.com/iam/apikeys) | -|ibm | region | |yes | IBM Region. One of: `eu-gb`, `eu-de`, `us-south`, `us-east`, `br-sao`, `ca-tor`, `jp-tok`, `jp-osa`, `au-syd` | -|ibm | resource_group_id | | yes | Resource group id from your IBM Cloud account. Get it from [here](https://cloud.ibm.com/account/resource-groups) | - -## Code Engine: - -|Group|Key|Default|Mandatory|Additional info| -|---|---|---|---|---| -|code_engine | project_name | |no | Project name that already exists in Code Engine. If not provided lithops will automatically create a new project| -|code_engine | namespace | |no | Alternatively to `project_name`, you can provide `namespace`. Get it from you code engine k8s config file.| -|code_engine | region | | no | Cluster region. One of: `eu-gb`, `eu-de`, `us-south`, `us-east`, `br-sao`, `ca-tor`, `jp-tok`, `jp-osa`, `au-syd`. Lithops will use the `region` set under the `ibm` section if it is not set here | -|code_engine | docker_server | docker.io |no | Container registry URL | -|code_engine | docker_user | |no | Container registry user name | -|code_engine | docker_password | |no | Container registry password/token. In case of Docker hub, login to your docker hub account and generate a new access token [here](https://hub.docker.com/settings/security)| -|code_engine | max_workers | 1000 | no | Max number of workers per `FunctionExecutor()`| -|code_engine | worker_processes | 1 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of the container. | -|code_engine | runtime | |no | Docker image name.| -|code_engine | runtime_cpu | 0.125 |no | CPU limit. Default 0.125vCPU. See [valid combinations](https://cloud.ibm.com/docs/codeengine?topic=codeengine-mem-cpu-combo) | -|code_engine | runtime_memory | 256 |no | Memory limit in MB. Default 256Mi. See [valid combinations](https://cloud.ibm.com/docs/codeengine?topic=codeengine-mem-cpu-combo) | -|code_engine | runtime_timeout | 600 |no | Runtime timeout in seconds. Default 600 seconds | -|code_engine | connection_retries | |no | If specified, number of job invoke retries in case of connection failure with error code 500 | - - -## Lithops using Knative API of Code Engine - -The preferable way to run Lithops in Code Engine is by using the JOB API. However, Lithops can be also executed in Code Engine using the Knative API. To configure this mode of execution refer to the [Knative documentation](https://github.com/lithops-cloud/lithops/blob/master/config/compute/knative.md#configuration) and follow the steps to configure Knative. - - ## Test Lithops Once you have your compute and storage backends configured, you can run a hello world function with: @@ -118,4 +121,3 @@ You can view the function executions logs in your local machine using the *litho ```bash lithops logs poll ``` - diff --git a/docs/source/compute_config/gcp_cloudrun.md b/docs/source/compute_config/gcp_cloudrun.md index 9777ad095..a36454356 100644 --- a/docs/source/compute_config/gcp_cloudrun.md +++ b/docs/source/compute_config/gcp_cloudrun.md @@ -7,7 +7,7 @@ Lithops with *GCP Cloud Run* as serverless compute backend. 1. Install Google Cloud Platform backend dependencies: ```bash -python3 -m install lithops[gcp] +python3 -m pip install lithops[gcp] ``` 2. [Login](https://console.cloud.google.com) to Google Cloud Console (or sign up if you don't have an account). @@ -29,6 +29,8 @@ python3 -m install lithops[gcp] 9. Enable the **Cloud Run API** : Navigate to *APIs & services* tab on the menu. Click *ENABLE APIS AND SERVICES*. Look for "Cloud Run API" at the search bar. Click *Enable*. +10. Enable the **Artifact Registry API**: Navigate to *APIs & services* tab on the menu. Click *ENABLE APIS AND SERVICES*. Look for "Artifact Registry API" at the search bar. Click *Enable*. + ## Configuration 1. Edit your lithops config and add the following keys: @@ -64,7 +66,7 @@ python3 -m install lithops[gcp] |gcp_cloudrun | runtime_timeout | 300 |no | Runtime timeout in seconds. Default 5 minutes | |gcp_cloudrun | trigger | https | no | Currently it supports 'https' trigger| |gcp_cloudrun | invoke_pool_threads | 100 |no | Number of concurrent threads used for invocation | - +|gcp_cloudrun | runtime_include_function | False | no | If set to true, Lithops will automatically build a new runtime, including the function's code, instead of transferring it through the storage backend at invocation time. This is useful when the function's code size is large (in the order of 10s of MB) and the code does not change frequently | ## Test Lithops Once you have your compute and storage backends configured, you can run a hello world function with: diff --git a/docs/source/compute_config/gcp_functions.md b/docs/source/compute_config/gcp_functions.md index a3a22b2d9..e1cc082e3 100644 --- a/docs/source/compute_config/gcp_functions.md +++ b/docs/source/compute_config/gcp_functions.md @@ -7,7 +7,7 @@ Lithops with *GCP Functions* as serverless compute backend. 1. Install Google Cloud Platform backend dependencies: ```bash -python3 -m install lithops[gcp] +python3 -m pip install lithops[gcp] ``` 2. [Login](https://console.cloud.google.com) to Google Cloud Console (or sign up if you don't have an account). @@ -30,6 +30,8 @@ python3 -m install lithops[gcp] 9. Enable the **Cloud Functions API** : Navigate to *APIs & services* tab on the menu. Click *ENABLE APIS AND SERVICES*. Look for "Cloud Functions API" at the search bar. Click *Enable*. +10. Enable the **Artifact Registry API**: Navigate to *APIs & services* tab on the menu. Click *ENABLE APIS AND SERVICES*. Look for "Artifact Registry API" at the search bar. Click *Enable*. + ## Configuration 1. Edit your lithops config and add the following keys: diff --git a/docs/source/compute_config/ibm_cf.md b/docs/source/compute_config/ibm_cf.md index 5dc906d3f..60f58646d 100644 --- a/docs/source/compute_config/ibm_cf.md +++ b/docs/source/compute_config/ibm_cf.md @@ -2,6 +2,16 @@ Lithops with *IBM Cloud Functions* as compute backend. +**Note**: This backend is deprecated. See the [deprecation overview](https://cloud.ibm.com/docs/openwhisk?topic=openwhisk-dep-overview) + +## Installation + +1. Install IBM Cloud backend dependencies: + +```bash +python3 -m pip install lithops[ibm] +``` + ## Configuration 1. Login to IBM Cloud and open up your [dashboard](https://cloud.ibm.com/). @@ -12,7 +22,7 @@ Lithops with *IBM Cloud Functions* as compute backend. 4. Copy the generated IAM API key (You can only see the key the first time you create it, so make sure to copy it). -5. Naviagete to the [resource groups dashboard](https://cloud.ibm.com/account/resource-groups), and copy the desired resource group ID. +5. Navigate to the [resource groups dashboard](https://cloud.ibm.com/account/resource-groups), and copy the desired resource group ID. 5. Edit your lithops config and add the following keys: @@ -40,8 +50,8 @@ Lithops with *IBM Cloud Functions* as compute backend. |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|ibm_cf| namespace | |no | Value of CURRENT NAMESPACE from [here](https://cloud.ibm.com/functions/namespace-settings). Provide it if you want to use an existsing `namespace`. Lithops will automatically create a new namespace if not provided.| -|ibm_cf| namespace_id | |no | Value of 'GUID' from [here](https://cloud.ibm.com/functions/namespace-settings). Provide it if you want to use an existsing `namespace`. Provide it along with `namespace`.| +|ibm_cf| namespace | |no | Value of CURRENT NAMESPACE from [here](https://cloud.ibm.com/functions/namespace-settings). Provide it if you want to use an existing `namespace`. Lithops will automatically create a new namespace if not provided.| +|ibm_cf| namespace_id | |no | Value of 'GUID' from [here](https://cloud.ibm.com/functions/namespace-settings). Provide it if you want to use an existing `namespace`. Provide it along with `namespace`.| |ibm_cf | region | |no | Service region. One of: `jp-tok`, `au-syd`, `eu-gb`, `eu-de`, `us-south`, `us-east`. Lithops will use the `region` set under the `ibm` section if it is not set here | |ibm_cf| endpoint | |no | IBM Cloud Functions endpoint (if region not provided). Make sure to use https:// prefix, for example: https://us-east.functions.cloud.ibm.com | |ibm_cf | max_workers | 1200 | no | Max number of workers per `FunctionExecutor()`| @@ -51,6 +61,8 @@ Lithops with *IBM Cloud Functions* as compute backend. |ibm_cf | runtime_timeout | 600 |no | Runtime timeout in seconds. Default 600 seconds | |ibm_cf | invoke_pool_threads | 500 |no | Number of concurrent threads used for invocation | |ibm_cf | remote_invoker | False | no | Activate the remote invoker feature that uses one cloud function to spawn all the actual `map()` activations | +|ibm_cf | runtime_include_function | False | no | If set to true, Lithops will automatically build a new runtime, including the function's code, instead of transferring it through the storage backend at invocation time. This is useful when the function's code size is large (in the order of 10s of MB) and the code does not change frequently | + ## Test Lithops diff --git a/docs/source/compute_config/ibm_vpc.md b/docs/source/compute_config/ibm_vpc.md index 8ac8aa614..0930f8f34 100644 --- a/docs/source/compute_config/ibm_vpc.md +++ b/docs/source/compute_config/ibm_vpc.md @@ -1,21 +1,30 @@ # IBM Virtual Private Cloud -The IBM VPC client of Lithops can provide a truely serverless user experience on top of IBM VPC where Lithops creates new VSIs (Virtual Server Instance) dynamically in runtime, and scale Lithops jobs against them. Alternatively Lithops can start and stop an existing VSI instances. +The IBM VPC client of Lithops can provide a truely serverless user experience on top of IBM VPC where Lithops creates new VSIs (Virtual Server Instance) dynamically in runtime, and scale Lithops jobs against them (Create & Reuse modes). Alternatively Lithops can start and stop an existing VSI instance (Consume mode). + +## Installation + +1. Install IBM Cloud backend dependencies: + +```bash +python3 -m pip install lithops[ibm] +``` ## IBM VPC The assumption that you already familiar with IBM Cloud, have your IBM IAM API key created (you can create new keys [here](https://cloud.ibm.com/iam/apikeys)), have valid IBM COS account, region and resource group. -Follow [IBM VPC setup](https://cloud.ibm.com/vpc-ext/overview) if you need to create IBM Virtual Private Cloud. Decide the region for your VPC. The best practice is to use the same region both for VPC and IBM COS, hoewever there is no requirement to keep them in the same region. +Follow [IBM VPC setup](https://cloud.ibm.com/vpc-ext/overview) if you need to create IBM Virtual Private Cloud. Decide the region for your VPC. The best practice is to use the same region both for VPC and IBM COS, however there is no requirement to keep them in the same region. ## Choose an operating system image for VSI -Any Virtual Service Instance (VSI) need to define the instance’s operating system and version. Lithops support both standard Ubuntu operting system choices provided by the VPC and using pre-defined custom images that already contains all dependencies required by Lithops. +Any Virtual Service Instance (VSI) need to define the instance’s operating system and version. Lithops support both standard Ubuntu operating system choices provided by the VPC and using pre-defined custom images that already contains all dependencies required by Lithops. - Option 1: Lithops is compatible with any Ubuntu 22.04 image provided in IBM Cloud. In this case, no further action is required and you can continue to the next step. Lithops will install all required dependencies in the VSI by itself. Notice this can consume about 3 min to complete all installations. - Option 2: Alternatively, you can use a pre-built custom image (based on Ubuntu) that will greatly improve VSI creation time for Lithops jobs. To benefit from this approach, navigate to [runtime/ibm_vpc](https://github.com/lithops-cloud/lithops/tree/master/runtime/ibm_vpc), and follow the instructions. -## Lithops and the VM auto create|reuse mode +## Create and reuse modes + In the `create` mode, Lithops will automatically create new worker VM instances in runtime, scale Lithops job against generated VMs, and automatically delete the VMs when the job is completed. Alternatively, you can set the `reuse` mode to keep running the started worker VMs, and reuse them for further executions. In the `reuse` mode, Lithops checks all the available worker VMs and start new workers only if necessary. @@ -56,6 +65,7 @@ ibm_vpc: docker_server : us.icr.io # Change-me if you have the CR in another region docker_user : iamapikey docker_password : + docker_namespace : # namespace name from https://cloud.ibm.com/registry/namespaces ``` @@ -73,36 +83,34 @@ ibm_vpc: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|ibm_vpc | region | |no | VPC Region. For example `us-south`. Choose one region from [here](https://cloud.ibm.com/docs/vpc?topic=vpc-service-endpoints-for-vpc). Lithops will use the `region` set under the `ibm` section if it is not set here | +|ibm_vpc | region | |no | VPC Region. For example `us-south`. Choose one region from [here](https://cloud.ibm.com/docs/vpc?topic=vpc-service-endpoints-for-vpc). Lithops will use the `region` set under the `ibm` section if it is not set here. Alternatively you can specify a Zone, for example: `eu-gb-2` | |ibm_vpc | vpc_id | | no | VPC id of an existing VPC. Get it from [here](https://cloud.ibm.com/vpc-ext/network/vpcs) | |ibm_vpc | vpc_name | | no | VPC name of an existing VPC (if `vpc_id` is not provided) | |ibm_vpc | security_group_id | | no | Security group id of an existing VPC. Get it from [here](https://cloud.ibm.com/vpc-ext/network/securityGroups)| |ibm_vpc | subnet_id | | no | Subnet id of an existing VPC. Get it from [here](https://cloud.ibm.com/vpc-ext/network/subnets)| |ibm_vpc | ssh_key_id | | no | SSH public key id. Get it from [here](https://cloud.ibm.com/vpc-ext/compute/sshKeys)| |ibm_vpc | gateway_id | | no | Gateway id. Get it from [here](https://cloud.ibm.com/vpc-ext/network/publicGateways)| -|ibm_vpc | image_name | lithops-worker-default | no | Virtual machine image name | -|ibm_vpc | image_id | | no | Virtual machine image id | +|ibm_vpc | image_id | | no | Virtual machine image id. Default is Ubuntu Server 22.04 | +|ibm_vpc | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the default python3 interpreter of the VM | |ibm_vpc | ssh_username | root |no | Username to access the VM | |ibm_vpc | ssh_password | |no | Password for accessing the worker VMs. If not provided, it is created randomly| |ibm_vpc | ssh_key_filename | ~/.ssh/id_rsa | no | Path to the ssh key file provided to access the VPC. It will use the default path if not provided | |ibm_vpc | boot_volume_profile | general-purpose | no | Virtual machine boot volume profile | -|ibm_vpc | boot_volume_capacity | 100 | no | Virtual machine boot volume capacity in GB. Set it to 10 if using a custom image. | +|ibm_vpc | boot_volume_capacity | 100 | no | Virtual machine boot volume capacity in GB. | |ibm_vpc | worker_profile_name | cx2-2x4 | no | Profile name for the worker VMs | |ibm_vpc | master_profile_name | cx2-2x4 | no | Profile name for the master VM | -|ibm_vpc | delete_on_dismantle | True | no | Delete the worekr VMs when they are stopped | +|ibm_vpc | verify_resources | True | no | Verify the resources that are stored in the local cache, and expected to be already created (VPC, subnet, floating IP, etc.), exist every time a `FunctionExecutor()` is created | +|ibm_vpc | delete_on_dismantle | True | no | Delete the worker VMs when they are stopped | |ibm_vpc | max_workers | 100 | no | Max number of workers per `FunctionExecutor()`| -|ibm_vpc | worker_processes | 2 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of a worker VM. | -|ibm_vpc | singlesocket | False | no | Try to allocate workers with single socket CPU. If eventually running on multiple socket, a warning message printed to user. Is **True** standalone **workers_policy** must be set to **strict** to trace workers states| -|ibm_vpc | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the default python3 interpreter of the VM | +|ibm_vpc | worker_processes | AUTO | no | Number of Lithops processes within a given worker. This is used to parallelize function activations within a worker. By default it detects the amount of CPUs in the worker VM| |ibm_vpc | auto_dismantle | True |no | If False then the VM is not stopped automatically.| |ibm_vpc | soft_dismantle_timeout | 300 |no| Time in seconds to stop the VM instance after a job **completed** its execution | |ibm_vpc | hard_dismantle_timeout | 3600 | no | Time in seconds to stop the VM instance after a job **started** its execution | -|ibm_vpc | exec_mode | consume | no | One of: **consume**, **create** or **reuse**. If set to **create**, Lithops will automatically create new VMs for each map() call based on the number of elements in iterdata. If set to **reuse** will try to reuse running workers if exist | -|ibm_vpc | pull_runtime | False | no | If set to True, Lithops will execute the command `docker pull ` in each VSI before executing the a job (in case of using a docker runtime)| -|ibm_vpc | workers_policy | permissive | no | One of: **permissive**, **strict**. If set to **strict** will force creation of required workers number | -|ibm_vpc | gpu | False | no | If True docker started with gpu support. Requires host to have neccessary hardware and software preconfigured and docker image runtime with gpu support specified | +|ibm_vpc | exec_mode | reuse | no | One of: **consume**, **create** or **reuse**. If set to **create**, Lithops will automatically create new VMs for each map() call based on the number of elements in iterdata. If set to **reuse** will try to reuse running workers if exist | +|ibm_vpc | singlesocket | False | no | Try to allocate workers with single socket CPU. If eventually running on multiple socket, a warning message printed to user. Is **True** standalone **workers_policy** must be set to **strict** to trace workers states| +|ibm_vpc | gpu | False | no | If `True` docker started with gpu support. Requires host to have necessary hardware and software pre-configured, and docker image runtime with gpu support specified | -## Lithops and the VSI consume mode +## Consume mode In this mode, Lithops can start and stop an existing VM, and deploy an entire job to that VM. The partition logic in this scenario is different from the `create/reuse` modes, since the entire job is executed in the same VM. @@ -110,18 +118,19 @@ In this mode, Lithops can start and stop an existing VM, and deploy an entire jo Edit your lithops config and add the relevant keys: - ```yaml - lithops: - backend: ibm_vpc +```yaml +lithops: + backend: ibm_vpc - ibm: - iam_api_key: +ibm: + iam_api_key: - ibm_vpc: - region : - instance_id : - floating_ip : - ``` +ibm_vpc: + exec_mode: consume + region : + instance_id : + floating_ip : +``` If you need to create new VM, then follow the steps to create and update Lithops configuration: @@ -133,7 +142,7 @@ If you need to create new VM, then follow the steps to create and update Lithops ## Summary of configuration keys for IBM Cloud: -### IBM IAM: +### IBM: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| @@ -146,15 +155,14 @@ If you need to create new VM, then follow the steps to create and update Lithops |---|---|---|---|---| |ibm_vpc | region | |yes | VPC Region. For example `us-south`. Choose one region from [here](https://cloud.ibm.com/docs/vpc?topic=vpc-service-endpoints-for-vpc). Lithops will use the region set under the `ibm` section if it is not set here | |ibm_vpc | instance_id | | yes | virtual server instance ID | -|ibm_vpc | floating_ip | | yes | Floatting IP address atached to your VM instance| +|ibm_vpc | floating_ip | | yes | Floating IP address attached to your VM instance| |ibm_vpc | ssh_username | root |no | Username to access the VM | |ibm_vpc | ssh_key_filename | ~/.ssh/id_rsa | no | Path to the ssh key file provided to create the VM. It will use the default path if not provided | -|ibm_vpc | worker_processes | 2 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of the VM. | -|ibm_vpc | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the defeuv python3 interpreter of the VM | +|ibm_vpc | worker_processes | AUTO | no | Number of Lithops processes within a given worker. This is used to parallelize function activations within the worker. By default it detects the amount of CPUs in the VM| +|ibm_vpc | runtime | python3 | no | Runtime name to run the functions. Can be a container image name. If not set Lithops will use the default `python3` interpreter of the VM | |ibm_vpc | auto_dismantle | True |no | If False then the VM is not stopped automatically.| |ibm_vpc | soft_dismantle_timeout | 300 |no| Time in seconds to stop the VM instance after a job **completed** its execution | |ibm_vpc | hard_dismantle_timeout | 3600 | no | Time in seconds to stop the VM instance after a job **started** its execution | -|ibm_vpc | pull_runtime | False | no | If set to True, Lithops will execute the command `docker pull ` in each VSI before executing the a job (in case of using a docker runtime)| ## Test Lithops @@ -173,10 +181,40 @@ You can view the function executions logs in your local machine using the *litho lithops logs poll ``` -The master and worker VMs contain the Lithops service logs in `/tmp/lithops-root/service.log` +## VM Management + +Lithops for IBM VPC follows a Mater-Worker architecture (1:N). + +All the VMs, including the master VM, are automatically stopped after a configurable timeout (see hard/soft dismantle timeouts). You can login to the master VM and get a live ssh connection with: ```bash lithops attach -b ibm_vpc ``` + +The master and worker VMs contain the Lithops service logs in `/tmp/lithops-root/*-service.log` + +To list all the available workers in the current moment, use the next command: + +```bash +lithops worker list -b ibm_vpc +``` + +You can also list all the submitted jobs with: + +```bash +lithops job list -b ibm_vpc +``` + +You can delete all the workers with: + +```bash +lithops clean -b ibm_vpc -s ibm_cos +``` + +You can delete all the workers including the Master VM with the `--all` flag: + +```bash +lithops clean -b ibm_vpc -s ibm_cos --all +``` diff --git a/docs/source/compute_config/knative.md b/docs/source/compute_config/knative.md index 315fef72b..51f51a4b9 100644 --- a/docs/source/compute_config/knative.md +++ b/docs/source/compute_config/knative.md @@ -6,51 +6,57 @@ Lithops with *Knative* as serverless compute backend. Lithops also supports vani Note that Lithops automatically builds the default runtime the first time you run a script. For this task it uses the **docker** command installed locally in your machine. -1. [Install the Docker CE version](https://docs.docker.com/get-docker/). +1. Install Knative backend dependencies: -2. Login to your docker account: +```bash +python3 -m pip install lithops[knative] +``` + +2. [Install the Docker CE version](https://docs.docker.com/get-docker/). + +3. Login to your docker account: ```bash docker login ``` -3. Choose one of these 3 installation options: +4. Choose one of these 3 installation options: ### Option 1 - Minikube: -4. Start minikube with the 'ingress' addon: +5. Start minikube with the 'ingress' addon: ```bash minikube start --addons=ingress ``` -5. [Follow this instructions to install knative serving.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/) +6. [Follow this instructions to install knative serving.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/) -6. Install a networking layer. Currently Lithops supports **Kourier**. [Follow these instructions to install Kourier.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/#install-a-networking-layer) +7. Install a networking layer. Currently Lithops supports **Kourier**. [Follow these instructions to install Kourier.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/#install-a-networking-layer) -7. Edit your lithops config and add: +8. Edit your lithops config and add: ```yaml knative: ingress_endpoint : http://127.0.0.1:80 ``` -8. On a separate terminal, keep running: +9. On a separate terminal, keep running: ```bash minikube tunnel ``` ### Option 2 - IBM IKS: -4. Access to the [IBM dashboard](https://cloud.ibm.com/kubernetes/landing) and create a new Kubernetes cluster. +5. Access to the [IBM dashboard](https://cloud.ibm.com/kubernetes/landing) and create a new Kubernetes cluster. -5. Once the cluster is running, follow the instructions of the "Actions"--> "Connect via CLI" option of the dashboard to configure the *kubectl* client in your local machine. +6. Once the cluster is running, follow the instructions of the "Actions"--> "Connect via CLI" option of the dashboard to configure the *kubectl* client in your local machine. -6. [Follow this instructions to install knative serving.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/) +7. [Follow this instructions to install knative serving.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/) -7. Install a networking layer. Currently Lithops supports **Kourier**. [Follow these instructions to install Kourier.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/#install-a-networking-layer) +8. Install a networking layer. Currently Lithops supports **Kourier**. [Follow these instructions to install Kourier.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/#install-a-networking-layer) ### Option 3 - IBM IKS or any other Kubernetes Cluster: -4. Install Kubernetes >= v1.16 and make sure the *kubectl* client is running. +5. Install Kubernetes >= v1.16 and make sure the *kubectl* client is running. 6. [Follow this instructions to install knative serving.](https://knative.dev/docs/install/yaml-install/serving/install-serving-with-yaml/) @@ -92,6 +98,7 @@ knative: docker_server : us.icr.io docker_user : iamapikey docker_password : + docker_namespace : # namespace name from https://cloud.ibm.com/registry/namespaces ``` ## Summary of configuration keys for Knative: diff --git a/docs/source/compute_config/kubernetes.md b/docs/source/compute_config/kubernetes.md index 8c158b804..796f67d89 100644 --- a/docs/source/compute_config/kubernetes.md +++ b/docs/source/compute_config/kubernetes.md @@ -2,6 +2,13 @@ Lithops with kubernetes as serverless compute backend. +## Installation + +1. Install kubernetes backend dependencies: + +```bash +python3 -m pip install lithops[kubernetes] +``` ## Configuration @@ -36,7 +43,7 @@ k8s: .... docker_server : docker.io docker_user : - docker_password : + docker_password : ``` ### Configure IBM Container Registry @@ -48,6 +55,7 @@ k8s: docker_server : us.icr.io docker_user : iamapikey docker_password : + docker_namespace : # namespace name from https://cloud.ibm.com/registry/namespaces ``` ## Summary of configuration keys for kubernetes: @@ -60,12 +68,14 @@ k8s: |k8s | docker_server | docker.io |no | Container registry URL | |k8s | docker_user | |no | Container registry user name | |k8s | docker_password | |no | Container registry password/token. In case of Docker hub, login to your docker hub account and generate a new access token [here](https://hub.docker.com/settings/security)| +|k8s | rabbitmq_executor | False | no | Alternative K8s backend accelerating parallel function execution (map) thanks to rabbitmq group calls and warm-state pods of higher granularity. For more information [here](./kubernetes_rabbitmq.md).| |k8s | max_workers | 100 | no | Max number of workers per `FunctionExecutor()`| |k8s | worker_processes | 1 | no | Number of Lithops processes within a given worker. This can be used to parallelize function activations within a worker. It is recommendable to set this value to the same number of CPUs of the container. | |k8s | runtime | |no | Docker image name.| |k8s | runtime_cpu | 1 |no | CPU limit. Default 1vCPU | |k8s | runtime_memory | 512 |no | Memory limit in MB. Default 512MB | |k8s | runtime_timeout | 600 |no | Runtime timeout in seconds. Default 600 seconds | +|k8s | master_timeout | 600 |no | Master pod timeout in seconds. Default 600 seconds | ## Test Lithops diff --git a/docs/source/compute_config/kubernetes_rabbitmq.md b/docs/source/compute_config/kubernetes_rabbitmq.md new file mode 100644 index 000000000..33db5533c --- /dev/null +++ b/docs/source/compute_config/kubernetes_rabbitmq.md @@ -0,0 +1,131 @@ +# Kubernetes RabbitMQ (batch/job) + +**Lithops for Kubernetes RabbitMQ** is an **experimental k8s backend** designed to leverage the capabilities of RabbitMQ for more efficient execution of **group invocations**. It introduces the use of pods for **warm starts**, optimizes resource allocation, and offers various enhancements. + +All of these changes are **ideal** for pipelines where launching **hundreds of parallel tasks as quickly as possible** is a critical requirement, in a fixed size heterogeneous cluster. + +## Changes of K8s RabbitMQ + +* **Utilization of RabbitMQ:** Within this architecture, RabbitMQ is employed to launch group invocations in a single call, avoiding the need for multiple calls for each function execution. Additionally, it enables data exchange between the client and running pods, bypassing the Storage Backend as an intermediary, which is slower. This accelerates and streamlines communication significantly. + +* **Warm Start Capability:** Unlike K8s, Lithops K8s RabbitMQ introduces the ability to perform warm starts on the workers pods. This means that previous workers pods still listening to RabbitMQ to launch a new task, further reducing invocation time to almost 0. + +* **Improved Invocation Time:** Lithops K8s RabbitMQ offers an **up x4** significant enhancement in cold start time, effectively reducing the delay before your functions start executing. + +* **Resource Utilization:** In this backend, CPU assignment is employed by allocating identifiers to each CPU. This approach facilitates more effective resource management within our cluster, enabling the creation of pods that by default match the entire capacity of a machine (worker_processes) and allowing the launch of precisely the number of tasks that can run concurrently. + +## Installation + +1. Install kubernetes backend dependencies: + +```bash +python3 -m pip install lithops[kubernetes] +``` + +## Configuration + +1. Edit your Lithops config and add the following keys: + +```yaml + lithops: + backend : k8s + + k8s: + .... + docker_server : docker.io + docker_user : + docker_password : + .... + rabbitmq_executor : True +``` + +2. Make sure you have a kubernetes cluster configuration file. + - Option 1: You have the config file in `~/.kube/config` + + - Option 2: You have the config file in another location, and you exported the KUBECONFIG variable: + ```bash + export KUBECONFIG= + ``` + + - Option 3: You have the config file in another location, and you set the `kubecfg_path` var in the Lithops config: + ```yaml + k8s: + kubecfg_path: + ``` + + 3. For this version, a connection to [rabbitMQ](../monitoring.rst) is required. + To enable Lithops to use this service, add the AMQP_URL key into the rabbitmq section in the configuration, for example: + ```yaml + rabbitmq: + amqp_url: # amqp:// + ``` + In addition, you need to activate the monitoring service in the configuration (Lithops section): + + ```yaml + lithops: + monitoring: rabbitmq + ``` + +## Comparison + +In these graphs, we will compare the usage of the original K8s architecture with the RabbitMQ implementation. + +All of these tests consist of running 225 functions on a 2-node cluster, each with 128 CPUs. The executed function involves a 5-second sleep. + +In this scenario, it is evident that the invocation time is consistently reduced by a factor of **up to 5x** on cold start and **up to 7x** on warm start. This represents a significant enhancement for parallel function execution. + +- Plot 1: Kubernetes K8s original. + +*Elapsed time = 16,9 sec.* + +![Kubernetes K8s original plot](../images/plots_kubernetes/k8s_original_histogram.png) + +- Plot 2: Kubernetes K8s original with master on Warm Start. + +*Elapsed time = 8,1 sec.* + +![Kubernetes K8s original with Warm Start plot](../images/plots_kubernetes/k8s_original_warm_start_histogram.png) + +- Plot 3: Kubernetes K8s RabbitMQ. + +*Elapsed time = 8 sec.* + +![Kubernetes K8s RabbitMQ plot](../images/plots_kubernetes/rabbitmq_histogram.png) + +- Plot 4: Kubernetes K8s RabbitMQ with workers on Warm Start. + +*Elapsed time = 5,9 sec.* + +![Kubernetes K8s RabbitMQ with Warm Start plot](../images/plots_kubernetes/rabbitmq_warm_start_histogram.png) + +## Summary of configuration keys for kubernetes: + +|Group|Key|Default|Mandatory|Additional info| +|---|---|---|---|---| +|k8s | kubecfg_path | |no | Path to kubecfg file. Mandatory if config file not in `~/.kube/config` or KUBECONFIG env var not present| +|k8s | kubecfg_context | |no | kubernetes context to use from your kubeconfig file. It will use the default active context if not provided | +|k8s | namespace | default |no | Kubernetes namespace to use for lithops execution | +|k8s | docker_server | docker.io |no | Container registry URL | +|k8s | docker_user | |no | Container registry user name | +|k8s | docker_password | |no | Container registry password/token. In case of Docker hub, login to your docker hub account and generate a new access token [here](https://hub.docker.com/settings/security)| +|k8s | rabbitmq_executor | False | yes | Alternative K8s backend accelerating parallel function execution (map) thanks to rabbitmq group calls and warm-state pods of higher granularity.| +|k8s | worker_processes | |no | CPUs per pod. This enables pod granularity. Default gets all CPUs of the nodes. | +|k8s | runtime | |no | Docker image name.| +|k8s | runtime_cpu | 1 |no | CPU limit. Default 1vCPU | +|k8s | runtime_memory | 512 |no | Memory limit in MB per pod. Default 512MB | + +## Test Lithops + +Once you have your compute and storage backends configured, you can run a hello world function with: + +```bash +lithops hello -b k8s -s ibm_cos +``` + +## Viewing the execution logs + +You can view the function executions logs in your local machine using the *lithops client*: + +```bash +lithops logs poll +``` \ No newline at end of file diff --git a/docs/source/compute_config/localhost.md b/docs/source/compute_config/localhost.md index a6fc9a511..6d0f4abe3 100644 --- a/docs/source/compute_config/localhost.md +++ b/docs/source/compute_config/localhost.md @@ -14,7 +14,7 @@ lithops: ## Execution Environments -The localhost backend can run functions both using the local ``python3`` interpreter, or using a ``docker container`` image. The environment is automatically chosen depending on whether or not you provided a Docker image as a runtime. +The localhost backend can run functions both using the local ``python3`` interpreter, or using a ``container`` image. The environment is automatically chosen depending on whether or not you provided a Docker image as a runtime. In both cases, you can view the executions logs in your local machine using the *lithops client*: @@ -24,7 +24,7 @@ lithops logs poll ### Default Environment -By default Lithops uses the local python interpreter to run the functions. That is, if for example you executed the main script with ``python3.8``, your functions will run with ``python3.8``. in this case, you must ensure that all the dependencies of your script are installed in your machine. +By default Lithops uses the local python interpreter to run the functions. That is, if for example you executed the main script with ``python3.12``, your functions will run with ``python3.12``. in this case, you must ensure that all the dependencies of your script are installed in your machine. ```python # As we use the default FunctionExecutor(), backend must be set to localhost in config @@ -38,25 +38,25 @@ or alternatively, you can force the Localhost executor with: fexec = lithops.LocalhostExecutor() ``` -### Docker Environment +### Container Environment -The Docker environment runs the functions within a ``docker container``. In this case you must [install the Docker CE version](https://docs.docker.com/get-docker/) in your machine. This environment is automatically activated when you provide a docker image as a runtime. For example, by adding the following keys in the config: +The Container environment runs the functions within a ``docker container``. In this case you must [install the Docker CE version](https://docs.docker.com/get-docker/) in your machine. This environment is automatically activated when you provide a docker image as a runtime. For example, by adding the following keys in the config: ```yaml localhost: - runtime: ibmfunctions/action-python-v3.8 + runtime: docker.io/lithopscloud/ibmcf-python-v312 ``` or by using the ``runtime`` param in a function executor: ```python -# As we use the default FunctionExecutor(), backend must be set to localhost in config -fexec = lithops.FunctionExecutor(runtime='jsampe/action-python-v3.8') +# As we use the default FunctionExecutor(), the "backend" config parameter must be set to localhost in config +fexec = lithops.FunctionExecutor(runtime='docker.io/lithopscloud/ibmcf-python-v312') ``` ```python -# As we use/force the LocalhostExecutor(), backend does not need to be set to localhost in config -fexec = lithops.LocalhostExecutor(runtime='jsampe/action-python-v3.8') +# As we use/force the LocalhostExecutor(), the "backend" config parameter does not need to be set to localhost in config +fexec = lithops.LocalhostExecutor(runtime='docker.io/lithopscloud/ibmcf-python-v312') ``` In this mode of execution, you can use any docker image that contains all the required dependencies. For example, the IBM Cloud Functions and Knative runtimes are compatible with it. @@ -65,7 +65,8 @@ In this mode of execution, you can use any docker image that contains all the re |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|localhost | runtime | python3 | no | Docker image name | +|localhost | runtime | python3 | no | By default it uses the `python3` interpreter. It can be a container image name | +|localhost | version | 2 | no | There are 2 different localhost implementations. Use '1' for using the alternative version | |localhost | worker_processes | CPU_COUNT | no | Number of Lithops processes. This is used to parallelize function activations. By default it is set to the number of CPUs of your machine | ## Test Lithops @@ -82,4 +83,6 @@ You can view the function executions logs in your local machine using the *litho ```bash lithops logs poll -``` \ No newline at end of file +``` + +You can view the localhost runner logs in `/tmp/lithops-*/localhost-runner.log` \ No newline at end of file diff --git a/docs/source/compute_config/openwhisk.md b/docs/source/compute_config/openwhisk.md index a700e3145..ce32361f6 100644 --- a/docs/source/compute_config/openwhisk.md +++ b/docs/source/compute_config/openwhisk.md @@ -80,6 +80,7 @@ Lithops with *OpenWhisk* as serverless compute backend. Lithops can also run fun |openwhisk | runtime_memory | 256 |no | Memory limit in MB. Default 256MB | |openwhisk | runtime_timeout | 600 |no | Runtime timeout in seconds. Default 10 minutes | |openwhisk | invoke_pool_threads | 500 |no | Number of concurrent threads used for invocation | +|openwhisk | runtime_include_function | False | no | If set to true, Lithops will automatically build a new runtime, including the function's code, instead of transferring it through the storage backend at invocation time. This is useful when the function's code size is large (in the order of 10s of MB) and the code does not change frequently | ## Test Lithops diff --git a/docs/source/compute_config/oracle_functions.md b/docs/source/compute_config/oracle_functions.md index 6b7fbbce8..f7a999960 100644 --- a/docs/source/compute_config/oracle_functions.md +++ b/docs/source/compute_config/oracle_functions.md @@ -1,4 +1,4 @@ -# Oracle Functions (beta) +# Oracle Functions Lithops with *Oracle Functions* as serverless compute backend. @@ -7,7 +7,8 @@ Lithops with *Oracle Functions* as serverless compute backend. ## Installation 1. Install Oracle Cloud backend dependencies: -``` + +```bash python3 -m pip install lithops[oracle] ``` @@ -117,6 +118,7 @@ docker login .ocir.io -u / -p :@:/ + ``` + + Replace ``, ``, ``, ``, and `` with your RabbitMQ credentials. + +### Configure Singularity backend + + ```yaml + singularity: + worker_processes: + runtime: + sif_path: + ``` + +## Summary of Configuration Keys for Singularity + +| Group | Key | Default | Mandatory | Additional info | +|-------------|----------------------|---------|-----------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| singularity | worker_processes | 1 | no | Number of functions sent in each RabbitMQ package. Ideally, set to a multiple of the node's CPU count. | +| singularity | runtime | | yes | Name of the Singularity runtime image (`.sif`) file containing the Lithops runtime environment. | +| singularity | sif_path | /tmp | no | Directory path where the Singularity runtime image `.sif` will be stored. | + +## Deploying the Runtime Image + +Since Lithops doesn't directly manage Singularity instances on your cluster, you need to ensure the runtime image is available on **each** node: + +1. **Transfer:** Manually copy the built `.sif` runtime image to each node in your cluster. + +2. **Start:** Start a new Singularity instance on each node using the `.sif` file. Then run the instance and add the RabbitMQ server details to the environment variables. + + ```bash + singularity instance start --fakeroot /path/to/sif/your-singularity-runtime.sif + singularity run instance:// --env AMQP_URL=amqp://:@:/ + ``` + +Depending on your cluster setup, you might need to adjust permissions of the `.sif` file or the [singularity flags](https://docs.sylabs.io/guides/latest/user-guide/cli/singularity_exec.html#singularity-exec) to ensure that the user running the Lithops worker can access and execute it. + + +## Test Lithops +Once you have your compute and storage backends configured, you can run a hello world function with: + +```bash +lithops hello -b singularity +``` + +## Viewing the execution logs + +You can view the function executions logs in your local machine using the *lithops client*: + +```bash +lithops logs poll +``` diff --git a/docs/source/compute_config/vm.md b/docs/source/compute_config/vm.md index 4e8d77da4..c828c0c1d 100644 --- a/docs/source/compute_config/vm.md +++ b/docs/source/compute_config/vm.md @@ -1,6 +1,6 @@ # Virtual Machine -Lithops can run functions using a remote host or a virtual machine (VM). In this backend, Lithops uses all the available VM CPUs to parallelize the tasks of a job. For testing purposes, it is preferable to have an Ubuntu > 20.04 host. +Lithops can run functions using a remote host or a virtual machine (VM). In this backend, Lithops uses all the available VM CPUs to parallelize the tasks of a job. For testing purposes, it is preferable to have an Ubuntu > 22.04 host. ## Configuration @@ -33,13 +33,13 @@ The Docker environment runs the functions within a ``docker container``. In this ```yaml vm: - runtime: lithopscloud/ibmcf-python-v38 + runtime: lithopscloud/ibmcf-python-v312 ``` or by using the ``runtime`` param in a function executor: ```python -fexec = lithops.FunctionExecutor(runtime='lithopscloud/ibmcf-python-v38') +fexec = lithops.FunctionExecutor(runtime='lithopscloud/ibmcf-python-v312') ``` In this backend, you can use any docker image that contains all the required dependencies. For example, the IBM Cloud Functions and Knative runtimes are compatible with it. diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 00f20f11d..273bb5254 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -29,8 +29,9 @@ Choose your compute and storage engines from the table below: || `Azure Container Apps `_ || `Redis `_ | || `Aliyun Function Compute `_ || `OpenStack Swift `_ | || `Oracle Functions `_ || `Oracle Object Storage `_ | -|| `Kubernetes `_ || | +|| `Kubernetes `_ || | || `Knative `_ || | +|| `Singularity `_ || | || `OpenWhisk `_ || | || `Remote Host / Virtual Machine `_ || | || `IBM Virtual Private Cloud `_ || | @@ -44,7 +45,7 @@ Configuration File To configure Lithops through a `configuration file `_ you have multiple options: -1. Create e new file called ``config`` in the ``~/.lithops`` folder. +1. Create a new file called ``config`` in the ``~/.lithops`` folder. 2. Create a new file called ``.lithops_config`` in the root directory of your project from where you will execute your Lithops scripts. @@ -70,22 +71,30 @@ Here is an example of providing configuration keys for IBM Cloud Functions and I .. code:: python - import lithops - - - config = {'lithops': {'backend': 'ibm_cf', 'storage': 'ibm_cos'}, - 'ibm': {'region': 'REGION', - 'iam_api_key': 'IAM_API_KEY', - 'resource_group_id': 'RESOURCE_GROUP_ID'} - 'ibm_cos': {'storage_bucket': 'STORAGE_BUCKET'}} - - def hello_world(name): - return 'Hello {}!'.format(name) - - if __name__ == '__main__': - fexec = lithops.FunctionExecutor(config=config) - fexec.call_async(hello_world, 'World') - print(fexec.get_result()) + import lithops + + config = { + 'lithops': { + 'backend': 'code_engine', + 'storage': 'ibm_cos' + }, + 'ibm': { + 'region': 'REGION', + 'iam_api_key': 'IAM_API_KEY', + 'resource_group_id': 'RESOURCE_GROUP_ID' + }, + 'ibm_cos': { + 'storage_bucket': 'STORAGE_BUCKET' + } + } + + def hello_world(number): + return f'Hello {number}!' + + if __name__ == '__main__': + fexec = lithops.FunctionExecutor(config=config) + fexec.map(hello_world, [1, 2, 3, 4]) + print(fexec.get_result()) .. _config-reference-label: diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index d0bb84ec4..01aa4e074 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -18,24 +18,39 @@ To contribute a patch 1. Break your work into small, single-purpose patches if possible. It's much harder to merge in a large change with a lot of disjoint features. 2. Submit the patch as a GitHub pull request against the master branch. -3. Make sure that your code passes the unit tests. -4. Make sure that your code passes the linter. -5. Add new unit tests for your code. - - -Unit testing ------------- - -To test that all is working as expected, run either: - -.. code:: - - $ lithops test - - -.. code:: - - $ python3 -m lithops.tests.tests_main - - -Please follow the guidelines in :ref:`testing` for more details. \ No newline at end of file +3. Make sure that your code passes the tests. +4. Make sure that your code passes the linter. Install `flake8` with `pip3 install flake8` and run the next command until you don't see any linitng error: + ```bash + flake8 lithops --count --max-line-length=180 --statistics --ignore W605,W503 + ``` +5. Add new tests for your code. + + +Testing +------- + +To test that all is working as expected, you must install `pytest`, navigate to the tests folder `lithops/tests/`, and execute: +```bash +pytest -v +``` + +If you made changes to a specific backend, please run tests on that backend. +For example, if you made changes to the AWS Lambda backend, execute the tests with: +```bash +pytest -v --backend aws_lambda --storage aws_s3 +``` + +You can list all the available tests using: +```bash +pytest --collect-only +``` + +To run a specific test or group of tests, use the `-k` parameter, for example: +```bash +pytest -v --backend localhost --storage localhost -k test_map +``` + +To view all the Lithops logs during the tests, and in DEBUG mode, execute: +```bash +pytest -o log_cli=true --log-cli-level=DEBUG --backend localhost --storage localhost +``` diff --git a/docs/source/customized_runtime.rst b/docs/source/customized_runtime.rst deleted file mode 100644 index ab1143408..000000000 --- a/docs/source/customized_runtime.rst +++ /dev/null @@ -1,19 +0,0 @@ - -Dynamic Runtime Customization -============================= - -.. note:: Currently this feature only works with dcoker-based backends. - -This feature enables early preparation of Lithops workers with the map function and custom Lithops -runtime already deployed, and ready to be used in consequent computations. This can reduce overall map/reduce -computation latency significantly, especially when the computation overhead (pickle stage) is longer compared to -the actual computation performed at the workers. - -.. warning:: To protect your privacy, use a private docker registry instead of public docker hub. - -To activate this mode, set to True the ``customized_runtime`` property under ``lithops`` section of the config file. - -.. code:: yaml - - lithops: - customized_runtime: True diff --git a/docs/source/data_processing.rst b/docs/source/data_processing.rst index 1fa0d48a1..4ddfb9479 100644 --- a/docs/source/data_processing.rst +++ b/docs/source/data_processing.rst @@ -1,156 +1,292 @@ .. _data-processing: -Processing data from the Cloud -=========================================== +Processing Data from the Cloud +============================== -Lithops has built-in logic for processing data objects from public URLs and object storage services. This logic is automatically activated with the reseverd parameter named **obj**. When you write in the parameters of a function the parameter name **obj**, you are telling to Lithops that you want to process objects located in an object storage service, public urls, or localhost files. +Lithops provides built-in support for reading and processing data from **object storage**, **public URLs**, and **local files**. This functionality is automatically enabled when your function includes a reserved parameter named **obj**. -Additionally, the built-in data-processing logic integrates a **data partitioner** system that allows to automatically split the dataset in smallest chunks. Splitting a file into smaller chunks permit to leverage the parallelism provided by the compute backends to process the data. We designed the partitioner within the ``map()`` and ``map_reduce()`` API calls, an it is configurable by specifying the *size of the chunk*, or the *number of chunks* to split each file. The current implementation of the data partitioner supports to split files that contain multiple lines (or rows) ended by '\n', for example, a .txt book or a common .csv file among others. More data-types will be supported in future releases. +When you define a function with the parameter `obj`, Lithops knows to pass in a special object representing a file (or a chunk of a file) from an external data source. This allows you to write scalable data processing workflows with minimal boilerplate. +Data Partitioning +----------------- -Cloud Object Storage --------------------- -For processing data from a cloud object storage service, the input data must be either a list of buckets, a list of buckets with object prefix, or a list of data objects. If you set the *size of the chunk* or the *number of chunks*, the partitioner is activated inside Lithops and it is responsible to split the objects into smaller chunks, eventually running one function activation for each generated chunk. If *size of the chunk* and *number of chunks* are not set, chunk is an entire object, so one function activation is executed for each individual object. +Lithops includes an integrated **data partitioner** that allows you to automatically split large datasets into smaller, more manageable chunks. This partitioning enables massive parallelism across the compute backend, accelerating data processing tasks. -The **obj** parameter is a python class from where you can access all the information related to the object (or chunk) that the function is processing. For example, consider the following function that shows all the available attributes in **obj** when you are processing objects from an object store: +Partitioning is supported directly within the :meth:`map()` and :meth:`map_reduce()` APIs and can be controlled via: +- **`obj_chunk_size`**: The size (in bytes) of each chunk to split the object into. +- **`obj_chunk_number`**: The total number of chunks to split each object into. -.. code:: python +Currently, the partitioner supports **text-based files** where rows are separated by newline characters (`\n`), such as `.txt` and `.csv`. Support for additional data types is planned in future releases. + +Cloud Object Storage Integration +-------------------------------- + +When processing data from cloud object storage, your input must be one of the following: + +1. A single bucket or a list of buckets +2. A bucket prefix (e.g., a folder path) +3. A list of specific object keys + +Based on your configuration: + +- If `obj_chunk_size` or `obj_chunk_number` is set, **each object is automatically split into smaller chunks**, and Lithops runs one function activation per chunk. +- If chunking is not configured, Lithops runs one function activation per full object. + +Accessing Object Metadata +-------------------------- + +Inside your function, the `obj` parameter gives you access to metadata and data for the current chunk being processed. + +Example: + +.. code-block:: python def my_map_function(obj): - print(obj.bucket) - print(obj.key) - print(obj.part) - print(obj.data_byte_range) - print(obj.chunk_size) - - data = obj.data_stream.read() + print(obj.bucket) # Bucket name + print(obj.key) # Object key + print(obj.part) # Chunk number + print(obj.data_byte_range) # Byte range for this chunk + print(obj.chunk_size) # Chunk size in bytes + + data = obj.data_stream.read() # Read the data for this chunk + +Accepted Input Formats +----------------------- -The allowed inputs of a function can be: +Lithops accepts **only one type** of input format per execution. Do not mix formats in the same list. The supported formats are: -- Input data is a bucket or a list of buckets. See an example in [map_reduce_cos_bucket.py](../../examples/map_reduce_cos_bucket.py): +- **Buckets**: One or more buckets + *(See: `map_reduce_cos_bucket.py <../../examples/map_reduce_cos_bucket.py>`_)* -.. code:: python + .. code-block:: python - iterdata = 'bucket1' + iterdata = ['my-bucket-1', 'my-bucket-2'] -- Input data is a bucket(s) with object prefix. See an example in [map_cos_prefix.py](../../examples/map_cos_prefix.py): +- **Object Prefixes**: Folder-like paths ending with `/` + *(See: `map_cos_prefix.py <../../examples/map_cos_prefix.py>`_)* -.. code:: python + .. code-block:: python - iterdata = ['bucket1/images/', 'bucket1/videos/'] + iterdata = ['my-bucket/data/csvs/', 'my-bucket/logs/'] -Notice that you must write the end slash (/) to inform partitioner you are providing an object prefix. + ⚠️ Prefixes must end with a `/` to indicate to the partitioner that you're specifying a folder-like path. -- Input data is a list of object keys. See an example in [map_reduce_cos_key.py](../../examples/map_reduce_cos_key.py): +- **Object Keys**: Specific file paths + *(See: `map_reduce_cos_key.py <../../examples/map_reduce_cos_key.py>`_)* -.. code:: python + .. code-block:: python - iterdata = ['bucket1/object1', 'bucket1/object2', 'bucket1/object3'] + iterdata = ['my-bucket/file1.csv', 'my-bucket/file2.csv'] -Notice that *iterdata* must be only one of the previous 3 types. Intermingled types are not allowed. For example, you cannot set in the same *iterdata* a bucket and some object keys: +❌ **Mixing formats is not allowed**: -.. code:: python +.. code-block:: python - iterdata = ['bucket1', 'bucket1/object2', 'bucket1/object3'] # Not allowed + # This will raise an error + iterdata = ['my-bucket', 'my-bucket/file2.csv'] -Once iterdata is defined, you can execute Lithops as usual, either using *map()* or *map_reduce()* calls. If you need to split the files in smaller chunks, you can set (optionally) the *obj_chunk_size* or *obj_chunk_number* parameters. +Putting It All Together +------------------------ -.. code:: python +Once you've defined your input and function, you can run Lithops as usual with optional chunking: + +.. code-block:: python import lithops - object_chunksize = 4*1024**2 # 4MB + object_chunksize = 4 * 1024 ** 2 # 4 MB per chunk fexec = lithops.FunctionExecutor() fexec.map_reduce(my_map_function, iterdata, obj_chunk_size=object_chunksize) result = fexec.get_result() -Processing data from public URLs --------------------------------- -For processing data from public URLs, the input data must be either a single URL or a list of URLs. As in the previous case, if you set the *size of the chunk* or the *number of chunks*, the partitioner is activated inside Lithops and it is responsible to split the objects into smaller chunks, as long as the remote storage server allows requests in chunks (ranges). If range requests are not allowed in the remote storage server, each URL is treated as a single object. -The **obj** parameter is a python class from where you can access all the information related to the object (or chunk) that the function is processing. For example, consider the following function that shows all the available attributes in **obj** when you are processing URLs: +Processing Data from Public URLs +================================ + +Lithops also supports processing data directly from **public URLs**. The input can be a single URL or a list of URLs. +If you set the `obj_chunk_size` or `obj_chunk_number`, Lithops activates its internal partitioner to split each file into smaller chunks—**provided that the remote server supports HTTP range requests**. If range requests are not supported, each URL is processed as a single object. -.. code:: python +As with other backends, the special **`obj`** parameter gives you access to metadata and the content of the chunk being processed. + +Example: + +.. code-block:: python import lithops def my_map_function(obj): - print(obj.url) - print(obj.part) - print(obj.data_byte_range) - print(obj.chunk_size) + print(obj.url) # Full URL of the object + print(obj.part) # Chunk number + print(obj.data_byte_range) # Byte range for this chunk + print(obj.chunk_size) # Size of this chunk (in bytes) data = obj.data_stream.read() for line in data.splitlines(): - # Do some process - return partial_intersting_data + # Process each line + pass + + return partial_result def my_reduce_function(results): - for partial_intersting_data in results: - # Do some process + for partial_result in results: + # Aggregate results + pass + return final_result - iterdata = ['http://myurl/my_file_1.csv', 'http://myurl/my_file_2.csv'] - object_chunk_number= 2 + iterdata = ['http://example.com/file1.csv', 'http://example.com/file2.csv'] + chunk_number = 2 fexec = lithops.FunctionExecutor() fexec.map_reduce(my_map_function, iterdata, my_reduce_function, - obj_chunk_number=object_chunk_number) + obj_chunk_number=chunk_number) result = fexec.get_result() -See a complete example in `map_reduce_url.py `_ +📄 See the full example in: +`map_reduce_url.py `_ + + +Processing Data from Localhost Files +==================================== +.. note:: This feature is only available when using the **localhost backend**. -Processing data from localhost files ------------------------------------- +Lithops can also process files stored on the local filesystem. The input can be: -.. note:: This is only allowed when running Lithops with the localhost backend +- A single file path +- A list of file paths +- A directory path +- A list of directory paths -For processing data from localhost files, the input data must be either a directory path, a list of directory paths, a file path a list of file paths. As in the previous cases, if you set the *size of the chunk* or the *number of chunks*, the partitioner is activated inside Lithops and it is responsible to split the objects into smaller chunks, eventually spawning one function for each generated chunk. If *size of the chunk* and *number of chunks* are not set, chunk is an entire object, so one function activation is executed for each individual object. +As in other cases, if you set `obj_chunk_size` or `obj_chunk_number`, the file(s) will be split into chunks and processed in parallel. If not set, each file is processed as a single object. -The **obj** parameter is a python class from where you can access all the information related to the object (or chunk) that the function is processing. For example, consider the following function that shows all the available attributes in **obj** when you are processing localhost files: +The **`obj`** parameter again exposes the metadata and content of the chunk. -.. code:: python +Example: + +.. code-block:: python import lithops def my_map_function(obj): - print(obj.path) - print(obj.part) - print(obj.data_byte_range) - print(obj.chunk_size) + print(obj.path) # Full local file path + print(obj.part) # Chunk number + print(obj.data_byte_range) # Byte range for this chunk + print(obj.chunk_size) # Size of this chunk (in bytes) data = obj.data_stream.read() for line in data.splitlines(): - # Do some process - return partial_intersting_data + # Process each line + pass + + return partial_result def my_reduce_function(results): - for partial_intersting_data in results: - # Do some process + for partial_result in results: + # Aggregate results + pass + return final_result - iterdata = ['/home/user/data/my_file_1.csv', '/home/user/data/my_file_2.csv'] - object_chunk_number= 2 + iterdata = ['/home/user/file1.csv', '/home/user/file2.csv'] + chunk_number = 2 fexec = lithops.FunctionExecutor() fexec.map_reduce(my_map_function, iterdata, my_reduce_function, - obj_chunk_number=object_chunk_number) + obj_chunk_number=chunk_number) result = fexec.get_result() -See a complete example in `map_reduce_localhost.py `_. +📄 See the full example in: +`map_reduce_localhost.py `_ -Reducer granularity +Reducer Granularity ------------------- -When using the ``map_reduce()`` API call with ``obj_chunk_size`` or ``obj_chunk_number``, by default there will be only one reducer for all the object chunks from all the objects. Alternatively, you can spawn one reducer for each object by setting the parameter ``obj_reduce_by_key=True``. -.. code:: python +When using the :meth:`map_reduce()` API along with `obj_chunk_size` or `obj_chunk_number`, Lithops defaults to using **a single reducer** to aggregate results across **all chunks and objects**. + +If you'd prefer to reduce results **per original object** (e.g., one reducer per file), you can set the parameter `obj_reduce_by_key=True`. + +Example: + +.. code-block:: python fexec.map_reduce(my_map_function, bucket_name, my_reduce_function, - obj_chunk_size=obj_chunk_size, obj_reduce_by_key=True) + obj_chunk_size=obj_chunk_size, + obj_reduce_by_key=True) + + +Elastic Data Processing and Cloud-Optimized Formats +=================================================== + +Lithops is especially powerful for **massively parallel data processing**. When the input to `map()` or `map_reduce()` is a **storage bucket** or a collection of large files, Lithops will automatically: + +- Launch one function per file, or +- Partition large files into chunks and assign each chunk to a different function + +This behavior enables **elastic scaling** that fully utilizes the underlying compute backend. + +Cloud-Optimized Formats +------------------------ + +Lithops is ideally suited for processing **cloud-optimized data formats** such as: + +- **ZARR** +- **COG** (Cloud Optimized GeoTIFF) +- **COPC** (Cloud Optimized Point Clouds) +- **FlatGeoBuf** + +These formats are designed to support **random access via HTTP range requests**, making them a perfect match for cloud object storage and serverless computing. + +By leveraging HTTP range primitives, Lithops enables fast and scalable parallel processing — distributing workload across many concurrent function activations, each fetching only the data it needs. This approach takes full advantage of the **high aggregate bandwidth** provided by modern object storage systems. + +Partitioning Non-Optimized Formats with Dataplug +------------------------------------------------- + +Thanks to the `DATAPLUG `_ library, Lithops also supports **on-the-fly partitioning** of data formats that are **not cloud-optimized**. Supported formats include: + +- Genomics: **FASTA**, **FASTQ**, **FASTQ.GZ** +- Metabolomics: **mlMZ** +- Geospatial: **LIDAR (.laz)** + +Dataplug wraps these formats into cloud-native interfaces and exposes partitioning strategies that Lithops can consume directly. + +Example: Parallel Processing of a Cloud-Hosted LIDAR File +---------------------------------------------------------- + +In the example below, we use Dataplug to wrap a COPC (Cloud Optimized Point Cloud) file stored in S3, partition it into spatial chunks, and process each chunk in parallel using Lithops: + +.. code-block:: python + + from dataplug import CloudObject + from dataplug.formats.geospatial.copc import CloudOptimizedPointCloud, square_split_strategy + import laspy + import lithops + + # Function to process each LiDAR slice + def process_lidar_slice(data_slice): + las_data = data_slice.get() + lidar_file = laspy.open(las_data) + ... + + # Load the COPC file from S3 using Dataplug + co = CloudObject.from_s3( + CloudOptimizedPointCloud, + "s3://geospatial/copc/CA_YosemiteNP_2019/USGS_LPC_CA_YosemiteNP_2019_D19_11SKB6892.laz", + s3_config=local_minio, + ) + + # Partition the point cloud into 9 spatial chunks + slices = co.partition(square_split_strategy, num_chunks=9) + + # Process slices in parallel using Lithops + with lithops.FunctionExecutor() as executor: + futures = executor.map(process_lidar_slice, slices) + results = executor.get_result(futures) + +This enables truly **elastic and serverless geospatial processing pipelines**, with no infrastructure overhead and full cloud-native efficiency. diff --git a/docs/source/design.rst b/docs/source/design.rst index 0f1eaa80f..2286c8463 100644 --- a/docs/source/design.rst +++ b/docs/source/design.rst @@ -43,7 +43,7 @@ In Lithops, each map or reduce computation is executed as a separate compute *jo As mentioned above, the ``FunctionExecutor`` class is responsible for orchestrating the computation in Lithops. One ``FunctionExecutor`` object is instantiated prior to any use of Lithops. Its initialization includes these important steps: 1. It sets up the workers (depending on the specific compute backend), such as constructing docker images, defining IBM Cloud Functions, etc. This step may not include actually creating the workers, as this may be done automatically by the backend on-demand. 2. It defines a bucket in object storage (depending on the storage backend) in which each job will store job and call data (prior to computation) and results (when computation is complete). 3. It creates a ``FunctionInvoker`` object, which is responsible for executing a job as a set of independent per-worker calls. -Compute jobs are created in the functions of the ``job`` module (see chart above), invoked from the respective API method of ``FunctionExecutor``. Map jobs are created in ``create_map_job()`` and reduce jobs in ``create_reduce_job()``. The flow in both functions is quite similar. First, data is partitioned, with the intention of each partition be processed by one worker. For map jobs, this is done by invoking the ``create_partitions()`` function of the ``partitioner`` module, yielding a partition map. +Compute jobs are created in the functions of the ``job`` module (see chart above), invoked from the respective API method of ``FunctionExecutor``. Map jobs are created in ``create_map_job()`` and reduce jobs in ``create_reduce_job()``. The flow in both functions is quite similar. First, data is partitioned, with the intention that each partition be processed by one worker. For map jobs, this is done by invoking the ``create_partitions()`` function of the ``partitioner`` module, yielding a partition map. For reduce jobs, Lithops currently supports two modes: reduce per object, where each object is processed by a reduce function, and global (default) reduce, where all data is processed by a single reduce function. Respectively, data is partitioned as either one partition per storage object, or one global partition with all data. This process yields a partition map similar to map jobs. Additionally, ``create_reduce_job()`` wraps the reduce function in a special wrapper function that forces waiting for data before the actual reduce function is invoked. This is because reduce jobs follow map jobs, so the output of the map jobs needs to finish before reduce can run. @@ -58,4 +58,4 @@ Completion of a computation job in Lithops is detected in one of two techniques: **RabbitMQ**: A unique RabbitMQ topic is defined for each job. combining the executor id and job id. Each worker, once completes a call, posts a notification message on that topic (code in ``function_handler()`` in ``handler`` module, called from ``entry_point`` module of the worker). The ``wait_rabbitmq()`` function from ``wait_rabbitmq`` module, which is called from ``FunctionExecutor.wait()``, consumes a number of messages on that topic equal to ``total_calls`` and determines completion. -**Object Storage**: As explained above, each call persists its computation results in a specific object. Determining completion of a job is by the ``FunctionExecutor.wait()`` invoking the ``wait_storage()`` function from the ``wait_storage`` module. This function repeatedly, once per fixed period (controllable), polls the executor’s bucket for status objects of a subset of calls that have still not completed. This allows control of resource usage and eventual detection of all calls. \ No newline at end of file +**Object Storage**: As explained above, each call persists its computation results in a specific object. Determining completion of a job is by the ``FunctionExecutor.wait()`` invoking the ``wait_storage()`` function from the ``wait_storage`` module. This function repeatedly, once per fixed period (controllable), polls the executor’s bucket for status objects of a subset of calls that have still not completed. This allows control of resource usage and eventual detection of all calls. diff --git a/docs/source/execution_modes.rst b/docs/source/execution_modes.rst index 8649bffc6..1d280d883 100644 --- a/docs/source/execution_modes.rst +++ b/docs/source/execution_modes.rst @@ -1,20 +1,25 @@ Execution Modes =============== -Lithops compute backends can be classified in 3 different execution modes depending on the backend you choose. +Lithops compute backends can be classified in 3 different execution modes depending +on the backend you choose. Localhost mode -------------- -The "localhost mode" in Lithops is a convenient feature that enables you to execute functions on your local machine using processes. It serves as the default execution mode if no specific configuration is provided. +The "localhost mode" in Lithops is a convenient feature that enables you to execute +functions on your local machine using processes, without relying on cloud resources +or serverless computing environments. It serves as the default execution +mode if no specific configuration is provided. -In localhost mode, you can run your code locally without relying on cloud resources or serverless computing environments. It allows you to leverage the power of Lithops and its distributed computing capabilities right on your own machine. +To use localhost mode, you can simply write your functions using the Lithops programming +model and execute them locally. Lithops will handle the distribution and coordination +of the function executions, optimizing performance by leveraging multiple processes. -By utilizing processes, Lithops efficiently manages the execution of functions in parallel, taking advantage of the available resources on your local system. This mode is particularly useful for development, testing, and debugging purposes, as it eliminates the need to deploy code to a cloud environment during the development phase. +This mode is particularly useful for development, testing, and debugging purposes, +as it eliminates the need to deploy code to a cloud environment during the +development phase. -To use localhost mode, you can simply write your functions using the Lithops programming model and execute them locally. Lithops will handle the distribution and coordination of the function executions, optimizing performance by leveraging multiple processes. - -Whether you're exploring Lithops for the first time or working on local development tasks, the localhost mode offers a seamless experience, empowering you to harness the capabilities of Lithops without the need for cloud infrastructure. .. note:: This is the preferable option for starting with Lithops, and for testing (debugging) your applications. @@ -28,33 +33,49 @@ Whether you're exploring Lithops for the first time or working on local developm Serverless mode --------------- -The "serverless mode" in Lithops is designed to execute functions using publicly accessible serverless compute services, including IBM Cloud Functions, Amazon Lambda, Google Cloud Functions, and more, enabling parallel task execution in isolated cloud environments. - -In serverless mode, Lithops leverages the power of these serverless platforms to execute functions as independent tasks. Each function invocation is treated as a separate parallel task, benefiting from the scalability, automatic provisioning of resources, and isolation provided by the serverless compute service. - -By utilizing serverless platforms, developers can offload the burden of managing infrastructure and focus solely on writing and deploying their functions. The serverless mode in Lithops abstracts away the complexities of configuring and scaling embarrassingly parallel applications, making it easier to develop and deploy large-scale data processing workloads. - -This execution mode offers flexibility and elasticity, as resources are dynamically allocated based on workload demands, ensuring efficient utilization of compute power. It allows developers to seamlessly leverage the scalability and reliability of serverless platforms while benefiting from Lithops' programming model and distributed computing capabilities. - -Whether you're processing large datasets, handling real-time event-driven tasks, or building serverless applications, Lithops' serverless mode provides a convenient and scalable approach to execute functions on popular serverless compute services, simplifying the development and deployment process. +The "serverless mode" in Lithops is designed to execute functions using publicly +accessible serverless compute services, including IBM Cloud Functions, Amazon Lambda, +Google Cloud Functions, and more, enabling parallel task execution in isolated cloud +environments. + +In serverless mode, Lithops leverages the power of these serverless platforms to execute +functions as independent tasks. Each function invocation is treated as a separate parallel +task, benefiting from the scalability, automatic provisioning of resources, and isolation +provided by the serverless compute service. + +By utilizing serverless platforms, developers can offload the burden of managing +infrastructure and focus solely on writing and deploying their functions. +The serverless mode in Lithops abstracts away the complexities of configuring and +scaling embarrassingly parallel applications, making it easier to develop and deploy +large-scale data processing workloads. + +This execution mode offers flexibility and elasticity, as resources are dynamically +allocated based on workload demands, ensuring efficient utilization of compute power. +It allows developers to seamlessly leverage the scalability and reliability of +serverless platforms while benefiting from Lithops' programming model and distributed +computing capabilities. .. code:: python fexec = lithops.ServerlessExecutor() -- Available backends: `IBM Cloud Functions `_, `IBM Code Engine `_, `AWS Lambda `_, `AWS Batch `_, `Google Cloud Functions `_, `Google Cloud Run `_, `Azure Functions `_, `Azure Container APPs `_, `Aliyun Function Compute `_, `Oracle Functions `_, `Kubernetes Jobs `_, `Knative `_, `OpenWhisk `_ +- Available backends: `IBM Cloud Functions `_, `IBM Code Engine `_, `AWS Lambda `_, `AWS Batch `_, `Google Cloud Functions `_, `Google Cloud Run `_, `Azure Functions `_, `Azure Container APPs `_, `Aliyun Function Compute `_, `Oracle Functions `_, `Kubernetes Jobs `_, `Knative `_, `Singularity `_, `OpenWhisk `_ Standalone mode --------------- -The "standalone mode" in Lithops provides the capability to execute functions on one or multiple virtual machines (VMs) simultaneously, in a serverless-like fashion, without requiring manual provisioning as everything is automatically created. This mode can be deployed in a private cluster or in the cloud, where functions within each VM are executed using parallel processes, similar to the functionality offered in localhost mode. - -In standalone mode, Lithops simplifies the deployment and management of VMs, enabling users to effortlessly scale their compute resources to meet the demands of their workloads. By leveraging the automatic creation and configuration of VMs, developers can focus on writing their functions while Lithops takes care of the underlying infrastructure. - -Each VM within the standalone mode operates independently, allowing for parallel processing of functions. This parallelism enhances performance and enables efficient execution of computationally intensive tasks across multiple VMs. Whether deployed in a private cluster or in the cloud, standalone mode provides flexibility and scalability to process large volumes of data or perform complex computations. - -Standalone mode in Lithops expands the possibilities for distributed computing by combining the convenience of serverless-like provisioning with the power of parallel processing on VMs. It offers developers a seamless experience for executing functions in an isolated and scalable environment, simplifying the development and execution of data-intensive workloads and parallel applications. +The "standalone mode" in Lithops provides the capability to execute functions on one +or multiple virtual machines (VMs) simultaneously, in a serverless-like fashion, +without requiring manual provisioning as everything is automatically created. +This mode can be deployed in a private cluster or in the cloud, where functions +within each VM are executed using parallel processes. + +In standalone mode, Lithops simplifies the deployment and management of VMs, enabling +users to effortlessly scale their compute resources to meet the demands of their workloads. +By leveraging the automatic creation and configuration of VMs provided by Lithops, +developers can focus on writing their functions while Lithops takes care of the +underlying infrastructure. .. note:: This is the preferable option if your application (or a part) requires a more powerful environment than the ones provided by the Serverless backends (in terms of CPU and Memory). diff --git a/docs/source/images/plots_kubernetes/k8s_original_histogram.png b/docs/source/images/plots_kubernetes/k8s_original_histogram.png new file mode 100644 index 000000000..90ea11812 Binary files /dev/null and b/docs/source/images/plots_kubernetes/k8s_original_histogram.png differ diff --git a/docs/source/images/plots_kubernetes/k8s_original_warm_start_histogram.png b/docs/source/images/plots_kubernetes/k8s_original_warm_start_histogram.png new file mode 100644 index 000000000..4e8ee62a8 Binary files /dev/null and b/docs/source/images/plots_kubernetes/k8s_original_warm_start_histogram.png differ diff --git a/docs/source/images/plots_kubernetes/rabbitmq_histogram.png b/docs/source/images/plots_kubernetes/rabbitmq_histogram.png new file mode 100644 index 000000000..e422611e3 Binary files /dev/null and b/docs/source/images/plots_kubernetes/rabbitmq_histogram.png differ diff --git a/docs/source/images/plots_kubernetes/rabbitmq_warm_start_histogram.png b/docs/source/images/plots_kubernetes/rabbitmq_warm_start_histogram.png new file mode 100644 index 000000000..a13af85d3 Binary files /dev/null and b/docs/source/images/plots_kubernetes/rabbitmq_warm_start_histogram.png differ diff --git a/docs/source/lithops_config_keys.csv b/docs/source/lithops_config_keys.csv index 58cea506a..330453304 100644 --- a/docs/source/lithops_config_keys.csv +++ b/docs/source/lithops_config_keys.csv @@ -1,15 +1,15 @@ Group;Key;Default;Mandatory;Additional info -lithops;backend;``ibm_cf``;no;Compute backend implementation. IBM Cloud Functions is the default. If not set, Lithops will check the `mode` and use the `backend` set under the `serverless` or `standalone` sections described below. -lithops;storage;``ibm_cos``;no;Storage backend implementation. IBM Cloud Object Storage is the default. -lithops;data_cleaner;``True``;no;If set to True, then the cleaner will automatically delete all the temporary data that was written into `storage_bucket/lithops.jobs`. -lithops;monitoring;``storage``;no;Monitoring system implementation. One of: **storage** or **rabbitmq**. -lithops;monitoring_interval;``2``;no;Monitoring check interval in seconds in case of **storage** monitoring. -lithops;data_limit;``4``;no;Max (iter)data size (in MB). Set to False for unlimited size. -lithops;execution_timeout;``1800``;no;Functions will be automatically killed if they exceed this execution time (in seconds). Alternatively, it can be set in the `call_async()`, `map()` or `map_reduce()` calls using the `timeout` parameter. -lithops;include_modules;``[]``;no;Explicitly pickle these dependencies. All required dependencies are pickled if default empty list. No one dependency is pickled if it is explicitly set to None. -lithops;exclude_modules;``[]``;no;Explicitly keep these modules from pickled dependencies. It is not taken into account if you set include_modules. -lithops;log_level;``INFO``;no;Logging level. One of: WARNING, INFO, DEBUG, ERROR, CRITICAL, Set to None to disable logging. -lithops;log_format;``%(asctime)s [%(levelname)s] %(name)s -- %(message)``;no; Logging format string. -lithops;log_stream;``ext://sys.stderr``;no;Logging stream. eg.: ext://sys.stderr, ext://sys.stdout -lithops;log_filename;```` ;no;Path to a file. log_filename has preference over log_stream. -lithops;customized_runtime;False;no;Enables to build a new runtime with the map() function and its dependencies integrated. Only docker-based backends support this feature. | \ No newline at end of file +lithops;backend;`aws_lambda`;no;Compute backend implementation. Default is AWS Lambda. +lithops;storage;`aws_s3`;no;Storage backend implementation. Default is AWS S3. +lithops;data_cleaner;`True`;no;If True, automatically deletes temporary data written to `storage_bucket/lithops.jobs`. +lithops;monitoring;`storage`;no;Monitoring system implementation. Options: **storage** or **rabbitmq**. +lithops;monitoring_interval;`2`;no;Interval in seconds for monitoring checks (used if monitoring is set to **storage**). +lithops;data_limit;`4`;no;Maximum size (in MB) for iterator data chunks. Set to False for unlimited size. +lithops;execution_timeout;`1800`;no;Maximum execution time for functions in seconds. Functions exceeding this time are killed. Can also be set per call using the `timeout` parameter. +lithops;include_modules;`[]`;no;List of dependencies to explicitly include for pickling. If empty, all required dependencies are included. If set to None, no dependencies are included. +lithops;exclude_modules;`[]`;no;List of dependencies to explicitly exclude from pickling. Ignored if `include_modules` is set. +lithops;log_level;`INFO`;no;Logging level. Options: WARNING, INFO, DEBUG, ERROR, CRITICAL. Set to None to disable logging. +lithops;log_format;`%(asctime)s [%(levelname)s] %(name)s -- %(message)s`;no;Format string for log messages. +lithops;log_stream;`ext://sys.stderr`;no;Logging output stream, e.g., ext://sys.stderr or ext://sys.stdout. +lithops;log_filename;``;no;File path for logging output. Takes precedence over `log_stream` if set. +lithops;retries;`0`;no;Number of retries for failed function invocations when using the `RetryingFunctionExecutor`. Default is 0. Can be overridden per API call. diff --git a/docs/source/sklearn_joblib.rst b/docs/source/sklearn_joblib.rst index 45b38b15e..8c216f553 100644 --- a/docs/source/sklearn_joblib.rst +++ b/docs/source/sklearn_joblib.rst @@ -3,7 +3,14 @@ Distributed Scikit-learn / Joblib Lithops supports running distributed scikit-learn programs by implementing a Lithops backend for joblib using Functions instead of local processes. This makes it easy to scale existing applications that use scikit-learn from a single node to a cluster. -To get started, first install Lithops, then use from ``lithops.util.joblib import register_lithops`` and run ``register_lithops()``. This will register Lithops as a joblib backend for scikit-learn to use. Then run your original scikit-learn code inside with ``joblib.parallel_backend('lithops')``. +To get started, first install Lithops and the joblib dependencies with: + +.. code-block:: bash + + python3 -m pip install lithops[joblib] + + +Once installed, use ``from lithops.util.joblib import register_lithops`` and run ``register_lithops()``. This will register Lithops as a joblib backend for scikit-learn to use. Then run your original scikit-learn code inside with ``joblib.parallel_backend('lithops')``. Refer to the official `JobLib `_ and `SkLearn `_ documentation to operate with these libraries. diff --git a/docs/source/storage_config/aliyun_oss.md b/docs/source/storage_config/aliyun_oss.md index 32d1bc0d8..e818ddec8 100644 --- a/docs/source/storage_config/aliyun_oss.md +++ b/docs/source/storage_config/aliyun_oss.md @@ -7,8 +7,8 @@ Lithops with Aliyun Object Storage Service as storage backend. 1. Install Alibaba Cloud backend dependencies: -``` -$ python3 -m pip install lithops[aliyun] +```bash +python3 -m pip install lithops[aliyun] ``` ## Configuration @@ -41,5 +41,5 @@ aliyun: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|aliyun_oss | region | | no | Region Name from [here](https://www.alibabacloud.com/help/en/object-storage-service/latest/regions-and-endpoints). Ommit the `oss-` prefix. For example: `eu-west-1`. Lithops will use the region set under the `aliyun` section if it is not set here | +|aliyun_oss | region | | no | Region Name from [here](https://www.alibabacloud.com/help/en/object-storage-service/latest/regions-and-endpoints). Omit the `oss-` prefix. For example: `eu-west-1`. Lithops will use the region set under the `aliyun` section if it is not set here | |aliyun_oss | storage_bucket | | no | The name of a bucket that exists in you account. This will be used by Lithops for intermediate data. Lithops will automatically create a new one if it is not provided| \ No newline at end of file diff --git a/docs/source/storage_config/aws_s3.md b/docs/source/storage_config/aws_s3.md index 21f89f609..81ddd6171 100644 --- a/docs/source/storage_config/aws_s3.md +++ b/docs/source/storage_config/aws_s3.md @@ -4,31 +4,48 @@ Lithops with AWS S3 as storage backend. ## Installation -1. Install Amazon Web Services backend dependencies: +1. Install AWS backend dependencies: - ``` - $ python3 -m pip install lithops[aws] - ``` +```bash +python3 -m pip install lithops[aws] +``` -2. [Login](https://console.aws.amazon.com/?nc2=h_m_mc) to Amazon Web Services Console (or signup if you don't have an account) +## Configuration -3. Navigate to *S3* and *create a bucket*. Type a name (e.g. `lithops-data`). The bucket should be created in the same region as the Lambda functions are expected to be run (mainly to avoid inter-region data transfer charges). +Lithops automatically creates a bucket with a unique name for your user. If you want to use a different bucket, you can create it manually and provide the name in the lithops config file. For this: +1. [Login](https://console.aws.amazon.com/?nc2=h_m_mc) to Amazon Web Services Console (or signup if you don't have an account) -## Configuration +2. Navigate to *S3* and *create a bucket*. Type a name (e.g. `lithops-data-mysuer`). The bucket should be created in the same region as the Lambda functions are expected to be run (mainly to avoid inter-region data transfer charges). + + +## AWS Credential setup + +Lithops loads AWS credentials as specified in the [boto3 configuration guide](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html). -4. Edit your lithopsa config and add the following keys: +In summary, you can use one of the following settings: +1. Provide the credentials via the `~/.aws/config` file, or set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. + + You can run `aws configure` command if the AWS CLI is installed to setup the credentials. Then set in the Lithops config file: ```yaml lithops: storage: aws_s3 aws: - region : - access_key_id : - secret_access_key : + region: ``` +2. Provide the credentials in the `aws` section of the Lithops config file: + ```yaml + lithops: + storage: aws_s3 + + aws: + access_key_id: + secret_access_key: + region: + ``` ## Summary of configuration keys for AWS: @@ -37,14 +54,14 @@ Lithops with AWS S3 as storage backend. |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| |aws | region | |yes | AWS Region. For example `us-east-1` | -|aws | access_key_id | |yes | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | -|aws | secret_access_key | |yes | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | access_key_id | |no | Account access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | +|aws | secret_access_key | |no | Account secret access key to AWS services. To find them, navigate to *My Security Credentials* and click *Create Access Key* if you don't already have one. | |aws | session_token | |no | Session token for temporary AWS credentials | ### Summary of configuration keys for AWS S3: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|aws_s3 | region | |no | Region of your Bcuket. e.g `us-east-1`, `eu-west-1`, etc. Lithops will use the region set under the `aws` section if it is not set here | +|aws_s3 | region | |no | Region of your Bucket. e.g `us-east-1`, `eu-west-1`, etc. Lithops will use the region set under the `aws` section if it is not set here | |aws_s3 | storage_bucket | | no | The name of a bucket that exists in you account. This will be used by Lithops for intermediate data. Lithops will automatically create a new one if it is not provided | diff --git a/docs/source/storage_config/azure_blob.md b/docs/source/storage_config/azure_blob.md index e70e5967f..4233a38e5 100644 --- a/docs/source/storage_config/azure_blob.md +++ b/docs/source/storage_config/azure_blob.md @@ -7,11 +7,11 @@ Lithops with Azure Blob Storage as storage backend. 1. Install Microsoft Azure backend dependencies: -``` +```bash $ python3 -m pip install lithops[azure] ``` -5. Create a Resource Group and a Storage Account: +2. Create a Resource Group and a Storage Account: Option 1: diff --git a/docs/source/storage_config/ceph.md b/docs/source/storage_config/ceph.md index 844753037..c53729e67 100644 --- a/docs/source/storage_config/ceph.md +++ b/docs/source/storage_config/ceph.md @@ -5,15 +5,20 @@ Lithops with Ceph storage backend. ## Installation -1. Install Ceph. +1. Install Ceph backend dependencies: -2. Create a new user. +```bash +python3 -m pip install lithops[ceph] +``` + +2. Install Ceph. + +3. Create a new user. -3. Create a new bucket (e.g. `lithops-data`). Remember to update the corresponding Lithops config field with this bucket name. ## Configuration -3. Edit your lithops config file and add the following keys: +1. Edit your lithops config file and add the following keys: ```yaml lithops: @@ -30,7 +35,8 @@ Lithops with Ceph storage backend. |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|ceph | endpoint | |yes | The host ip adress where you installed the Ceph server. Must start with http:// or https:// | +|ceph | endpoint | |yes | The host ip address where you installed the Ceph server. Must start with http:// or https:// | +|ceph | region | |no | Region name. For example 'eu-west-1' | |ceph | access_key_id | |yes | Account user access key | |ceph | secret_access_key | |yes | Account user secret access key | |ceph | session_token | |no | Session token for temporary AWS credentials | diff --git a/docs/source/storage_config/gcp_storage.md b/docs/source/storage_config/gcp_storage.md index 4a0b669b7..177a93511 100644 --- a/docs/source/storage_config/gcp_storage.md +++ b/docs/source/storage_config/gcp_storage.md @@ -6,31 +6,29 @@ Lithops with GCP Storage as storage backend. 1. Install Google Cloud Platform backend dependencies: +```bash +python3 -m pip install lithops[gcp] ``` -$ python3 -m pip install lithops[gcp] -``` - 2. [Login](https://console.cloud.google.com) to Google Cloud Console (or signup if you don't have an account). +## Configuration + +1. [Login](https://console.cloud.google.com) to Google Cloud Console (or signup if you don't have an account). - 3. Create a new project. Name it `lithops` or similar. +2. Create a new project. Name it `lithops` or similar. - 4. Navigate to *IAM & Admin* > *Service Accounts*. +3. Navigate to *IAM & Admin* > *Service Accounts*. - 5. Click on *Create Service Account*. Name the service account `lithops-executor` or similar. Then click on *Create*. +4. Click on *Create Service Account*. Name the service account `lithops-executor` or similar. Then click on *Create*. - 6. Add the following roles to the service account: - - Service Account User - - Cloud Functions Admin - - Pub/Sub Admin - - Storage Admin +5. Add the following roles to the service account: + - Service Account User + - Cloud Functions Admin + - Pub/Sub Admin + - Storage Admin - 7. Click on *Continue*. Then, click on *Create key*. Select *JSON* and then *Create*. Download the JSON file to a secure location in you computer. Click *Done*. - - 8. Navigate to *Storage* on the menu. Create a bucket and name it `lithops-data` or similar. Remember to update the corresponding Lithops config field with this bucket name. - -## Configuration +6. Click on *Continue*. Then, click on *Create key*. Select *JSON* and then *Create*. Download the JSON file to a secure location in you computer. Click *Done*. -9. Edit your lithops config file and add the following keys: +7. Edit your lithops config file and add the following keys: ```yaml lithops: diff --git a/docs/source/storage_config/ibm_cos.md b/docs/source/storage_config/ibm_cos.md index acacf2631..08a130767 100644 --- a/docs/source/storage_config/ibm_cos.md +++ b/docs/source/storage_config/ibm_cos.md @@ -1,14 +1,18 @@ # IBM Cloud Object Storage - Lithops with IBM COS as storage backend. - ## Installation -1. Create an [IBM Cloud Object Storage account](https://www.ibm.com/cloud/object-storage). +1. Install IBM Cloud backend dependencies: + +```bash +python3 -m pip install lithops[ibm] +``` -2. Crate a bucket in your desired region. Remember to update the corresponding Lithops config field with this bucket name. +2. Create an [IBM Cloud Object Storage account](https://www.ibm.com/cloud/object-storage). + +3. Crate a bucket in your desired region. Remember to update the corresponding Lithops config field with this bucket name. ## Configuration @@ -87,7 +91,7 @@ Lithops with IBM COS as storage backend. ## Lithops COS Endpoint configuration ### Using region -The easiest apporach is to let Lithops to choose the right endpoint by itself. To enable this, just configure Lithops with the region name of your `storage_bucket`, as follows: +The easiest approach is to let Lithops to choose the right endpoint by itself. To enable this, just configure Lithops with the region name of your `storage_bucket`, as follows: ```yaml ibm_cos: @@ -123,12 +127,13 @@ Alternative to using region, you can configure the public and private endpoints ### IBM Cloud Object Storage: -|Group|Key|Default|Mandatory|Additional info| -|---|---|---|---|---| -|ibm_cos | region | |yes | Region of your bucket. One of: `eu-gb`, `eu-de`, `us-south`, `us-east`, `br-sao`, `ca-tor`, `jp-tok`, `jp-osa`, `au-syd`. Lithops will use the region set under the `ibm` section if it is not set here| -|ibm_cos | api_key | |yes | API Key to your COS account. Not needed if using IAM API Key| -|ibm_cos | storage_bucket | | yes | The name of a bucket that exists in you account. This will be used by Lithops for intermediate data. You must provide HMAC Credentials if you want the bucket to be automatically created | -|ibm_cos | access_key_id | |no | HMAC Credentials. **Mandatory** if no api_key. Not needed if using IAM API Key| -|ibm_cos | secret_access_key | |no | HMAC Credentials. **Mandatory** if no api_key. Not needed if using IAM API Key| -|ibm_cos | endpoint | |no | Endpoint to your COS account. **Mandatory** if no region. Make sure to use the full path with 'https://' as prefix | -|ibm_cos | private_endpoint | |no | Private endpoint to your COS account. **Mandatory** if no region. Make sure to use the full path with 'https://' or http:// as prefix | +| Group | Key |Default|Mandatory| Additional info | +|---------|---------------------|---|---|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| ibm_cos | region | |yes | Region of your bucket. One of: `eu-gb`, `eu-de`, `us-south`, `us-east`, `br-sao`, `ca-tor`, `jp-tok`, `jp-osa`, `au-syd`. Lithops will use the region set under the `ibm` section if it is not set here | +| ibm_cos | api_key | |yes | API Key to your COS account. Not needed if using IAM API Key | +| ibm_cos | storage_bucket | | yes | The name of a bucket that exists in you account. This will be used by Lithops for intermediate data. You must provide HMAC Credentials if you want the bucket to be automatically created | +| ibm_cos | service_instance_id | |no | The service instance (CRN format) of your COS instance. **Mandatory** if no HMAC Credentials provided. | | | | | +| ibm_cos | access_key_id | |no | HMAC Credentials. **Mandatory** if no api_key. Not needed if using IAM API Key | +| ibm_cos | secret_access_key | |no | HMAC Credentials. **Mandatory** if no api_key. Not needed if using IAM API Key | +| ibm_cos | endpoint | |no | Endpoint to your COS account. **Mandatory** if no region. Make sure to use the full path with 'https://' as prefix | +| ibm_cos | private_endpoint | |no | Private endpoint to your COS account. **Mandatory** if no region. Make sure to use the full path with 'https://' or http:// as prefix | diff --git a/docs/source/storage_config/infinispan.md b/docs/source/storage_config/infinispan.md index fbe7c3a1d..af117f476 100644 --- a/docs/source/storage_config/infinispan.md +++ b/docs/source/storage_config/infinispan.md @@ -11,7 +11,7 @@ infinispan_hotrod (native binary) endpoint. ## Configuration -2. Edit your lithops config file and add the following keys: +Edit your lithops config file and add the following keys: ### Rest endpoint ```yaml @@ -30,7 +30,7 @@ infinispan_hotrod (native binary) endpoint. - ... ``` -### Summary of configuration keys for Infinispan: +#### Summary of configuration keys for Infinispan: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| @@ -61,7 +61,7 @@ To run this endpoint you need to compile and install the Infinispan python clien - ... ``` -### Summary of configuration keys for Infinispan_hotrod: +#### Summary of configuration keys for Infinispan_hotrod: |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| diff --git a/docs/source/storage_config/minio.md b/docs/source/storage_config/minio.md index ad468fa4a..1a91947e8 100644 --- a/docs/source/storage_config/minio.md +++ b/docs/source/storage_config/minio.md @@ -5,15 +5,19 @@ Lithops with MinIO storage backend. ## Installation -1. Install MinIO. +1. Install MinIO backend dependencies: -2. Create a new user. +```bash +python3 -m pip install lithops[minio] +``` + +2. Install MinIO. -3. Create a new bucket (e.g. `lithops-data`). Remember to update the corresponding Lithops config field with this bucket name. +3. Create a new user. ## Configuration -3. Edit your lithops config file and add the following keys: +Edit your lithops config file and add the following keys: ```yaml lithops: diff --git a/docs/source/storage_config/oracle_oss.md b/docs/source/storage_config/oracle_oss.md index 6ebdbd07d..be3843546 100644 --- a/docs/source/storage_config/oracle_oss.md +++ b/docs/source/storage_config/oracle_oss.md @@ -1,4 +1,4 @@ -# Oracle Object Storage (beta) +# Oracle Object Storage Lithops with Oracle Object Storage as storage backend. @@ -8,6 +8,10 @@ Lithops with Oracle Object Storage as storage backend. 1. Install Oracle Cloud backend dependencies: +```bash +python3 -m pip install lithops[oracle] +``` + ## Configuration 1. Navigate to the [API keys page](https://cloud.oracle.com/identity/domains/my-profile/api-keys) and generate and download a new API signing keys. Omit this step if you already generated and downloaded one key. When you generate a new Key, oracle provides a sample config file with most of the required parameters by lithops. Copy all the `key:value` pairs and configure lithops as follows: diff --git a/docs/source/storage_config/redis.md b/docs/source/storage_config/redis.md index b2deef999..859b186c9 100644 --- a/docs/source/storage_config/redis.md +++ b/docs/source/storage_config/redis.md @@ -5,14 +5,20 @@ Lithops with Redis as storage backend. ## Installation -1. Install Redis >= 5. +1. Install Redis backend dependencies: -2. Secure your installation by setting a password in the redis configuration file. +```bash +python3 -m pip install lithops[redis] +``` + +2. Install Redis >= 5. + +3. Secure your installation by setting a password in the redis configuration file. ## Configuration -3. Edit your lithops config file and add the following keys: +Edit your lithops config file and add the following keys: ```yaml lithops: @@ -29,10 +35,10 @@ Lithops with Redis as storage backend. |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| -|redis | host | localhost |no | The host ip adress where you installed the Redis server. | +|redis | host | localhost |no | The host ip address where you installed the Redis server. | |redis | port | 6379 |no | The port where the redis server is listening | |redis | username | None |no | The username (if any)| |redis | password | None |no | The password you set in the Redis configuration file (if any) | |redis | db | 0 |no | Number of database to use | -|redis | ssl | False |no | Activate ssl conection | +|redis | ssl | False |no | Activate ssl connection | |redis | ... | |no | All the parameters set in this lithops `redis` config section are directly passed to a [`reds.Redis()`](https://redis-py.readthedocs.io/en/stable/index.html#redis.Redis) instance, so you can set all the same parameters if necessary. | diff --git a/docs/source/storage_config/swift.md b/docs/source/storage_config/swift.md index 1585ba294..d9c8e793d 100644 --- a/docs/source/storage_config/swift.md +++ b/docs/source/storage_config/swift.md @@ -26,6 +26,8 @@ Lithops with OpenStack Swift as storage backend. user_id : project_id : password : + user_domain_name: + project_domain_name: ``` @@ -34,8 +36,10 @@ Lithops with OpenStack Swift as storage backend. |Group|Key|Default|Mandatory|Additional info| |---|---|---|---|---| |swift | storage_bucket | | yes | The name of a container that exists in you account. This will be used by Lithops for intermediate data. If set, this will overwrite the `storage_bucket` set in `lithops` section | -|swift | auth_url | |yes | The keystone endpoint for authenthication | +|swift | auth_url | |yes | The keystone endpoint for authentication | |swift | region | |yes | The region of your container | |swift | project_id | |yes | The Project ID | |swift | user_id | |yes | The user ID | |swift | password | |yes | The password | +|swift | user_domain_name | | no | The domain to which the user belongs, by default is set to "default" | +|swift | project_domain_name | | no | The domain associated with the project, by default is set to "default" | diff --git a/docs/source/supported_clouds.rst b/docs/source/supported_clouds.rst index 96eb1221e..fd30fb62c 100644 --- a/docs/source/supported_clouds.rst +++ b/docs/source/supported_clouds.rst @@ -114,9 +114,9 @@ Currently, Lithops for Kubernetes and On-Premise supports these backends: - `OpenStack Swift `_ * - `Knative `_ - `Redis `_ - * - `OpenWhisk `_ + * - `Singularity `_ - `Ceph `_ - * - + * - `OpenWhisk `_ - `MinIO `_ * - - `Infinispan `_ diff --git a/docs/source/testing.rst b/docs/source/testing.rst deleted file mode 100644 index 32f5cc304..000000000 --- a/docs/source/testing.rst +++ /dev/null @@ -1,82 +0,0 @@ -.. _testing: - -Testing Guidelines -================== - -Verify current Lithops status ------------------------------ - -Run ``lithops test`` to run an extensive inspection, which simulates your lithops installation on a variety of tests. - -* Get all available options by running ``lithops test --help``. - -* Get all available test functions and their parent group by running ``lithops test -t help``. - -* Run all test instances named ````, via ``lithops test -t ``. - Run a test from a specific group by prefixing the test name with group name, e.g. : ``lithops test -t .``. - Run multiple tests by separating them with a comma, e.g. ``lithops test -t ,``. - -* To get all available groups, run ``lithops test -g help``. - -* Run a single group of tests via ``lithops test -g ``. - Run multiple tests by separating them with a comma, e.g. ``lithops test -g ,``. - -* To stop the test procedure upon first encountering a failed test, add the -f flag, e.g. ``lithops test -f``. -* To remove datasets, uploaded during the test procedure, use the -r flag, ``lithops test -r``. - WARNING - do not use this flag on a github workflow, due to race condition issues. - -* Get a complete list of the available flags by running ``lithops test --help``. - -* A summarizing example: ``lithops test -t test_map,storage.test_cloudobject -g call_async -f``. - -Alternatively, you may run the tests via ``python3 -m lithops.tests.tests_main``, followed by aforementioned flags. - -Contribute ----------- - -Add a test to an existing test group -.................................... - -Locate the matching test group in ``lithops/tests`` (indicated by file name) and add your test function appropriately: -* Add your test function as a method inside the class inheriting from ``unittest.testcase``. - -* Use existing util functions from ``lithops/tests/util_func`` or add new ones to that package. - -* A test that's simultaneously testing any two of the following functions: {async, map, map_reduce} will be placed in the proper file by complying with the following hierarchy: map_reduce > map > async. - -Add a new test group -.................... - -Before adding a test function that aims to test an untested feature: - -* Create a new file in ``lithops/tests`` using the template file as your guide: - - * Create a new copy of the template file and name it "test_feature_name", thus automatically creating a new test group named "feature_name". - - * Figure out which rows are necessary for your new test group, by following documentation beside the rows, then, proceed to un-comment said rows. - -* Continue to add the function by adhering to the instructions in the clause above. - - -Additional information -...................... - -Whether you're adding a new function, or a new test group you'd be better off knowing the available resources at your disposal: - -* ``lithops/tests/util_func`` contains many functions, divided into categories, that you may find helpful. - -* ``lithops/tests`` contains a template_file called "test_template" which contains documentation regarding each common resource/import that may interest you. - -* Many examples utilizing said resources may be found across the "test_*" files of ``lithops/tests``. - -* For a variety of evaluation functions ("assert*") belonging to unittest, browse this [documentation page](https://docs.python.org/3/library/unittest.html). - -Examples -........ - -* Access the functions in your chosen backend storage class via "STORAGE", e.g. `STORAGE.put_object`. - -* Access your bucket via `STORAGE_CONFIG['bucket']`. - -* Pass on "CONFIG" to your function executor instance, to allow your test function to work with users that provided a path to the config file via a flag, e.g. `fexec = lithops.FunctionExecutor(config=CONFIG)`. - \ No newline at end of file diff --git a/docs/source/worker_granularity.rst b/docs/source/worker_granularity.rst index c7cc3be79..f15d66582 100644 --- a/docs/source/worker_granularity.rst +++ b/docs/source/worker_granularity.rst @@ -1,10 +1,119 @@ -Customize Worker Granularity -============================ +Worker Granularity +================== -By default, Lithops worker granularity is et to 1. That is, each function activations is run in a different runtime instance. -You can change this by using the ``chunksize`` parameter of the map and map_reduce calls. +Lithops was initially designed with Function as a Service (FaaS) platforms in mind. As such, its default worker +granularity is set to 1. This means that each function activation is executed within its own isolated +runtime instance. This design choice aligns with the nature of FaaS, where functions are typically short-lived +and stateless, making it well-suited for tasks like event-driven processing and serverless computing. -By default, Lithops uses one process in each runtime instance. -You can change this by using the ``worker_processes`` parameter in the configuration of your backend. -This parameter allows to start multiple processes within the same runtime instance. -This is convenient if your runtime have access to more than one CPU. +However, it's essential to understand the distinctions between FaaS and Container as a Service (CaaS) platforms. +In CaaS, you have greater flexibility in selecting the appropriate resources (CPU and memory) for each worker. +This flexibility allows you to fine-tune the execution environment to your specific requirements. In contrast +to FaaS, where the granularity is often fixed at 1, CaaS platforms enable you to adjust the number of CPUs +allocated to a container. + +When using Lithops on a CaaS platform, it can be more advantageous to increase the number of CPUs assigned to each +worker and subsequently adjust the granularity, rather than adhering to a 1:1 granularity ratio. This approach +significantly reduces cold start times. For instance, if you need to execute 100 tasks with a 1:1 granularity, +it would attempt to initiate all 100 containers simultaneously, potentially overloading the CaaS platform. However, +by configuring each worker to utilize 4 CPUs and updating Lithops accordingly, it would only need to start 25 containers. +This allows you to leverage the resource flexibility provided by CaaS without attempting to impose FaaS-like granularity. +Understanding these distinctions between FaaS and CaaS platforms is crucial for optimizing the performance and efficient +resource utilization of your Lithops-based applications. + +How to customize worker granularity? +------------------------------------ + +To customize the worker granularity, you first need to use the ``worker_processes`` parameter. +The ``worker_processes`` config parameter is employed to define the number of parallel sub-workers +initiated within a single worker. To fully utilize the allocated resources for your containers, +it is advisable to set this parameter to a value that matches or exceeds the number of CPUs in +your container or VM. + +You can provide the ``worker_processes`` parameter either in the Lithops config, under the +compute backend section: + +.. code:: yaml + + gcp_cloudrun: + .... + worker_processes : 4 + +or during a ``FunctionExecutor()`` instantiation: + +.. code:: python + + import lithops + + fexec = lithops.FunctionExecutor(worker_processes=4) + + +Alongside the ``worker_processes`` configuration parameter, it is possible to specify the ``chunksize`` parameter. +The ``chunksize`` parameter determines the number of functions allocated to each worker for processing. +By default, the ``chunksize`` parameter is automatically configured to match the ``worker_processes``. However, you have the +flexibility to customize it by setting it to a higher value. For example, if you have 200 tasks to execute and you set +``worker_processes`` to 4 and ``chunksize`` to 8, this configuration will result in the initiation of 25 workers (instead of 50). +Within each worker, 4 parallel sub-workers will start execution. Each worker will receive 8 tasks to process. The first 4 +tasks will begin immediately since there are 4 available sub-workers per worker. Meanwhile, the remaining 4 tasks will be +queued for execution as the initial tasks start to complete. + + +To customize the ``chunksize`` parameter, you have to edit your ``map()`` or ``map_reduce()`` calls and specify the desirde value, for example: + +.. code:: python + + import lithops + + + def my_map_function(id, x): + print(f"I'm activation number {id}") + return x + 7 + + + if __name__ == "__main__": + fexec = lithops.FunctionExecutor(worker_processes=4) + fexec.map(my_map_function, range(200), chunksize=8) + print(fexec.get_result()) + + +Worker granularity in the standalone mode using VMs +--------------------------------------------------- + +In addition to supporting FaaS and CaaS platforms, Lithops also extends its compatibility to Virtual Machine (VM) backends, +such as EC2. Similar to CaaS environments, VMs offer a high degree of resource customization. When utilizing VMs with Lithops, +you gain the ability to tailor your VM instance with the appropriate resources, including CPU cores. In scenarios where +parallelism is crucial, it may be more efficient to configure a VM with a higher core count, such as 16 CPUs, rather than +attempting to manage and coordinate eight separate VM instances with single cores each. This approach simplifies resource +management and optimizes the performance of your Lithops-based applications running on VM backends. As with CaaS, +understanding the flexibility VMs provide is essential for effectively utilizing your compute resources. + +Unlike FaaS and CaaS platforms, when deploying Lithops on Virtual Machine backends, such as EC2, a master-worker architecture +is adopted. In this paradigm, the master node holds a work queue containing tasks for a specific job, and workers pick up and +process tasks one by one. In this sense, the chunksize parameter, which determines the number of functions allocated +to each worker for parallel processing, is not applicable in this context. Consequently, the worker granularity is inherently +determined by the number of worker processess in the VM setup. Adjusting the number of VM instances or the configuration of +each VM, such as the CPU core count, becomes crucial for optimizing performance and resource utilization in this master-worker +approach. + +In this scenario, specifying either the ``worker_instance_type`` or ``worker_processes`` config parameter is enough to achieve +the desired parallelism inside worker VMs. By default, Lithops determines the total number of worker processes based on the +number of CPUs in the specified instance type. For example, an AWS EC2 instance of type ``t2.medium``, with 2 CPUs, would set +``worker_processes`` to 2. Additionally, users have the flexibility to manually adjust parallelism by setting a different +value for ``worker_processes``. Depending on the use case, it would be convenient to set more ``worker_processes`` than CPUs, +or less ``worker_processes`` than CPUs. For example, we can use a ``t2.medium`` instance types that has 2 CPUs, but +set ``worker_processes`` to 4: + +.. code:: python + + import lithops + + + def my_map_function(id, x): + print(f"I'm activation number {id}") + return x + 7 + + + if __name__ == "__main__": + fexec = lithops.FunctionExecutor(worker_instance_type='t2.medium', worker_processes=4) + fexec.map(my_map_function, range(50)) + print(fexec.get_result()) diff --git a/docs/user_guide.md b/docs/user_guide.md deleted file mode 100644 index 81c20af93..000000000 --- a/docs/user_guide.md +++ /dev/null @@ -1,41 +0,0 @@ -# User Guide - -1. [Lithops design overview](source/design.rst) - -1. [Supported Clouds](source/supported_clouds.rst) - -1. [Execution Modes](source/execution_modes.rst) - -1. High-level Compute and Storage APIs - - [Futures API](api_futures.md) - - [Multiprocessing API](source/api_multiprocessing.rst) - - [Storage API](api_storage.md) - - [Storage OS API](source/api_storage_os.rst) - -1. [Lithops Monitoring](source/monitoring.rst) - -1. [Functions design and parameters](source/functions.md) - - [Reserved parameters](source/functions.md#reserved-parameters) - - [Parameters format for a *single* call](source/functions.md#parameters-in-the-call_async-method) - - [Parameters format for a *map* call](source/functions.md#parameters-in-the-map-and-map_reduce-methods) - - [Common parameters across functions](source/functions.md#common-parameters-across-functions-invocations) - -1. [Distributed shared objects across function activations](source/dso.rst) - -1. [Distributed Scikit-learn / Joblib](source/sklearn_joblib.rst) - -1. [Lithops for big data analytics](source/data_processing.rst) - - [Processing data from a cloud object store](source/data_processing.rst#processing-data-from-a-cloud-object-storage-service) - - [Processing data from public URLs](source/data_processing.rst#processing-data-from-public-urls) - - [Processing data from localhost files](source/data_processing.rst#processing-data-from-localhost-files) - -1. [Run Lithops on Jupyter notebooks](../examples/hello_world.ipynb) - -1. [Execute Airflow workflows using Lithops](https://github.com/lithops-cloud/airflow-plugin) - -1. [Lithops end-to-end Applications](https://github.com/lithops-cloud/applications) - -1. [Build and manage custom runtimes to run the functions](../runtime/) - - [Dynamic runtime customization](source/customized_runtime.rst) - -1. [Command Line Tool](source/cli.rst) \ No newline at end of file diff --git a/examples/call_async_cython.py b/examples/call_async_cython.py index 48e27e95a..1fc4a77a0 100755 --- a/examples/call_async_cython.py +++ b/examples/call_async_cython.py @@ -1,10 +1,13 @@ """ -Simple Lithops example using one single function invocation -with a cythonized function located in function.so +Simple Lithops example using one single function invocation with a +cythonized function located in function.cpython-39-x86_64-linux-gnu.so Commands to compile the function.py into function.so (Ubuntu): cython3 -3 --embed -X always_allow_keywords=true -o function.c function.py gcc -shared -o function.so -fPIC -I /usr/include/python3.9 function.c + +To make this example working, you have to delete funtion.py and rename +function.cpython-39-x86_64-linux-gnu.so -> function.so """ import lithops from function import my_c_function diff --git a/examples/failed_futures.py b/examples/failed_futures.py new file mode 100644 index 000000000..73a89441d --- /dev/null +++ b/examples/failed_futures.py @@ -0,0 +1,37 @@ +""" +Simple Lithops example using the map method. +In this example the map() method will launch one +map function for each entry in 'iterdata'. Finally +it will print the results for each invocation with +fexec.get_result() +""" +import lithops +import time + + +def my_map_function(id, x): + print(f"I'm activation number {id}") + time.sleep(2) + if id in [2, 4]: + raise MemoryError() + return x + + +if __name__ == "__main__": + iterdata = ["a", "b", "c", "d", "e"] + + fexec = lithops.FunctionExecutor(log_level='DEBUG') + futures = fexec.map(my_map_function, iterdata) + return_vals = fexec.get_result(fs=futures, throw_except=False) + + failed_callids = [int(f.call_id) for f in futures if f.error] + + if failed_callids: + new_iterdata = [iterdata[i] for i in failed_callids] + futures = fexec.map(my_map_function, new_iterdata) + new_return_vals = fexec.get_result(fs=futures, throw_except=False) + + for i, failed_callid in enumerate(failed_callids): + return_vals[failed_callid] = new_return_vals[i] + + print(return_vals) diff --git a/examples/function.py b/examples/function.py index 0d1401f8a..d70886e42 100644 --- a/examples/function.py +++ b/examples/function.py @@ -5,5 +5,7 @@ cython3 -3 --embed -X always_allow_keywords=true -o function.c function.py gcc -shared -o function.so -fPIC -I /usr/include/python3.9 function.c """ + + def my_c_function(x): return x + 7 diff --git a/examples/map.py b/examples/map.py index 68df993d0..c93eab821 100644 --- a/examples/map.py +++ b/examples/map.py @@ -6,16 +6,18 @@ fexec.get_result() """ import lithops +import time def my_map_function(id, x): print(f"I'm activation number {id}") + time.sleep(5) return x + 7 if __name__ == "__main__": iterdata = [1, 2, 3, 4] fexec = lithops.FunctionExecutor() - fexec.map(my_map_function, iterdata) + fexec.map(my_map_function, range(2)) + fexec.map(my_map_function, range(6)) print(fexec.get_result()) - fexec.clean() diff --git a/examples/multiprocessing/pool_args.py b/examples/multiprocessing/pool_args.py index 5aa5a39d2..a112cc1e0 100644 --- a/examples/multiprocessing/pool_args.py +++ b/examples/multiprocessing/pool_args.py @@ -51,7 +51,7 @@ def single_arg(arg): except Exception as e: print(e) - # Pool.apply con accept positional arguments, key word arguments or both + # Pool.apply can accept positional arguments, key word arguments or both pool.apply(multiple_args, args=(1, 2, 3)) pool.apply(multiple_args, kwds={'arg1': 1, 'arg2': 2, 'arg3': 3}) pool.apply(multiple_args, args=(1, 2), kwds={'arg3': 3}) diff --git a/examples/retry.py b/examples/retry.py new file mode 100644 index 000000000..c6aac1e43 --- /dev/null +++ b/examples/retry.py @@ -0,0 +1,46 @@ +import lithops +from lithops.storage import Storage +from lithops import RetryingFunctionExecutor + +# Dictionary of known failures: how many times each input should fail before succeeding +# This must be available to each function at runtime, so hardcoded or passed in as data +FAILURE_MAP = { + 0: 1, # fail once + 1: 2, # fail twice + 2: 0, # succeed immediately + 3: 3, # fail three times (requires at least retries=3) +} + +bucket = 'storage' + + +def my_retry_function(x): + storage = Storage() + + key = f"retries-demo/input-{x}" + try: + count = int(storage.get_object(bucket, key)) + except Exception: + count = 0 + + print(f"[Input {x}] Attempt #{count + 1}") + + if count < FAILURE_MAP.get(x, 0): + # Store updated count before failing + storage.put_object(bucket, key, str(count + 1)) + raise RuntimeError(f"Deliberate failure for input {x}, attempt {count + 1}") + + return x + 100 + + +if __name__ == "__main__": + iterdata = [0, 1, 2, 3] + + with lithops.FunctionExecutor() as fexec: + with RetryingFunctionExecutor(fexec) as retry_exec: + futures = retry_exec.map(my_retry_function, iterdata) + done, not_done = retry_exec.wait(futures, throw_except=False) + outputs = set(f.result() for f in done) + + Storage().delete_objects(bucket, [f"retries-demo/input-{x}" for x in iterdata]) + print("Final results:", outputs) diff --git a/examples/sklearn_job.py b/examples/sklearn_job_1.py similarity index 100% rename from examples/sklearn_job.py rename to examples/sklearn_job_1.py diff --git a/examples/sklearn_job_2.py b/examples/sklearn_job_2.py new file mode 100644 index 000000000..d7aa5f13f --- /dev/null +++ b/examples/sklearn_job_2.py @@ -0,0 +1,25 @@ +import joblib +from lithops.util.joblib import register_lithops +from sklearn.datasets import load_digits +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import GridSearchCV + +digits = load_digits() +param_grid = { + "n_estimators": [100, 50, 25], +} +model = RandomForestClassifier() +search = GridSearchCV(model, param_grid, cv=2, refit=True) + + +register_lithops() + +with joblib.parallel_backend("lithops"): + search.fit( + digits.data, + digits.target, + ) +print("Best score: %0.3f" % search.best_score_) +# print("Best parameters set:") +# # best_parameters = search.best_estimator_.get_params() +# # print(best_parameters) diff --git a/examples/sklearn_job_3.py b/examples/sklearn_job_3.py new file mode 100644 index 000000000..f35238080 --- /dev/null +++ b/examples/sklearn_job_3.py @@ -0,0 +1,38 @@ +import joblib +import pandas as pd +from lithops.util.joblib import register_lithops +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import GridSearchCV +from sklearn.pipeline import FunctionTransformer, Pipeline + +data = pd.DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "features": [[1, 2, 3, 4, 5] for _ in range(6)]} +) + +target = pd.Series([0, 1, 0, 1, 0, 1]) +param_grid = { + "classifier__n_estimators": [100, 50, 25], +} + +# Expands nested columns +format_transformer = FunctionTransformer(lambda df: df.features.apply(pd.Series)) + +pipeline = Pipeline( + [ + ("format", format_transformer), + ("classifier", RandomForestClassifier()), + ] +) +search = GridSearchCV(pipeline, param_grid, cv=2, refit=True) + + +register_lithops() +with joblib.parallel_backend("lithops"): + search.fit( + data, + target, + ) + +print("Best CV score: %0.3f" % search.best_score_) +print("Best parameters set:") +print(search.best_params_) diff --git a/lithops/__init__.py b/lithops/__init__.py index ae906e449..80383ee9f 100644 --- a/lithops/__init__.py +++ b/lithops/__init__.py @@ -2,6 +2,19 @@ from lithops.executors import LocalhostExecutor from lithops.executors import ServerlessExecutor from lithops.executors import StandaloneExecutor +from lithops.retries import RetryingFunctionExecutor from lithops.storage import Storage from lithops.version import __version__ from lithops.wait import wait, get_result + +__all__ = [ + 'FunctionExecutor', + 'LocalhostExecutor', + 'ServerlessExecutor', + 'StandaloneExecutor', + 'RetryingFunctionExecutor', + 'Storage', + 'wait', + 'get_result', + '__version__', +] diff --git a/lithops/config.py b/lithops/config.py index 8dba6b8bb..50e768803 100644 --- a/lithops/config.py +++ b/lithops/config.py @@ -33,8 +33,6 @@ os.makedirs(c.LOGS_DIR, exist_ok=True) os.makedirs(c.CLEANER_DIR, exist_ok=True) -CPU_COUNT = os.cpu_count() - def load_yaml_config(config_filename): import yaml @@ -174,20 +172,8 @@ def default_config(config_file=None, config_data=None, config_overwrite={}, load if mode == c.LOCALHOST: logger.debug("Loading compute backend module: localhost") - - config_data[backend]['max_workers'] = 1 - - if 'execution_timeout' not in config_data['lithops']: - config_data['lithops']['execution_timeout'] = c.EXECUTION_TIMEOUT_LOCALHOST_DEFAULT - - if 'storage' not in config_data['lithops']: - config_data['lithops']['storage'] = c.LOCALHOST - - if 'worker_processes' not in config_data[c.LOCALHOST]: - config_data[backend]['worker_processes'] = CPU_COUNT - - if 'runtime' not in config_data[c.LOCALHOST]: - config_data[backend]['runtime'] = c.LOCALHOST_RUNTIME_DEFAULT + cb_config = importlib.import_module('lithops.localhost.config') + cb_config.load_config(config_data) elif mode == c.SERVERLESS: logger.debug(f"Loading Serverless backend module: {backend}") @@ -198,22 +184,20 @@ def default_config(config_file=None, config_data=None, config_overwrite={}, load logger.debug(f"Loading Standalone backend module: {backend}") sb_config = importlib.import_module(f'lithops.standalone.backends.{backend}.config') sb_config.load_config(config_data) - - if 'monitoring' not in config_data['lithops']: - config_data['lithops']['monitoring'] = c.MONITORING_DEFAULT - - if 'execution_timeout' not in config_data['lithops']: - config_data['lithops']['execution_timeout'] = c.EXECUTION_TIMEOUT_DEFAULT + config_data['lithops']['chunksize'] = 0 if 'chunksize' not in config_data['lithops']: config_data['lithops']['chunksize'] = config_data[backend]['worker_processes'] if load_storage_config: config_data = default_storage_config(config_data=config_data) - if config_data['lithops']['storage'] == c.LOCALHOST \ - and backend != c.LOCALHOST: + if config_data['lithops']['storage'] == c.LOCALHOST and backend != c.LOCALHOST: raise Exception(f'Localhost storage backend cannot be used with {backend}') + for key in c.LITHOPS_DEFAULT_CONFIG_KEYS: + if key not in config_data['lithops']: + config_data['lithops'][key] = c.LITHOPS_DEFAULT_CONFIG_KEYS[key] + return config_data @@ -241,13 +225,14 @@ def default_storage_config(config_file=None, config_data=None, backend=None): def extract_storage_config(config): s_config = {} + s_config['monitoring_interval'] = config['lithops'].get( + 'monitoring_interval', c.LITHOPS_DEFAULT_CONFIG_KEYS['monitoring_interval'] + ) backend = config['lithops']['storage'] s_config['backend'] = backend s_config[backend] = config[backend] if backend in config and config[backend] else {} s_config[backend]['user_agent'] = f'lithops/{__version__}' - s_config['bucket'] = s_config[backend].get('storage_bucket') - return s_config diff --git a/lithops/constants.py b/lithops/constants.py index 63507e393..84303102f 100644 --- a/lithops/constants.py +++ b/lithops/constants.py @@ -15,7 +15,6 @@ # import os -import sys import tempfile LOGGER_LEVEL = 'info' @@ -24,6 +23,8 @@ LOGGER_FORMAT_SHORT = "[%(levelname)s] %(filename)s:%(lineno)s -- %(message)s" LOGGER_LEVEL_CHOICES = ["debug", "info", "warning", "error", "critical"] +CPU_COUNT = os.cpu_count() + STORAGE_CLI_MSG = '{} client created' COMPUTE_CLI_MSG = '{} client created' @@ -33,41 +34,15 @@ MODE_DEFAULT = SERVERLESS -MONITORING_DEFAULT = 'storage' -MONITORING_INTERVAL = 2 - -SERVERLESS_BACKEND_DEFAULT = 'ibm_cf' -STANDALONE_BACKEND_DEFAULT = 'ibm_vpc' -STORAGE_BACKEND_DEFAULT = 'ibm_cos' +SERVERLESS_BACKEND_DEFAULT = 'aws_lambda' +STANDALONE_BACKEND_DEFAULT = 'aws_ec2' +STORAGE_BACKEND_DEFAULT = 'aws_s3' JOBS_PREFIX = "lithops.jobs" TEMP_PREFIX = "lithops.jobs/tmp" LOGS_PREFIX = "lithops.logs" RUNTIMES_PREFIX = "lithops.runtimes" -EXECUTION_TIMEOUT_DEFAULT = 1800 -EXECUTION_TIMEOUT_LOCALHOST_DEFAULT = 3600 - -LOCALHOST_RUNTIME_DEFAULT = os.path.basename(sys.executable) - -SA_INSTALL_DIR = '/opt/lithops' -SA_TMP_DIR = '/tmp/lithops-root' -SA_LOG_FILE = f'{SA_TMP_DIR}/service.log' -SA_SERVICE_PORT = 8080 -SA_CONFIG_FILE = os.path.join(SA_INSTALL_DIR, 'config') -SA_DATA_FILE = os.path.join(SA_INSTALL_DIR, 'access.data') -SA_IMAGE_NAME_DEFAULT = 'lithops-worker-default' - -SA_DEFAULT_CONFIG_KEYS = { - 'runtime': 'python3', - 'exec_mode': 'consume', - 'start_timeout': 300, - 'pull_runtime': False, - 'auto_dismantle': True, - 'soft_dismantle_timeout': 300, - 'hard_dismantle_timeout': 3600 -} - MAX_AGG_DATA_SIZE = 4 # 4MiB WORKER_PROCESSES_DEFAULT = 1 @@ -78,9 +53,10 @@ JOBS_DIR = os.path.join(LITHOPS_TEMP_DIR, 'jobs') LOGS_DIR = os.path.join(LITHOPS_TEMP_DIR, 'logs') MODULES_DIR = os.path.join(LITHOPS_TEMP_DIR, 'modules') -CUSTOM_RUNTIME_DIR = os.path.join(LITHOPS_TEMP_DIR, 'custom_runtime') +CUSTOM_RUNTIME_DIR = os.path.join(LITHOPS_TEMP_DIR, 'custom-runtime') -RN_LOG_FILE = os.path.join(LITHOPS_TEMP_DIR, 'runner.log') +RN_LOG_FILE = os.path.join(LITHOPS_TEMP_DIR, 'localhost-runner.log') +SV_LOG_FILE = os.path.join(LITHOPS_TEMP_DIR, 'localhost-service.log') FN_LOG_FILE = os.path.join(LITHOPS_TEMP_DIR, 'functions.log') CLEANER_DIR = os.path.join(LITHOPS_TEMP_DIR, 'cleaner') @@ -93,6 +69,33 @@ CONFIG_FILE = os.path.join(CONFIG_DIR, 'config') CONFIG_FILE_GLOBAL = os.path.join("/etc", "lithops", "config") +LITHOPS_DEFAULT_CONFIG_KEYS = { + 'monitoring': 'storage', + 'monitoring_interval': 2, + 'execution_timeout': 1800 +} + +SA_INSTALL_DIR = '/opt/lithops' +SA_SETUP_LOG_FILE = f'{SA_INSTALL_DIR}/setup.log' +SA_SETUP_DONE_FILE = f'{SA_INSTALL_DIR}/setup-done.flag' +SA_MASTER_LOG_FILE = f'{LITHOPS_TEMP_DIR}/master-service.log' +SA_WORKER_LOG_FILE = f'{LITHOPS_TEMP_DIR}/worker-service.log' +SA_MASTER_SERVICE_PORT = 8080 +SA_WORKER_SERVICE_PORT = 8081 +SA_CONFIG_FILE = os.path.join(SA_INSTALL_DIR, 'config') +SA_MASTER_DATA_FILE = os.path.join(SA_INSTALL_DIR, 'master.data') +SA_WORKER_DATA_FILE = os.path.join(SA_INSTALL_DIR, 'worker.data') + +SA_DEFAULT_CONFIG_KEYS = { + 'runtime': 'python3', + 'exec_mode': 'reuse', + 'use_gpu': False, + 'start_timeout': 300, + 'auto_dismantle': True, + 'soft_dismantle_timeout': 300, + 'hard_dismantle_timeout': 3600 +} + SERVERLESS_BACKENDS = [ 'ibm_cf', 'code_engine', @@ -107,7 +110,8 @@ 'azure_containers', 'aliyun_fc', 'oracle_f', - 'k8s' + 'k8s', + 'singularity' ] STANDALONE_BACKENDS = [ @@ -116,27 +120,3 @@ 'azure_vms', 'vm' ] - -FAAS_BACKENDS = [ - 'ibm_cf', - 'knative', - 'openwhisk', - 'aws_lambda', - 'gcp_cloudrun', - 'gcp_functions', - 'cloudrun', - 'azure_functions', - 'azure_containers', - 'aliyun_fc', - 'oracle_f' -] - -BATCH_BACKENDS = [ - 'ibm_vpc', - 'aws_ec2', - 'azure_vms', - 'aws_batch', - 'k8s', - 'code_engine' - 'vm' -] diff --git a/lithops/executors.py b/lithops/executors.py index c1c62b84f..2cf28f9cf 100644 --- a/lithops/executors.py +++ b/lithops/executors.py @@ -30,18 +30,18 @@ from lithops.future import ResponseFuture from lithops.invokers import create_invoker from lithops.storage import InternalStorage -from lithops.wait import wait, ALL_COMPLETED, THREADPOOL_SIZE, WAIT_DUR_SEC, ALWAYS +from lithops.wait import wait, ALL_COMPLETED, THREADPOOL_SIZE, ALWAYS from lithops.job import create_map_job, create_reduce_job from lithops.config import default_config, \ extract_localhost_config, extract_standalone_config, \ extract_serverless_config, get_log_info, extract_storage_config from lithops.constants import LOCALHOST, CLEANER_DIR, \ SERVERLESS, STANDALONE -from lithops.utils import is_notebook, setup_lithops_logger, \ +from lithops.utils import setup_lithops_logger, \ is_lithops_worker, create_executor_id, create_futures_list -from lithops.localhost.localhost import LocalhostHandler -from lithops.standalone.standalone import StandaloneHandler -from lithops.serverless.serverless import ServerlessHandler +from lithops.localhost import LocalhostHandlerV1, LocalhostHandlerV2 +from lithops.standalone import StandaloneHandler +from lithops.serverless import ServerlessHandler from lithops.storage.utils import create_job_key, CloudObject from lithops.monitor import JobMonitor from lithops.utils import FuturesList @@ -61,7 +61,8 @@ class FunctionExecutor: :param backend: Compute backend to run the functions :param storage: Storage backend to store Lithops data :param monitoring: Monitoring system implementation. One of: storage, rabbitmq - :param log_level: Log level printing (INFO, DEBUG, ...). Set it to None to hide all logs. If this is param is set, all logging params in config are disabled + :param log_level: Log level printing (INFO, DEBUG, ...). Set it to None to hide all logs. + If this is param is set, all logging params in config are disabled :param kwargs: Any parameter that can be set in the compute backend section of the config file, can be set here """ @@ -107,7 +108,7 @@ def __init__( self.data_cleaner = self.config['lithops'].get('data_cleaner', True) if self.data_cleaner and not self.is_lithops_worker: - atexit.register(self.clean, clean_cloudobjects=False, clean_fn=True) + atexit.register(self.clean, clean_cloudobjects=False, clean_fn=True, on_exit=True) storage_config = extract_storage_config(self.config) self.internal_storage = InternalStorage(storage_config) @@ -118,7 +119,10 @@ def __init__( if self.mode == LOCALHOST: localhost_config = extract_localhost_config(self.config) - self.compute_handler = LocalhostHandler(localhost_config) + if localhost_config.get('version', 2) == 1: + self.compute_handler = LocalhostHandlerV1(localhost_config) + else: + self.compute_handler = LocalhostHandlerV2(localhost_config) elif self.mode == SERVERLESS: serverless_config = extract_serverless_config(self.config) self.compute_handler = ServerlessHandler(serverless_config, self.internal_storage) @@ -126,6 +130,8 @@ def __init__( standalone_config = extract_standalone_config(self.config) self.compute_handler = StandaloneHandler(standalone_config) + self.config['lithops']['backend_type'] = self.compute_handler.get_backend_type() + # Create the monitoring system self.job_monitor = JobMonitor( executor_id=self.executor_id, @@ -159,7 +165,7 @@ def __exit__(self, exc_type, exc_value, traceback): def _create_job_id(self, call_type): job_id = str(self.total_jobs).zfill(3) self.total_jobs += 1 - return '{}{}'.format(call_type, job_id) + return f'{call_type}{job_id}' def call_async( self, @@ -231,11 +237,15 @@ def map( :param extra_args: Additional arguments to pass to each map_function activation :param extra_env: Additional environment variables for function environment :param runtime_memory: Memory (in MB) to use to run the functions - :param obj_chunk_size: Used for data processing. Chunk size to split each object in bytes. Must be >= 1MiB. 'None' for processing the whole file in one function activation - :param obj_chunk_number: Used for data processing. Number of chunks to split each object. 'None' for processing the whole file in one function activation. chunk_n has prevalence over chunk_size if both parameters are set - :param obj_newline: new line character for keeping line integrity of partitions. 'None' for disabling line integrity logic and get partitions of the exact same size in the functions + :param obj_chunk_size: Used for data processing. Chunk size to split each object in bytes. + Must be >= 1MiB. 'None' for processing the whole file in one function activation + :param obj_chunk_number: Used for data processing. Number of chunks to split each object. + 'None' for processing the whole file in one function activation. chunk_n has prevalence over chunk_size if both parameters are set + :param obj_newline: new line character for keeping line integrity of partitions. + 'None' for disabling line integrity logic and get partitions of the exact same size in the functions :param timeout: Max time per function activation (seconds) - :param include_modules: Explicitly pickle these dependencies. All required dependencies are pickled if default empty list. No one dependency is pickled if it is explicitly set to None + :param include_modules: Explicitly pickle these dependencies. All required dependencies are pickled if default empty list. + No one dependency is pickled if it is explicitly set to None :param exclude_modules: Explicitly keep these modules from pickled dependencies. It is not taken into account if you set include_modules. :return: A list with size `len(map_iterdata)` of futures for each job (Futures are also internally stored by Lithops). @@ -310,7 +320,8 @@ def map_reduce( :param timeout: Time that the functions have to complete their execution before raising a timeout :param obj_chunk_size: the size of the data chunks to split each object. 'None' for processing the whole file in one function activation :param obj_chunk_number: Number of chunks to split each object. 'None' for processing the whole file in one function activation - :param obj_newline: New line character for keeping line integrity of partitions. 'None' for disabling line integrity logic and get partitions of the exact same size in the functions + :param obj_newline: New line character for keeping line integrity of partitions. + 'None' for disabling line integrity logic and get partitions of the exact same size in the functions :param obj_reduce_by_key: Set one reducer per object after running the partitioner. By default there is one reducer for all the objects :param spawn_reducer: Percentage of done map functions before spawning the reduce function :param include_modules: Explicitly pickle these dependencies. @@ -379,8 +390,7 @@ def map_reduce( reduce_futures = self.invoker.run_job(reduce_job) self.futures.extend(reduce_futures) - for f in map_futures: - f._produce_output = False + [f._set_mapreduce() for f in map_futures] return create_futures_list(map_futures + reduce_futures, self) @@ -392,7 +402,7 @@ def wait( download_results: Optional[bool] = False, timeout: Optional[int] = None, threadpool_size: Optional[int] = THREADPOOL_SIZE, - wait_dur_sec: Optional[int] = WAIT_DUR_SEC, + wait_dur_sec: Optional[int] = None, show_progressbar: Optional[bool] = True ) -> Tuple[FuturesList, FuturesList]: """ @@ -409,16 +419,17 @@ def wait( :param download_results: Download results. Default false (Only get statuses) :param timeout: Timeout of waiting for results :param threadpool_size: Number of threads to use. Default 64 - :param wait_dur_sec: Time interval between each check + :param wait_dur_sec: Time interval between each check. Default 1 second :param show_progressbar: whether or not to show the progress bar. - :return: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have completed and `fs_notdone` is a list of futures that have not completed. + :return: `(fs_done, fs_notdone)` where `fs_done` is a list of futures that have + completed and `fs_notdone` is a list of futures that have not completed. """ futures = fs or self.futures - if type(futures) != list and type(futures) != FuturesList: + + if type(futures) not in [list, FuturesList]: futures = [futures] - # Start waiting for results try: wait(fs=futures, internal_storage=self.internal_storage, @@ -429,7 +440,8 @@ def wait( timeout=timeout, threadpool_size=threadpool_size, wait_dur_sec=wait_dur_sec, - show_progressbar=show_progressbar) + show_progressbar=show_progressbar, + futures_from_executor_wait=False if fs else True) if self.data_cleaner and return_when == ALL_COMPLETED: present_jobs = {f.job_key for f in futures} @@ -438,12 +450,11 @@ def wait( except (KeyboardInterrupt, Exception) as e: self.invoker.stop() - self.job_monitor.stop() - if not fs and is_notebook(): - del self.futures[len(self.futures) - len(futures):] + self.job_monitor.remove(futures) + [f._set_exception() for f in futures] if self.data_cleaner: present_jobs = {f.job_key for f in futures} - self.compute_handler.clear(present_jobs) + self.compute_handler.clear(present_jobs, exception=e) self.clean(clean_cloudobjects=False, force=True) raise e @@ -462,7 +473,7 @@ def get_result( throw_except: Optional[bool] = True, timeout: Optional[int] = None, threadpool_size: Optional[int] = THREADPOOL_SIZE, - wait_dur_sec: Optional[int] = WAIT_DUR_SEC, + wait_dur_sec: Optional[int] = None, show_progressbar: Optional[bool] = True ): """ @@ -472,11 +483,19 @@ def get_result( :param throw_except: Reraise exception if call raised. Default True. :param timeout: Timeout for waiting for results. :param threadpool_size: Number of threads to use. Default 128 - :param wait_dur_sec: Time interval between each check. + :param wait_dur_sec: Time interval between each check. Default 1 second :param show_progressbar: whether or not to show the progress bar. :return: The result of the future/s """ + pending_to_read = len(fs) if fs else len( + [f for f in self.futures if not f._read and not f.futures]) + + logger.info( + (f'ExecutorID {self.executor_id} - Getting results from ' + f'{pending_to_read} function activations') + ) + fs_done, _ = self.wait( fs=fs, throw_except=throw_except, @@ -488,14 +507,11 @@ def get_result( ) result = [] - fs_done = [f for f in fs_done if not f.futures and f._produce_output] - for f in fs_done: - if fs: - # Process futures provided by the user + for f in [f for f in fs_done if not f.futures and f._produce_output]: + if fs: # Process futures provided by the user result.append(f.result(throw_except=throw_except, internal_storage=self.internal_storage)) - elif not fs and not f._read: - # Process internally stored futures + elif not fs and not f._read: # Process internally stored futures result.append(f.result(throw_except=throw_except, internal_storage=self.internal_storage)) f._read = True @@ -510,7 +526,8 @@ def get_result( def plot( self, fs: Optional[Union[ResponseFuture, List[ResponseFuture], FuturesList]] = None, - dst: Optional[str] = None + dst: Optional[str] = None, + figsize: Optional[tuple] = (10, 6) ): """ Creates timeline and histogram of the current execution in dst_dir. @@ -529,13 +546,18 @@ def plot( logger.debug(f'ExecutorID {self.executor_id} - No futures ready to plot') return - logging.getLogger('matplotlib').setLevel(logging.WARNING) - from lithops.plots import create_timeline, create_histogram + try: + logging.getLogger('matplotlib').setLevel(logging.WARNING) + from lithops.plots import create_timeline, create_histogram + except ImportError: + raise ModuleNotFoundError( + "Please install 'pip3 install lithops[plotting]' for " + "making use of the plot() method") logger.info(f'ExecutorID {self.executor_id} - Creating execution plots') - create_timeline(ftrs_to_plot, dst) - create_histogram(ftrs_to_plot, dst) + create_timeline(ftrs_to_plot, dst, figsize) + create_histogram(ftrs_to_plot, dst, figsize) def clean( self, @@ -543,7 +565,8 @@ def clean( cs: Optional[List[CloudObject]] = None, clean_cloudobjects: Optional[bool] = True, clean_fn: Optional[bool] = False, - force: Optional[bool] = False + force: Optional[bool] = False, + on_exit: Optional[bool] = False ): """ Deletes all the temp files from storage. These files include the function, @@ -555,6 +578,7 @@ def clean( :param clean_cloudobjects: Delete all cloudobjects created with this executor :param clean_fn: Delete cached functions in this executor :param force: Clean all future objects even if they have not benn completed + :parma on_exit: do not print logs on exit """ global CLEANER_PROCESS @@ -584,13 +608,14 @@ def save_data_to_clean(data): save_data_to_clean(data) futures = fs or self.futures - futures = [futures] if type(futures) != list else futures + futures = [futures] if type(futures) is not list else futures present_jobs = {create_job_key(f.executor_id, f.job_id) for f in futures if (f.executor_id.count('-') == 1 and f.done) or force} jobs_to_clean = present_jobs - self.cleaned_jobs if jobs_to_clean: - logger.info(f'ExecutorID {self.executor_id} - Cleaning temporary data') + if not on_exit: + logger.info(f'ExecutorID {self.executor_id} - Cleaning temporary data') data = { 'jobs_to_clean': jobs_to_clean, 'clean_cloudobjects': clean_cloudobjects, @@ -611,8 +636,13 @@ def job_summary(self, cloud_objects_n: Optional[int] = 0): :param cloud_objects_n: number of cloud object used in COS, declared by user. """ - import pandas as pd - import numpy as np + try: + import pandas as pd + import numpy as np + except ImportError: + raise ModuleNotFoundError( + "Please install 'pip3 install lithops[plotting]' for " + "making use of the job_summary() method") def init(): headers = ['Job_ID', 'Function', 'Invocations', 'Memory(MB)', 'AvgRuntime', 'Cost', 'CloudObjects'] @@ -646,7 +676,7 @@ def get_object_num(): init() futures = self.futures - if type(futures) != list: + if type(futures) is not list: futures = [futures] memory = [] diff --git a/lithops/future.py b/lithops/future.py index c11666b37..cc465c445 100644 --- a/lithops/future.py +++ b/lithops/future.py @@ -27,8 +27,11 @@ from six import reraise from lithops.storage import InternalStorage -from lithops.storage.utils import check_storage_path, get_storage_path,\ +from lithops.storage.utils import ( + check_storage_path, + get_storage_path, create_job_key +) from lithops.constants import FN_LOG_FILE, LOGS_DIR logger = logging.getLogger(__name__) @@ -45,12 +48,9 @@ class State(): Running = "Running" Ready = "Ready" Success = "Success" - Futures = "Futures" Error = "Error" Done = "Done" - - GET_RESULT_SLEEP_SECS = 1 - GET_RESULT_MAX_RETRIES = 10 + Unknown = "Unknown" def __init__(self, call_id, job, job_metadata, storage_config): self.call_id = call_id @@ -108,34 +108,26 @@ def running(self): @property def ready(self): - return self._state in [ResponseFuture.State.Ready, - ResponseFuture.State.Futures] + return self._state == ResponseFuture.State.Ready @property def error(self): return self._state == ResponseFuture.State.Error - @property - def futures(self): - """ - The response of a call was a FutureResponse instance. - It has to wait to the new invocation output. - """ - return self._state == ResponseFuture.State.Futures - @property def success(self): - if self._state in [ResponseFuture.State.Success, - ResponseFuture.State.Error]: - return True - return False + return self._state in [ResponseFuture.State.Success, + ResponseFuture.State.Error] @property def done(self): - if self._state in [ResponseFuture.State.Done, - ResponseFuture.State.Error]: - return True - return False + return self._state in [ResponseFuture.State.Done, + ResponseFuture.State.Error, + ResponseFuture.State.Unknown] + + @property + def futures(self): + return self._new_futures is not None def _set_invoked(self): """ Set the future as invoked""" @@ -147,19 +139,34 @@ def _set_running(self, call_status): self.activation_id = self._call_status['activation_id'] self._state = ResponseFuture.State.Running + def _set_exception(self): + """ Set the future as error""" + self._read = True + self._host_status_done_tstamp = time.time() + if not self.done: + self._state = ResponseFuture.State.Unknown + def _set_ready(self, call_status): - """ Set the future as running""" + """ Set the future as ready""" self._call_status = call_status self._host_status_done_tstamp = time.time() self._state = ResponseFuture.State.Ready def _set_futures(self, call_status): - """ Set the future as running""" + """ Set the future as futures""" self._call_status = call_status self._host_status_done_tstamp = time.time() self.status(throw_except=False) + self._state = ResponseFuture.State.Ready - def status(self, throw_except=True, internal_storage=None, check_only=False): + def _set_mapreduce(self): + """ Set the future as mapreduce map""" + self._read = True + self._produce_output = False + if self.success: + self._state = ResponseFuture.State.Done + + def status(self, throw_except=True, internal_storage=None, check_only=False, wait_dur_sec=1): """ Return the status returned by the call. If the call raised an exception, this method will raise the same exception @@ -168,6 +175,8 @@ def status(self, throw_except=True, internal_storage=None, check_only=False): :param check_only: Return None immediately if job is not complete. Default False. :param throw_except: Reraise exception if call raised. Default true. :param internal_storage: Storage handler to poll cloud storage. Default None. + :param wait_dur_sec: Time interval between each check + :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. @@ -178,10 +187,6 @@ def status(self, throw_except=True, internal_storage=None, check_only=False): if self.success or self.done: return self._call_status - if self.ready and self._new_futures: - self._set_state(ResponseFuture.State.Done) - return self._call_status - if self._call_status is None or self._call_status['type'] == '__init__': if internal_storage is None: internal_storage = InternalStorage(self._storage_config) @@ -193,7 +198,7 @@ def status(self, throw_except=True, internal_storage=None, check_only=False): return self._call_status while self._call_status is None: - time.sleep(self.GET_RESULT_SLEEP_SECS) + time.sleep(wait_dur_sec) self._call_status = internal_storage.get_call_status(self.executor_id, self.job_id, self.call_id) self._status_query_count += 1 self._host_status_done_tstamp = time.time() @@ -214,13 +219,22 @@ def status(self, throw_except=True, internal_storage=None, check_only=False): with open(FN_LOG_FILE, 'a') as lf: lf.write(header + ' ' + output + tail) + for key in self._call_status: + if any(key.startswith(ss) for ss in ['func', 'host', 'worker']): + self.stats[key] = self._call_status[key] + + self.stats['worker_exec_time'] = round(self.stats['worker_end_tstamp'] - self.stats['worker_start_tstamp'], 8) + total_time = format(round(self.stats['worker_exec_time'], 2), '.2f') + + logger.debug( + f'ExecutorID {self.executor_id} | JobID {self.job_id} - Got status from call {self.call_id} ' + f'- Activation ID: {self.activation_id} - Time: {str(total_time)} seconds' + ) + if self._call_status['exception']: self._set_state(ResponseFuture.State.Error) self._exception = pickle.loads(eval(self._call_status['exc_info'])) - msg1 = ('ExecutorID {} | JobID {} - There was an exception - Activation ' - 'ID: {}'.format(self.executor_id, self.job_id, self.activation_id)) - if not self._call_status.get('exc_pickle_fail', False): fn_exctype = self._exception[0] fn_exc = self._exception[1] @@ -237,13 +251,15 @@ def status(self, throw_except=True, internal_storage=None, check_only=False): self._exception = (fn_exctype, fn_exc, self._exception['exc_traceback']) + logger.warning( + 'ExecutorID {} | JobID {} - CallID: {} - There was an exception - Activation ID: {} - {}' + .format(self.executor_id, self.job_id, self.call_id, self.activation_id, fn_exctype.__name__) + ) + def exception_hook(exctype, exc, trcbck): if exctype == fn_exctype and str(exc) == str(fn_exc): - logger.warning(msg1) if self._handler_exception: - msg2 = 'Exception: {} - {}'.format(fn_exctype.__name__, - fn_exc) - logger.warning(msg2) + logger.warning(f'Exception: {fn_exctype.__name__} - {fn_exc}') else: traceback.print_exception(*self._exception) else: @@ -254,46 +270,32 @@ def exception_hook(exctype, exc, trcbck): sys.excepthook = exception_hook reraise(*self._exception) else: - logger.warning(msg1) - msg2 = 'Exception: {} - {}'.format(self._exception[0].__name__, - self._exception[1]) - logger.warning(msg2) return None - for key in self._call_status: - if any(key.startswith(ss) for ss in ['func', 'host', 'worker']): - self.stats[key] = self._call_status[key] - - self.stats['worker_exec_time'] = round(self.stats['worker_end_tstamp'] - self.stats['worker_start_tstamp'], 8) - total_time = format(round(self.stats['worker_exec_time'], 2), '.2f') - - logger.debug(f'ExecutorID {self.executor_id} | JobID {self.job_id} - Got status from call {self.call_id} ' - f'- Activation ID: {self.activation_id} - Time: {str(total_time)} seconds') - - self._set_state(ResponseFuture.State.Success) - - if self._call_status['func_result_size'] == 0: - self._produce_output = False - - if not self._produce_output: - self._set_state(ResponseFuture.State.Done) - if 'new_futures' in self._call_status and not self._new_futures: new_futures = pickle.loads(eval(self._call_status['new_futures'])) - self._new_futures = [new_futures] if type(new_futures) == ResponseFuture else new_futures - self._set_state(ResponseFuture.State.Futures) + self._new_futures = [new_futures] if type(new_futures) is ResponseFuture else new_futures + + elif self._call_status['func_result_size'] == 0: + self._produce_output = False if 'result' in self._call_status: self._call_output = pickle.loads(eval(self._call_status['result'])) self.stats['host_result_done_tstamp'] = time.time() self.stats['host_result_query_count'] = 0 - logger.debug(f'ExecutorID {self.executor_id} | JobID {self.job_id} - Got output ' - f'from call {self.call_id} - Activation ID: {self.activation_id}') + logger.debug( + f'ExecutorID {self.executor_id} | JobID {self.job_id} - Got output ' + f'from call {self.call_id} - Activation ID: {self.activation_id}' + ) + + if self._call_output is not None or not self._produce_output: self._set_state(ResponseFuture.State.Done) + else: + self._set_state(ResponseFuture.State.Success) return self._call_status - def result(self, throw_except=True, internal_storage=None): + def result(self, throw_except=True, internal_storage=None, retries=10, wait_dur_sec=1): """ Return the value returned by the call. If the call raised an exception, this method will raise the same exception @@ -301,6 +303,9 @@ def result(self, throw_except=True, internal_storage=None): :param throw_except: Reraise exception if call raised. Default true. :param internal_storage: Storage handler to poll cloud storage. Default None. + :param retries: Number of times to check if the result file is in the storage + :param wait_dur_sec: Time interval between each retry check + :return: Result of the call. :raises CancelledError: If the job is cancelled before completed. :raises TimeoutError: If job is not complete after `timeout` seconds. @@ -308,20 +313,14 @@ def result(self, throw_except=True, internal_storage=None): if self._state == ResponseFuture.State.New: raise ValueError("Task not yet invoked") - if not self._produce_output: - self.status(throw_except=throw_except, internal_storage=internal_storage) - self._set_state(ResponseFuture.State.Done) - - if self.done: - return self._call_output - - if self._state == ResponseFuture.State.Futures: - return self._new_futures - - if internal_storage is None: + if not self.done and internal_storage is None: internal_storage = InternalStorage(storage_config=self._storage_config) - self.status(throw_except=throw_except, internal_storage=internal_storage) + self.status(throw_except=throw_except, internal_storage=internal_storage, wait_dur_sec=wait_dur_sec) + + if self.futures: + self._call_output = self._new_futures + self._set_state(ResponseFuture.State.Done) if self.done: return self._call_output @@ -330,8 +329,8 @@ def result(self, throw_except=True, internal_storage=None): call_output = internal_storage.get_call_output(self.executor_id, self.job_id, self.call_id) self._output_query_count += 1 - while call_output is None and self._output_query_count < self.GET_RESULT_MAX_RETRIES: - time.sleep(self.GET_RESULT_SLEEP_SECS) + while call_output is None and self._output_query_count < retries: + time.sleep(wait_dur_sec) call_output = internal_storage.get_call_output(self.executor_id, self.job_id, self.call_id) self._output_query_count += 1 diff --git a/lithops/invokers.py b/lithops/invokers.py index d60d11698..607308313 100644 --- a/lithops/invokers.py +++ b/lithops/invokers.py @@ -27,8 +27,20 @@ from lithops.future import ResponseFuture from lithops.config import extract_storage_config from lithops.version import __version__ -from lithops.utils import verify_runtime_name, version_str, is_lithops_worker, iterchunks -from lithops.constants import LOGGER_LEVEL, LOGS_DIR, SERVERLESS +from lithops.utils import ( + verify_runtime_name, + version_str, + is_lithops_worker, + iterchunks, + BackendType +) +from lithops.constants import ( + LOGGER_LEVEL, + LOGS_DIR, + SERVERLESS, + SA_INSTALL_DIR, + STANDALONE_BACKENDS +) from lithops.util.metrics import PrometheusExporter logger = logging.getLogger(__name__) @@ -39,7 +51,7 @@ def create_invoker(config, executor_id, internal_storage, """ Creates the appropriate invoker based on the backend type """ - if compute_handler.get_backend_type() == 'batch': + if compute_handler.get_backend_type() == BackendType.BATCH.value: return BatchInvoker( config, executor_id, @@ -48,7 +60,7 @@ def create_invoker(config, executor_id, internal_storage, job_monitor ) - elif compute_handler.get_backend_type() == 'faas': + elif compute_handler.get_backend_type() == BackendType.FAAS.value: return FaaSInvoker( config, executor_id, @@ -82,7 +94,7 @@ def __init__(self, config, executor_id, internal_storage, compute_handler, job_m self.mode = self.config['lithops']['mode'] self.backend = self.config['lithops']['backend'] - self.customized_runtime = self.config['lithops'].get('customized_runtime', False) + self.include_function = self.config[self.backend].get('runtime_include_function', False) self.runtime_info = self.compute_handler.get_runtime_info() self.runtime_name = self.runtime_info['runtime_name'] @@ -133,27 +145,30 @@ def select_runtime(self, job_id, runtime_memory): def _create_payload(self, job): """ - Creates the default pyload dictionary + Creates the default payload dictionary """ - payload = {'config': self.config, - 'chunksize': job.chunksize, - 'log_level': self.log_level, - 'func_key': job.func_key, - 'data_key': job.data_key, - 'extra_env': job.extra_env, - 'total_calls': job.total_calls, - 'execution_timeout': job.execution_timeout, - 'data_byte_ranges': job.data_byte_ranges, - 'executor_id': job.executor_id, - 'job_id': job.job_id, - 'job_key': job.job_key, - 'max_workers': self.max_workers, - 'call_ids': None, - 'host_submit_tstamp': time.time(), - 'lithops_version': __version__, - 'runtime_name': job.runtime_name, - 'runtime_memory': job.runtime_memory, - 'worker_processes': job.worker_processes} + payload = { + 'config': self.config, + 'chunksize': job.chunksize, + 'log_level': self.log_level, + 'func_name': job.function_name, + 'func_key': job.func_key, + 'data_key': job.data_key, + 'extra_env': job.extra_env, + 'total_calls': job.total_calls, + 'execution_timeout': job.execution_timeout, + 'data_byte_ranges': job.data_byte_ranges, + 'executor_id': job.executor_id, + 'job_id': job.job_id, + 'job_key': job.job_key, + 'max_workers': self.max_workers, + 'call_ids': None, + 'host_submit_tstamp': time.time(), + 'lithops_version': __version__, + 'runtime_name': job.runtime_name, + 'runtime_memory': job.runtime_memory, + 'worker_processes': job.worker_processes + } return payload @@ -161,20 +176,17 @@ def _run_job(self, job): """ Run a job """ - if self.customized_runtime: - logger.debug('ExecutorID {} | JobID {} - Customized runtime activated' - .format(job.executor_id, job.job_id)) + if self.include_function: + logger.debug('ExecutorID {} | JobID {} - Runtime include function feature ' + ' is activated' .format(job.executor_id, job.job_id)) job.runtime_name = self.runtime_name extend_runtime(job, self.compute_handler, self.internal_storage) self.runtime_name = job.runtime_name - logger.info('ExecutorID {} | JobID {} - Starting function ' - 'invocation: {}() - Total: {} activations' - .format(job.executor_id, job.job_id, - job.function_name, job.total_calls)) - - logger.debug('ExecutorID {} | JobID {} - Worker processes: {} - Chunksize: {}' - .format(job.executor_id, job.job_id, job.worker_processes, job.chunksize)) + logger.info( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - Starting function ' + f'invocation: {job.function_name}() - Total: {job.total_calls} activations' + ) self.prometheus.send_metric( name='job_total_calls', @@ -196,6 +208,12 @@ def _run_job(self, job): ) ) + if self.backend not in STANDALONE_BACKENDS: + logger.debug( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - Worker processes: ' + f'{job.worker_processes} - Chunksize: {job.chunksize}' + ) + try: job.runtime_name = self.runtime_name self._invoke_job(job) @@ -204,8 +222,9 @@ def _run_job(self, job): raise e log_file = os.path.join(LOGS_DIR, job.job_key + '.log') - logger.info("ExecutorID {} | JobID {} - View execution logs at {}" - .format(job.executor_id, job.job_id, log_file)) + logger.info( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - View execution logs at {log_file}' + ) # Create all futures futures = [] @@ -249,19 +268,15 @@ def _invoke_job(self, job): roundtrip = time.time() - start resp_time = format(round(roundtrip, 3), '.3f') - logger.debug('ExecutorID {} | JobID {} - Job invoked ({}s) - Activation ID: {}' - .format(job.executor_id, job.job_id, resp_time, activation_id or job.job_key)) + logger.debug( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - Job invoked ' + f'({resp_time}s) - Activation ID: {activation_id or job.job_key}' + ) def run_job(self, job): """ Run a job """ - # Ensure only self.max_workers are started - total_workers = job.total_calls // job.chunksize + (job.total_calls % job.chunksize > 0) - if self.max_workers < total_workers: - job.chunksize = job.total_calls // self.max_workers + (job.total_calls % self.max_workers > 0) - - # Perform the invocation futures = self._run_job(job) self.job_monitor.start(futures) @@ -286,10 +301,10 @@ def __init__(self, config, executor_id, internal_storage, compute_handler, job_m self.should_run = False self.sync = is_lithops_worker() - invoke_pool_threads = self.config[self.backend]['invoke_pool_threads'] - self.executor = ThreadPoolExecutor(invoke_pool_threads) + self.invoke_pool_threads = self.config[self.backend]['invoke_pool_threads'] + self.executor = ThreadPoolExecutor(self.invoke_pool_threads) - logger.debug('ExecutorID {} - Serverless invoker created'.format(self.executor_id)) + logger.debug(f'ExecutorID {self.executor_id} - Serverless invoker created') def _start_async_invokers(self): """Starts the invoker process responsible to spawn pending calls @@ -298,10 +313,9 @@ def _start_async_invokers(self): def invoker_process(inv_id): """Run process that implements token bucket scheduling approach""" - logger.debug('ExecutorID {} - Async invoker {} started' - .format(self.executor_id, inv_id)) + logger.debug(f'ExecutorID {self.executor_id} - Async invoker {inv_id} started') - with ThreadPoolExecutor(max_workers=250) as executor: + with ThreadPoolExecutor(max_workers=min(64, self.invoke_pool_threads // 4)) as executor: while self.should_run: try: self.job_monitor.token_bucket_q.get() @@ -313,10 +327,10 @@ def invoker_process(inv_id): else: break - logger.debug('ExecutorID {} - Async invoker {} finished' - .format(self.executor_id, inv_id)) + logger.debug(f'ExecutorID {self.executor_id} - Async invoker {inv_id} finished') for inv_id in range(self.ASYNC_INVOKERS): + self.job_monitor.token_bucket_q.put('#') p = threading.Thread(target=invoker_process, args=(inv_id,)) self.invokers.append(p) p.daemon = True @@ -327,8 +341,7 @@ def stop(self): Stop async invokers """ if self.invokers: - logger.debug('ExecutorID {} - Stopping async invokers' - .format(self.executor_id)) + logger.debug(f'ExecutorID {self.executor_id} - Stopping async invokers') self.should_run = False while not self.pending_calls_q.empty(): @@ -373,9 +386,10 @@ def _invoke_task(self, job, call_ids_range): self.job_monitor.token_bucket_q.put('#') return - logger.debug('ExecutorID {} | JobID {} - Calls {} invoked ({}s) - Activation' - ' ID: {}'.format(job.executor_id, job.job_id, ', '.join(call_ids), - resp_time, activation_id)) + logger.debug( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - Calls {", ".join(call_ids)} ' + f'invoked ({resp_time}s) - Activation ID: {activation_id}' + ) def _invoke_job_remote(self, job): """ @@ -395,8 +409,10 @@ def _invoke_job_remote(self, job): resp_time = format(round(roundtrip, 3), '.3f') if activation_id: - logger.debug('ExecutorID {} | JobID {} - Remote invoker call done ({}s) - Activation' - ' ID: {}'.format(job.executor_id, job.job_id, resp_time, activation_id)) + logger.debug( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - Remote invoker ' + f'call done ({resp_time}s) - Activation ID: {activation_id}' + ) else: raise Exception('Unable to spawn remote invoker') @@ -415,6 +431,16 @@ def _invoke_job(self, job): self.should_run = True self._start_async_invokers() + if self.running_workers > 0 and not self.job_monitor.token_bucket_q.empty(): + while not self.job_monitor.token_bucket_q.empty(): + try: + self.job_monitor.token_bucket_q.get(False) + self.running_workers -= 1 + if self.running_workers == 0: + break + except Exception: + pass + if self.running_workers < self.max_workers: free_workers = self.max_workers - self.running_workers total_direct = free_workers * job.chunksize @@ -427,10 +453,11 @@ def _invoke_job(self, job): consumed_workers = ci // cz + (ci % cz > 0) self.running_workers += consumed_workers - logger.debug('ExecutorID {} | JobID {} - Free workers:' - ' {} - Going to run {} activations in {} workers' - .format(job.executor_id, job.job_id, free_workers, - len(callids_to_invoke_direct), consumed_workers)) + logger.debug( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - Free workers: ' + f'{free_workers} - Going to run {len(callids_to_invoke_direct)} activations ' + f'in {consumed_workers} workers' + ) def _callback(future): future.result() @@ -446,17 +473,17 @@ def _callback(future): # Put into the queue the rest of the callids to invoke within the process if callids_to_invoke_nondirect: - logger.debug('ExecutorID {} | JobID {} - Putting remaining ' - '{} function activations into pending queue' - .format(job.executor_id, job.job_id, - len(callids_to_invoke_nondirect))) + logger.debug( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - Putting remaining ' + f'{len(callids_to_invoke_nondirect)} function activations into pending queue' + ) for call_ids_range in iterchunks(callids_to_invoke_nondirect, job.chunksize): self.pending_calls_q.put((job, call_ids_range)) else: - logger.debug('ExecutorID {} | JobID {} - Reached maximum {} ' - 'workers, queuing {} function activations' - .format(job.executor_id, job.job_id, - self.max_workers, job.total_calls)) + logger.debug( + f'ExecutorID {job.executor_id} | JobID {job.job_id} - Reached maximum {self.max_workers} ' + f'workers, queuing {job.total_calls} function activations' + ) for call_ids_range in iterchunks(range(job.total_calls), job.chunksize): self.pending_calls_q.put((job, call_ids_range)) @@ -477,12 +504,12 @@ def run_job(self, job): def extend_runtime(job, compute_handler, internal_storage): """ - This method is used when customized_runtime is active + This method is used when runtime_include_function is active """ base_docker_image = job.runtime_name uuid = job.ext_runtime_uuid - ext_runtime_name = "{}:{}".format(base_docker_image.split(":")[0], uuid) + ext_runtime_name = f'{base_docker_image.split(":")[0]}:{uuid}' # update job with new extended runtime name job.runtime_name = ext_runtime_name @@ -491,17 +518,14 @@ def extend_runtime(job, compute_handler, internal_storage): runtime_meta = internal_storage.get_runtime_meta(runtime_key) if not runtime_meta: - logger.info('Creating runtime: {}, memory: {}MB'.format(ext_runtime_name, job.runtime_memory)) - ext_docker_file = '/'.join([job.local_tmp_dir, "Dockerfile"]) # Generate Dockerfile extended with function dependencies and function with open(ext_docker_file, 'w') as df: df.write('\n'.join([ - 'FROM {}'.format(base_docker_image), - 'ENV PYTHONPATH=/tmp/lithops/modules:$PYTHONPATH', - # set python path to point to dependencies folder - 'COPY . /tmp/lithops' + f'FROM {base_docker_image}', + f'ENV PYTHONPATH={SA_INSTALL_DIR}/modules:$PYTHONPATH', + f'COPY . {SA_INSTALL_DIR}' ])) # Build new extended runtime tagged by function hash @@ -517,12 +541,15 @@ def extend_runtime(job, compute_handler, internal_storage): # Verify python version and lithops version if __version__ != runtime_meta['lithops_version']: - raise Exception("Lithops version mismatch. Host version: {} - Runtime version: {}" - .format(__version__, runtime_meta['lithops_version'])) + raise Exception( + f"Lithops version mismatch. Host version: {__version__} - " + f"Runtime version: {runtime_meta['lithops_version']}" + ) py_local_version = version_str(sys.version_info) py_remote_version = runtime_meta['python_version'] if py_local_version != py_remote_version: - raise Exception(("The indicated runtime '{}' is running Python {} and it " - "is not compatible with the local Python version {}") - .format(job.runtime_name, py_remote_version, py_local_version)) + raise Exception( + f"The runtime '{job.runtime_name}' uses Python {py_remote_version}, " + f"which is incompatible with local Python {py_local_version}" + ) diff --git a/lithops/job/__init__.py b/lithops/job/__init__.py index 434e9d910..3f624091e 100644 --- a/lithops/job/__init__.py +++ b/lithops/job/__init__.py @@ -1,2 +1,7 @@ from .job import create_map_job from .job import create_reduce_job + +__all__ = [ + 'create_map_job', + 'create_reduce_job' +] diff --git a/lithops/job/job.py b/lithops/job/job.py index 59c40d38b..eeb6f44b4 100644 --- a/lithops/job/job.py +++ b/lithops/job/job.py @@ -26,16 +26,17 @@ from lithops import utils from lithops.job.partitioner import create_partitions -from lithops.storage.utils import create_func_key, create_data_key,\ +from lithops.storage.utils import create_func_key, create_data_key, \ create_job_key, func_key_suffix from lithops.job.serialize import SerializeIndependent, create_module_data -from lithops.constants import MAX_AGG_DATA_SIZE, LOCALHOST,\ - SERVERLESS, STANDALONE, CUSTOM_RUNTIME_DIR, FAAS_BACKENDS +from lithops.constants import MAX_AGG_DATA_SIZE, LOCALHOST, \ + SERVERLESS, STANDALONE, CUSTOM_RUNTIME_DIR logger = logging.getLogger(__name__) FUNCTION_CACHE = set() +MAX_DATA_IN_PAYLOAD = 8 * 1024 # Per invocation. 8KB def create_map_job( @@ -176,8 +177,6 @@ def _create_job( """ Creates a new Job """ - global FUNCTION_CACHE - ext_env = {} if extra_env is None else extra_env.copy() if ext_env: ext_env = utils.convert_bools_to_string(ext_env) @@ -216,8 +215,15 @@ def _create_job( exclude_modules_cfg = config['lithops'].get('exclude_modules', []) include_modules_cfg = config['lithops'].get('include_modules', []) + if type(include_modules_cfg) is str: + if include_modules_cfg.lower() == 'none': + include_modules_cfg = None + else: + raise ValueError("'include_modules' parameter in config must be a list") + exc_modules = set() inc_modules = set() + if exclude_modules_cfg: exc_modules.update(exclude_modules_cfg) if exclude_modules: @@ -231,7 +237,7 @@ def _create_job( if include_modules is None: inc_modules = None - logger.debug('ExecutorID {} | JobID {} - Serializing function and data'.format(executor_id, job_id)) + logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Serializing function and data') job_serialize_start = time.time() serializer = SerializeIndependent(runtime_meta['preinstalls']) func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules, exc_modules) @@ -257,11 +263,8 @@ def _create_job( raise Exception(log_msg) # Upload function and data - upload_function = not config['lithops'].get('customized_runtime', False) - upload_data = not ( - (len(str(data_str)) * job.chunksize < 8 * 1204 for data_str in data_strs) - and backend in FAAS_BACKENDS - ) + upload_function = not config[backend].get("runtime_include_function", False) + upload_data = any([(len(data_str) * job.chunksize) > MAX_DATA_IN_PAYLOAD for data_str in data_strs]) # Upload function and modules if upload_function: @@ -286,14 +289,14 @@ def _create_job( function_hash = hashlib.md5(open(function_file, 'rb').read()).hexdigest()[:16] mod_hash = hashlib.md5(repr(sorted(mod_paths)).encode('utf-8')).hexdigest()[:16] job.func_key = func_key_suffix - job.ext_runtime_uuid = '{}{}'.format(function_hash, mod_hash) + job.ext_runtime_uuid = f'{function_hash}{mod_hash}' job.local_tmp_dir = os.path.join(CUSTOM_RUNTIME_DIR, job.ext_runtime_uuid) _store_func_and_modules(job.local_tmp_dir, job.func_key, func_str, module_data) host_job_meta['host_func_upload_time'] = 0 # upload data - if upload_data: - # Upload iterdata to COS only if a single element is greater than 8KB + if upload_data or config['lithops']['backend_type'] == utils.BackendType.BATCH.value: + # Upload iterdata to COS only if a single element is greater than MAX_DATA_IN_PAYLOAD logger.debug('ExecutorID {} | JobID {} - Uploading data to the storage backend' .format(executor_id, job_id)) # pass_iteradata through an object storage file @@ -310,7 +313,7 @@ def _create_job( # pass iteradata as part of the invocation payload logger.debug('ExecutorID {} | JobID {} - Data per activation is < ' '{}. Passing data through invocation payload' - .format(executor_id, job_id, utils.sizeof_fmt(8 * 1024))) + .format(executor_id, job_id, utils.sizeof_fmt(MAX_DATA_IN_PAYLOAD))) job.data_key = None job.data_byte_ranges = None job.data_byte_strs = data_strs diff --git a/lithops/job/partitioner.py b/lithops/job/partitioner.py index 95c7a4eee..f2ec4ed2e 100644 --- a/lithops/job/partitioner.py +++ b/lithops/job/partitioner.py @@ -302,7 +302,7 @@ def _split_objects_from_object_storage( # check that only one schemma provided. Throw exception if more than one provided for elem in map_func_args_list: - if type(elem['obj']) == CloudObject: + if type(elem['obj']) is CloudObject: elem['obj'] = f"{elem['obj'].backend}://{elem['obj'].bucket}/{elem['obj'].key}" sb, bucket, prefix, obj_name = utils.split_object_url(elem['obj']) if sb is None: diff --git a/lithops/job/serialize.py b/lithops/job/serialize.py index d99b16e6b..bd53ff015 100644 --- a/lithops/job/serialize.py +++ b/lithops/job/serialize.py @@ -28,6 +28,8 @@ from importlib import import_module from types import CodeType, FunctionType, ModuleType +from lithops.libs import imp +from lithops.libs import inspect as linspect from lithops.utils import bytes_to_b64str from lithops.libs.multyvac.module_dependency import ModuleDependencyAnalyzer @@ -45,61 +47,77 @@ def __call__(self, list_of_objs, include_modules, exclude_modules): """ Serialize f, args, kwargs independently """ - self._modulemgr = ModuleDependencyAnalyzer() preinstalled_modules = [name for name, _ in self.preinstalled_modules] - self._modulemgr.ignore(preinstalled_modules) - if not include_modules: - self._modulemgr.ignore(exclude_modules) - # Inspect modules strs = [] - modules = set() + mod_paths = set() for obj in list_of_objs: - modules.update(self._module_inspect(obj)) strs.append(cloudpickle.dumps(obj)) - # Add modules - direct_modules = set() - mod_paths = set() + if include_modules is None: + # If include_modules is explicitly set to None, no module is included + logger.debug('Module manager disabled. Modules to transmit: None') + return (strs, mod_paths) - for module_name in modules: - if module_name in ['__main__', None]: - continue - try: - mod_spec = importlib.util.find_spec(module_name) - except Exception: - mod_spec = None + if len(include_modules) == 0: + # If include_modules is not provided (empty list by default), + # inspect the objects looking for referenced modules + self._modulemgr = ModuleDependencyAnalyzer() + self._modulemgr.ignore(preinstalled_modules) + self._modulemgr.ignore(exclude_modules) - origin = mod_spec.origin if mod_spec else module_name - if origin and origin.endswith('.so'): - if origin not in exclude_modules and \ - os.path.basename(origin) not in exclude_modules: - mod_paths.add(origin) - else: - self._modulemgr.add(module_name) + ref_modules = set() - direct_modules.add(origin if origin not in ['built-in', None] else module_name) + for obj in list_of_objs: + ref_modules.update(self._module_inspect(obj)) - logger.debug("Referenced modules: {}".format(None if not - direct_modules else ", ".join(direct_modules))) + logger.debug("Referenced Modules: {}".format(None if not + ref_modules else ", ".join(ref_modules))) + + for module_name in ref_modules: + if module_name in ['__main__', None]: + continue + try: + mod_spec = importlib.util.find_spec(module_name) + except Exception: + mod_spec = None + + origin = mod_spec.origin if mod_spec else module_name + if origin and origin.endswith('.so'): + if origin not in exclude_modules and \ + os.path.basename(origin) not in exclude_modules: + mod_paths.add(origin) + else: + self._modulemgr.add(module_name) - if include_modules is not None: tent_mod_paths = self._modulemgr.get_and_clear_paths() - if include_modules: - logger.debug("Tentative modules to transmit: {}" - .format(None if not tent_mod_paths else ", ".join(tent_mod_paths))) - logger.debug("Include modules: {}".format(", ".join(include_modules))) - for im in include_modules: - for mp in tent_mod_paths: - if im in mp: - mod_paths.add(mp) - break - else: - mod_paths = mod_paths.union(tent_mod_paths) + mod_paths = mod_paths.union(tent_mod_paths) + + else: + # If include_modules is provided, include only the provided list + logger.debug("Include Modules: {}".format(", ".join(include_modules))) + for module_name in include_modules: + if module_name.endswith('.so') or module_name.endswith('.py'): + pathname = os.path.abspath(module_name) + if os.path.isfile(pathname): + logger.debug(f"Module '{module_name}' found in {pathname}") + mod_paths.add(pathname) + else: + logger.debug(f"Could not find module '{module_name}', skipping") + continue + module_root = module_name.split('.')[0] + if module_root in preinstalled_modules: + logger.debug(f"Module '{module_name}' is already installed in the runtime, skipping") + continue + try: + fp, pathname, description = imp.find_module(module_root) + logger.debug(f"Module '{module_name}' found in {pathname}") + mod_paths.add(pathname) + except ImportError: + logger.debug(f"Could not find module '{module_name}', skipping") - logger.debug("Modules to transmit: {}".format(None if - not mod_paths else ", ".join(mod_paths))) + logger.debug("Modules to transmit: {}".format(None if not mod_paths else ", ".join(mod_paths))) return (strs, mod_paths) @@ -111,17 +129,16 @@ def _module_inspect(self, obj): seen = set() mods = set() - if inspect.isfunction(obj) or (inspect.ismethod(obj) and - inspect.isfunction(obj.__func__)): + if inspect.isfunction(obj) or (inspect.ismethod(obj) and inspect.isfunction(obj.__func__)): # The obj is the user's function worklist.append(obj) elif type(obj).__name__ == 'cython_function_or_method': - for k, v in inspect.getmembers(obj): + for k, v in linspect.getmembers_static(obj): if k == '__globals__': mods.add(v['__file__']) - elif type(obj) == dict: + elif type(obj) is dict: # the obj is the user's iterdata for param in obj.values(): if type(param).__module__ == "__builtin__": @@ -131,16 +148,14 @@ def _module_inspect(self, obj): worklist.append(param) else: # it is a user defined class - for k, v in inspect.getmembers(param): - if inspect.isfunction(v) or (inspect.ismethod(v) and - inspect.isfunction(v.__func__)): + for k, v in linspect.getmembers_static(param): + if inspect.isfunction(v) or (inspect.ismethod(v) and inspect.isfunction(v.__func__)): worklist.append(v) else: # The obj is the user's function but in form of a class found_methods = [] - for k, v in inspect.getmembers(obj): - if inspect.isfunction(v) or (inspect.ismethod(v) and - inspect.isfunction(v.__func__)): + for k, v in linspect.getmembers_static(obj): + if inspect.isfunction(v) or (inspect.ismethod(v) and inspect.isfunction(v.__func__)): found_methods.append(k) worklist.append(v) if "__call__" not in found_methods: @@ -184,7 +199,7 @@ def _module_inspect(self, obj): elif inspect.iscode(v): codeworklist.append(v) - return mods + return set([mod_name.split('.')[0] for mod_name in mods]) def _inner_module_inspect(self, inst): """ diff --git a/lithops/libs/globber/__init__.py b/lithops/libs/globber/__init__.py index 4efb9a9ce..03d720d63 100644 --- a/lithops/libs/globber/__init__.py +++ b/lithops/libs/globber/__init__.py @@ -1,3 +1,7 @@ from .globber import match +__all__ = [ + 'match' +] + name = 'globber' diff --git a/lithops/libs/imp/__init__.py b/lithops/libs/imp/__init__.py new file mode 100644 index 000000000..215b9429d --- /dev/null +++ b/lithops/libs/imp/__init__.py @@ -0,0 +1,28 @@ +from .imp import ( + find_module, + SEARCH_ERROR, + PY_SOURCE, + PY_COMPILED, + C_EXTENSION, + PY_RESOURCE, + PKG_DIRECTORY, + C_BUILTIN, + PY_FROZEN, + PY_CODERESOURCE, + IMP_HOOK, +) + + +__all__ = [ + 'find_module', + 'SEARCH_ERROR', + 'PY_SOURCE', + 'PY_COMPILED', + 'C_EXTENSION', + 'PY_RESOURCE', + 'PKG_DIRECTORY', + 'C_BUILTIN', + 'PY_FROZEN', + 'PY_CODERESOURCE', + 'IMP_HOOK' +] diff --git a/lithops/libs/imp/imp.py b/lithops/libs/imp/imp.py new file mode 100644 index 000000000..9c6d42b6b --- /dev/null +++ b/lithops/libs/imp/imp.py @@ -0,0 +1,78 @@ +from _imp import is_builtin, is_frozen + +from importlib._bootstrap import _ERR_MSG +from importlib import machinery +import os +import sys +import tokenize + + +SEARCH_ERROR = 0 +PY_SOURCE = 1 +PY_COMPILED = 2 +C_EXTENSION = 3 +PY_RESOURCE = 4 +PKG_DIRECTORY = 5 +C_BUILTIN = 6 +PY_FROZEN = 7 +PY_CODERESOURCE = 8 +IMP_HOOK = 9 + + +def get_suffixes(): + extensions = [(s, 'rb', C_EXTENSION) for s in machinery.EXTENSION_SUFFIXES] + source = [(s, 'r', PY_SOURCE) for s in machinery.SOURCE_SUFFIXES] + bytecode = [(s, 'rb', PY_COMPILED) for s in machinery.BYTECODE_SUFFIXES] + + return extensions + source + bytecode + + +def find_module(name, path=None): + """ + Search for a module. + + If path is omitted or None, search for a built-in, frozen or special + module and continue search in sys.path. The module name cannot + contain '.'; to search for a submodule of a package, pass the + submodule name and the package's __path__. + + """ + if not isinstance(name, str): + raise TypeError("'name' must be a str, not {}".format(type(name))) + elif not isinstance(path, (type(None), list)): + # Backwards-compatibility + raise RuntimeError("'path' must be None or a list, " + "not {}".format(type(path))) + + if path is None: + if is_builtin(name): + return None, None, ('', '', C_BUILTIN) + elif is_frozen(name): + return None, None, ('', '', PY_FROZEN) + else: + path = sys.path + + for entry in path: + package_directory = os.path.join(entry, name) + for suffix in ['.py', machinery.BYTECODE_SUFFIXES[0]]: + package_file_name = '__init__' + suffix + file_path = os.path.join(package_directory, package_file_name) + if os.path.isfile(file_path): + return None, package_directory, ('', '', PKG_DIRECTORY) + for suffix, mode, type_ in get_suffixes(): + file_name = name + suffix + file_path = os.path.join(entry, file_name) + if os.path.isfile(file_path): + break + else: + continue + break # Break out of outer loop when breaking out of inner loop. + else: + raise ImportError(_ERR_MSG.format(name), name=name) + + encoding = None + if 'b' not in mode: + with open(file_path, 'rb') as file: + encoding = tokenize.detect_encoding(file.readline)[0] + file = open(file_path, mode, encoding=encoding) + return file, file_path, (suffix, mode, type_) diff --git a/lithops/libs/inspect/__init__.py b/lithops/libs/inspect/__init__.py new file mode 100644 index 000000000..5305455e0 --- /dev/null +++ b/lithops/libs/inspect/__init__.py @@ -0,0 +1,9 @@ +from .inspect import ( + getmembers, + getmembers_static +) + +__all__ = [ + 'getmembers', + 'getmembers_static' +] diff --git a/lithops/libs/inspect/inspect.py b/lithops/libs/inspect/inspect.py new file mode 100644 index 000000000..689e895c3 --- /dev/null +++ b/lithops/libs/inspect/inspect.py @@ -0,0 +1,160 @@ +""" +From +https://github.com/python/cpython/blob/main/Lib/inspect.py +""" +import types +import functools +from inspect import isclass, getmro + + +def _getmembers(object, predicate, getter): + results = [] + processed = set() + names = dir(object) + if isclass(object): + mro = getmro(object) + # add any DynamicClassAttributes to the list of names if object is a class; + # this may result in duplicate entries if, for example, a virtual + # attribute with the same name as a DynamicClassAttribute exists + try: + for base in object.__bases__: + for k, v in base.__dict__.items(): + if isinstance(v, types.DynamicClassAttribute): + names.append(k) + except AttributeError: + pass + else: + mro = () + for key in names: + # First try to get the value via getattr. Some descriptors don't + # like calling their __get__ (see bug #1785), so fall back to + # looking in the __dict__. + try: + value = getter(object, key) + # handle the duplicate key + if key in processed: + raise AttributeError + except AttributeError: + for base in mro: + if key in base.__dict__: + value = base.__dict__[key] + break + else: + # could be a (currently) missing slot member, or a buggy + # __dir__; discard and move on + continue + if not predicate or predicate(value): + results.append((key, value)) + processed.add(key) + results.sort(key=lambda pair: pair[0]) + return results + + +def getmembers(object, predicate=None): + """Return all members of an object as (name, value) pairs sorted by name. + Optionally, only return members that satisfy a given predicate.""" + return _getmembers(object, predicate, getattr) + + +def getmembers_static(object, predicate=None): + """Return all members of an object as (name, value) pairs sorted by name + without triggering dynamic lookup via the descriptor protocol, + __getattr__ or __getattribute__. Optionally, only return members that + satisfy a given predicate. + + Note: this function may not be able to retrieve all members + that getmembers can fetch (like dynamically created attributes) + and may find members that getmembers can't (like descriptors + that raise AttributeError). It can also return descriptor objects + instead of instance members in some cases. + """ + return _getmembers(object, predicate, getattr_static) + + +# ------------------------------------------------ static version of getattr + +_sentinel = object() +_static_getmro = type.__dict__['__mro__'].__get__ +_get_dunder_dict_of_class = type.__dict__["__dict__"].__get__ + + +def _check_instance(obj, attr): + instance_dict = {} + try: + instance_dict = object.__getattribute__(obj, "__dict__") + except AttributeError: + pass + return dict.get(instance_dict, attr, _sentinel) + + +def _check_class(klass, attr): + for entry in _static_getmro(klass): + if _shadowed_dict(type(entry)) is _sentinel and attr in entry.__dict__: + return entry.__dict__[attr] + return _sentinel + + +@functools.lru_cache() +def _shadowed_dict_from_mro_tuple(mro): + for entry in mro: + dunder_dict = _get_dunder_dict_of_class(entry) + if '__dict__' in dunder_dict: + class_dict = dunder_dict['__dict__'] + if not (isinstance(class_dict, types.GetSetDescriptorType) + and class_dict.__name__ == "__dict__" + and class_dict.__objclass__ is entry): + return class_dict + return _sentinel + + +def _shadowed_dict(klass): + return _shadowed_dict_from_mro_tuple(_static_getmro(klass)) + + +def getattr_static(obj, attr, default=_sentinel): + """Retrieve attributes without triggering dynamic lookup via the + descriptor protocol, __getattr__ or __getattribute__. + + Note: this function may not be able to retrieve all attributes + that getattr can fetch (like dynamically created attributes) + and may find attributes that getattr can't (like descriptors + that raise AttributeError). It can also return descriptor objects + instead of instance members in some cases. See the + documentation for details. + """ + instance_result = _sentinel + + objtype = type(obj) + if type not in _static_getmro(objtype): + klass = objtype + dict_attr = _shadowed_dict(klass) + if (dict_attr is _sentinel or isinstance(dict_attr, types.MemberDescriptorType)): + instance_result = _check_instance(obj, attr) + else: + klass = obj + + klass_result = _check_class(klass, attr) + + if instance_result is not _sentinel and klass_result is not _sentinel: + if _check_class(type(klass_result), "__get__") is not _sentinel and ( + _check_class(type(klass_result), "__set__") is not _sentinel + or _check_class(type(klass_result), "__delete__") is not _sentinel + ): + return klass_result + + if instance_result is not _sentinel: + return instance_result + if klass_result is not _sentinel: + return klass_result + + if obj is klass: + # for types we check the metaclass too + for entry in _static_getmro(type(klass)): + if ( + _shadowed_dict(type(entry)) is _sentinel + and attr in entry.__dict__ + ): + return entry.__dict__[attr] + if default is not _sentinel: + return default + raise AttributeError(attr) diff --git a/lithops/libs/multyvac/module_dependency.py b/lithops/libs/multyvac/module_dependency.py index f74493610..2c0d29fe7 100644 --- a/lithops/libs/multyvac/module_dependency.py +++ b/lithops/libs/multyvac/module_dependency.py @@ -27,9 +27,9 @@ https://github.com/cloudpipe/multyvac-fork/blob/master/multyvac/util/module_dependency.py """ import ast -import imp import logging import pkgutil +from lithops.libs import imp logger = logging.getLogger(__name__) @@ -63,7 +63,7 @@ def add(self, module_name): Adds a module to be analyzed. :param module_name: String of module name. """ - logger.debug('Queuing module %r', module_name) + # logger.debug('Queuing module %r', module_name) root_module_name = self._extract_root_module(module_name) self._modules_to_inspect.add(root_module_name) @@ -103,23 +103,23 @@ def _inspect(self, root_module_name): Determines what resources to send over (if any) for a given module. """ if root_module_name in self._inspected_modules: - logger.debug('Already inspected module %r, skipping', - root_module_name) + # logger.debug('Already inspected module %r, skipping', root_module_name) return elif root_module_name in self._modules_to_ignore: - logger.debug('Module %r is to be ignored, skipping', + logger.debug('Module %r is already installed in the runtime, skipping', root_module_name) return else: # Add module to set of scanned modules, before we've analyzed it self._inspected_modules.add(root_module_name) - logger.debug('Inspecting module %r', root_module_name) + # logger.debug('Inspecting module %r', root_module_name) try: fp, pathname, description = imp.find_module(root_module_name) + if pathname: + logger.debug(f"Module '{root_module_name}' found in {pathname}") except ImportError: - logger.debug('Could not find module %r, skipping', - root_module_name) + logger.debug('Could not find module %r, skipping', root_module_name) return _, _, mod_type = description if mod_type == imp.PY_SOURCE: diff --git a/lithops/libs/openwhisk/client.py b/lithops/libs/openwhisk/client.py index d42538597..b4fd1bde9 100644 --- a/lithops/libs/openwhisk/client.py +++ b/lithops/libs/openwhisk/client.py @@ -99,7 +99,7 @@ def delete_namespace(self, namespace): res = self.session.delete(f'{self.url}/{namespace}') if res.status_code == 200: - logger.debug(f"OK --> Namespace deleted") + logger.debug("OK --> Namespace deleted") elif res.status_code == 404: pass else: diff --git a/lithops/localhost/__init__.py b/lithops/localhost/__init__.py index 72cd48eb1..0ad09e0e6 100644 --- a/lithops/localhost/__init__.py +++ b/lithops/localhost/__init__.py @@ -1 +1,10 @@ -from .localhost import LocalhostHandler +from .v1.localhost import LocalhostHandlerV1 +from .v2.localhost import LocalhostHandlerV2 + +# Set the default localhost handler +LocalhostHandler = LocalhostHandlerV2 + +__all__ = [ + 'LocalhostHandlerV1', + 'LocalhostHandlerV2' +] diff --git a/lithops/localhost/config.py b/lithops/localhost/config.py new file mode 100644 index 000000000..1587d7342 --- /dev/null +++ b/lithops/localhost/config.py @@ -0,0 +1,61 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import re +import sys +from enum import Enum + + +DEFAULT_CONFIG_KEYS = { + 'runtime': os.path.basename(sys.executable), + 'worker_processes': os.cpu_count(), +} + +LOCALHOST_EXECUTION_TIMEOUT = 3600 + + +class LocvalhostEnvironment(Enum): + DEFAULT = "default" + CONTAINER = "container" + + +def get_environment(runtime_name): + + windows_path_pattern = re.compile(r'^[A-Za-z]:\\.*$') + if runtime_name.startswith(('python', '/')) \ + or windows_path_pattern.match(runtime_name) is not None: + environment = LocvalhostEnvironment.DEFAULT + else: + environment = LocvalhostEnvironment.CONTAINER + + return environment + + +def load_config(config_data): + + if 'localhost' not in config_data or not config_data['localhost']: + config_data['localhost'] = {} + + for key in DEFAULT_CONFIG_KEYS: + if key not in config_data['localhost']: + config_data['localhost'][key] = DEFAULT_CONFIG_KEYS[key] + + config_data['localhost']['max_workers'] = 1 + + if 'execution_timeout' not in config_data['lithops']: + config_data['lithops']['execution_timeout'] = LOCALHOST_EXECUTION_TIMEOUT + + if 'storage' not in config_data['lithops']: + config_data['lithops']['storage'] = 'localhost' diff --git a/lithops/tests/util_func/__init__.py b/lithops/localhost/v1/__init__.py similarity index 100% rename from lithops/tests/util_func/__init__.py rename to lithops/localhost/v1/__init__.py diff --git a/lithops/localhost/localhost.py b/lithops/localhost/v1/localhost.py similarity index 51% rename from lithops/localhost/localhost.py rename to lithops/localhost/v1/localhost.py index d650a2c1b..5d2cd9cfc 100644 --- a/lithops/localhost/localhost.py +++ b/lithops/localhost/v1/localhost.py @@ -15,7 +15,6 @@ # import os -import sys import json import shlex import queue @@ -29,39 +28,67 @@ from pathlib import Path from lithops.version import __version__ -from lithops.constants import RN_LOG_FILE, TEMP_DIR, USER_TEMP_DIR,\ - LITHOPS_TEMP_DIR, COMPUTE_CLI_MSG, JOBS_PREFIX -from lithops.utils import is_lithops_worker, is_unix_system +from lithops.constants import ( + TEMP_DIR, + USER_TEMP_DIR, + LITHOPS_TEMP_DIR, + COMPUTE_CLI_MSG, + JOBS_PREFIX +) +from lithops.utils import ( + BackendType, + get_docker_path, + is_lithops_worker, + is_podman, + is_unix_system +) +from lithops.localhost.config import ( + LocvalhostEnvironment, + get_environment +) logger = logging.getLogger(__name__) -RUNNER = os.path.join(LITHOPS_TEMP_DIR, 'runner.py') +RUNNER_FILE = os.path.join(LITHOPS_TEMP_DIR, 'localhost-runner.py') LITHOPS_LOCATION = os.path.dirname(os.path.abspath(lithops.__file__)) -class LocalhostHandler: +class LocalhostHandlerV1: """ A localhostHandler object is used by invokers and other components to access underlying localhost backend without exposing the implementation details. """ - def __init__(self, localhost_config): + def __init__(self, config): logger.debug('Creating Localhost compute client') - self.config = localhost_config + self.config = config + self.runtime_name = self.config['runtime'] + self.environment = get_environment(self.runtime_name) - self.env = {} # dict to store environments + self.env = None self.job_queue = queue.Queue() self.job_manager = None - self.should_run = True + self.invocation_in_progress = False - msg = COMPUTE_CLI_MSG.format('Localhost compute') - logger.info("{}".format(msg)) + msg = COMPUTE_CLI_MSG.format('Localhost compute v1') + logger.info(f"{msg}") + + def get_backend_type(self): + """ + Wrapper method that returns the type of the backend (Batch or FaaS) + """ + return BackendType.BATCH.value def init(self): """ Init tasks for localhost """ - pass + if self.environment == LocvalhostEnvironment.DEFAULT: + self.env = DefaultEnvironment(self.config) + else: + self.env = ContainerEnvironment(self.config) + + self.env.setup() def start_manager(self): """ @@ -70,83 +97,70 @@ def start_manager(self): def job_manager(): logger.debug('Staring localhost job manager') - self.should_run = True - while self.should_run: + while True: job_payload, job_filename = self.job_queue.get() + if job_payload is None and job_filename is None: - break + if self.invocation_in_progress or not self.job_queue.empty(): + continue + else: + break + executor_id = job_payload['executor_id'] job_id = job_payload['job_id'] - runtime_name = job_payload['runtime_name'] - env = self.get_env(runtime_name) - process = env.run(job_payload, job_filename) + total_calls = len(job_payload['call_ids']) + job_key = job_payload['job_key'] + logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Running ' + f'{total_calls} activations in the localhost worker') + process = self.env.run_job(job_key, job_filename) process.communicate() # blocks until the process finishes + if process.returncode != 0: + logger.error(f"ExecutorID {executor_id} | JobID {job_id} - Job " + f"process failed with return code {process.returncode}") logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Execution finished') + if self.job_queue.empty(): - break + if self.invocation_in_progress: + continue + else: + break self.job_manager = None - logger.debug("Localhost job manager stopped") + logger.debug("Localhost job manager finished") if not self.job_manager: self.job_manager = threading.Thread(target=job_manager) self.job_manager.start() - def _get_env_type(self, runtime_name): - """ - Gets the environment type based on the runtime name - """ - return 'default' if '/' not in runtime_name else 'docker' - - def get_env(self, runtime_name): - """ - Generates the proper runtime environment based on the runtime name - """ - if runtime_name not in self.env: - if '/' not in runtime_name: - env = DefaultEnv() - else: - pull_runtime = self.config.get('pull_runtime', False) - env = DockerEnv(runtime_name, pull_runtime) - env.setup() - self.env[runtime_name] = env - - return self.env[runtime_name] - def deploy_runtime(self, runtime_name, *args): """ Extract the runtime metadata and preinstalled modules """ logger.info(f"Deploying runtime: {runtime_name}") - env = self.get_env(runtime_name) - - logger.debug(f"Extracting runtime metadata from: {runtime_name}") - runtime_metadata = env.get_metadata() - - return runtime_metadata + return self.env.get_metadata() def invoke(self, job_payload): """ Run the job description against the selected environment """ + self.invocation_in_progress = True executor_id = job_payload['executor_id'] job_id = job_payload['job_id'] - runtime_name = job_payload['runtime_name'] - logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Putting job into localhost queue') - self.start_manager() - env = self.get_env(runtime_name) - job_filename = env._prepare_job_file(job_payload) + logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Putting job into localhost queue') + job_filename = self.env.prepare_job_file(job_payload) self.job_queue.put((job_payload, job_filename)) + self.start_manager() + self.invocation_in_progress = False + def get_runtime_key(self, runtime_name, *args): """ Generate the runtime key that identifies the runtime """ - env_type = self._get_env_type(runtime_name) - runtime_key = os.path.join('localhost', __version__, env_type, runtime_name.strip("/")) + runtime_key = os.path.join('localhost', __version__, runtime_name.strip("/")) return runtime_key @@ -155,70 +169,57 @@ def get_runtime_info(self): Method that returns a dictionary with all the relevant runtime information set in config """ - runtime_info = { + return { 'runtime_name': self.config['runtime'], - 'runtime_memory': None, - 'runtime_timeout': None, + 'runtime_memory': self.config.get('runtime_memory'), + 'runtime_timeout': self.config.get('runtime_timeout'), 'max_workers': self.config['max_workers'], } - return runtime_info - - def get_backend_type(self): - """ - Wrapper method that returns the type of the backend (Batch or FaaS) - """ - return 'batch' - def clean(self, **kwargs): """ Deletes all local runtimes """ pass - def clear(self, job_keys=None): + def clear(self, job_keys=None, exception=None): """ Kills all running jobs processes """ - self.should_run = False - while not self.job_queue.empty(): try: self.job_queue.get(False) except Exception: pass - for runtime_name in self.env: - self.env[runtime_name].stop(job_keys) + self.env.stop(job_keys) if self.job_manager: self.job_queue.put((None, None)) - self.should_run = True - -class BaseEnv: +class ExecutionEnvironment: """ Base environment class for shared methods """ - def __init__(self, runtime): - self.runtime = runtime + def __init__(self, config): + self.config = config + self.runtime_name = self.config['runtime'] + self.is_unix_system = is_unix_system() self.jobs = {} # dict to store executed jobs (job_keys) and PIDs def _copy_lithops_to_tmp(self): - if is_lithops_worker() and os.path.isfile(RUNNER): + if is_lithops_worker() and os.path.isfile(RUNNER_FILE): return os.makedirs(LITHOPS_TEMP_DIR, exist_ok=True) - try: - shutil.rmtree(os.path.join(LITHOPS_TEMP_DIR, 'lithops')) - except FileNotFoundError: - pass - shutil.copytree(LITHOPS_LOCATION, os.path.join(LITHOPS_TEMP_DIR, 'lithops')) - src_handler = os.path.join(LITHOPS_LOCATION, 'localhost', 'runner.py') - copyfile(src_handler, RUNNER) - - def _prepare_job_file(self, job_payload): + dst_path = os.path.join(LITHOPS_TEMP_DIR, 'lithops') + shutil.rmtree(dst_path, ignore_errors=True) + shutil.copytree(LITHOPS_LOCATION, dst_path, dirs_exist_ok=True) + src_handler = os.path.join(LITHOPS_LOCATION, 'localhost', 'v1', 'runner.py') + copyfile(src_handler, RUNNER_FILE) + + def prepare_job_file(self, job_payload): """ Creates the job file that contains the job payload to be executed """ @@ -236,8 +237,8 @@ def _prepare_job_file(self, job_payload): with open(local_job_filename, 'w') as jl: json.dump(job_payload, jl, default=str) - if isinstance(self, DockerEnv): - job_filename = '{}/{}'.format(docker_job_dir, job_file) + if isinstance(self, ContainerEnvironment): + job_filename = f'{docker_job_dir}/{job_file}' else: job_filename = local_job_filename @@ -252,7 +253,7 @@ def kill_job(job_key): if self.jobs[job_key].poll() is None: logger.debug(f'Killing job {job_key} with PID {self.jobs[job_key].pid}') PID = self.jobs[job_key].pid - if is_unix_system(): + if self.is_unix_system: PGID = os.getpgid(PID) os.killpg(PGID, signal.SIGKILL) else: @@ -268,130 +269,125 @@ def kill_job(job_key): pass -class DockerEnv(BaseEnv): +class DefaultEnvironment(ExecutionEnvironment): """ - Docker environment uses a docker runtime image + Default environment uses current python3 installation """ - def __init__(self, docker_image, pull_runtime): - logger.debug(f'Starting Docker Environment for {docker_image}') - super().__init__(runtime=docker_image) - self.pull_runtime = pull_runtime - self.uid = os.getuid() if is_unix_system() else None - self.gid = os.getuid() if is_unix_system() else None + def __init__(self, config): + super().__init__(config) + logger.debug(f'Starting default environment for {self.runtime_name}') def setup(self): - logger.debug('Setting up Docker environment') + logger.debug('Setting up default environment') self._copy_lithops_to_tmp() - if self.pull_runtime: - logger.debug('Pulling Docker runtime {}'.format(self.runtime)) - sp.run(shlex.split(f'docker pull {self.runtime}'), check=True, - stdout=sp.PIPE, universal_newlines=True) def get_metadata(self): - if not os.path.isfile(RUNNER): + if not os.path.isfile(RUNNER_FILE): self.setup() - tmp_path = Path(TEMP_DIR).as_posix() - cmd = 'docker run ' - cmd += f'--user {self.uid}:{self.gid} ' if is_unix_system() else '' - cmd += f'--env USER={os.getenv("USER", "root")} ' - cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" ' - cmd += f'{self.runtime} /tmp/{USER_TEMP_DIR}/runner.py get_metadata' - - process = sp.run(shlex.split(cmd), check=True, stdout=sp.PIPE, - universal_newlines=True, start_new_session=True) + logger.debug(f"Extracting metadata from: {self.runtime_name}") + cmd = [self.runtime_name, RUNNER_FILE, 'get_metadata'] + process = sp.run( + cmd, check=True, + stdout=sp.PIPE, + universal_newlines=True, + start_new_session=True + ) runtime_meta = json.loads(process.stdout.strip()) - return runtime_meta - def run(self, job_payload, job_filename): + def run_job(self, job_key, job_filename): """ Runs a job """ - executor_id = job_payload['executor_id'] - job_id = job_payload['job_id'] - total_calls = len(job_payload['call_ids']) - job_key = job_payload['job_key'] - - logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Running ' - f'{total_calls} activations in the localhost worker') - - if not os.path.isfile(RUNNER): + if not os.path.isfile(RUNNER_FILE): self.setup() - tmp_path = Path(TEMP_DIR).as_posix() - if job_payload['config'].get('standalone', {}).get('gpu', False): - cmd = f'docker run --gpus all --name lithops_{job_key} ' - else: - cmd = f'docker run --name lithops_{job_key} ' - cmd += f'--user {self.uid}:{self.gid} ' if is_unix_system() else '' - cmd += f'--env USER={os.getenv("USER", "root")} ' - cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" ' - cmd += f'{self.runtime} /tmp/{USER_TEMP_DIR}/runner.py run_job {job_filename}' - - log = open(RN_LOG_FILE, 'a') - process = sp.Popen(shlex.split(cmd), stdout=log, stderr=log, start_new_session=True) + cmd = [self.runtime_name, RUNNER_FILE, 'run_job', job_filename] + process = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE, start_new_session=True) self.jobs[job_key] = process return process - def stop(self, job_keys=None): - """ - Stops running containers - """ - if job_keys: - for job_key in job_keys: - sp.Popen(shlex.split(f'docker rm -f lithops_{job_key}'), - stdout=sp.DEVNULL, stderr=sp.DEVNULL) - else: - for job_key in self.jobs: - sp.Popen(shlex.split(f'docker rm -f lithops_{job_key}'), - stdout=sp.DEVNULL, stderr=sp.DEVNULL) - super().stop(job_keys) - -class DefaultEnv(BaseEnv): +class ContainerEnvironment(ExecutionEnvironment): """ - Default environment uses current python3 installation + Docker environment uses a docker runtime image """ - def __init__(self): - logger.debug(f'Starting Default Environment for {sys.executable}') - super().__init__(runtime=sys.executable) + def __init__(self, config): + super().__init__(config) + logger.debug(f'Starting container environment for {self.runtime_name}') + self.use_gpu = self.config.get('use_gpu', False) + self.docker_path = get_docker_path() + self.is_podman = is_podman(self.docker_path) + self.uid = os.getuid() if self.is_unix_system else None + self.gid = os.getgid() if self.is_unix_system else None def setup(self): - logger.debug('Setting up Default environment') + logger.debug('Setting up container environment') self._copy_lithops_to_tmp() + if self.config.get('pull_runtime', False): + logger.debug(f'Pulling runtime {self.runtime_name}') + sp.run( + shlex.split(f'docker pull {self.runtime_name}'), check=True, + stdout=sp.PIPE, universal_newlines=True + ) def get_metadata(self): - if not os.path.isfile(RUNNER): + if not os.path.isfile(RUNNER_FILE): self.setup() - cmd = [self.runtime, RUNNER, 'get_metadata'] - process = sp.run(cmd, check=True, stdout=sp.PIPE, universal_newlines=True, - start_new_session=True) + logger.debug(f"Extracting metadata from: {self.runtime_name}") + + tmp_path = Path(TEMP_DIR).as_posix() + + cmd = f'{self.docker_path} run --name lithops_metadata ' + cmd += f'--user {self.uid}:{self.gid} ' if self.is_unix_system and not self.is_podman else '' + cmd += f'--env USER={os.getenv("USER", "root")} ' + cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" ' + cmd += f'{self.runtime_name} /tmp/{USER_TEMP_DIR}/localhost-runner.py get_metadata' + + process = sp.run( + shlex.split(cmd), check=True, stdout=sp.PIPE, + universal_newlines=True, start_new_session=True + ) runtime_meta = json.loads(process.stdout.strip()) + return runtime_meta - def run(self, job_payload, job_filename): + def run_job(self, job_key, job_filename): """ Runs a job """ - executor_id = job_payload['executor_id'] - job_id = job_payload['job_id'] - total_calls = len(job_payload['call_ids']) - job_key = job_payload['job_key'] + if not os.path.isfile(RUNNER_FILE): + self.setup() - logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Running ' - f'{total_calls} activations in the localhost worker') + tmp_path = Path(TEMP_DIR).as_posix() - if not os.path.isfile(RUNNER): - self.setup() + cmd = f'{self.docker_path} run --name lithops_{job_key} ' + cmd += '--gpus all ' if self.use_gpu else '' + cmd += f'--user {self.uid}:{self.gid} ' if self.is_unix_system and not self.is_podman else '' + cmd += f'--env USER={os.getenv("USER", "root")} ' + cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" ' + cmd += f'{self.runtime_name} /tmp/{USER_TEMP_DIR}/localhost-runner.py run_job {job_filename}' - cmd = [self.runtime, RUNNER, 'run_job', job_filename] - log = open(RN_LOG_FILE, 'a') - process = sp.Popen(cmd, stdout=log, stderr=log, start_new_session=True) + process = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE, start_new_session=True) self.jobs[job_key] = process return process + + def stop(self, job_keys=None): + """ + Stops running containers + """ + jk_to_delete = job_keys or list(self.jobs.keys()) + + for job_key in jk_to_delete: + sp.Popen( + shlex.split(f'{self.docker_path} rm -f lithops_{job_key}'), + stdout=sp.DEVNULL, stderr=sp.DEVNULL + ) + + super().stop(job_keys) diff --git a/lithops/localhost/runner.py b/lithops/localhost/v1/runner.py similarity index 99% rename from lithops/localhost/runner.py rename to lithops/localhost/v1/runner.py index 19b3d1893..31920977c 100644 --- a/lithops/localhost/runner.py +++ b/lithops/localhost/v1/runner.py @@ -25,7 +25,7 @@ from lithops.worker import function_handler from lithops.worker.utils import get_runtime_metadata -from lithops.constants import LITHOPS_TEMP_DIR, JOBS_DIR, LOGS_DIR,\ +from lithops.constants import LITHOPS_TEMP_DIR, JOBS_DIR, LOGS_DIR, \ RN_LOG_FILE, LOGGER_FORMAT log_file_stream = open(RN_LOG_FILE, 'a') diff --git a/lithops/localhost/v2/__init__.py b/lithops/localhost/v2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/lithops/localhost/v2/localhost.py b/lithops/localhost/v2/localhost.py new file mode 100644 index 000000000..a873104af --- /dev/null +++ b/lithops/localhost/v2/localhost.py @@ -0,0 +1,455 @@ +# +# (C) Copyright IBM Corp. 2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +import os +import json +import threading +import uuid +import shlex +import signal +import lithops +import logging +import shutil +import queue +import subprocess as sp +from shutil import copyfile +from pathlib import Path + +from lithops.version import __version__ +from lithops.constants import ( + JOBS_DIR, + TEMP_DIR, + LITHOPS_TEMP_DIR, + COMPUTE_CLI_MSG, + CPU_COUNT, + USER_TEMP_DIR, +) +from lithops.utils import ( + BackendType, + CountDownLatch, + get_docker_path, + is_lithops_worker, + is_podman, + is_unix_system +) +from lithops.localhost.config import ( + LocvalhostEnvironment, + get_environment +) + +logger = logging.getLogger(__name__) + +RUNNER_FILE = os.path.join(LITHOPS_TEMP_DIR, 'localhost-runner.py') +LITHOPS_LOCATION = os.path.dirname(os.path.abspath(lithops.__file__)) + + +class LocalhostHandlerV2: + """ + A localhostHandler object is used by invokers and other components to + access underlying localhost backend without exposing the implementation + details. + """ + + def __init__(self, localhost_config): + logger.debug('Creating Localhost compute client') + self.config = localhost_config + self.runtime_name = self.config['runtime'] + self.environment = get_environment(self.runtime_name) + + self.env = None + self.job_manager = None + self.invocation_in_progress = False + + msg = COMPUTE_CLI_MSG.format('Localhost compute v2') + logger.info(f"{msg}") + + def get_backend_type(self): + """ + Wrapper method that returns the type of the backend (Batch or FaaS) + """ + return BackendType.BATCH.value + + def init(self): + """ + Init tasks for localhost + """ + if self.environment == LocvalhostEnvironment.DEFAULT: + self.env = DefaultEnvironment(self.config) + else: + self.env = ContainerEnvironment(self.config) + + self.env.setup() + + def start_manager(self): + """ + Starts manager thread to keep order in tasks + """ + def job_manager(): + logger.debug('Staring localhost job manager') + + while True: + for job_key in list(self.env.jobs.keys()): + self.env.jobs[job_key].wait() + if all(job.done for job in self.env.jobs.values()): + if self.invocation_in_progress: + continue + else: + break + + self.job_manager = None + logger.debug("Localhost job manager finished") + + if not self.job_manager: + self.job_manager = threading.Thread(target=job_manager) + self.job_manager.start() + self.env.start() + + def deploy_runtime(self, runtime_name, *args): + """ + Extract the runtime metadata and preinstalled modules + """ + logger.info(f"Deploying runtime: {runtime_name}") + return self.env.get_metadata() + + def invoke(self, job_payload): + """ + Run the job description against the selected environment + """ + self.invocation_in_progress = True + executor_id = job_payload['executor_id'] + job_id = job_payload['job_id'] + total_calls = len(job_payload['call_ids']) + + logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Running ' + f'{total_calls} activations in the localhost worker') + + self.env.run_job(job_payload) + + self.start_manager() + self.invocation_in_progress = False + + def get_runtime_key(self, runtime_name, *args): + """ + Generate the runtime key that identifies the runtime + """ + runtime_key = os.path.join('localhost', __version__, runtime_name.strip("/")) + + return runtime_key + + def get_runtime_info(self): + """ + Method that returns a dictionary with all the relevant runtime + information set in config + """ + return { + 'runtime_name': self.config['runtime'], + 'runtime_memory': self.config.get('runtime_memory'), + 'runtime_timeout': self.config.get('runtime_timeout'), + 'max_workers': self.config['max_workers'], + } + + def clean(self, **kwargs): + """ + Deletes all local runtimes + """ + pass + + def clear(self, job_keys=None, exception=None): + """ + Kills the running service in case of exception + """ + while not self.env.work_queue.empty(): + try: + self.env.work_queue.get(False) + except Exception: + pass + + self.env.stop(job_keys) + + for job_key in list(self.env.jobs.keys()): + while not self.env.jobs[job_key].done: + self.env.jobs[job_key].unlock() + + +class ExecutionEnvironment: + """ + Base environment class for shared methods + """ + + def __init__(self, config): + self.config = config + self.runtime_name = self.config['runtime'] + self.worker_processes = self.config.get('worker_processes', CPU_COUNT) + self.work_queue = queue.Queue() + self.is_unix_system = is_unix_system() + self.task_processes = {} + self.consumer_threads = [] + self.jobs = {} + + def _copy_lithops_to_tmp(self): + if is_lithops_worker() and os.path.isfile(RUNNER_FILE): + return + os.makedirs(LITHOPS_TEMP_DIR, exist_ok=True) + dst_path = os.path.join(LITHOPS_TEMP_DIR, 'lithops') + shutil.rmtree(dst_path, ignore_errors=True) + shutil.copytree(LITHOPS_LOCATION, dst_path, dirs_exist_ok=True) + src_handler = os.path.join(LITHOPS_LOCATION, 'localhost', 'v2', 'runner.py') + copyfile(src_handler, RUNNER_FILE) + + def run_job(self, job_payload): + """ + Adds a job to the localhost work queue + """ + job_key = job_payload['job_key'] + self.jobs[job_key] = CountDownLatch(len(job_payload['call_ids'])) + os.makedirs(os.path.join(JOBS_DIR, job_key), exist_ok=True) + + dbr = job_payload['data_byte_ranges'] + for call_id in job_payload['call_ids']: + task_payload = copy.deepcopy(job_payload) + task_payload['call_ids'] = [call_id] + task_payload['data_byte_ranges'] = [dbr[int(call_id)]] + self.work_queue.put(json.dumps(task_payload)) + + def start(self): + """ + Starts the threads responsible to consume individual tasks from the queue + and execute them in the appropiate environment + """ + if self.consumer_threads: + return + + def process_task(task_payload_str): + task_payload = json.loads(task_payload_str) + job_key = task_payload['job_key'] + call_id = task_payload['call_ids'][0] + + task_filename = os.path.join(JOBS_DIR, job_key, call_id + '.task') + with open(task_filename, 'w') as jl: + json.dump(task_payload, jl, default=str) + + self.run_task(job_key, call_id) + + if os.path.exists(task_filename): + os.remove(task_filename) + + self.jobs[job_key].unlock() + + def queue_consumer(work_queue): + while True: + task_payload_str = work_queue.get() + if task_payload_str is None: + break + process_task(task_payload_str) + + logger.debug("Starting Localhost work queue consumer threads") + for _ in range(self.worker_processes): + t = threading.Thread( + target=queue_consumer, + args=(self.work_queue,), + daemon=True) + t.start() + self.consumer_threads.append(t) + + def stop(self, job_keys=None): + """ + Stops running consumer threads + """ + logger.debug("Stopping Localhost work queue consumer threads") + for _ in range(self.worker_processes): + self.work_queue.put(None) + + for t in self.consumer_threads: + t.join() + + self.consumer_threads = [] + + +class DefaultEnvironment(ExecutionEnvironment): + """ + Default environment uses current python3 installation + """ + + def __init__(self, config): + super().__init__(config) + logger.debug(f'Starting default environment for {self.runtime_name}') + + def setup(self): + logger.debug('Setting up default environment') + self._copy_lithops_to_tmp() + + def get_metadata(self): + if not os.path.isfile(RUNNER_FILE): + self.setup() + + logger.debug(f"Extracting metadata from: {self.runtime_name}") + cmd = [self.runtime_name, RUNNER_FILE, 'get_metadata'] + process = sp.run( + cmd, check=True, + stdout=sp.PIPE, + universal_newlines=True, + start_new_session=True + ) + runtime_meta = json.loads(process.stdout.strip()) + return runtime_meta + + def start(self): + if not os.path.isfile(RUNNER_FILE): + self.setup() + + super().start() + + def run_task(self, job_key, call_id): + """ + Runs a task + """ + job_key_call_id = f'{job_key}-{call_id}' + task_filename = os.path.join(JOBS_DIR, job_key, call_id + '.task') + + logger.debug(f"Going to execute task process {job_key_call_id}") + cmd = [self.runtime_name, RUNNER_FILE, 'run_job', task_filename] + process = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE, start_new_session=True) + self.task_processes[job_key_call_id] = process + process.communicate() # blocks until the process finishes + if process.returncode != 0: + logger.error(f"Task process {job_key_call_id} failed with return code {process.returncode}") + del self.task_processes[job_key_call_id] + logger.debug(f"Task process {job_key_call_id} finished") + + def stop(self, job_keys=None): + """ + Stops running processes + """ + def kill_process(process): + if process and process.poll() is None: + PID = process.pid + if self.is_unix_system: + PGID = os.getpgid(PID) + os.killpg(PGID, signal.SIGKILL) + else: + os.kill(PID, signal.SIGTERM) + + job_keys_to_stop = job_keys or list(self.jobs.keys()) + for job_key in job_keys_to_stop: + for job_key_call_id in list(self.task_processes.keys()): + if job_key_call_id.rsplit('-', 1)[0] == job_key: + process = self.task_processes[job_key_call_id] + try: + kill_process(process) + except Exception: + pass + self.task_processes[job_key_call_id] = None + + super().stop(job_keys) + + +class ContainerEnvironment(ExecutionEnvironment): + """ + Container environment uses a container runtime image + """ + + def __init__(self, config): + super().__init__(config) + logger.debug(f'Starting container environment for {self.runtime_name}') + self.use_gpu = self.config.get('use_gpu', False) + self.docker_path = get_docker_path() + self.is_podman = is_podman(self.docker_path) + self.container_name = "lithops_" + str(uuid.uuid4()).replace('-', '')[:12] + self.container_process = None + self.uid = os.getuid() if self.is_unix_system else None + self.gid = os.getgid() if self.is_unix_system else None + + def setup(self): + logger.debug('Setting up container environment') + self._copy_lithops_to_tmp() + + if self.config.get('pull_runtime', False): + logger.debug(f'Pulling runtime {self.runtime_name}') + sp.run( + shlex.split(f'{self.docker_path} pull {self.runtime_name}'), + check=True, stdout=sp.PIPE, universal_newlines=True + ) + + def get_metadata(self): + if not os.path.isfile(RUNNER_FILE): + self.setup() + + logger.debug(f"Extracting metadata from: {self.runtime_name}") + + tmp_path = Path(TEMP_DIR).as_posix() + + cmd = f'{self.docker_path} run --name lithops_metadata ' + cmd += f'--user {self.uid}:{self.gid} ' if self.is_unix_system and not self.is_podman else '' + cmd += f'--env USER={os.getenv("USER", "root")} ' + cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" ' + cmd += f'{self.runtime_name} /tmp/{USER_TEMP_DIR}/localhost-runner.py get_metadata' + + process = sp.run( + shlex.split(cmd), check=True, stdout=sp.PIPE, + universal_newlines=True, start_new_session=True + ) + runtime_meta = json.loads(process.stdout.strip()) + + return runtime_meta + + def start(self): + if not os.path.isfile(RUNNER_FILE): + self.setup() + + tmp_path = Path(TEMP_DIR).as_posix() + + cmd = f'{self.docker_path} run --name {self.container_name} ' + cmd += '--gpus all ' if self.use_gpu else '' + cmd += f'--user {self.uid}:{self.gid} ' if self.is_unix_system and not self.is_podman else '' + cmd += f'--env USER={os.getenv("USER", "root")} ' + cmd += f'--rm -v {tmp_path}:/tmp -it --detach ' + cmd += f'--entrypoint=/bin/bash {self.runtime_name}' + + self.container_process = sp.Popen(shlex.split(cmd), stdout=sp.DEVNULL, start_new_session=True) + self.container_process.communicate() # blocks until the process finishes + + super().start() + + def run_task(self, job_key, call_id): + """ + Runs a task + """ + job_key_call_id = f'{job_key}-{call_id}' + docker_job_dir = f'/tmp/{USER_TEMP_DIR}/jobs/{job_key}' + docker_task_filename = f'{docker_job_dir}/{call_id}.task' + + logger.debug(f"Going to execute task process {job_key_call_id}") + cmd = f'{self.docker_path} exec {self.container_name} /bin/bash -c ' + cmd += f'"python3 /tmp/{USER_TEMP_DIR}/localhost-runner.py ' + cmd += f'run_job {docker_task_filename}"' + + process = sp.Popen(shlex.split(cmd), stdout=sp.PIPE, stderr=sp.PIPE, start_new_session=True) + self.task_processes[job_key_call_id] = process + process.communicate() # blocks until the process finishes + if process.returncode != 0: + logger.error(f"Task process {job_key_call_id} failed with return code {process.returncode}") + logger.debug(f"Task process {job_key_call_id} finished") + + def stop(self, job_keys=None): + """ + Stop localhost container + """ + sp.Popen( + shlex.split(f'{self.docker_path} rm -f {self.container_name}'), + stdout=sp.DEVNULL, stderr=sp.DEVNULL + ) + super().stop(job_keys) diff --git a/lithops/localhost/v2/runner.py b/lithops/localhost/v2/runner.py new file mode 100644 index 000000000..d4915dbd5 --- /dev/null +++ b/lithops/localhost/v2/runner.py @@ -0,0 +1,95 @@ +# +# (C) Copyright Cloudlab URV 2020 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys +import json +import platform +import logging +import uuid +import multiprocessing as mp + +from lithops.worker import function_handler +from lithops.worker.utils import get_runtime_metadata +from lithops.constants import ( + LITHOPS_TEMP_DIR, + JOBS_DIR, + LOGS_DIR, + LOGGER_FORMAT, + RN_LOG_FILE +) + + +os.makedirs(LITHOPS_TEMP_DIR, exist_ok=True) +os.makedirs(JOBS_DIR, exist_ok=True) +os.makedirs(LOGS_DIR, exist_ok=True) + +log_file_stream = open(RN_LOG_FILE, 'a') +logging.basicConfig(stream=log_file_stream, level=logging.DEBUG, format=LOGGER_FORMAT) +logger = logging.getLogger('lithops.localhost.runner') + + +# Change spawn method for MacOS +if platform.system() == 'Darwin': + mp.set_start_method("fork") + + +def run_job(): + sys.stdout = log_file_stream + sys.stderr = log_file_stream + + task_filename = sys.argv[2] + logger.info(f'Got {task_filename} file') + + with open(task_filename, 'rb') as jf: + task_payload = json.load(jf) + + executor_id = task_payload['executor_id'] + job_id = task_payload['job_id'] + call_id = task_payload['call_ids'][0] + + logger.info(f'ExecutorID {executor_id} | JobID {job_id} | CallID {call_id} - Starting execution') + + act_id = str(uuid.uuid4()).replace('-', '')[:12] + os.environ['__LITHOPS_ACTIVATION_ID'] = act_id + os.environ['__LITHOPS_BACKEND'] = 'Localhost' + + try: + task_payload['worker_processes'] = 1 + function_handler(task_payload) + except KeyboardInterrupt: + pass + + logger.info(f'ExecutorID {executor_id} | JobID {job_id} | CallID {call_id} - Execution Finished') + + +def extract_runtime_meta(): + runtime_meta = get_runtime_metadata() + print(json.dumps(runtime_meta)) + + +if __name__ == "__main__": + logger.info('Starting Localhost task runner') + command = sys.argv[1] + logger.info(f'Received command: {command}') + + switcher = { + 'get_metadata': extract_runtime_meta, + 'run_job': run_job + } + + switcher.get(command, lambda: "Invalid command")() + log_file_stream.close() diff --git a/lithops/monitor.py b/lithops/monitor.py index 02cb30adc..ce68fa2b2 100644 --- a/lithops/monitor.py +++ b/lithops/monitor.py @@ -25,7 +25,6 @@ import threading import concurrent.futures as cf from tblib import pickling_support -from lithops.constants import MONITORING_INTERVAL pickling_support.install() @@ -42,15 +41,17 @@ class Monitor(threading.Thread): def __init__(self, executor_id, internal_storage, token_bucket_q, + job_chunksize, generate_tokens, config): super().__init__() self.executor_id = executor_id - self.futures = [] + self.futures = set() self.internal_storage = internal_storage self.should_run = True self.token_bucket_q = token_bucket_q + self.job_chunksize = job_chunksize self.generate_tokens = generate_tokens self.config = config self.daemon = True @@ -59,28 +60,40 @@ def __init__(self, executor_id, self.workers = {} self.workers_done = [] self.callids_done_worker = {} - self.job_chunksize = {} self.present_jobs = set() - def add_futures(self, fs, job_id=None, chunksize=None): + def add_futures(self, fs): """ Extends the current thread list of futures to track """ - self.futures.extend(fs) + self.futures.update(set(fs)) - # this is required for FaaS backends and _generate_tokens - if job_id: - self.job_chunksize[job_id] = chunksize - - present_jobs = {f.job_id for f in fs} + present_jobs = {future.job_id for future in fs} for job_id in present_jobs: self.present_jobs.add(job_id) + def remove_futures(self, fs): + """ + Remove from the current thread a list of futures + """ + self._print_status_log() + + for future in fs: + if future in self.futures: + self.futures.remove(future) + + for job_id in {future.job_id for future in fs}: + if job_id in self.present_jobs: + self.present_jobs.remove(job_id) + def _all_ready(self): """ Checks if all futures are ready, success or done """ - return all([f.ready or f.success or f.done for f in self.futures]) + try: + return all(f.ready or f.success or f.done for f in self.futures) + except Exception: + return False def _check_new_futures(self, call_status, f): """Checks if a functions returned new futures to track""" @@ -88,8 +101,11 @@ def _check_new_futures(self, call_status, f): return False f._set_futures(call_status) - self.futures.extend(f._new_futures) - logger.debug(f'ExecutorID {self.executor_id} - Got {len(f._new_futures)} new futures to track') + self.futures.update(f._new_futures) + logger.debug( + f'ExecutorID {self.executor_id} - Received {len(f._new_futures)} ' + 'new function Futures to track' + ) return True @@ -98,13 +114,13 @@ def _future_timeout_checker(self, futures): Checks if running futures exceeded the timeout """ current_time = time.time() - futures_running = [f for f in futures if f.running] + futures_running = [f for f in futures if f.running and f._call_status] for fut in futures_running: try: start_tstamp = fut._call_status['worker_start_tstamp'] fut_timeout = start_tstamp + fut.execution_timeout + 5 if current_time > fut_timeout: - msg = 'The function did not run as expected.' + msg = f"The function exceeded the execution timeout of {fut.execution_timeout} seconds." raise TimeoutError('HANDLER', msg) except TimeoutError: # generate fake TimeoutError call status @@ -115,11 +131,15 @@ def _future_timeout_checker(self, futures): 'executor_id': fut.executor_id, 'job_id': fut.job_id, 'call_id': fut.call_id, - 'activation_id': fut.activation_id} + 'activation_id': fut.activation_id, + 'worker_start_tstamp': start_tstamp, + 'worker_end_tstamp': time.time()} fut._set_ready(call_status) def _print_status_log(self, previous_log=None, log_time=None): """prints a debug log showing the status of the job""" + if not self.futures: + return previous_log, log_time callids_pending = len([f for f in self.futures if f.invoked]) callids_running = len([f for f in self.futures if f.running]) callids_done = len([f for f in self.futures if f.ready or f.success or f.done]) @@ -132,11 +152,27 @@ def _print_status_log(self, previous_log=None, log_time=None): class RabbitmqMonitor(Monitor): - def __init__(self, executor_id, internal_storage, token_bucket_q, generate_tokens, config): - super().__init__(executor_id, internal_storage, token_bucket_q, generate_tokens, config) + def __init__( + self, + executor_id, + internal_storage, + token_bucket_q, + job_chunksize, + generate_tokens, + config + ): + super().__init__( + executor_id, + internal_storage, + token_bucket_q, + job_chunksize, + generate_tokens, + config + ) self.rabbit_amqp_url = config.get('amqp_url') self.queue = f'lithops-{self.executor_id}' + self.tag = None self._create_resources() def _create_resources(self): @@ -157,6 +193,8 @@ def _delete_resources(self): """ connection = pika.BlockingConnection(self.pikaparams) channel = connection.channel() + if self.tag: + channel.basic_cancel(self.tag) channel.queue_delete(queue=self.queue) channel.close() connection.close() @@ -209,9 +247,7 @@ def _generate_tokens(self, call_status): self.token_bucket_q.put('#') def run(self): - logger.debug(f'ExecutorID {self.executor_id} | Starting RabbitMQ job monitor') - prevoius_log = None - log_time = 0 + logger.debug(f'ExecutorID {self.executor_id} | Starting RabbitMQ job monitor') SLEEP_TIME = 2 channel = self.connection.channel() @@ -229,28 +265,47 @@ def callback(ch, method, properties, body): if self._all_ready() or not self.should_run: ch.stop_consuming() ch.close() - self._print_status_log() - logger.debug(f'ExecutorID {self.executor_id} | RabbitMQ job monitor finished') - channel.basic_consume(self.queue, callback, auto_ack=True) - threading.Thread(target=channel.start_consuming, daemon=True).start() + def manage_timeouts(): + prevoius_log = None + log_time = 0 + while self.should_run and not self._all_ready(): + # Format call_ids running, pending and done + prevoius_log, log_time = self._print_status_log(previous_log=prevoius_log, log_time=log_time) + self._future_timeout_checker(self.futures) + time.sleep(SLEEP_TIME) + log_time += SLEEP_TIME - while not self._all_ready() or not self.futures: - # Format call_ids running, pending and done - prevoius_log, log_time = self._print_status_log(previous_log=prevoius_log, log_time=log_time) - self._future_timeout_checker(self.futures) - time.sleep(SLEEP_TIME) - log_time += SLEEP_TIME + threading.Thread(target=manage_timeouts, daemon=True).start() - if not self.should_run: - break + self.tag = channel.basic_consume(self.queue, callback, auto_ack=True) + channel.start_consuming() + self.tag = None + self._print_status_log() + logger.debug(f'ExecutorID {self.executor_id} | RabbitMQ job monitor finished') class StorageMonitor(Monitor): + THREADPOOL_SIZE = 64 - def __init__(self, executor_id, internal_storage, token_bucket_q, generate_tokens, config): - super().__init__(executor_id, internal_storage, token_bucket_q, generate_tokens, config) + def __init__( + self, + executor_id, + internal_storage, + token_bucket_q, + job_chunksize, + generate_tokens, + config + ): + super().__init__( + executor_id, + internal_storage, + token_bucket_q, + job_chunksize, + generate_tokens, + config + ) self.monitoring_interval = config['monitoring_interval'] @@ -374,37 +429,54 @@ def _generate_tokens(self, callids_running, callids_done): self.callids_running_processed.update(callids_running_to_process) self.callids_done_processed.update(callids_done_to_process) + def _poll_and_process_job_status(self, previous_log, log_time): + """ + Polls the storage backend for job status, updates futures, + and prints status logs. + + Returns: + new_callids_done (set): New callids that were marked as done. + previous_log (str): Updated log message. + log_time (float): Updated log time counter. + """ + callids_running, callids_done = self.internal_storage.get_job_status(self.executor_id) + new_callids_done = callids_done - self.callids_done_processed_status + + self._generate_tokens(callids_running, callids_done) + self._tag_future_as_running(callids_running) + self._tag_future_as_ready(callids_done) + + previous_log, log_time = self._print_status_log(previous_log, log_time) + + return new_callids_done, previous_log, log_time + def run(self): """ - Run method + Run method for the Storage job monitor thread. """ logger.debug(f'ExecutorID {self.executor_id} - Starting Storage job monitor') - WAIT_DUR_SEC = self.monitoring_interval - prevoius_log = None + wait_dur_sec = self.monitoring_interval + previous_log = None log_time = 0 - while not self._all_ready() or not self.futures: - time.sleep(WAIT_DUR_SEC) - WAIT_DUR_SEC = self.monitoring_interval - log_time += WAIT_DUR_SEC + while not self._all_ready(): + time.sleep(wait_dur_sec) + wait_dur_sec = self.monitoring_interval + log_time += wait_dur_sec if not self.should_run: + logger.debug(f'ExecutorID {self.executor_id} - Monitor stopped externally') break - callids_running, callids_done = \ - self.internal_storage.get_job_status(self.executor_id) - - # verify if there are new callids_done and reduce the sleep - new_callids_done = callids_done - self.callids_done_processed_status - if len(new_callids_done) > 0: - WAIT_DUR_SEC = 0.5 + try: + new_callids_done, previous_log, log_time = self._poll_and_process_job_status(previous_log, log_time) + if new_callids_done: + wait_dur_sec = self.monitoring_interval / 5 + except Exception as e: + logger.error(f'ExecutorID {self.executor_id} - Error during monitor: {e}', exc_info=True) - # generate tokens and mark futures as running/done - self._generate_tokens(callids_running, callids_done) - self._tag_future_as_running(callids_running) - self._tag_future_as_ready(callids_done) - prevoius_log, log_time = self._print_status_log(prevoius_log, log_time) + self._poll_and_process_job_status(previous_log, log_time) logger.debug(f'ExecutorID {self.executor_id} - Storage job monitor finished') @@ -414,34 +486,51 @@ class JobMonitor: def __init__(self, executor_id, internal_storage, config=None): self.executor_id = executor_id self.internal_storage = internal_storage + self.storage_config = internal_storage.get_storage_config() + self.storage_backend = internal_storage.backend self.config = config - self.backend = self.config['lithops']['monitoring'].lower() if config else 'storage' + self.type = self.config['lithops']['monitoring'].lower() if config else 'storage' + self.token_bucket_q = queue.Queue() self.monitor = None + self.job_chunksize = {} self.MonitorClass = getattr( lithops.monitor, - f'{self.backend.capitalize()}Monitor' + f'{self.type.capitalize()}Monitor' ) def start(self, fs, job_id=None, chunksize=None, generate_tokens=False): - if self.backend == 'storage': - mi = self.config['lithops'].get('monitoring_interval', MONITORING_INTERVAL) \ - if self.config else MONITORING_INTERVAL - bk_config = {'monitoring_interval': mi} + if self.type == 'storage': + monitoring_interval = self.storage_config['monitoring_interval'] + monitor_config = {'monitoring_interval': monitoring_interval} else: - bk_config = self.config.get(self.backend) + monitor_config = self.config.get(self.type) + + if job_id: + self.job_chunksize[job_id] = chunksize if not self.monitor or not self.monitor.is_alive(): self.monitor = self.MonitorClass( executor_id=self.executor_id, internal_storage=self.internal_storage, token_bucket_q=self.token_bucket_q, + job_chunksize=self.job_chunksize, generate_tokens=generate_tokens, - config=bk_config + config=monitor_config ) + + self.monitor.add_futures(fs) + + if not self.monitor.is_alive(): self.monitor.start() - self.monitor.add_futures(fs, job_id, chunksize) + + def is_alive(self): + return self.monitor.is_alive() + + def remove(self, fs): + if self.monitor and self.monitor.is_alive(): + self.monitor.remove_futures(fs) def stop(self): if self.monitor and self.monitor.is_alive(): diff --git a/lithops/multiprocessing/__init__.py b/lithops/multiprocessing/__init__.py index 1bd5cf9dc..e23b9f35e 100644 --- a/lithops/multiprocessing/__init__.py +++ b/lithops/multiprocessing/__init__.py @@ -5,8 +5,14 @@ # Modifications Copyright (c) 2020 Cloudlab URV # -from .context import (CloudContext, cpu_count, get_context, - get_all_start_methods, set_start_method, get_start_method) +from .context import ( + CloudContext, + cpu_count, + get_context, + get_all_start_methods, + set_start_method, + get_start_method +) from .context import CloudContext as DefaultContext from .connection import Pipe from .managers import SyncManager as Manager @@ -14,12 +20,51 @@ from .process import CloudProcess as Process from .queues import Queue, SimpleQueue, JoinableQueue from .sharedctypes import RawValue, RawArray, Value, Array -from .synchronize import (Semaphore, BoundedSemaphore, - Lock, RLock, - Condition, Event, Barrier) +from .synchronize import ( + Semaphore, + BoundedSemaphore, + Lock, + RLock, + Condition, + Event, + Barrier +) from .process import current_process, active_children, parent_process from . import config + +__all__ = [ + 'cpu_count', + 'get_context', + 'get_all_start_methods', + 'set_start_method', + 'get_start_method', + 'DefaultContext', + 'Pipe', + 'Manager', + 'Pool', + 'Process', + 'Queue', + 'SimpleQueue', + 'JoinableQueue', + 'RawValue', + 'RawArray', + 'Value', + 'Array', + 'Semaphore', + 'BoundedSemaphore', + 'Lock', + 'RLock', + 'Condition', + 'Event', + 'Barrier', + 'current_process', + 'active_children', + 'parent_process', + 'config' +] + + context = CloudContext() diff --git a/lithops/multiprocessing/connection.py b/lithops/multiprocessing/connection.py index edf9883ce..71a54dad0 100644 --- a/lithops/multiprocessing/connection.py +++ b/lithops/multiprocessing/connection.py @@ -355,15 +355,15 @@ def _recv(self, size, read=None): raise NotImplementedError('Connection._recv() on Redis') def _send_bytes(self, buf): - t0 = time.time() + # t0 = time.time() self._write(self._subhandle, buf) - t1 = time.time() + # t1 = time.time() # logger.debug('Redis Pipe send - {} - {} - {} - {}'.format(t0, t1, t1 - t0, len(buf))) def _recv_bytes(self, maxsize=None): - t0 = time.time() + # t0 = time.time() msg = self._read(self._handle) - t1 = time.time() + # t1 = time.time() # logger.debug('Redis Pipe recv - {} - {} - {} - {}'.format(t0, t1, t1 - t0, len(msg))) return msg @@ -474,7 +474,8 @@ def _send_bytes(self, buf): self._req.dial(self._subhandle_addr) # logger.debug('Send %i B to %s', len(buf), self._subhandle_addr) self._req.send(buf) - res = self._req.recv() + self._req.recv() + # res = self._req.recv() # logger.debug(res) def _recv_bytes(self, maxsize=None): diff --git a/lithops/multiprocessing/context.py b/lithops/multiprocessing/context.py index aaaf0d1ea..b2ebb9e98 100644 --- a/lithops/multiprocessing/context.py +++ b/lithops/multiprocessing/context.py @@ -139,7 +139,9 @@ def Array(self, typecode_or_type, size_or_initializer, *, lock=True): def cpu_count(self): lithops_config = lithops.config.default_config() backend = lithops_config['lithops']['backend'] - return lithops_config[backend]['max_workers'] + max_workers = lithops_config[backend]['max_workers'] + worker_processes = lithops_config[backend]['worker_processes'] + return max_workers * worker_processes def get_context(self, method='cloud'): if method not in ['spawn', 'fork', 'forkserver', 'cloud']: diff --git a/lithops/multiprocessing/util.py b/lithops/multiprocessing/util.py index 171ad6bf8..e4fa31ac2 100644 --- a/lithops/multiprocessing/util.py +++ b/lithops/multiprocessing/util.py @@ -33,6 +33,7 @@ # Picklable redis client # + class PicklableRedis(redis.StrictRedis): def __init__(self, *args, **kwargs): self._args = args diff --git a/lithops/plots.py b/lithops/plots.py index e52f004c6..5cdf654ca 100644 --- a/lithops/plots.py +++ b/lithops/plots.py @@ -30,24 +30,26 @@ logger = logging.getLogger(__name__) -def create_timeline(fs, dst): +def create_timeline(fs, dst, figsize=(10, 6)): stats = [f.stats for f in fs] host_job_create_tstamp = min([cm['host_job_create_tstamp'] for cm in stats]) stats_df = pd.DataFrame(stats) total_calls = len(stats_df) - palette = sns.color_palette("deep", 6) + palette = sns.color_palette("deep", 10) - fig = pylab.figure(figsize=(10, 6)) + fig = pylab.figure(figsize=figsize) ax = fig.add_subplot(1, 1, 1) y = np.arange(total_calls) point_size = 10 fields = [('host submit', stats_df.host_submit_tstamp - host_job_create_tstamp), - ('call start', stats_df.worker_start_tstamp - host_job_create_tstamp), - ('call done', stats_df.worker_end_tstamp - host_job_create_tstamp), + # ('worker start', stats_df.worker_start_tstamp - host_job_create_tstamp), + ('function start', stats_df.worker_func_start_tstamp - host_job_create_tstamp), + ('function done', stats_df.worker_func_end_tstamp - host_job_create_tstamp), + # ('worker done', stats_df.worker_end_tstamp - host_job_create_tstamp), ('status fetched', stats_df.host_status_done_tstamp - host_job_create_tstamp)] if 'host_result_done_tstamp' in stats_df: @@ -98,7 +100,7 @@ def create_timeline(fs, dst): fig.savefig(dst) -def create_histogram(fs, dst): +def create_histogram(fs, dst, figsize=(10, 6)): stats = [f.stats for f in fs] host_job_create_tstamp = min([cm['host_job_create_tstamp'] for cm in stats]) @@ -128,7 +130,7 @@ def compute_times_rates(time_rates): 'end_tstamp': end_time, 'runtime_calls_hist': runtime_calls_hist} - fig = pylab.figure(figsize=(10, 6)) + fig = pylab.figure(figsize=figsize) ax = fig.add_subplot(1, 1, 1) time_rates = [(cs['worker_start_tstamp'], cs['worker_end_tstamp']) for cs in stats] diff --git a/lithops/retries.py b/lithops/retries.py new file mode 100644 index 000000000..57e7e480e --- /dev/null +++ b/lithops/retries.py @@ -0,0 +1,361 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +from lithops import FunctionExecutor +from lithops.future import ResponseFuture +from lithops.storage.utils import CloudObject +from lithops.wait import ( + ALL_COMPLETED, + ALWAYS, + ANY_COMPLETED, + THREADPOOL_SIZE, + WAIT_DUR_SEC, +) +from six import reraise + + +class RetryingFuture: + """ + A wrapper around `ResponseFuture` that adds retry capabilities. + + This class is used internally by Lithops to handle retry logic for + failed function executions. It allows retrying a map function with + the same input upon failure, up to a specified number of times. + """ + + def __init__( + self, + response_future: ResponseFuture, + map_function: Callable[..., Any], + input: Any, + retries: Optional[int] = None, + **kwargs + ): + """ + Initialize a RetryingFuture. + + :param response_future: The initial ResponseFuture object. + :param map_function: The function to retry on failure. + :param input: The input data for the map function. + :param retries: Maximum number of retry attempts. + :param kwargs: Additional arguments to pass to the map function. + """ + self.response_future = response_future + self.map_function = map_function + self.input = input + self.retries = retries or 0 + self.map_kwargs = kwargs + self.failure_count = 0 + self.cancelled = False + + def _inc_failure_count(self): + """ + Increment the internal failure counter. + """ + self.failure_count += 1 + + def _should_retry(self): + """ + Determine whether another retry attempt should be made. + + :return: True if retry is allowed, False otherwise. + """ + return not self.cancelled and self.failure_count <= self.retries + + def _retry(self, function_executor: FunctionExecutor): + """ + Re-submit the map function using the provided FunctionExecutor. + + :param function_executor: An instance of FunctionExecutor to resubmit the job. + """ + inputs = [self.input] + futures_list = function_executor.map( + self.map_function, inputs, **self.map_kwargs + ) + self.response_future = futures_list[0] + + def cancel(self): + """ + Cancel any future retries. This does not cancel any running tasks. + """ + self.cancelled = True + + @property + def done(self): + """ + Check if the function execution is complete. + + :return: True if the execution is done, False otherwise. + """ + return self.response_future.done + + @property + def error(self): + """ + Get the error from the function execution, if any. + + :return: An exception or error message if an error occurred. + """ + return self.response_future.error + + @property + def _exception(self): + """ + Get the exception tuple (type, value, traceback) from the execution. + + :return: Exception tuple. + """ + return self.response_future._exception + + @property + def stats(self): + """ + Get execution statistics. + + :return: A dictionary containing performance and usage metrics. + """ + return self.response_future.stats + + def status( + self, + throw_except: bool = True, + internal_storage: Any = None, + check_only: bool = False, + ): + """ + Return the current status of the function execution. + + :param throw_except: Whether to raise any captured exception. + :param internal_storage: Optional internal storage reference. + :param check_only: If True, only checks status without updating. + :return: Execution status string. + """ + stat = self.response_future.status( + throw_except=throw_except, + internal_storage=internal_storage, + check_only=check_only, + ) + if self.response_future.error: + reraise(*self.response_future._exception) + return stat + + def result(self, throw_except: bool = True, internal_storage: Any = None): + """ + Get the result of the function execution. + + :param throw_except: Whether to raise any captured exception. + :param internal_storage: Optional internal storage reference. + :return: The result of the executed function. + """ + res = self.response_future.result( + throw_except=throw_except, internal_storage=internal_storage + ) + if self.response_future.error: + reraise(*self.response_future._exception) + return res + + +class RetryingFunctionExecutor: + """ + A wrapper around `FunctionExecutor` that adds automatic retry capabilities to function invocations. + This class allows spawning multiple function activations and handling failures by retrying them + according to the configured retry policy. + + It provides the same interface as `FunctionExecutor` for compatibility, with an extra `retries` parameter + in `map()` to control the number of retries per invocation. + + :param executor: An instance of FunctionExecutor (e.g., Localhost, Serverless, or Standalone) + """ + + def __init__(self, executor: FunctionExecutor): + self.executor = executor + self.config = executor.config + + def __enter__(self): + """ + Context manager entry. Delegates to the inner FunctionExecutor. + """ + self.executor.__enter__() + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + Context manager exit. Delegates to the inner FunctionExecutor. + """ + self.executor.__exit__(exc_type, exc_value, traceback) + + def map( + self, + map_function: Callable, + map_iterdata: List[Union[List[Any], Tuple[Any, ...], Dict[str, Any]]], + chunksize: Optional[int] = None, + extra_args: Optional[Union[List[Any], Tuple[Any, ...], Dict[str, Any]]] = None, + extra_env: Optional[Dict[str, str]] = None, + runtime_memory: Optional[int] = None, + obj_chunk_size: Optional[int] = None, + obj_chunk_number: Optional[int] = None, + obj_newline: Optional[str] = '\n', + timeout: Optional[int] = None, + include_modules: Optional[List[str]] = [], + exclude_modules: Optional[List[str]] = [], + retries: Optional[int] = None, + ) -> List[RetryingFuture]: + """ + Spawn multiple function activations with automatic retry on failure. + + :param map_function: The function to map over the data. + :param map_iterdata: An iterable of input data (e.g., Python list). + :param chunksize: Split map_iterdata in chunks of this size. One worker per chunk. + :param extra_args: Additional arguments to pass to each function. + :param extra_env: Additional environment variables for the function environment. + :param runtime_memory: Memory (in MB) to allocate per function activation. + :param obj_chunk_size: For file processing. Split each object into chunks of this size (in bytes). + :param obj_chunk_number: For file processing. Number of chunks to split each object into. + :param obj_newline: Newline character for line integrity in file partitioning. + :param timeout: Max time per function activation (in seconds). + :param include_modules: Explicitly pickle these dependencies. + :param exclude_modules: Explicitly exclude these modules from pickling. + :param retries: Number of retries for each function activation upon failure. + + :return: A list of RetryingFuture objects, one for each function activation. + """ + + retries_to_use = ( + retries + if retries is not None + else self.config.get('lithops', {}).get('retries', 0) + ) + + futures_list = self.executor.map( + map_function, + map_iterdata, + chunksize=chunksize, + extra_args=extra_args, + extra_env=extra_env, + runtime_memory=runtime_memory, + obj_chunk_size=obj_chunk_size, + obj_chunk_number=obj_chunk_number, + obj_newline=obj_newline, + timeout=timeout, + include_modules=include_modules, + exclude_modules=exclude_modules, + ) + return [ + RetryingFuture( + f, + map_function=map_function, + input=i, + retries=retries_to_use, + chunksize=chunksize, + extra_args=extra_args, + extra_env=extra_env, + runtime_memory=runtime_memory, + obj_chunk_size=obj_chunk_size, + obj_chunk_number=obj_chunk_number, + obj_newline=obj_newline, + timeout=timeout, + include_modules=include_modules, + exclude_modules=exclude_modules, + ) + for i, f in zip(map_iterdata, futures_list) + ] + + def wait( + self, + fs: List[RetryingFuture], + throw_except: Optional[bool] = True, + return_when: Optional[Any] = ALL_COMPLETED, + download_results: Optional[bool] = False, + timeout: Optional[int] = None, + threadpool_size: Optional[int] = THREADPOOL_SIZE, + wait_dur_sec: Optional[int] = WAIT_DUR_SEC, + show_progressbar: Optional[bool] = True, + ) -> Tuple[List[RetryingFuture], List[RetryingFuture]]: + """ + Wait for a set of futures to complete, retrying any that fail. + + :param fs: List of RetryingFuture objects to wait on. + :param throw_except: Raise exceptions encountered during execution. + :param return_when: Completion policy. One of: ALWAYS, ANY_COMPLETED, or ALL_COMPLETED. + :param download_results: Whether to download results after completion. + :param timeout: Maximum wait time (in seconds). + :param threadpool_size: Number of threads used for polling. + :param wait_dur_sec: Polling interval (in seconds). + :param show_progressbar: Show progress bar for the wait operation. + + :return: A tuple (done, pending) of lists of RetryingFutures. + """ + lookup = {f.response_future: f for f in fs} + + while True: + response_futures = [f.response_future for f in fs] + + done, pending = self.executor.wait( + response_futures, + throw_except=throw_except, + return_when=return_when, + download_results=download_results, + timeout=timeout, + threadpool_size=threadpool_size, + wait_dur_sec=wait_dur_sec, + show_progressbar=show_progressbar, + ) + + retrying_done = [] + retrying_pending = [lookup[response_future] for response_future in pending] + for response_future in done: + retrying_future = lookup[response_future] + if response_future.error: + retrying_future._inc_failure_count() + if retrying_future._should_retry(): + retrying_future._retry(self.executor) + retrying_pending.append(retrying_future) + lookup[retrying_future.response_future] = retrying_future + else: + retrying_done.append(retrying_future) + else: + retrying_done.append(retrying_future) + + if return_when == ALWAYS: + break + elif return_when == ANY_COMPLETED and len(retrying_done) > 0: + break + elif return_when == ALL_COMPLETED and len(retrying_pending) == 0: + break + + return retrying_done, retrying_pending + + def clean( + self, + fs: Optional[Union[ResponseFuture, List[ResponseFuture]]] = None, + cs: Optional[List[CloudObject]] = None, + clean_cloudobjects: Optional[bool] = True, + clean_fn: Optional[bool] = False, + force: Optional[bool] = False + ): + """ + Cleans up temporary files and objects related to this executor, including: + - Function packages + - Serialized input/output data + - Cloud objects (if specified) + + :param fs: List of futures to clean. + :param cs: List of cloudobjects to clean. + :param clean_cloudobjects: Whether to delete all cloudobjects created with this executor. + :param clean_fn: Whether to delete cached functions. + :param force: Force cleanup even for unfinished futures. + """ + self.executor.clean(fs, cs, clean_cloudobjects, clean_fn, force) diff --git a/lithops/scripts/cleaner.py b/lithops/scripts/cleaner.py index a56e7046d..43c06bd0a 100644 --- a/lithops/scripts/cleaner.py +++ b/lithops/scripts/cleaner.py @@ -23,7 +23,7 @@ from lithops.storage import Storage from lithops.storage.utils import clean_bucket -from lithops.constants import JOBS_PREFIX, TEMP_PREFIX, CLEANER_DIR,\ +from lithops.constants import JOBS_PREFIX, TEMP_PREFIX, CLEANER_DIR, \ CLEANER_PID_FILE, CLEANER_LOG_FILE log_file_stream = open(CLEANER_LOG_FILE, 'a') @@ -41,7 +41,8 @@ def clean_executor_jobs(executor_id, executor_data): storage = None - prefix = '/'.join([JOBS_PREFIX, executor_id]) + + logger.debug(f"Cleaning Executor ID: {executor_id}") for file_data in executor_data: file_location = file_data['file_location'] @@ -49,32 +50,21 @@ def clean_executor_jobs(executor_id, executor_data): storage_config = data['storage_config'] clean_cloudobjects = data['clean_cloudobjects'] - if not storage: - storage = Storage(storage_config=storage_config) - logger.info(f'Cleaning jobs {", ".join([job_key for job_key in data["jobs_to_clean"]])}') + logger.debug(f"File location: {file_location}") - objects = storage.list_keys(storage.bucket, prefix) - - objects_to_delete = [ - key for key in objects - if '-'.join(key.split('/')[1].split('-')[0:3]) - in data['jobs_to_clean'] - ] + if not storage: + storage = Storage(storage_config=storage_config) - while objects_to_delete: - storage.delete_objects(storage.bucket, objects_to_delete) - time.sleep(5) - objects = storage.list_keys(storage.bucket, prefix) - objects_to_delete = [ - key for key in objects - if '-'.join(key.split('/')[1].split('-')[0:3]) - in data['jobs_to_clean'] - ] + for job_key in data['jobs_to_clean']: + prefix = '/'.join([JOBS_PREFIX, job_key]) + '/' + logger.debug(f"Cleaning data from {prefix}") + clean_bucket(storage, storage.bucket, prefix) if clean_cloudobjects: for job_key in data['jobs_to_clean']: - prefix = '/'.join([TEMP_PREFIX, job_key]) + prefix = '/'.join([TEMP_PREFIX, job_key]) + '/' + logger.debug(f"Cleaning cloudobjects from {prefix}") clean_bucket(storage, storage.bucket, prefix) if os.path.exists(file_location): @@ -108,10 +98,10 @@ def clean_functions(functions_data): data = functions_data['data'] executor_id = data['fn_to_clean'] - logger.info(f'Going to clean functions from {executor_id}') storage_config = data['storage_config'] storage = Storage(storage_config=storage_config) prefix = '/'.join([JOBS_PREFIX, executor_id]) + '/' + logger.info(f'Cleaning functions from {prefix}') key_list = storage.list_keys(storage.bucket, prefix) storage.delete_objects(storage.bucket, key_list) diff --git a/lithops/scripts/cli.py b/lithops/scripts/cli.py index a9feb8dd6..7af456ebc 100644 --- a/lithops/scripts/cli.py +++ b/lithops/scripts/cli.py @@ -1,5 +1,6 @@ # # (C) Copyright Cloudlab URV 2020 +# (C) Copyright IBM Corp. 2023 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,23 +24,44 @@ import shlex import subprocess as sp from itertools import cycle +from tabulate import tabulate +from datetime import datetime from concurrent.futures import ThreadPoolExecutor import lithops from lithops import Storage from lithops.version import __version__ -from lithops.tests.tests_main import print_test_functions, print_test_groups, run_tests -from lithops.utils import get_mode, setup_lithops_logger, verify_runtime_name, sizeof_fmt -from lithops.config import default_config, extract_storage_config, \ - extract_serverless_config, extract_standalone_config, \ - extract_localhost_config, load_yaml_config -from lithops.constants import CACHE_DIR, LITHOPS_TEMP_DIR, RUNTIMES_PREFIX, \ - JOBS_PREFIX, LOCALHOST, SA_IMAGE_NAME_DEFAULT, SERVERLESS, STANDALONE, LOGS_DIR, FN_LOG_FILE +from lithops.utils import ( + get_mode, + setup_lithops_logger, + verify_runtime_name, + sizeof_fmt +) +from lithops.config import ( + default_config, + extract_storage_config, + extract_serverless_config, + extract_standalone_config, + extract_localhost_config, + load_yaml_config +) +from lithops.constants import ( + CACHE_DIR, + LITHOPS_TEMP_DIR, + RUNTIMES_PREFIX, + JOBS_PREFIX, + LOCALHOST, + SERVERLESS, + STANDALONE, + LOGS_DIR, + FN_LOG_FILE, + STANDALONE_BACKENDS +) from lithops.storage import InternalStorage from lithops.serverless import ServerlessHandler from lithops.storage.utils import clean_bucket -from lithops.standalone.standalone import StandaloneHandler -from lithops.localhost.localhost import LocalhostHandler +from lithops.standalone import StandaloneHandler +from lithops.localhost import LocalhostHandler logger = logging.getLogger(__name__) @@ -108,15 +130,15 @@ def clean(config, backend, storage, debug, region, all): storage = internal_storage.storage runtimes_path = RUNTIMES_PREFIX + '/' + backend jobs_path = JOBS_PREFIX - clean_bucket(storage, storage_config['bucket'], runtimes_path, sleep=1) - clean_bucket(storage, storage_config['bucket'], jobs_path, sleep=1) + clean_bucket(storage, storage.bucket, runtimes_path, sleep=1) + clean_bucket(storage, storage.bucket, jobs_path, sleep=1) # Clean localhost executor temp dirs shutil.rmtree(LITHOPS_TEMP_DIR, ignore_errors=True) # Clean local lithops runtime cache shutil.rmtree(os.path.join(CACHE_DIR, RUNTIMES_PREFIX, backend), ignore_errors=True) - logger.info('All Lithops data cleaned') + logger.info('All Lithops temporary data cleaned') @lithops_cli.command('test') @@ -125,30 +147,39 @@ def clean(config, backend, storage, debug, region, all): @click.option('--storage', '-s', default=None, help='Storage backend') @click.option('--debug', '-d', is_flag=True, help='Debug mode') @click.option('--region', '-r', default=None, help='compute backend region') -@click.option('--test', '-t', default='all', help='Run a specific tester. To avoid running similarly named tests ' - 'you may prefix the tester with its test class, ' - 'e.g. TestClass.test_name. ' - 'Type "-t help" for the complete tests list') -@click.option('--groups', '-g', default=None, help='Run all testers belonging to a specific group.' - ' type "-g help" for groups list') -@click.option('--fail_fast', '-f', is_flag=True, help='Stops test run upon first occurrence of a failed test') -@click.option('--keep_datasets', '-k', is_flag=True, help='keeps datasets in storage after the test run. ' - 'Meant to serve some use-cases in github workflow.') -def test(test, config, backend, groups, storage, debug, region, fail_fast, keep_datasets): - config = load_yaml_config(config) if config else None +@click.option('--test', '-t', default=None, help='Run a specific test. To avoid running similarly named tests ' + 'you may prefix the tester with its test class, ' + 'e.g. TestAsync::test_call_async' + 'Type "-t help" for the complete tests list') +@click.option('--exitfirst', '-x', is_flag=True, help='Stops test run upon first occurrence of a failed test') +def test(test, config, backend, storage, debug, region, exitfirst): + import pytest - log_level = logging.INFO if not debug else logging.DEBUG - setup_lithops_logger(log_level) - - if groups and test == 'all': # if user specified a group(s) avoid running all tests. - test = '' + dir_path = os.path.dirname(os.path.realpath(__file__)) + tests_path = os.path.abspath(os.path.join(dir_path, '..', 'tests')) if test == 'help': - print_test_functions() - elif groups == 'help': - print_test_groups() + pytest.main([tests_path, "--collect-only"]) else: - run_tests(test, config, groups, backend, storage, region, fail_fast, keep_datasets) + cmd_string = [tests_path, "-v"] + if exitfirst: + cmd_string.extend(["-x"]) + if debug: + cmd_string.extend(["-o", "log_cli=true", "--log-cli-level=DEBUG"]) + if config: + cmd_string.extend(["--config", config]) + if backend: + cmd_string.extend(["--backend", backend]) + if storage: + cmd_string.extend(["--storage", storage]) + if region: + cmd_string.extend(["--region", region]) + if test: + cmd_string.extend(["-k", test]) + + print("Executing lithops tests: pytest " + ' '.join(cmd_string[1:])) + + pytest.main(cmd_string) @lithops_cli.command('hello') @@ -200,14 +231,22 @@ def attach(config, backend, start, debug, region): setup_lithops_logger(log_level) config_ow = set_config_ow(backend=backend, region=region) - config = default_config(config_data=config, config_overwrite=config_ow) + config = default_config(config_data=config, config_overwrite=config_ow, load_storage_config=False) if config['lithops']['mode'] != STANDALONE: - raise Exception('lithops attach method is only available for standalone backends') + raise Exception('lithops attach method is only available for standalone backends. ' + f'Please use "lithops attach -b {set(STANDALONE_BACKENDS)}"') compute_config = extract_standalone_config(config) compute_handler = StandaloneHandler(compute_config) + + if not compute_handler.is_initialized(): + logger.info("The backend is not initialized") + return compute_handler.init() + if not start and not compute_handler.backend.master.is_ready(): + logger.info(f"{compute_handler.backend.master} is stopped") + return if start: compute_handler.backend.master.start() @@ -343,23 +382,21 @@ def list_bucket(prefix, bucket, backend, debug, config): logger.info('Listing objects in bucket {}'.format(bucket)) objects = storage.list_objects(bucket, prefix=prefix) - if objects: - width = max([len(obj['Key']) for obj in objects]) + objs = [ + { + key: obj[key].strftime("%b %d %Y %H:%M:%S") if key == 'LastModified' else sizeof_fmt(obj[key]) if key == 'Size' else obj[key] + for key in ('Key', 'LastModified', 'Size') + if key in obj + } + for obj in objects + ] - print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width)) - print('-' * width, '\t', '-' * 20, '\t', '-' * 9) - for obj in objects: - key = obj['Key'] - date = obj['LastModified'].strftime("%b %d %Y %H:%M:%S") - size = sizeof_fmt(obj['Size']) - print('{:{width}} \t {} \t {:>9}'.format(key, date, size, width=width)) + if objs[0]: print() - print('Total objects: {}'.format(len(objects))) + print(tabulate(objs, headers="keys")) + print(f'\nTotal objects: {len(objs)}') else: - width = 10 - print('\n{:{width}} \t {} \t\t {:>9}'.format('Key', 'Last modified', 'Size', width=width)) - print('-' * width, '\t', '-' * 20, '\t', '-' * 9) - print('\nThe bucket is empty') + print(f'\nNo information can be listed from bucket \"{bucket}\" using current \"{storage.backend}\" backend') # /---------------------------------------------------------------------------/ @@ -434,8 +471,8 @@ def runtime(ctx): @click.pass_context def build(ctx, name, file, config, backend, debug): """ build a serverless runtime. """ - log_level = logging.INFO if not debug else logging.DEBUG - setup_lithops_logger(log_level) + # log_level = logging.INFO if not debug else logging.DEBUG + setup_lithops_logger(logging.DEBUG) verify_runtime_name(name) @@ -493,12 +530,14 @@ def deploy(name, storage, backend, memory, timeout, config, debug): logger.info('Runtime deployed') + @runtime.command('list') +@click.argument('name', default='all', required=False) @click.option('--config', '-c', default=None, help='path to yaml config file', type=click.Path(exists=True)) @click.option('--backend', '-b', default=None, help='compute backend') @click.option('--storage', '-s', default=None, help='storage backend') @click.option('--debug', '-d', is_flag=True, help='debug mode') -def list_runtimes(config, backend, storage, debug): +def list_runtimes(name, config, backend, storage, debug): """ list all deployed serverless runtime. """ log_level = logging.INFO if not debug else logging.DEBUG setup_lithops_logger(log_level) @@ -512,25 +551,13 @@ def list_runtimes(config, backend, storage, debug): compute_config = extract_serverless_config(config) compute_handler = ServerlessHandler(compute_config, None) - runtimes = compute_handler.list_runtimes() - - if runtimes: - width = max([len(runtime[0]) for runtime in runtimes]) - - print('\n{:{width}} \t {} \t {}'.format('Runtime Name', 'Memory Size', 'Lithops Version', width=width)) - print('-' * width, '\t', '-' * 13, '\t', '-' * 17) - for runtime in runtimes: - name = runtime[0] - mem = runtime[1] - ver = runtime[2] if len(runtime) == 3 else 'NaN' - print('{:{width}} \t {} MB \t {}'.format(name, mem, ver, width=width)) - print() - print('Total runtimes: {}'.format(len(runtimes))) - else: - width = 14 - print('\n{:{width}} \t {} \t {}'.format('Runtime Name', 'Memory Size', 'Lithops Version', width=width)) - print('-' * width, '\t', '-' * 13, '\t', '-' * 17) - print('\nNo runtimes deployed') + runtimes = compute_handler.list_runtimes(runtime_name=name) + + headers = ['Runtime Name', 'Memory Size', 'Lithops Version', 'Worker Name'] + + print() + print(tabulate(runtimes, headers=headers)) + print(f'\nTotal runtimes: {len(runtimes)}') @runtime.command('update') @@ -574,6 +601,7 @@ def update(name, config, backend, storage, debug): logger.info('Runtime updated') + @runtime.command('delete') @click.argument('name', required=True) @click.option('--config', '-c', default=None, help='path to yaml config file', type=click.Path(exists=True)) @@ -605,6 +633,8 @@ def delete(name, config, memory, version, backend, storage, debug): runtime_name = runtime_info['runtime_name'] runtimes = compute_handler.list_runtimes(runtime_name) + runtimes_to_delete = [] + for runtime in runtimes: to_delete = True if memory is not None and runtime[1] != int(memory): @@ -612,11 +642,170 @@ def delete(name, config, memory, version, backend, storage, debug): if version is not None and runtime[2] != version: to_delete = False if to_delete: - compute_handler.delete_runtime(runtime[0], runtime[1], runtime[2]) - runtime_key = compute_handler.get_runtime_key(runtime[0], runtime[1], runtime[2]) - internal_storage.delete_runtime_meta(runtime_key) + runtimes_to_delete.append((runtime[0], runtime[1], runtime[2])) + + if not runtimes_to_delete: + logger.info("Runtime not found") + return + + for runtime in runtimes_to_delete: + compute_handler.delete_runtime(runtime[0], runtime[1], runtime[2]) + runtime_key = compute_handler.get_runtime_key(runtime[0], runtime[1], runtime[2]) + internal_storage.delete_runtime_meta(runtime_key) + + logger.info("Runtime deleted") + + +# /---------------------------------------------------------------------------/ +# +# lithops jobs +# +# /---------------------------------------------------------------------------/ + +@click.group('job') +@click.pass_context +def job(ctx): + pass - logger.info('Runtime deleted') + +@job.command('list', context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.option('--config', '-c', default=None, help='path to yaml config file', type=click.Path(exists=True)) +@click.option('--backend', '-b', default=None, help='compute backend') +@click.option('--region', '-r', default=None, help='compute backend region') +@click.option('--debug', '-d', is_flag=True, help='debug mode') +def list_jobs(config, backend, region, debug): + """ List Standalone Jobs """ + log_level = logging.INFO if not debug else logging.DEBUG + setup_lithops_logger(log_level) + + config = load_yaml_config(config) if config else None + config_ow = set_config_ow(backend=backend, region=region) + config = default_config(config_data=config, config_overwrite=config_ow, load_storage_config=False) + + if config['lithops']['mode'] != STANDALONE: + raise Exception('"lithops job list" command is only available for standalone backends. ' + f'Please use "lithops job list -b {set(STANDALONE_BACKENDS)}"') + + compute_config = extract_standalone_config(config) + compute_handler = StandaloneHandler(compute_config) + + if not compute_handler.is_initialized(): + logger.info("The backend is not initialized") + return + + compute_handler.init() + + if not compute_handler.backend.master.is_ready(): + logger.info(f"{compute_handler.backend.master} is stopped") + return + + if not compute_handler._is_master_service_ready(): + logger.info(f"Lithops service is not running in {compute_handler.backend.master}") + return + + logger.info(f'Listing jobs submitted to {compute_handler.backend.master}') + job_list = compute_handler.list_jobs() + + headers = job_list.pop(0) + key_index = headers.index("Submitted") + + try: + import pytz + from tzlocal import get_localzone + local_tz = get_localzone() + + def convert_utc_to_local(utc_timestamp): + utc_time = datetime.strptime(utc_timestamp, '%Y-%m-%d %H:%M:%S %Z') + utc_time = utc_time.replace(tzinfo=pytz.utc) + local_time = utc_time.astimezone(local_tz) + return local_time.strftime('%Y-%m-%d %H:%M:%S %Z') + + for row in job_list: + row[key_index] = convert_utc_to_local(row[key_index]) + except ModuleNotFoundError: + pass + + sorted_data = sorted(job_list, key=lambda x: x[key_index]) + + print() + print(tabulate(sorted_data, headers=headers)) + print(f'\nTotal jobs: {len(job_list)}') + + +# /---------------------------------------------------------------------------/ +# +# lithops workers +# +# /---------------------------------------------------------------------------/ + +@click.group('worker') +@click.pass_context +def worker(ctx): + pass + + +@worker.command('list', context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.option('--config', '-c', default=None, help='path to yaml config file', type=click.Path(exists=True)) +@click.option('--backend', '-b', default=None, help='compute backend') +@click.option('--region', '-r', default=None, help='compute backend region') +@click.option('--debug', '-d', is_flag=True, help='debug mode') +def list_workers(config, backend, region, debug): + """ List Standalone Jobs """ + log_level = logging.INFO if not debug else logging.DEBUG + setup_lithops_logger(log_level) + + config = load_yaml_config(config) if config else None + config_ow = set_config_ow(backend=backend, region=region) + config = default_config(config_data=config, config_overwrite=config_ow, load_storage_config=False) + + if config['lithops']['mode'] != STANDALONE: + raise Exception('"lithops worker list" command is only available for standalone backends. ' + f'Please use "lithops worker list -b {set(STANDALONE_BACKENDS)}"') + + compute_config = extract_standalone_config(config) + compute_handler = StandaloneHandler(compute_config) + + if not compute_handler.is_initialized(): + logger.info("The backend is not initialized") + return + + compute_handler.init() + + if not compute_handler.backend.master.is_ready(): + logger.info(f"{compute_handler.backend.master} is stopped") + return + + if not compute_handler._is_master_service_ready(): + logger.info(f"Lithops service is not running in {compute_handler.backend.master}") + return + + logger.info(f'Listing available workers in {compute_handler.backend.master}') + worker_list = compute_handler.list_workers() + + headers = worker_list.pop(0) + key_index = headers.index("Created") + + try: + import pytz + from tzlocal import get_localzone + local_tz = get_localzone() + + def convert_utc_to_local(utc_timestamp): + utc_time = datetime.strptime(utc_timestamp, '%Y-%m-%d %H:%M:%S %Z') + utc_time = utc_time.replace(tzinfo=pytz.utc) + local_time = utc_time.astimezone(local_tz) + return local_time.strftime('%Y-%m-%d %H:%M:%S %Z') + + for row in worker_list: + row[key_index] = convert_utc_to_local(row[key_index]) + except ModuleNotFoundError: + pass + + sorted_data = sorted(worker_list, key=lambda x: x[key_index]) + + print() + print(tabulate(sorted_data, headers=headers)) + print(f'\nTotal workers: {len(worker_list)}') # /---------------------------------------------------------------------------/ @@ -639,28 +828,64 @@ def image(ctx): @click.option('--region', '-r', default=None, help='compute backend region') @click.option('--debug', '-d', is_flag=True, help='debug mode') @click.option('--overwrite', '-o', is_flag=True, help='overwrite the image if it already exists') +@click.option('--include', '-i', multiple=True, help='include source:destination paths', type=str) @click.pass_context -def build_image(ctx, name, file, config, backend, region, debug, overwrite): +def build_image(ctx, name, file, config, backend, region, debug, overwrite, include): """ build a VM image """ setup_lithops_logger(logging.DEBUG) - name = SA_IMAGE_NAME_DEFAULT if not name else name - verify_runtime_name(name) + if name: + verify_runtime_name(name) config = load_yaml_config(config) if config else None config_ow = set_config_ow(backend=backend, region=region) config = default_config(config_data=config, config_overwrite=config_ow, load_storage_config=False) if config['lithops']['mode'] != STANDALONE: - raise Exception('"lithops image build" command is only available for standalone backends') + raise Exception('"lithops image build" command is only available for standalone backends. ' + f'Please use "lithops image build -b {set(STANDALONE_BACKENDS)}"') + + for src_dst_file in include: + src_file, dst_file = src_dst_file.split(':') + if not os.path.isfile(src_file): + raise FileNotFoundError(f"The file '{src_file}' does not exist") compute_config = extract_standalone_config(config) compute_handler = StandaloneHandler(compute_config) - compute_handler.build_image(name, file, overwrite, ctx.args) + compute_handler.build_image(name, file, overwrite, include, ctx.args) logger.info('VM Image built') +@image.command('delete', context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.argument('name', required=True) +@click.option('--config', '-c', default=None, help='path to yaml config file', type=click.Path(exists=True)) +@click.option('--backend', '-b', default=None, help='compute backend') +@click.option('--region', '-r', default=None, help='compute backend region') +@click.option('--debug', '-d', is_flag=True, help='debug mode') +@click.pass_context +def delete_image(ctx, name, config, backend, region, debug): + """ Delete a VM image """ + setup_lithops_logger(logging.DEBUG) + + if name: + verify_runtime_name(name) + + config = load_yaml_config(config) if config else None + config_ow = set_config_ow(backend=backend, region=region) + config = default_config(config_data=config, config_overwrite=config_ow, load_storage_config=False) + + if config['lithops']['mode'] != STANDALONE: + raise Exception('"lithops image delete" command is only available for standalone backends. ' + f'Please use "lithops image delete -b {set(STANDALONE_BACKENDS)}"') + + compute_config = extract_standalone_config(config) + compute_handler = StandaloneHandler(compute_config) + compute_handler.delete_image(name) + + logger.info('VM Image deleted') + + @image.command('list', context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) @click.option('--config', '-c', default=None, help='path to yaml config file', type=click.Path(exists=True)) @click.option('--backend', '-b', default=None, help='compute backend') @@ -676,34 +901,26 @@ def list_images(config, backend, region, debug): config = default_config(config_data=config, config_overwrite=config_ow, load_storage_config=False) if config['lithops']['mode'] != STANDALONE: - raise Exception('"lithops image build" command is only available for standalone backends') + raise Exception('"lithops image build" command is only available for standalone backends. ' + f'Please use "lithops image list -b {set(STANDALONE_BACKENDS)}"') compute_config = extract_standalone_config(config) compute_handler = StandaloneHandler(compute_config) logger.info('Listing all Ubuntu Linux 22.04 VM Images') - images = compute_handler.list_images() + vm_images = compute_handler.list_images() - if images: - width1 = max([len(img[0]) for img in images]) - width2 = max([len(img[1]) for img in images]) - width3 = max([len(img[2]) for img in images]) + headers = ['Image Name', 'Image ID', 'Creation Date'] - print('\n{:{width1}} \t {:{width2}} {:{width3}}'.format('Image Name', 'Image ID', 'Creation Date', width1=width1, width2=width2, width3=width3)) - print('-' * width1, '\t', '-' * width2, ' ', '-' * width3) - for image in images: - print('{:{width1}} \t {:{width2}} {:{width3}}'.format(image[0], image[1], image[2], width1=width1, width2=width2, width3=width3)) - print() - print(f'Total VM images: {len(images)}') - else: - width = 14 - print('\n{:{width}} \t {} {}'.format('Image Name', 'Image ID', 'Creation Date', width=width)) - print('-' * width, '\t', '-' * width, ' ', '-' * width) - print('\nNo VM Images found') + print() + print(tabulate(vm_images, headers=headers)) + print(f'\nTotal VM images: {len(vm_images)}') lithops_cli.add_command(runtime) lithops_cli.add_command(image) +lithops_cli.add_command(job) +lithops_cli.add_command(worker) lithops_cli.add_command(logs) lithops_cli.add_command(storage) diff --git a/lithops/serverless/__init__.py b/lithops/serverless/__init__.py index 356ab9fe4..0ad376cf7 100644 --- a/lithops/serverless/__init__.py +++ b/lithops/serverless/__init__.py @@ -1 +1,3 @@ from .serverless import ServerlessHandler + +__all__ = ['ServerlessHandler'] diff --git a/lithops/serverless/backends/aliyun_fc/__init__.py b/lithops/serverless/backends/aliyun_fc/__init__.py index c67b9d612..ea36ef1a5 100644 --- a/lithops/serverless/backends/aliyun_fc/__init__.py +++ b/lithops/serverless/backends/aliyun_fc/__init__.py @@ -1 +1,3 @@ from .aliyun_fc import AliyunFunctionComputeBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/aliyun_fc/aliyun_fc.py b/lithops/serverless/backends/aliyun_fc/aliyun_fc.py index 8541dfdc4..8728b8e2e 100644 --- a/lithops/serverless/backends/aliyun_fc/aliyun_fc.py +++ b/lithops/serverless/backends/aliyun_fc/aliyun_fc.py @@ -40,7 +40,7 @@ class AliyunFunctionComputeBackend: def __init__(self, afc_config, internal_storage): logger.debug("Creating Aliyun Function Compute client") self.name = 'aliyun_fc' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.config = afc_config self.user_agent = afc_config['user_agent'] @@ -239,9 +239,9 @@ def list_runtimes(self, runtime_name='all'): for function in functions: if function['functionName'].startswith('lithops-worker'): memory = function['memorySize'] - version, name = self._unformat_function_name(function['functionName']) - if runtime_name == name or runtime_name == 'all': - runtimes.append((name, memory, version)) + version, img_name = self._unformat_function_name(function['functionName']) + if runtime_name == img_name or runtime_name == 'all': + runtimes.append((img_name, memory, version, function['functionName'])) return runtimes def invoke(self, runtime_name, memory, payload={}): diff --git a/lithops/serverless/backends/aliyun_fc/config.py b/lithops/serverless/backends/aliyun_fc/config.py index 802260018..3a6123924 100644 --- a/lithops/serverless/backends/aliyun_fc/config.py +++ b/lithops/serverless/backends/aliyun_fc/config.py @@ -43,6 +43,7 @@ tblib cloudpickle ps-mem +psutil """ REQ_PARAMS_1 = ('account_id', 'access_key_id', 'access_key_secret') diff --git a/lithops/serverless/backends/aws_batch/__init__.py b/lithops/serverless/backends/aws_batch/__init__.py index 209c20c93..2af7f5120 100644 --- a/lithops/serverless/backends/aws_batch/__init__.py +++ b/lithops/serverless/backends/aws_batch/__init__.py @@ -1,17 +1,3 @@ -# -# Copyright Cloudlab URV 2021 -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - from .aws_batch import AWSBatchBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/aws_batch/aws_batch.py b/lithops/serverless/backends/aws_batch/aws_batch.py index 9c9bb7b86..d0ca74b49 100644 --- a/lithops/serverless/backends/aws_batch/aws_batch.py +++ b/lithops/serverless/backends/aws_batch/aws_batch.py @@ -21,6 +21,7 @@ import logging import subprocess import sys +import botocore import time import boto3 @@ -36,43 +37,56 @@ class AWSBatchBackend: - def __init__(self, aws_batch_config, internal_storage): + def __init__(self, batch_config, internal_storage): """ Initialize AWS Batch Backend """ - logger.debug('Creating AWS Lambda client') + logger.debug('Creating AWS Batch client') self.name = 'aws_batch' - self.type = 'batch' - self.aws_batch_config = aws_batch_config + self.type = utils.BackendType.BATCH.value + self.aws_batch_config = batch_config + self.user_agent = batch_config['user_agent'] + self.region = batch_config['region'] + self.env_type = batch_config['env_type'] + self.namespace = batch_config.get('namespace') + + self.aws_session = boto3.Session( + aws_access_key_id=batch_config.get('access_key_id'), + aws_secret_access_key=batch_config.get('secret_access_key'), + aws_session_token=batch_config.get('session_token'), + region_name=self.region + ) + self.batch_client = self.aws_session.client('batch') - self.user_key = aws_batch_config['access_key_id'][-4:] - self.package = f'lithops_v{__version__.replace(".", "-")}_{self.user_key}' - self.region_name = aws_batch_config['region'] + self.batch_client = self.aws_session.client( + 'batch', config=botocore.client.Config( + user_agent_extra=self.user_agent + ) + ) - self._env_type = self.aws_batch_config['env_type'] - self._queue_name = f'{self.package}_{self._env_type.replace("_", "-")}_queue' - self._compute_env_name = f'{self.package}_{self._env_type.replace("_", "-")}_env' + self.internal_storage = internal_storage - logger.debug('Creating Boto3 AWS Session and Batch Client') - self.aws_session = boto3.Session(aws_access_key_id=aws_batch_config['access_key_id'], - aws_secret_access_key=aws_batch_config['secret_access_key'], - aws_session_token=aws_batch_config.get('session_token'), - region_name=self.region_name) - self.batch_client = self.aws_session.client('batch', region_name=self.region_name) + if 'account_id' not in batch_config or 'user_id' not in batch_config: + sts_client = self.aws_session.client('sts') + identity = sts_client.get_caller_identity() - self.internal_storage = internal_storage + self.account_id = batch_config.get('account_id') or identity["Account"] + self.user_id = batch_config.get('user_id') or identity["UserId"] + self.user_key = self.user_id.split(":")[0][-4:].lower() - if 'account_id' in self.aws_batch_config: - self.account_id = self.aws_batch_config['account_id'] - else: - sts_client = self.aws_session.client('sts', region_name=self.region_name) - self.account_id = sts_client.get_caller_identity()["Account"] + self.ecr_client = self.aws_session.client('ecr') + package = f'lithops_v{__version__.replace(".", "")}_{self.user_key}' + self.package = f"{package}_{self.namespace}" if self.namespace else package - self.ecr_client = self.aws_session.client('ecr', region_name=self.region_name) + self._queue_name = f'{self.package}_{self.env_type.replace("_", "-")}_queue' + self._compute_env_name = f'{self.package}_{self.env_type.replace("_", "-")}_env' msg = COMPUTE_CLI_MSG.format('AWS Batch') - logger.info("{} - Region: {}".format(msg, self.region_name)) + if self.namespace: + logger.info(f"{msg} - Region: {self.region} - Namespace: {self.namespace} - Env: {self.env_type}") + else: + logger.info(f"{msg} - Region: {self.region} - Env: {self.env_type}") def _get_default_runtime_image_name(self): python_version = utils.CURRENT_PY_VERSION.replace('.', '') @@ -81,7 +95,7 @@ def _get_default_runtime_image_name(self): def _get_full_image_name(self, runtime_name): full_image_name = runtime_name if ':' in runtime_name else f'{runtime_name}:latest' - registry = f'{self.account_id}.dkr.ecr.{self.region_name}.amazonaws.com' + registry = f'{self.account_id}.dkr.ecr.{self.region}.amazonaws.com' full_image_name = '/'.join([registry, self.package.replace('-', '.'), full_image_name]).lower() repo_name = full_image_name.split('/', 1)[1:].pop().split(':')[0] return full_image_name, registry, repo_name @@ -89,7 +103,7 @@ def _get_full_image_name(self, runtime_name): def _format_jobdef_name(self, runtime_name, runtime_memory, version=__version__): fmt_runtime_name = runtime_name.replace('/', '--').replace(':', '--') package = self.package.replace(__version__.replace(".", "-"), version.replace(".", "-")) - return f'{package}--{self._env_type}--{fmt_runtime_name}--{runtime_memory}mb' + return f'{package}--{self.env_type}--{fmt_runtime_name}--{runtime_memory}mb' def _unformat_jobdef_name(self, jobdef_name): # Default jobdef name is "lithops_v2-7-2_WU5O--FARGATE_SPOT--batch-default-runtime-v39--latest--1024mb" @@ -122,27 +136,20 @@ def _create_compute_env(self): 'securityGroupIds': self.aws_batch_config['security_groups'] } - if self._env_type == 'SPOT': + if self.env_type == 'SPOT': compute_resources_spec['allocationStrategy'] = 'SPOT_CAPACITY_OPTIMIZED' - if self._env_type in {'EC2', 'SPOT'}: + if self.env_type in {'EC2', 'SPOT'}: compute_resources_spec['instanceRole'] = self.aws_batch_config['instance_role'] compute_resources_spec['minvCpus'] = 0 compute_resources_spec['instanceTypes'] = ['optimal'] - if 'service_role' in self.aws_batch_config: - res = self.batch_client.create_compute_environment( - computeEnvironmentName=self._compute_env_name, - type='MANAGED', - computeResources=compute_resources_spec, - serviceRole=self.aws_batch_config['service_role'] - ) - else: - res = self.batch_client.create_compute_environment( - computeEnvironmentName=self._compute_env_name, - type='MANAGED', - computeResources=compute_resources_spec, - ) + res = self.batch_client.create_compute_environment( + computeEnvironmentName=self._compute_env_name, + type='MANAGED', + computeResources=compute_resources_spec, + serviceRole=self.aws_batch_config.get('service_role', "") + ) if res['ResponseMetadata']['HTTPStatusCode'] != 200: raise Exception(res) @@ -248,12 +255,12 @@ def _create_job_def(self, runtime_name, runtime_memory): job_def_name = self._format_jobdef_name(runtime_name, runtime_memory) job_def = self._get_job_def(job_def_name) - if self._env_type in {'EC2', 'SPOT'}: + if self.env_type in {'EC2', 'SPOT'}: platform_capabilities = ['EC2'] - elif self._env_type in {'FARGATE', 'FARGATE_SPOT'}: + elif self.env_type in {'FARGATE', 'FARGATE_SPOT'}: platform_capabilities = ['FARGATE'] else: - raise Exception(f'Unknown env type {self._env_type}') + raise Exception(f'Unknown env type {self.env_type}') if job_def is None: logger.debug(f'Creating new Job Definition {job_def_name}') @@ -262,10 +269,11 @@ def _create_job_def(self, runtime_name, runtime_memory): container_properties = { 'image': image_name, 'executionRoleArn': self.aws_batch_config['execution_role'], + 'jobRoleArn': self.aws_batch_config['job_role'], 'resourceRequirements': [ { 'type': 'VCPU', - 'value': str(self.aws_batch_config['container_vcpus']) + 'value': str(self.aws_batch_config['runtime_cpu']) }, { 'type': 'MEMORY', @@ -274,7 +282,7 @@ def _create_job_def(self, runtime_name, runtime_memory): ], } - if self._env_type in {'FARGATE', 'FARGATE_SPOT'}: + if self.env_type in {'FARGATE', 'FARGATE_SPOT'}: container_properties['networkConfiguration'] = { 'assignPublicIp': 'ENABLED' if self.aws_batch_config['assign_public_ip'] else 'DISABLED' } @@ -325,7 +333,7 @@ def _generate_runtime_meta(self, runtime_name, runtime_memory): payload['log_level'] = logger.getEffectiveLevel() logger.debug(f'Submitting get-metadata job for runtime {runtime_name}') - res = self.batch_client.submit_job( + self.batch_client.submit_job( jobName=job_name, jobQueue=self._queue_name, jobDefinition=self._format_jobdef_name(runtime_name, runtime_memory), @@ -380,9 +388,9 @@ def build_runtime(self, runtime_name, runtime_file, extra_args=[]): if runtime_file: assert os.path.isfile(runtime_file), f'Cannot locate "{runtime_file}"' - cmd = f'{docker_path} build -t {full_image_name} -f {runtime_file} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {full_image_name} -f {runtime_file} . ' else: - cmd = f'{docker_path} build -t {full_image_name} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {full_image_name} . ' cmd = cmd + ' '.join(extra_args) try: @@ -398,7 +406,7 @@ def build_runtime(self, runtime_name, runtime_file, extra_args=[]): try: self.ecr_client.create_repository(repositoryName=repo_name, imageTagMutability='MUTABLE') - except self.ecr_client.exceptions.RepositoryAlreadyExistsException as e: + except self.ecr_client.exceptions.RepositoryAlreadyExistsException: logger.info('Repository {} already exists'.format(repo_name)) logger.debug(f'Pushing runtime {full_image_name} to AWS container registry') @@ -516,19 +524,35 @@ def list_runtimes(self, runtime_name='all'): rt_name, rt_mem, version = self._unformat_jobdef_name(jobdef_name=job_def['jobDefinitionName']) if runtime_name != 'all' and runtime_name != rt_name: continue - runtimes.append((rt_name, rt_mem, version)) + runtimes.append((rt_name, rt_mem, version, job_def['jobDefinitionName'])) return runtimes def invoke(self, runtime_name, runtime_memory, payload): + """ + Invoke a job -- return information about this invocation + """ + executor_id = payload['executor_id'] + job_id = payload['job_id'] total_calls = payload['total_calls'] + max_workers = payload['max_workers'] chunksize = payload['chunksize'] + + # Make sure only max_workers are started total_workers = total_calls // chunksize + (total_calls % chunksize > 0) + if max_workers < total_workers: + chunksize = total_calls // max_workers + (total_calls % max_workers > 0) + total_workers = total_calls // chunksize + (total_calls % chunksize > 0) + payload['chunksize'] = chunksize + + logger.debug( + f'ExecutorID {executor_id} | JobID {job_id} - Required Workers: {total_workers}' + ) job_name = '{}_{}'.format(self._format_jobdef_name(runtime_name, runtime_memory), payload['job_key']) if total_workers > 1: - res = self.batch_client.submit_job( + self.batch_client.submit_job( jobName=job_name, jobQueue=self._queue_name, jobDefinition=self._format_jobdef_name(runtime_name, runtime_memory), @@ -549,7 +573,7 @@ def invoke(self, runtime_name, runtime_memory, payload): } ) else: - res = self.batch_client.submit_job( + self.batch_client.submit_job( jobName=job_name, jobQueue=self._queue_name, jobDefinition=self._format_jobdef_name(runtime_name, runtime_memory), @@ -569,7 +593,7 @@ def invoke(self, runtime_name, runtime_memory, payload): def get_runtime_key(self, runtime_name, runtime_memory, version=__version__): jobdef_name = self._format_jobdef_name(runtime_name, runtime_memory, version) - runtime_key = os.path.join(self.name, version, self.region_name, jobdef_name) + runtime_key = os.path.join(self.name, version, self.region, jobdef_name) return runtime_key def get_runtime_info(self): diff --git a/lithops/serverless/backends/aws_batch/config.py b/lithops/serverless/backends/aws_batch/config.py index de2892810..0346a2b1a 100644 --- a/lithops/serverless/backends/aws_batch/config.py +++ b/lithops/serverless/backends/aws_batch/config.py @@ -22,24 +22,29 @@ ENV_TYPES = {'EC2', 'SPOT', 'FARGATE', 'FARGATE_SPOT'} RUNTIME_ZIP = 'lithops_aws_batch.zip' -AVAILABLE_MEM_FARGATE = [512] + [1024 * i for i in range(1, 31)] -AVAILABLE_CPU_FARGATE = [0.25, 0.5, 1, 2, 4] +# https://docs.aws.amazon.com/batch/latest/APIReference/API_ResourceRequirement.html +AVAILABLE_CPU_MEM_FARGATE = { + 0.25: [512, 1024, 2048], + 0.5: [1024, 2048, 3072, 4096], + 1: [2048, 3072, 4096, 5120, 6144, 7168, 8192], + 2: [4096, 5120, 6144, 7168, 8192, 9216, 10240, 11264, 12288, 13312, 14336, 15360, 16384], + 4: [8192 + 1024 * i for i in range(21)], # Starts at 8192, increments by 1024 up to 30720 + 8: [16384 + 4096 * i for i in range(12)], # Starts at 16384, increments by 4096 up to 61440 + 16: [32768 + 8192 * i for i in range(12)] # Starts at 32768, increments by 8192 up to 122880 +} DEFAULT_CONFIG_KEYS = { 'runtime_timeout': 180, # Default: 180 seconds => 3 minutes 'runtime_memory': 1024, # Default memory: 1GB + 'runtime_cpu': 0.5, 'worker_processes': 1, - 'container_vcpus': 0.5, 'env_max_cpus': 10, 'env_type': 'FARGATE_SPOT', 'assign_public_ip': True, 'subnets': [] } -RUNTIME_TIMEOUT_MAX = 7200 # Max. timeout: 7200s == 2h -RUNTIME_MEMORY_MAX = 30720 # Max. memory: 30720 MB - -REQ_PARAMS = ('execution_role', 'instance_role', 'security_groups') +REQ_PARAMS = ('execution_role', 'security_groups') DOCKERFILE_DEFAULT = """ RUN apt-get update && apt-get install -y \ @@ -58,7 +63,8 @@ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil # Copy Lithops proxy and lib to the container image. ENV APP_HOME /lithops @@ -73,15 +79,12 @@ def load_config(config_data): - if 'aws' not in config_data: - raise Exception("'aws' section is mandatory in the configuration") - - if not {'access_key_id', 'secret_access_key'}.issubset(set(config_data['aws'])): - raise Exception("'access_key_id' and 'secret_access_key' are mandatory under the 'aws' section of the configuration") - - if not config_data['aws_batch']: + if 'aws_batch' not in config_data or not config_data['aws_batch']: raise Exception("'aws_batch' section is mandatory in the configuration") + if 'aws' not in config_data: + config_data['aws'] = {} + temp = copy.deepcopy(config_data['aws_batch']) config_data['aws_batch'].update(config_data['aws']) config_data['aws_batch'].update(temp) @@ -95,38 +98,41 @@ def load_config(config_data): if key not in config_data['aws_batch']: config_data['aws_batch'][key] = DEFAULT_CONFIG_KEYS[key] - if config_data['aws_batch']['runtime_memory'] > RUNTIME_MEMORY_MAX: - logger.warning("Memory set to {} - {} exceeds " - "the maximum amount".format(RUNTIME_MEMORY_MAX, config_data['aws_batch']['runtime_memory'])) - config_data['aws_batch']['runtime_memory'] = RUNTIME_MEMORY_MAX - - if config_data['aws_batch']['runtime_timeout'] > RUNTIME_TIMEOUT_MAX: - logger.warning("Timeout set to {} - {} exceeds the " - "maximum amount".format(RUNTIME_TIMEOUT_MAX, config_data['aws_batch']['runtime_timeout'])) - config_data['aws_batch']['runtime_timeout'] = RUNTIME_TIMEOUT_MAX - - config_data['aws_batch']['max_workers'] = config_data['aws_batch']['env_max_cpus'] // config_data['aws_batch']['container_vcpus'] - if config_data['aws_batch']['env_type'] not in ENV_TYPES: raise Exception( - 'AWS Batch env type must be one of {} (is {})'.format(ENV_TYPES, config_data['aws_batch']['env_type'])) - - if config_data['aws_batch']['env_type'] in {'FARGATE, FARGATE_SPOT'}: - if config_data['aws_batch']['container_vcpus'] not in AVAILABLE_CPU_FARGATE: - raise Exception('{} container vcpus is not available for {} environment (choose one of {})'.format( - config_data['aws_batch']['runtime_memory'], config_data['aws_batch']['env_type'], - AVAILABLE_CPU_FARGATE - )) - if config_data['aws_batch']['runtime_memory'] not in AVAILABLE_MEM_FARGATE: - raise Exception('{} runtime memory is not available for {} environment (choose one of {})'.format( - config_data['aws_batch']['runtime_memory'], config_data['aws_batch']['env_type'], - AVAILABLE_MEM_FARGATE - )) + f"AWS Batch env type must be one of {ENV_TYPES} " + f"(is {config_data['aws_batch']['env_type']})" + ) + + # container_vcpus is deprectaded. To be removed in a future release + if 'container_vcpus' in config_data['aws_batch']: + config_data['aws_batch']['runtime_cpu'] = config_data['aws_batch'].pop('container_vcpus') + + if config_data['aws_batch']['env_type'] in {'FARGATE', 'FARGATE_SPOT'}: + runtime_memory = config_data['aws_batch']['runtime_memory'] + runtime_cpu = config_data['aws_batch']['runtime_cpu'] + env_type = config_data['aws_batch']['env_type'] + cpu_keys = list(AVAILABLE_CPU_MEM_FARGATE.keys()) + if runtime_cpu not in cpu_keys: + raise Exception( + f"'{runtime_cpu}' runtime cpu is not available for the {env_type} environment " + f"(choose one of {', '.join(map(str, cpu_keys))})" + ) + mem_keys = AVAILABLE_CPU_MEM_FARGATE[runtime_cpu] + if config_data['aws_batch']['runtime_memory'] not in mem_keys: + raise Exception( + f"'{runtime_memory}' runtime memory is not valid for {runtime_cpu} " + f"vCPU and the {env_type} environment (for {runtime_cpu}vCPU " + f"choose one of {', '.join(map(str, mem_keys))})" + ) if config_data['aws_batch']['env_type'] in {'EC2', 'SPOT'}: if 'instance_role' not in config_data['aws_batch']: raise Exception("'instance_role' mandatory for EC2 or SPOT environments") + config_data['aws_batch']['max_workers'] = config_data['aws_batch']['env_max_cpus'] \ + // config_data['aws_batch']['runtime_cpu'] + assert isinstance(config_data['aws_batch']['assign_public_ip'], bool) if 'region_name' in config_data['aws_batch']: diff --git a/lithops/serverless/backends/aws_lambda/__init__.py b/lithops/serverless/backends/aws_lambda/__init__.py index 1be4d981c..a77c55ea6 100644 --- a/lithops/serverless/backends/aws_lambda/__init__.py +++ b/lithops/serverless/backends/aws_lambda/__init__.py @@ -1,17 +1,3 @@ -# -# Copyright Cloudlab URV 2020 -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - from .aws_lambda import AWSLambdaBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/aws_lambda/aws_lambda.py b/lithops/serverless/backends/aws_lambda/aws_lambda.py index d20bd7a49..e01ec3787 100644 --- a/lithops/serverless/backends/aws_lambda/aws_lambda.py +++ b/lithops/serverless/backends/aws_lambda/aws_lambda.py @@ -17,6 +17,7 @@ import os import logging import boto3 +import hashlib import time import json import zipfile @@ -52,68 +53,54 @@ def __init__(self, lambda_config, internal_storage): logger.debug('Creating AWS Lambda client') self.name = 'aws_lambda' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.lambda_config = lambda_config self.internal_storage = internal_storage self.user_agent = lambda_config['user_agent'] - self.region_name = lambda_config['region'] + self.region = lambda_config['region'] self.role_arn = lambda_config['execution_role'] - - logger.debug('Creating Boto3 AWS Session and Lambda Client') + self.namespace = lambda_config.get('namespace') self.aws_session = boto3.Session( - aws_access_key_id=lambda_config['access_key_id'], - aws_secret_access_key=lambda_config['secret_access_key'], + aws_access_key_id=lambda_config.get('access_key_id'), + aws_secret_access_key=lambda_config.get('secret_access_key'), aws_session_token=lambda_config.get('session_token'), - region_name=self.region_name + region_name=self.region ) self.lambda_client = self.aws_session.client( - 'lambda', region_name=self.region_name, - config=botocore.client.Config( + 'lambda', config=botocore.client.Config( user_agent_extra=self.user_agent ) ) self.credentials = self.aws_session.get_credentials() self.session = URLLib3Session() - self.host = f'lambda.{self.region_name}.amazonaws.com' - - if 'account_id' in self.lambda_config: - self.account_id = self.lambda_config['account_id'] - else: - sts_client = self.aws_session.client('sts', region_name=self.region_name) - self.account_id = sts_client.get_caller_identity()["Account"] + self.host = f'lambda.{self.region}.amazonaws.com' - sts_client = self.aws_session.client('sts', region_name=self.region_name) - caller_id = sts_client.get_caller_identity() + if 'account_id' not in self.lambda_config or 'user_id' not in self.lambda_config: + sts_client = self.aws_session.client('sts') + identity = sts_client.get_caller_identity() - if ":" in caller_id["UserId"]: # SSO user - self.user_key = caller_id["UserId"].split(":")[1] - else: # IAM user - self.user_key = caller_id["UserId"][-4:].lower() + self.account_id = self.lambda_config.get('account_id') or identity["Account"] + self.user_id = self.lambda_config.get('user_id') or identity["UserId"] + self.user_key = self.user_id.split(":")[0][-4:].lower() - self.ecr_client = self.aws_session.client('ecr', region_name=self.region_name) - self.package = f'lithops_v{__version__.replace(".", "-")}_{self.user_key}' + self.ecr_client = self.aws_session.client('ecr') + package = f'lithops_v{__version__.replace(".", "")}_{self.user_key}' + self.package = f"{package}_{self.namespace}" if self.namespace else package msg = COMPUTE_CLI_MSG.format('AWS Lambda') - logger.info(f"{msg} - Region: {self.region_name}") + if self.namespace: + logger.info(f"{msg} - Region: {self.region} - Namespace: {self.namespace}") + else: + logger.info(f"{msg} - Region: {self.region}") def _format_function_name(self, runtime_name, runtime_memory, version=__version__): - runtime_name = runtime_name.replace('/', '__').replace('.', '').replace(':', '--') - package = self.package.replace(__version__.replace(".", "-"), version.replace(".", "-")) - runtime_name = package + '__' + runtime_name - - return f'{runtime_name}_{runtime_memory}MB' - - @staticmethod - def _unformat_function_name(function_name): - version, runtime = function_name.split('__', 1) - version = version.replace('lithops_v', '').split('_')[0].replace('-', '.') - runtime = runtime.replace('__', '/') - runtime = runtime.replace('--', ':') - runtime_name, runtime_memory = runtime.rsplit('_', 1) - return version, runtime_name, runtime_memory.replace('MB', '') + name = f'{runtime_name}-{runtime_memory}-{version}' + name_hash = hashlib.sha1(name.encode("utf-8")).hexdigest()[:10] + fn_name = f'lithops-worker-{self.user_key}-{version.replace(".", "")}-{name_hash}' + return f'{self.namespace}-{fn_name}' if self.namespace else fn_name def _format_layer_name(self, runtime_name, version=__version__): package = self.package.replace(__version__.replace(".", ""), version.replace(".", "")) @@ -121,11 +108,11 @@ def _format_layer_name(self, runtime_name, version=__version__): def _get_default_runtime_name(self): py_version = utils.CURRENT_PY_VERSION.replace('.', '') - return f'default-v{py_version}' + return f'default-runtime-v{py_version}' def _is_container_runtime(self, runtime_name): name = runtime_name.split('/', 1)[-1] - return 'default-v' not in name + return 'default-runtime-v' not in name def _format_repo_name(self, runtime_name): if ':' in runtime_name: @@ -197,7 +184,7 @@ def _create_layer(self, runtime_name): @param runtime_name: runtime name from which to create the layer @return: ARN of the created layer """ - logger.info('Creating default lambda layer for runtime {}'.format(runtime_name)) + logger.info(f'Creating lambda layer for runtime {runtime_name}') with zipfile.ZipFile(BUILD_LAYER_FUNCTION_ZIP, 'w') as build_layer_zip: current_location = os.path.dirname(os.path.abspath(__file__)) @@ -226,11 +213,11 @@ def _create_layer(self, runtime_name): # wait until the function is created if resp['ResponseMetadata']['HTTPStatusCode'] not in (200, 201): - msg = 'An error occurred creating/updating action {}: {}'.format(runtime_name, resp) + msg = f'An error occurred creating/updating action {runtime_name}: {resp}' raise Exception(msg) self._wait_for_function_deployed(func_name) - logger.debug('OK --> Created "layer builder" function {}'.format(runtime_name)) + logger.debug(f'OK --> Created "layer builder" function {runtime_name}'.format()) dependencies = [dependency.strip().replace(' ', '') for dependency in config.DEFAULT_REQUIREMENTS] layer_name = self._format_layer_name(runtime_name) @@ -243,9 +230,9 @@ def _create_layer(self, runtime_name): logger.debug('Invoking "layer builder" function') response = self.lambda_client.invoke(FunctionName=func_name, Payload=json.dumps(payload)) if response['ResponseMetadata']['HTTPStatusCode'] == 200: - logger.debug('OK --> Layer {} built'.format(layer_name)) + logger.debug(f'OK --> Layer {layer_name} built') else: - msg = 'An error occurred creating layer {}: {}'.format(layer_name, response) + msg = f'An error occurred creating layer {layer_name}: {response}' raise Exception(msg) finally: os.remove(BUILD_LAYER_FUNCTION_ZIP) @@ -257,10 +244,10 @@ def _create_layer(self, runtime_name): raise # Publish layer from S3 - logger.debug('Creating layer {} ...'.format(layer_name)) + logger.debug(f'Creating layer {layer_name} ...') response = self.lambda_client.publish_layer_version( LayerName=layer_name, - Description='Lithops Function for ' + self.package, + Description=f'Lithops layer for v{__version__} and Python v{utils.CURRENT_PY_VERSION}', Content={ 'S3Bucket': self.internal_storage.bucket, 'S3Key': layer_name @@ -284,7 +271,7 @@ def _delete_layer(self, layer_name): Delete a layer @param layer_name: Formatted layer name """ - logger.debug('Deleting lambda layer: {}'.format(layer_name)) + logger.debug(f'Deleting lambda layer: {layer_name}') versions = [] response = self.lambda_client.list_layer_versions(LayerName=layer_name) @@ -300,7 +287,7 @@ def _delete_layer(self, layer_name): VersionNumber=version ) if response['ResponseMetadata']['HTTPStatusCode'] == 204: - logger.debug('OK --> Layer {} version {} deleted'.format(layer_name, version)) + logger.debug(f'OK --> Layer {layer_name} version {version} deleted') def _list_layers(self): """ @@ -350,9 +337,9 @@ def build_runtime(self, runtime_name, runtime_file, extra_args=[]): docker_path = utils.get_docker_path() if runtime_file: assert os.path.isfile(runtime_file), f'Cannot locate "{runtime_file}"' - cmd = f'{docker_path} build -t {runtime_name} -f {runtime_file} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {runtime_name} -f {runtime_file} . ' else: - cmd = f'{docker_path} build -t {runtime_name} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {runtime_name} . ' cmd = cmd + ' '.join(extra_args) try: @@ -361,7 +348,7 @@ def build_runtime(self, runtime_name, runtime_file, extra_args=[]): finally: os.remove(LITHOPS_FUNCTION_ZIP) - registry = f'{self.account_id}.dkr.ecr.{self.region_name}.amazonaws.com' + registry = f'{self.account_id}.dkr.ecr.{self.region}.amazonaws.com' res = self.ecr_client.get_authorization_token() if res['ResponseMetadata']['HTTPStatusCode'] != 200: @@ -417,7 +404,7 @@ def _deploy_default_runtime(self, runtime_name, memory, timeout): Code={ 'ZipFile': code }, - Description='Lithops Worker for ' + self.package, + Description=f'Lithops worker for v{__version__} and Python v{utils.CURRENT_PY_VERSION}', Timeout=timeout, MemorySize=memory, Layers=[layer_arn], @@ -478,7 +465,7 @@ def _deploy_container_runtime(self, runtime_name, memory, timeout): except botocore.exceptions.ClientError: raise Exception(f'Runtime "{runtime_name}" is not deployed to ECR') - registry = f'{self.account_id}.dkr.ecr.{self.region_name}.amazonaws.com' + registry = f'{self.account_id}.dkr.ecr.{self.region}.amazonaws.com' image_uri = f'{registry}/{repo_name}@{image_digest}' env_vars = {t['name']: t['value'] for t in self.lambda_config['env_vars']} @@ -491,7 +478,7 @@ def _deploy_container_runtime(self, runtime_name, memory, timeout): 'ImageUri': image_uri }, PackageType='Image', - Description='Lithops Worker for ' + self.package, + Description=f'Lithops worker for v{__version__} and Python v{utils.CURRENT_PY_VERSION}', Timeout=timeout, MemorySize=memory, VpcConfig={ @@ -504,8 +491,9 @@ def _deploy_container_runtime(self, runtime_name, memory, timeout): for efs_conf in self.lambda_config['efs'] ], Tags={ - 'runtime_name': self.package + '/' + runtime_name, - 'lithops_version': __version__ + 'runtime_name': runtime_name, + 'lithops_version': __version__, + **self.lambda_config.get('user_tags'), }, Architectures=[self.lambda_config['architecture']], EphemeralStorage={ @@ -556,28 +544,9 @@ def delete_runtime(self, runtime_name, runtime_memory, version=__version__): self._delete_function(func_name) - # Check if layer/container image has to also be deleted - if not self.list_runtimes(runtime_name): - runtime_name = runtime_name.split('/', 1)[1] if '/' in runtime_name else runtime_name - if self._is_container_runtime(runtime_name): - if ':' in runtime_name: - image, tag = runtime_name.split(':') - else: - image, tag = runtime_name, 'latest' - package = '_'.join(func_name.split('_')[:3]) - repo_name = f"{package}/{image}" - logger.debug(f'Going to delete ECR repository {repo_name} tag {tag}') - try: - self.ecr_client.batch_delete_image(repositoryName=repo_name, imageIds=[{'imageTag': tag}]) - images = self.ecr_client.list_images(repositoryName=repo_name, filter={'tagStatus': 'TAGGED'}) - if not images['imageIds']: - logger.debug(f'Going to delete ECR repository {repo_name}') - self.ecr_client.delete_repository(repositoryName=repo_name, force=True) - except Exception: - pass - else: - layer = self._format_layer_name(runtime_name, version) - self._delete_layer(layer) + if not self._is_container_runtime(runtime_name): + layer = self._format_layer_name(runtime_name, version) + self._delete_layer(layer) def clean(self, **kwargs): """ @@ -585,9 +554,12 @@ def clean(self, **kwargs): """ logger.debug('Deleting all runtimes') + prefix = f'{self.namespace}-lithops-worker-{self.user_key}' \ + if self.namespace else f'lithops-worker-{self.user_key}' + def delete_runtimes(response): for function in response['Functions']: - if function['FunctionName'].startswith('lithops_v') and self.user_key in function['FunctionName']: + if function['FunctionName'].startswith(prefix): self._delete_function(function['FunctionName']) response = self.lambda_client.list_functions(FunctionVersion='ALL') @@ -608,11 +580,20 @@ def list_runtimes(self, runtime_name='all'): """ runtimes = [] + prefix = f'{self.namespace}-lithops-worker-{self.user_key}' \ + if self.namespace else f'lithops-worker-{self.user_key}' + def get_runtimes(response): for function in response['Functions']: - if function['FunctionName'].startswith('lithops_v') and self.user_key in function['FunctionName']: - version, rt_name, rt_memory = self._unformat_function_name(function['FunctionName']) - runtimes.append((rt_name, rt_memory, version)) + if not function['FunctionName'].startswith(prefix): + continue + fn_name = function['FunctionName'] + rt_memory = function['MemorySize'] + function_arn = function['FunctionArn'].replace(':$LATEST', '') + tags_response = self.lambda_client.list_tags(Resource=function_arn)['Tags'] + rt_name = tags_response['runtime_name'] + version = tags_response['lithops_version'] + runtimes.append((rt_name, rt_memory, version, fn_name)) response = self.lambda_client.list_functions(FunctionVersion='ALL') get_runtimes(response) @@ -621,9 +602,7 @@ def get_runtimes(response): get_runtimes(response) if runtime_name != 'all': - if self._is_container_runtime(runtime_name) and ':' not in runtime_name: - runtime_name = runtime_name + ':latest' - runtimes = [tup for tup in runtimes if runtime_name in tup[0]] + runtimes = [tup for tup in runtimes if runtime_name == tup[0]] return runtimes @@ -641,7 +620,7 @@ def invoke(self, runtime_name, runtime_memory, payload): headers = {'Host': self.host, 'X-Amz-Invocation-Type': 'Event', 'User-Agent': self.user_agent} url = f'https://{self.host}/2015-03-31/functions/{function_name}/invocations' request = AWSRequest(method="POST", url=url, data=json.dumps(payload, default=str), headers=headers) - SigV4Auth(self.credentials, "lambda", self.region_name).add_auth(request) + SigV4Auth(self.credentials, "lambda", self.region).add_auth(request) invoked = False while not invoked: @@ -658,10 +637,10 @@ def invoke(self, runtime_name, runtime_memory, payload): raise Exception('Unauthorized - Invalid API Key') elif r.status_code == 404: logger.debug(r.text) - raise Exception('Lithops Runtime: {} not deployed'.format(runtime_name)) + raise Exception(f'Lithops Runtime: {runtime_name} not deployed') else: logger.debug(r.text) - raise Exception('Error {}: {}'.format(r.status_code, r.text)) + raise Exception(f'Error {r.status_code}: {r.text}') # response = self.lambda_client.invoke( # FunctionName=function_name, @@ -687,7 +666,7 @@ def get_runtime_key(self, runtime_name, runtime_memory, version=__version__): in order to know which runtimes are installed and which not. """ action_name = self._format_function_name(runtime_name, runtime_memory, version) - runtime_key = os.path.join(self.name, version, self.region_name, action_name) + runtime_key = os.path.join(self.name, version, self.region, action_name) return runtime_key @@ -733,4 +712,4 @@ def _generate_runtime_meta(self, runtime_name, runtime_memory): if 'lithops_version' in result: return result else: - raise Exception('An error occurred: {}'.format(result)) + raise Exception(f'An error occurred: {result}') diff --git a/lithops/serverless/backends/aws_lambda/config.py b/lithops/serverless/backends/aws_lambda/config.py index 7178b4a9f..4e8b45dc1 100644 --- a/lithops/serverless/backends/aws_lambda/config.py +++ b/lithops/serverless/backends/aws_lambda/config.py @@ -27,7 +27,8 @@ 'cloudpickle', 'ps-mem', 'tblib', - 'urllib3<2' + 'urllib3<2', + 'psutil' ] AVAILABLE_PY_RUNTIMES = { @@ -36,7 +37,8 @@ '3.8': 'python3.8', '3.9': 'python3.9', '3.10': 'python3.10', - '3.11': 'python3.11' + '3.11': 'python3.11', + '3.12': 'python3.12' } USER_RUNTIME_PREFIX = 'lithops.user_runtimes' @@ -50,6 +52,7 @@ 'architecture': 'x86_64', 'ephemeral_storage': 512, 'env_vars': {}, + 'user_tags': {}, 'vpc': {'subnets': [], 'security_groups': []}, 'efs': [] } @@ -65,15 +68,13 @@ def load_config(config_data): - if 'aws' not in config_data: - raise Exception("'aws' section is mandatory in the configuration") - - if not {'access_key_id', 'secret_access_key'}.issubset(set(config_data['aws'])): - raise Exception("'access_key_id' and 'secret_access_key' are mandatory under the 'aws' section of the configuration") if not config_data['aws_lambda']: raise Exception("'aws_lambda' section is mandatory in the configuration") + if 'aws' not in config_data: + config_data['aws'] = {} + temp = copy.deepcopy(config_data['aws_lambda']) config_data['aws_lambda'].update(config_data['aws']) config_data['aws_lambda'].update(temp) diff --git a/lithops/serverless/backends/azure_containers/__init__.py b/lithops/serverless/backends/azure_containers/__init__.py index 95b5fd221..8bcbc1582 100644 --- a/lithops/serverless/backends/azure_containers/__init__.py +++ b/lithops/serverless/backends/azure_containers/__init__.py @@ -1 +1,3 @@ from .azure_containers import AzureContainerAppBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/azure_containers/azure_containers.py b/lithops/serverless/backends/azure_containers/azure_containers.py index d6125733a..6463c648d 100644 --- a/lithops/serverless/backends/azure_containers/azure_containers.py +++ b/lithops/serverless/backends/azure_containers/azure_containers.py @@ -39,7 +39,7 @@ class AzureContainerAppBackend: def __init__(self, ac_config, internal_storage): logger.debug("Creating Azure Container Apps client") self.name = 'azure_containers' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.ac_config = ac_config self.internal_storage = internal_storage self.trigger = ac_config['trigger'] @@ -115,9 +115,9 @@ def build_runtime(self, runtime_name, dockerfile, extra_args=[]): if dockerfile: assert os.path.isfile(dockerfile), f'Cannot locate "{dockerfile}"' - cmd = f'{docker_path} build -t {runtime_name} -f {dockerfile} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {runtime_name} -f {dockerfile} . ' else: - cmd = f'{docker_path} build -t {runtime_name} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {runtime_name} . ' cmd = cmd + ' '.join(extra_args) try: @@ -127,7 +127,17 @@ def build_runtime(self, runtime_name, dockerfile, extra_args=[]): finally: os.remove(config.FH_ZIP_LOCATION) + docker_user = self.ac_config.get("docker_user") + docker_password = self.ac_config.get("docker_password") + docker_server = self.ac_config.get("docker_server") + logger.debug(f'Pushing runtime {runtime_name} to container registry') + + if docker_user and docker_password: + logger.debug('Container registry credentials found in config. Logging in into the registry') + cmd = f'{docker_path} login -u {docker_user} --password-stdin {docker_server}' + utils.run_command(cmd, input=docker_password) + if utils.is_podman(docker_path): cmd = f'{docker_path} push {runtime_name} --format docker --remove-signatures' else: @@ -172,7 +182,7 @@ def _create_app(self, runtime_name, memory, timeout): ca_temaplate['properties']['template']['scale']['rules'][0]['azureQueue']['queueName'] = containerapp_name ca_temaplate['properties']['template']['scale']['maxReplicas'] = min(self.ac_config['max_workers'], 30) - cmd = f"az containerapp env show -g {self.resource_group} -n {self.environment} --query id" + cmd = f"az containerapp env show -g {self.resource_group} -n {self.environment} --query id --only-show-errors" envorinemnt_id = utils.run_command(cmd, return_result=True) ca_temaplate['properties']['managedEnvironmentId'] = envorinemnt_id @@ -193,20 +203,20 @@ def _create_app(self, runtime_name, memory, timeout): cmd = (f'az containerapp create --name {containerapp_name} ' f'--resource-group {self.resource_group} ' - f'--yaml {config.CA_JSON_LOCATION}') + f'--yaml {config.CA_JSON_LOCATION} --only-show-errors') logger.debug('Deploying Azure Container App') deployed = False retries = 0 - while retries < 15: + while retries < 10: try: - time.sleep(20) utils.run_command(cmd) os.remove(config.CA_JSON_LOCATION) deployed = True break except Exception: + time.sleep(10) retries += 1 if not deployed: @@ -218,7 +228,7 @@ def delete_runtime(self, runtime_name, memory, version=__version__): """ logger.info(f'Deleting runtime: {runtime_name} - {memory}MB') containerapp_name = self._format_containerapp_name(runtime_name, memory, version) - cmd = f'az containerapp delete --name {containerapp_name} --resource-group {self.resource_group} -y' + cmd = f'az containerapp delete --name {containerapp_name} --resource-group {self.resource_group} -y --only-show-errors' utils.run_command(cmd) try: @@ -258,7 +268,7 @@ def clean(self, **kwargs): runtimes = self.list_runtimes() - for runtime_name, runtime_memory, version in runtimes: + for runtime_name, runtime_memory, version, wk_name in runtimes: self.delete_runtime(runtime_name, runtime_memory, version) def _generate_runtime_meta(self, runtime_name, memory): @@ -300,7 +310,7 @@ def list_runtimes(self, runtime_name='all'): logger.debug('Listing all deployed runtimes') runtimes = [] - response = os.popen('az containerapp list --query "[].{Name:name, Tags:tags}\"').read() + response = os.popen('az containerapp list --query "[].{Name:name, Tags:tags}\" --only-show-errors').read() response = json.loads(response) for containerapp in response: @@ -310,7 +320,7 @@ def list_runtimes(self, runtime_name='all'): memory = containerapp['Tags']['runtime_memory'] version = containerapp['Tags']['lithops_version'] if runtime_name == containerapp['Name'] or runtime_name == 'all': - runtimes.append((name, memory, version)) + runtimes.append((name, memory, version, containerapp['Name'])) return runtimes diff --git a/lithops/serverless/backends/azure_containers/config.py b/lithops/serverless/backends/azure_containers/config.py index 39b27dd52..57bda0862 100644 --- a/lithops/serverless/backends/azure_containers/config.py +++ b/lithops/serverless/backends/azure_containers/config.py @@ -140,7 +140,8 @@ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil WORKDIR /app COPY lithops_azure_ca.zip . diff --git a/lithops/serverless/backends/azure_containers/entry_point.py b/lithops/serverless/backends/azure_containers/entry_point.py index f16954133..3061381e5 100644 --- a/lithops/serverless/backends/azure_containers/entry_point.py +++ b/lithops/serverless/backends/azure_containers/entry_point.py @@ -31,6 +31,7 @@ queue_name = os.environ['QueueName'] queue = QueueClient.from_connection_string(conn_str=connection_string, queue_name=queue_name) + def get_message(): message = None while not message: @@ -46,7 +47,7 @@ def get_message(): def extract_runtime_metadata(payload): runtime_meta = get_runtime_metadata() internal_storage = InternalStorage(payload['storage_config']) - status_key = payload['containerapp_name']+'.meta' + status_key = payload['containerapp_name'] + '.meta' logger.info(f"Runtime metadata key {status_key}") dmpd_response_status = json.dumps(runtime_meta) internal_storage.put_data(status_key, dmpd_response_status) diff --git a/lithops/serverless/backends/azure_functions/__init__.py b/lithops/serverless/backends/azure_functions/__init__.py index b4dc510b3..05bf7e256 100644 --- a/lithops/serverless/backends/azure_functions/__init__.py +++ b/lithops/serverless/backends/azure_functions/__init__.py @@ -1 +1,3 @@ from .azure_functions import AzureFunctionAppBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/azure_functions/azure_functions.py b/lithops/serverless/backends/azure_functions/azure_functions.py index ef9498ad8..ca639d6df 100644 --- a/lithops/serverless/backends/azure_functions/azure_functions.py +++ b/lithops/serverless/backends/azure_functions/azure_functions.py @@ -44,7 +44,7 @@ class AzureFunctionAppBackend: def __init__(self, af_config, internal_storage): logger.debug("Creating Azure Functions client") self.name = 'azure_functions' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.af_config = af_config self.trigger = af_config['trigger'] self.resource_group = af_config['resource_group'] @@ -230,7 +230,7 @@ def _create_function(self, runtime_name, memory, timeout): time.sleep(10) utils.run_command(cmd) break - except Exception as e: + except Exception: pass time.sleep(10) @@ -295,7 +295,7 @@ def invoke(self, runtime_name, memory=None, payload={}, return_result=False): raise Exception(f'Invocation error: {resp.reason} - {resp_text}') try: resp_text = json.loads(resp_text) - except Exception as e: + except Exception: raise Exception(f'Unable to load runtime metadata: {resp_text}') else: # logger.debug('Invoking calls {}'.format(', '.join(payload['call_ids']))) @@ -328,7 +328,7 @@ def clean(self, **kwargs): runtimes = self.list_runtimes() - for runtime_name, runtime_memory, version in runtimes: + for runtime_name, runtime_memory, version, wk_name in runtimes: self.delete_runtime(runtime_name, runtime_memory, version) def _generate_runtime_meta(self, runtime_name, memory): @@ -367,7 +367,7 @@ def list_runtimes(self, runtime_name='all'): name = functionapp['Tags']['runtime_name'] memory = config.DEFAULT_CONFIG_KEYS['runtime_memory'] if runtime_name == functionapp['Name'] or runtime_name == 'all': - runtimes.append((name, memory, version)) + runtimes.append((name, memory, version, functionapp['Name'])) return runtimes diff --git a/lithops/serverless/backends/azure_functions/config.py b/lithops/serverless/backends/azure_functions/config.py index b39fbf0d1..8c5265785 100644 --- a/lithops/serverless/backends/azure_functions/config.py +++ b/lithops/serverless/backends/azure_functions/config.py @@ -138,6 +138,7 @@ cloudpickle ps-mem tblib +psutil """ diff --git a/lithops/serverless/backends/code_engine/__init__.py b/lithops/serverless/backends/code_engine/__init__.py index 19d04ff15..e33a357eb 100644 --- a/lithops/serverless/backends/code_engine/__init__.py +++ b/lithops/serverless/backends/code_engine/__init__.py @@ -1 +1,3 @@ from .code_engine import CodeEngineBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/code_engine/code_engine.py b/lithops/serverless/backends/code_engine/code_engine.py index 8d22e113a..ce5eb59a9 100644 --- a/lithops/serverless/backends/code_engine/code_engine.py +++ b/lithops/serverless/backends/code_engine/code_engine.py @@ -16,6 +16,7 @@ # import os +import re import base64 import hashlib import json @@ -40,6 +41,7 @@ logger = logging.getLogger(__name__) + # Decorator to wrap a function to reinit clients and retry on except. def retry_on_except(func): def decorated_func(*args, **kwargs): @@ -78,7 +80,7 @@ class CodeEngineBackend: def __init__(self, ce_config, internal_storage): logger.debug("Creating IBM Code Engine client") self.name = 'code_engine' - self.type = 'batch' + self.type = utils.BackendType.BATCH.value self.config = ce_config self.internal_storage = internal_storage self.is_lithops_worker = utils.is_lithops_worker() @@ -88,7 +90,7 @@ def __init__(self, ce_config, internal_storage): self.namespace = ce_config.get('namespace') self.region = ce_config['region'] - self.user_key = self.iam_api_key[:4].lower() + self.user_key = re.sub(r'[^a-z0-9\-\.]', '0', self.iam_api_key[:4].lower()) # RFC 1123 compliant self.project_name = ce_config.get('project_name', f'lithops-{self.region}-{self.user_key}') self.project_id = None @@ -224,7 +226,7 @@ def _format_jobdef_name(self, runtime_name, runtime_memory, version=__version__) name = f'{runtime_name}-{runtime_memory}-{version}' name_hash = hashlib.sha1(name.encode("utf-8")).hexdigest()[:10] - return f'lithops-worker-{version.replace(".", "")}-{name_hash}' + return f'lithops-worker-{self.user_key}-{version.replace(".", "")}-{name_hash}' def _get_default_runtime_image_name(self): """ @@ -281,8 +283,10 @@ def _build_default_runtime(self, default_runtime_img_name): """ # Build default runtime using local dokcer dockerfile = "Dockefile.default-ce-runtime" + python_version = utils.CURRENT_PY_VERSION + base_image = "slim-buster" if int(python_version.split('.')[1]) < 13 else "bookworm" with open(dockerfile, 'w') as f: - f.write(f"FROM python:{utils.CURRENT_PY_VERSION}-slim-buster\n") + f.write(f"FROM python:{python_version}-{base_image}\n") f.write(config.DOCKERFILE_DEFAULT) try: self.build_runtime(default_runtime_img_name, dockerfile) @@ -338,6 +342,7 @@ def clean(self, all=False): """ Deletes all runtimes from all packages """ + logger.info(f'Cleaning project {self.project_name}') if not self._get_or_create_namespace(create=False): logger.info(f"Project {self.project_name} does not exist") if os.path.exists(self.cache_file): @@ -347,7 +352,7 @@ def clean(self, all=False): self._create_k8s_iam_client() self.clear() runtimes = self.list_runtimes() - for image_name, memory, version in runtimes: + for image_name, memory, version, fn_name in runtimes: self.delete_runtime(image_name, memory, version) logger.debug('Deleting all lithops configmaps') @@ -363,7 +368,7 @@ def clean(self, all=False): if all and os.path.exists(self.cache_file): self._create_code_engine_client() - logger.debug(f"Deleting Code Engine project: {self.project_name}") + logger.info(f"Deleting Code Engine project: {self.project_name}") self.code_engine_service_v2.delete_project(id=self.project_id) os.remove(self.cache_file) @@ -393,14 +398,18 @@ def list_runtimes(self, docker_image_name='all'): for jobdef in jobdefs['items']: try: - if jobdef['metadata']['labels']['type'] == 'lithops-runtime': - version = jobdef['metadata']['labels']['version'].replace('lithops_v', '') - container = jobdef['spec']['template']['containers'][0] - image_name = container['image'] - memory = container['resources']['requests']['memory'].replace('M', '') - memory = int(int(memory) / 1000 * 1024) - if docker_image_name in image_name or docker_image_name == 'all': - runtimes.append((image_name, memory, version)) + if not jobdef['metadata']['name'].startswith(f'lithops-worker-{self.user_key}'): + continue + if not jobdef['metadata']['labels']['type'] == 'lithops-runtime': + continue + fn_name = jobdef['metadata']['name'] + version = jobdef['metadata']['labels']['version'].replace('lithops_v', '') + container = jobdef['spec']['template']['containers'][0] + image_name = container['image'] + memory = container['resources']['requests']['memory'].replace('M', '') + memory = int(int(memory) / 1000 * 1024) + if docker_image_name in image_name or docker_image_name == 'all': + runtimes.append((image_name, memory, version, fn_name)) except Exception: pass @@ -451,7 +460,18 @@ def invoke(self, docker_image_name, runtime_memory, job_payload): total_calls = job_payload['total_calls'] chunksize = job_payload['chunksize'] + max_workers = job_payload['max_workers'] + + # Make sure only max_workers are started total_workers = total_calls // chunksize + (total_calls % chunksize > 0) + if max_workers < total_workers: + chunksize = total_calls // max_workers + (total_calls % max_workers > 0) + total_workers = total_calls // chunksize + (total_calls % chunksize > 0) + job_payload['chunksize'] = chunksize + + logger.debug( + f'ExecutorID {executor_id} | JobID {job_id} - Required Workers: {total_workers}' + ) jobdef_name = self._format_jobdef_name(docker_image_name, runtime_memory) @@ -536,7 +556,7 @@ def _create_container_registry_secret(self): try: self.core_api.delete_namespaced_secret("lithops-regcred", self.namespace) - except ApiException as e: + except ApiException: pass try: @@ -703,8 +723,9 @@ def _generate_runtime_meta(self, docker_image_name, memory): done = False failed = False + failed_message = "" - while not done or failed: + while not done and not failed: try: w = watch.Watch() for event in w.stream(self.custom_api.list_namespaced_custom_object, @@ -715,6 +736,14 @@ def _generate_runtime_meta(self, docker_image_name, memory): failed = int(event['object'].get('status')['failed']) done = int(event['object'].get('status')['succeeded']) logger.debug('...') + if failed: + try: + pod_description = self.core_api.read_namespaced_pod( + name=f'{jobrun_name}-1-0', namespace=self.namespace + ) + failed_message = pod_description.status.container_statuses[0].state.terminated.message + except Exception: + pass if done or failed: w.stop() except Exception: @@ -737,7 +766,7 @@ def _generate_runtime_meta(self, docker_image_name, memory): self._delete_config_map(config_map_name) if failed: - raise Exception("Unable to extract Python preinstalled modules from the runtime") + raise Exception(f"Unable to extract Python preinstalled modules from the runtime: {failed_message}") data_key = '/'.join([JOBS_PREFIX, jobdef_name + '.meta']) json_str = self.internal_storage.get_data(key=data_key) diff --git a/lithops/serverless/backends/code_engine/config.py b/lithops/serverless/backends/code_engine/config.py index ebc059ea0..724b72dba 100644 --- a/lithops/serverless/backends/code_engine/config.py +++ b/lithops/serverless/backends/code_engine/config.py @@ -62,7 +62,8 @@ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil ENV PORT 8080 ENV CONCURRENCY 1 diff --git a/lithops/serverless/backends/code_engine/entry_point.py b/lithops/serverless/backends/code_engine/entry_point.py index cd8377936..5fd14228c 100644 --- a/lithops/serverless/backends/code_engine/entry_point.py +++ b/lithops/serverless/backends/code_engine/entry_point.py @@ -22,7 +22,7 @@ import flask import logging from lithops.version import __version__ -from lithops.utils import setup_lithops_logger, b64str_to_dict,\ +from lithops.utils import setup_lithops_logger, b64str_to_dict, \ iterchunks from lithops.worker import function_handler from lithops.worker import function_invoker @@ -96,7 +96,7 @@ def extract_runtime_metadata(payload): runtime_meta = get_runtime_metadata() internal_storage = InternalStorage(payload) - status_key = '/'.join([JOBS_PREFIX, payload['runtime_name']+'.meta']) + status_key = '/'.join([JOBS_PREFIX, payload['runtime_name'] + '.meta']) logger.info(f"Runtime metadata key {status_key}") dmpd_response_status = json.dumps(runtime_meta) internal_storage.put_data(status_key, dmpd_response_status) diff --git a/lithops/serverless/backends/gcp_cloudrun/__init__.py b/lithops/serverless/backends/gcp_cloudrun/__init__.py index 9ae1c4923..ae7707626 100644 --- a/lithops/serverless/backends/gcp_cloudrun/__init__.py +++ b/lithops/serverless/backends/gcp_cloudrun/__init__.py @@ -1 +1,3 @@ from .cloudrun import GCPCloudRunBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/gcp_cloudrun/cloudrun.py b/lithops/serverless/backends/gcp_cloudrun/cloudrun.py index 68f9177f3..758385cd4 100644 --- a/lithops/serverless/backends/gcp_cloudrun/cloudrun.py +++ b/lithops/serverless/backends/gcp_cloudrun/cloudrun.py @@ -33,10 +33,10 @@ from lithops.constants import COMPUTE_CLI_MSG from lithops.version import __version__ -invoke_mutex = Lock() - from . import config +invoke_mutex = Lock() + logger = logging.getLogger(__name__) @@ -44,7 +44,7 @@ class GCPCloudRunBackend: def __init__(self, cloudrun_config, internal_storage): self.name = 'gcp_cloudrun' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.cr_config = cloudrun_config self.region = cloudrun_config['region'] self.trigger = cloudrun_config['trigger'] @@ -217,9 +217,9 @@ def build_runtime(self, runtime_name, dockerfile, extra_args=[]): if dockerfile: assert os.path.isfile(dockerfile), f'Cannot locate "{dockerfile}"' - cmd = f'{docker_path} build -t {image_name} -f {dockerfile} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {image_name} -f {dockerfile} . ' else: - cmd = f'{docker_path} build -t {image_name} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {image_name} . ' cmd = cmd + ' '.join(extra_args) try: @@ -372,12 +372,13 @@ def list_runtimes(self, runtime_name='all'): runtimes = [] for item in res['items']: labels = item['spec']['template']['metadata']['labels'] + wk_name = item['metadata']['name'] if labels and 'type' in labels and labels['type'] == 'lithops-runtime': version = labels['lithops-version'].replace('-', '.') container = item['spec']['template']['spec']['containers'][0] memory = container['resources']['limits']['memory'].replace('Mi', '') if runtime_name in container['image'] or runtime_name == 'all': - runtimes.append((container['image'], memory, version)) + runtimes.append((container['image'], memory, version, wk_name)) return runtimes diff --git a/lithops/serverless/backends/gcp_cloudrun/config.py b/lithops/serverless/backends/gcp_cloudrun/config.py index 8ec4a8425..d0340cf45 100644 --- a/lithops/serverless/backends/gcp_cloudrun/config.py +++ b/lithops/serverless/backends/gcp_cloudrun/config.py @@ -69,7 +69,8 @@ google-cloud-pubsub \ google-api-python-client \ gcsfs \ - google-auth + google-auth \ + psutil ENV PORT 8080 diff --git a/lithops/serverless/backends/gcp_functions/__init__.py b/lithops/serverless/backends/gcp_functions/__init__.py index 0e5722cba..8f3d647bc 100644 --- a/lithops/serverless/backends/gcp_functions/__init__.py +++ b/lithops/serverless/backends/gcp_functions/__init__.py @@ -1 +1,3 @@ from .gcp_functions import GCPFunctionsBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/gcp_functions/config.py b/lithops/serverless/backends/gcp_functions/config.py index c0538bab0..4963ee7c0 100644 --- a/lithops/serverless/backends/gcp_functions/config.py +++ b/lithops/serverless/backends/gcp_functions/config.py @@ -29,15 +29,14 @@ RUNTIME_MEMORY_MAX = 8192 # 8GB RUNTIME_MEMORY_OPTIONS = {128, 256, 512, 1024, 2048, 4096, 8192} -RETRIES = 5 -RETRY_SLEEP = 20 - AVAILABLE_PY_RUNTIMES = { '3.7': 'python37', '3.8': 'python38', '3.9': 'python39', '3.10': 'python310', - '3.11': 'python311' + '3.11': 'python311', + '3.12': 'python312', + '3.13': 'python313' } USER_RUNTIMES_PREFIX = 'lithops.user_runtimes' @@ -48,37 +47,30 @@ 'max_workers': 1000, 'worker_processes': 1, 'invoke_pool_threads': 1000, - 'trigger': 'pub/sub' + 'trigger': 'pub/sub', + 'retries': 5, + 'retry_sleep': 10 } REQUIREMENTS_FILE = """ -numpy -scipy -scikit-learn -pandas google-cloud google-cloud-storage google-cloud-pubsub google-auth google-api-python-client -certifi -chardet -docutils -httplib2 -idna -jmespath -kafka-python -lxml -pika -redis -requests +numpy six -urllib3 -virtualenv -PyYAML +requests +redis +pika +scikit-learn +diskcache cloudpickle ps-mem tblib +PyYAML +urllib3 +psutil """ @@ -111,8 +103,5 @@ def load_config(config_data=None): if config_data['gcp_functions']['runtime_memory'] > RUNTIME_MEMORY_MAX: config_data['gcp_functions']['runtime_memory'] = RUNTIME_MEMORY_MAX - config_data['gcp_functions']['retries'] = RETRIES - config_data['gcp_functions']['retry_sleep'] = RETRY_SLEEP - if 'region' not in config_data['gcp']: config_data['gcp']['region'] = config_data['gcp_functions']['region'] diff --git a/lithops/serverless/backends/gcp_functions/gcp_functions.py b/lithops/serverless/backends/gcp_functions/gcp_functions.py index 127b24c18..c0f647728 100644 --- a/lithops/serverless/backends/gcp_functions/gcp_functions.py +++ b/lithops/serverless/backends/gcp_functions/gcp_functions.py @@ -37,17 +37,17 @@ from lithops.version import __version__ from lithops.constants import COMPUTE_CLI_MSG, JOBS_PREFIX, TEMP_DIR -invoke_mutex = Lock() - from . import config +invoke_mutex = Lock() + logger = logging.getLogger(__name__) class GCPFunctionsBackend: def __init__(self, gcf_config, internal_storage): self.name = 'gcp_functions' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.gcf_config = gcf_config self.region = gcf_config['region'] self.num_retries = gcf_config['retries'] @@ -143,7 +143,7 @@ def _get_token(self, function_name): if self.credentials_path and os.path.isfile(self.credentials_path): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = self.credentials_path auth_req = google.auth.transport.requests.Request() - self._api_token = google.oauth2.id_token.fetch_id_token(auth_req, self.function_url) + self._api_token = google.oauth2.id_token.fetch_id_token(auth_req, self._function_url) invoke_mutex.release() @@ -176,7 +176,7 @@ def _wait_function_deleted(self, function_location): time.sleep(self.retry_sleep) else: raise Exception(f'Unknown status: {response["status"]}') - except Exception as e: + except Exception: logger.debug('Function status is DELETED') break @@ -244,26 +244,35 @@ def _create_function(self, runtime_name, memory, timeout=60): 'failurePolicy': {} } - response = self._api_resource.projects().locations().functions().create( - location=self._default_location, - body=cloud_function - ).execute(num_retries=self.num_retries) + logger.info(f'Deploying function {function_location}') + for attempt in range(self.num_retries): + try: + operation = self._api_resource.projects().locations().functions().create( + location=self._default_location, + body=cloud_function + ).execute() + break + except Exception as e: + if attempt < self.num_retries - 1: + time.sleep(self.retry_sleep) + else: + raise Exception(f"Failed to create Cloud Function after {self.num_retries} attempts.") from e # Wait until the function is completely deployed logger.info('Waiting for the function to be deployed') + operation_name = operation['name'] while True: - response = self._api_resource.projects().locations().functions().get( - name=function_location + op_status = self._api_resource.operations().get( + name=operation_name ).execute(num_retries=self.num_retries) - logger.debug(f'Function status is {response["status"]}') - if response['status'] == 'ACTIVE': + if op_status.get('done'): + if 'error' in op_status: + raise Exception(f'Error while deploying Cloud Function: {op_status["error"]}') + logger.info("Deployment completed successfully.") break - elif response['status'] == 'OFFLINE': - raise Exception('Error while deploying Cloud Function') - elif response['status'] == 'DEPLOY_IN_PROGRESS': - time.sleep(self.retry_sleep) else: - raise Exception(f"Unknown status {response['status']}") + logger.debug("Deployment in progress, waiting...") + time.sleep(self.retry_sleep) def build_runtime(self, runtime_name, requirements_file, extra_args=[]): if not requirements_file: @@ -351,7 +360,7 @@ def delete_runtime(self, runtime_name, runtime_memory, version=__version__): def clean(self, **kwargs): logger.debug('Going to delete all deployed runtimes') runtimes = self.list_runtimes() - for runtime_name, runtime_memory, version in runtimes: + for runtime_name, runtime_memory, version, wk_name in runtimes: self.delete_runtime(runtime_name, runtime_memory, version) def list_runtimes(self, runtime_name='all'): @@ -364,11 +373,12 @@ def list_runtimes(self, runtime_name='all'): for func in response.get('functions', []): if func['labels'] and 'type' in func['labels'] \ and func['labels']['type'] == 'lithops-runtime': + fn_name = func['name'].rsplit('/', 1)[-1] version = func['labels']['lithops_version'].replace('-', '.') - name = func['labels']['runtime_name'] + rt_name = func['labels']['runtime_name'] memory = func['availableMemoryMb'] - if runtime_name == name or runtime_name == 'all': - runtimes.append((name, memory, version)) + if runtime_name == rt_name or runtime_name == 'all': + runtimes.append((rt_name, memory, version, fn_name)) return runtimes diff --git a/lithops/serverless/backends/ibm_cf/__init__.py b/lithops/serverless/backends/ibm_cf/__init__.py index 17474c5b1..1a04cedf0 100644 --- a/lithops/serverless/backends/ibm_cf/__init__.py +++ b/lithops/serverless/backends/ibm_cf/__init__.py @@ -1 +1,3 @@ from .ibm_cf import IBMCloudFunctionsBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/ibm_cf/config.py b/lithops/serverless/backends/ibm_cf/config.py index bd33f64d8..2e4a6b184 100644 --- a/lithops/serverless/backends/ibm_cf/config.py +++ b/lithops/serverless/backends/ibm_cf/config.py @@ -23,7 +23,8 @@ '3.8': 'docker.io/lithopscloud/ibmcf-python-v38', '3.9': 'docker.io/lithopscloud/ibmcf-python-v39', '3.10': 'docker.io/lithopscloud/ibmcf-python-v310', - '3.11': 'docker.io/lithopscloud/ibmcf-python-v311' + '3.11': 'docker.io/lithopscloud/ibmcf-python-v311', + '3.12': 'docker.io/lithopscloud/ibmcf-python-v312' } DEFAULT_CONFIG_KEYS = { diff --git a/lithops/serverless/backends/ibm_cf/ibm_cf.py b/lithops/serverless/backends/ibm_cf/ibm_cf.py index cc0395307..89e488ad0 100644 --- a/lithops/serverless/backends/ibm_cf/ibm_cf.py +++ b/lithops/serverless/backends/ibm_cf/ibm_cf.py @@ -16,7 +16,6 @@ import os import logging -import shutil from threading import Lock from lithops import utils @@ -40,7 +39,7 @@ class IBMCloudFunctionsBackend: def __init__(self, cf_config, internal_storage): logger.debug("Creating IBM Cloud Functions client") self.name = 'ibm_cf' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.config = cf_config self.is_lithops_worker = utils.is_lithops_worker() @@ -129,7 +128,7 @@ def _get_or_create_namespace(self, create=True): response = self.cf_client.list_namespaces(self.resource_group_id) if 'namespaces' in response: for namespace in response['namespaces']: - if namespace['name'] == self.namespace_name: + if 'name' in namespace and namespace['name'] == self.namespace_name: logger.debug(f"Found Cloud Functions namespace: {self.namespace_name}") self.namespace_id = namespace['id'] self.config['namespace_id'] = self.namespace_id @@ -290,7 +289,7 @@ def list_runtimes(self, docker_image_name='all'): for action in actions: version, image_name, memory = self._unformat_function_name(action['name']) if docker_image_name == image_name or docker_image_name == 'all': - runtimes.append((image_name, memory, version)) + runtimes.append((image_name, memory, version, action['name'])) return runtimes def pre_invoke(self, docker_image_name, runtime_memory): diff --git a/lithops/serverless/backends/k8s/__init__.py b/lithops/serverless/backends/k8s/__init__.py index af99dd3dd..29747aeb8 100644 --- a/lithops/serverless/backends/k8s/__init__.py +++ b/lithops/serverless/backends/k8s/__init__.py @@ -1 +1,3 @@ from .k8s import KubernetesBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/k8s/config.py b/lithops/serverless/backends/k8s/config.py index 359963a57..ed6f4a6e1 100644 --- a/lithops/serverless/backends/k8s/config.py +++ b/lithops/serverless/backends/k8s/config.py @@ -18,6 +18,7 @@ DEFAULT_CONFIG_KEYS = { 'runtime_timeout': 600, # Default: 10 minutes + 'master_timeout': 600, # Default: 10 minutes 'runtime_memory': 512, # Default memory: 512 MB 'runtime_cpu': 1, # 1 vCPU 'max_workers': 100, @@ -53,7 +54,8 @@ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil ENV PYTHONUNBUFFERED TRUE @@ -114,6 +116,27 @@ - name: lithops-regcred """ +POD = """ +apiVersion: v1 +kind: Pod +metadata: + name: lithops-worker +spec: + containers: + - name: "lithops-worker" + image: "" + command: ["python3"] + args: + - "/lithops/lithopsentry.py" + - "--" + - "--" + resources: + requests: + cpu: '1' + memory: '512Mi' +""" + + def load_config(config_data): for key in DEFAULT_CONFIG_KEYS: if key not in config_data['k8s']: @@ -124,3 +147,6 @@ def load_config(config_data): registry = config_data['k8s']['docker_server'] if runtime.count('/') == 1 and registry not in runtime: config_data['k8s']['runtime'] = f'{registry}/{runtime}' + + if config_data['k8s'].get('rabbitmq_executor', False): + config_data['k8s']['amqp_url'] = config_data['rabbitmq']['amqp_url'] diff --git a/lithops/serverless/backends/k8s/entry_point.py b/lithops/serverless/backends/k8s/entry_point.py index 29a1688d8..06eb0d45e 100644 --- a/lithops/serverless/backends/k8s/entry_point.py +++ b/lithops/serverless/backends/k8s/entry_point.py @@ -14,6 +14,7 @@ # limitations under the License. # +import pika import os import sys import uuid @@ -23,6 +24,7 @@ import time import requests from functools import partial +from multiprocessing import Value, Process from lithops.version import __version__ from lithops.utils import setup_lithops_logger, b64str_to_dict @@ -42,8 +44,6 @@ @proxy.route('/get-range///', methods=['GET']) def get_range(jobkey, total_calls, chunksize): - global JOB_INDEXES - range_start = 0 if jobkey not in JOB_INDEXES else JOB_INDEXES[jobkey] range_end = min(range_start + int(chunksize), int(total_calls)) JOB_INDEXES[jobkey] = range_end @@ -57,9 +57,9 @@ def get_range(jobkey, total_calls, chunksize): def run_master_server(): # Start Redis Server in the background - logger.info("Starting redis server in Master Pod") - os.system("redis-server --bind 0.0.0.0 --daemonize yes") - logger.info("Redis server started") + # logger.info("Starting redis server in Master Pod") + # os.system("redis-server --bind 0.0.0.0 --daemonize yes") + # logger.info("Redis server started") proxy.logger.setLevel(logging.DEBUG) proxy.run(debug=True, host='0.0.0.0', port=config.MASTER_PORT, use_reloader=False) @@ -77,7 +77,7 @@ def extract_runtime_meta(payload): internal_storage.put_data(status_key, dmpd_response_status) -def run_job(payload): +def run_job_k8s(payload): logger.info(f"Lithops v{__version__} - Starting kubernetes execution") os.environ['__LITHOPS_ACTIVATION_ID'] = str(uuid.uuid4()).replace('-', '')[:12] @@ -85,12 +85,8 @@ def run_job(payload): total_calls = payload['total_calls'] job_key = payload['job_key'] - worker_processes = payload['worker_processes'] chunksize = payload['chunksize'] - # Optimize chunksize to the number of processess if necessary - chunksize = worker_processes if worker_processes > chunksize else chunksize - call_ids = payload['call_ids'] data_byte_ranges = payload['data_byte_ranges'] @@ -104,7 +100,7 @@ def run_job(payload): try: server = f'http://{master_ip}:{config.MASTER_PORT}' url = f'{server}/get-range/{job_key}/{total_calls}/{chunksize}' - res = requests.get(url) + res = requests.get(url, timeout=0.1) call_ids_range = res.text # for example: 0-5 except Exception: time.sleep(0.1) @@ -123,6 +119,87 @@ def run_job(payload): logger.info("Finishing kubernetes execution") +def run_job_k8s_rabbitmq(payload): + logger.info(f"Lithops v{__version__} - Starting kubernetes execution") + + act_id = str(uuid.uuid4()).replace('-', '')[:12] + os.environ['__LITHOPS_ACTIVATION_ID'] = act_id + os.environ['__LITHOPS_BACKEND'] = 'k8s_rabbitmq' + + function_handler(payload) + with running_jobs.get_lock(): + running_jobs.value += len(payload['call_ids']) + + logger.info("Finishing kubernetes execution") + + +def callback_work_queue(ch, method, properties, body): + """Callback to receive the payload and run the jobs""" + logger.info("Call from lithops received.") + + message = json.loads(body) + tasks = message['total_calls'] + + # If there are more tasks than cpus in the pod, we need to send a new message + if tasks <= running_jobs.value: + processes_to_start = tasks + else: + if running_jobs.value == 0: + logger.info("All cpus are busy. Waiting for a cpu to be free") + ch.basic_nack(delivery_tag=method.delivery_tag) + time.sleep(0.5) + return + + processes_to_start = running_jobs.value + + message_to_send = message.copy() + message_to_send['total_calls'] = tasks - running_jobs.value + message_to_send['call_ids'] = message_to_send['call_ids'][running_jobs.value:] + message_to_send['data_byte_ranges'] = message_to_send['data_byte_ranges'][running_jobs.value:] + + message['total_calls'] = running_jobs.value + message['call_ids'] = message['call_ids'][:running_jobs.value] + message['data_byte_ranges'] = message['data_byte_ranges'][:running_jobs.value] + + ch.basic_publish( + exchange='', + routing_key='task_queue', + body=json.dumps(message_to_send), + properties=pika.BasicProperties( + delivery_mode=pika.spec.PERSISTENT_DELIVERY_MODE + )) + + logger.info(f"Starting {processes_to_start} processes") + + message['worker_processes'] = running_jobs.value + with running_jobs.get_lock(): + running_jobs.value -= processes_to_start + + Process(target=run_job_k8s_rabbitmq, args=(message,)).start() + + ch.basic_ack(delivery_tag=method.delivery_tag) + + +def start_rabbitmq_listening(payload): + global running_jobs + + # Connect to rabbitmq + params = pika.URLParameters(payload['amqp_url']) + connection = pika.BlockingConnection(params) + channel = connection.channel() + channel.queue_declare(queue='task_queue', durable=True) + channel.basic_qos(prefetch_count=1) + + # Shared variable to track completed jobs + running_jobs = Value('i', payload['cpus_pod']) + + # Start listening to the new job + channel.basic_consume(queue='task_queue', on_message_callback=callback_work_queue) + + logger.info("Listening to rabbitmq...") + channel.start_consuming() + + if __name__ == '__main__': action = sys.argv[1] encoded_payload = sys.argv[2] @@ -132,8 +209,9 @@ def run_job(payload): switcher = { 'get_metadata': partial(extract_runtime_meta, payload), - 'run_job': partial(run_job, payload), - 'run_master': run_master_server + 'run_job': partial(run_job_k8s, payload), + 'run_master': run_master_server, + 'start_rabbitmq': partial(start_rabbitmq_listening, payload) } func = switcher.get(action, lambda: "Invalid command") diff --git a/lithops/serverless/backends/k8s/k8s.py b/lithops/serverless/backends/k8s/k8s.py index 23cfa435b..d0f726840 100644 --- a/lithops/serverless/backends/k8s/k8s.py +++ b/lithops/serverless/backends/k8s/k8s.py @@ -15,6 +15,8 @@ # import os +import re +import pika import base64 import hashlib import json @@ -48,7 +50,7 @@ class KubernetesBackend: def __init__(self, k8s_config, internal_storage): logger.debug("Creating Kubernetes client") self.name = 'k8s' - self.type = 'batch' + self.type = utils.BackendType.BATCH.value self.k8s_config = k8s_config self.internal_storage = internal_storage @@ -59,6 +61,7 @@ def __init__(self, k8s_config, internal_storage): self.cluster = k8s_config.get('cluster', 'default') self.user = k8s_config.get('user', 'default') self.master_name = k8s_config.get('master_name', config.MASTER_NAME) + self.rabbitmq_executor = self.k8s_config.get('rabbitmq_executor', False) if os.path.exists(self.kubecfg_path): logger.debug(f"Loading kubeconfig file: {self.kubecfg_path}") @@ -70,7 +73,8 @@ def __init__(self, k8s_config, internal_storage): ctx_context = current_context.get('context') self.namespace = ctx_context.get('namespace') or self.namespace self.cluster = ctx_context.get('cluster') or self.cluster - self.user = ctx_context.get('user') or self.user + ctx_user = ctx_context.get('user') + self.user = hashlib.sha1(ctx_user.encode()).hexdigest()[:10] if ctx_user else self.user logger.debug(f"Using kubeconfig conetxt: {ctx_name} - cluster: {self.cluster}") self.is_incluster = False else: @@ -79,8 +83,7 @@ def __init__(self, k8s_config, internal_storage): self.is_incluster = True if self.master_name == config.MASTER_NAME: - user_hash = hashlib.sha1(self.user.encode()).hexdigest()[:6] - self.master_name = f'{config.MASTER_NAME}-{user_hash}' + self.master_name = f'{config.MASTER_NAME}-{self.user}' self.k8s_config['namespace'] = self.namespace self.k8s_config['cluster'] = self.cluster @@ -90,13 +93,26 @@ def __init__(self, k8s_config, internal_storage): self.batch_api = client.BatchV1Api() self.core_api = client.CoreV1Api() + if self.rabbitmq_executor: + self.amqp_url = self.k8s_config['amqp_url'] + + # Init rabbitmq + params = pika.URLParameters(self.amqp_url) + self.connection = pika.BlockingConnection(params) + self.channel = self.connection.channel() + self.channel.queue_declare(queue='task_queue', durable=True) + + # Define some needed variables + self._get_nodes() + self.image = "" + self.jobs = [] # list to store executed jobs (job_keys) msg = COMPUTE_CLI_MSG.format('Kubernetes') logger.info(f"{msg} - Namespace: {self.namespace}") def _format_job_name(self, runtime_name, runtime_memory, version=__version__): - name = f'{runtime_name}-{runtime_memory}-{version}' + name = f'{runtime_name}-{runtime_memory}-{version}-{self.user}' name_hash = hashlib.sha1(name.encode()).hexdigest()[:10] return f'lithops-worker-{version.replace(".", "")}-{name_hash}' @@ -119,9 +135,9 @@ def build_runtime(self, docker_image_name, dockerfile, extra_args=[]): if dockerfile: assert os.path.isfile(dockerfile), f'Cannot locate "{dockerfile}"' - cmd = f'{docker_path} build -t {docker_image_name} -f {dockerfile} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {docker_image_name} -f {dockerfile} . ' else: - cmd = f'{docker_path} build -t {docker_image_name} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {docker_image_name} . ' cmd = cmd + ' '.join(extra_args) try: @@ -201,7 +217,7 @@ def _create_container_registry_secret(self): try: self.core_api.delete_namespaced_secret("lithops-regcred", self.namespace) - except ApiException as e: + except ApiException: pass try: @@ -240,6 +256,7 @@ def clean(self, all=False): logger.debug('Cleaning lithops resources in kubernetes') try: + self._delete_workers() jobs = self.batch_api.list_namespaced_job( namespace=self.namespace, label_selector=f'user={self.user}' @@ -260,6 +277,9 @@ def clean(self, all=False): except ApiException: pass + if self.rabbitmq_executor: + self.channel.queue_delete(queue='task_queue') + def clear(self, job_keys=None): """ Delete only completed jobs @@ -291,6 +311,125 @@ def list_runtimes(self, docker_image_name='all'): logger.debug('Note that this backend does not manage runtimes') return [] + def _create_pod(self, pod, pod_name, cpu, memory): + pod["metadata"]["name"] = f"lithops-pod-{pod_name}" + node_name = re.sub(r'-\d+$', '', pod_name) + pod["spec"]["nodeName"] = node_name + pod["spec"]["containers"][0]["image"] = self.image + pod["spec"]["containers"][0]["resources"]["requests"]["cpu"] = str(cpu) + pod["spec"]["containers"][0]["resources"]["requests"]["memory"] = memory + pod["metadata"]["labels"] = {"app": "lithops-pod"} + + payload = { + 'log_level': 'DEBUG', + 'amqp_url': self.amqp_url, + 'cpus_pod': cpu, + } + + pod["spec"]["containers"][0]["args"][1] = "start_rabbitmq" + pod["spec"]["containers"][0]["args"][2] = utils.dict_to_b64str(payload) + + self.core_api.create_namespaced_pod(body=pod, namespace=self.namespace) + + def _get_nodes(self): + self.nodes = [] + list_all_nodes = self.core_api.list_node() + for node in list_all_nodes.items: + # If the node is tainted, skip it + if node.spec.taints: + continue + + # Check if the CPU is in millicores + if isinstance(node.status.allocatable['cpu'], str) and 'm' in node.status.allocatable['cpu']: + # Extract the number part and convert it to an integer + number_match = re.search(r'\d+', node.status.allocatable['cpu']) + if number_match: + number = int(number_match.group()) + + # Round to the nearest whole number of CPUs - 1 + cpu_info = round(number / 1000) - 1 + + if cpu_info < 1: + cpu_info = 0 + else: + # Handle the case where the CPU is in millicores but no number is found + cpu_info = 0 + else: + # CPU is not in millicores + cpu_info = node.status.allocatable['cpu'] + + self.nodes.append({ + "name": node.metadata.name, + "cpu": cpu_info, + "memory": node.status.allocatable['memory'] + }) + + def _create_workers(self, runtime_memory): + default_pod_config = yaml.load(config.POD, Loader=yaml.loader.SafeLoader) + granularity = self.k8s_config['worker_processes'] + cluster_info_cpu = {} + cluster_info_mem = {} + num_cpus_cluster = 0 + + # If the unit is not specified, assume it is in MB + try: + mem_num, mem_uni = re.match(r'(\d+)(\D*)', runtime_memory).groups() + except TypeError: + mem_num = runtime_memory + mem_uni = 'M' + + if granularity <= 1: + granularity = False + + for node in self.nodes: + cpus_node = int(float(node["cpu"]) * 0.9) + + if granularity: + times, res = divmod(cpus_node, granularity) + + for i in range(times): + cluster_info_cpu[f"{node['name']}-{i}"] = granularity + cluster_info_mem[f"{node['name']}-{i}"] = f"{mem_num}{mem_uni}" + num_cpus_cluster += granularity + if res != 0: + cluster_info_cpu[f"{node['name']}-{times}"] = res + cluster_info_mem[f"{node['name']}-{times}"] = f"{mem_num}{mem_uni}" + num_cpus_cluster += res + else: + cluster_info_cpu[node["name"] + "-0"] = cpus_node + num_cpus_cluster += cpus_node + + # If runtime_memory is not defined in the config, use 80% of the node memory + if runtime_memory == 512: + mem_num, mem_uni = re.match(r'(\d+)(\D*)', node["memory"]).groups() + mem_num = int(float(mem_num) * 0.8) + cluster_info_mem[node["name"] + "-0"] = f"{mem_num}{mem_uni}" + else: + cluster_info_mem[node["name"] + "-0"] = f"{mem_num}{mem_uni}" + + if num_cpus_cluster == 0: + raise ValueError("Total CPUs of the cluster cannot be 0") + + # Create all the pods + for pod_name in cluster_info_cpu.keys(): + self._create_pod(default_pod_config, pod_name, cluster_info_cpu[pod_name], cluster_info_mem[pod_name]) + + logger.info(f"Total cpus of the cluster: {num_cpus_cluster}") + + def _delete_workers(self): + list_pods = self.core_api.list_namespaced_pod(self.namespace, label_selector="app=lithops-pod") + for pod in list_pods.items: + self.core_api.delete_namespaced_pod(pod.metadata.name, self.namespace) + + # Wait until all pods are deleted + while True: + list_pods = self.core_api.list_namespaced_pod(self.namespace, label_selector="app=lithops-pod") + + if not list_pods.items: + break # All pods are deleted + + logger.info('All pods are deleted.') + def _start_master(self, docker_image_name): master_pod = self.core_api.list_namespaced_pod( @@ -309,7 +448,7 @@ def _start_master(self, docker_image_name): propagation_policy='Background' ) time.sleep(2) - except ApiException as e: + except ApiException: pass master_res = yaml.safe_load(config.JOB_DEFAULT) @@ -317,6 +456,7 @@ def _start_master(self, docker_image_name): master_res['metadata']['namespace'] = self.namespace master_res['metadata']['labels']['version'] = 'lithops_v' + __version__ master_res['metadata']['labels']['user'] = self.user + master_res['spec']['activeDeadlineSeconds'] = self.k8s_config['master_timeout'] container = master_res['spec']['template']['spec']['containers'][0] container['image'] = docker_image_name @@ -345,62 +485,196 @@ def _start_master(self, docker_image_name): w.stop() return event['object'].status.pod_ip + def convert_memory_units(self, mem_num, mem_uni): + mem_num = int(mem_num) + + if 'i' in mem_uni: + mem_num *= 1024 + mem_uni = mem_uni[:-1] + if 'K' in mem_uni: + mem_num = mem_num / (1024 if 'i' in mem_uni else 1000) + elif 'G' in mem_uni: + mem_num = mem_num * (1024 if 'i' in mem_uni else 1000) + + return mem_num, 'M' + + # Detect if granularity, memory or runtime image changed or not + def _has_config_changed(self, runtime_mem): + config_granularity = False if self.k8s_config['worker_processes'] <= 1 else self.k8s_config['worker_processes'] + config_memory = self.k8s_config['runtime_memory'] if self.k8s_config['runtime_memory'] != 512 else False + + self.current_runtime = "" + + list_pods = self.core_api.list_namespaced_pod(self.namespace, label_selector="app=lithops-pod") + + for pod in list_pods.items: + pod_name = pod.metadata.name + + # Get the node info where the pod is running + node_info = next((node for node in self.nodes if node["name"] == pod.spec.node_name), False) + if not node_info: + return True + + # Get the pod info + self.current_runtime = pod.spec.containers[0].image + pod_resource_cpu = int(pod.spec.containers[0].resources.requests.get('cpu', '0m')) + pod_resource_memory = pod.spec.containers[0].resources.requests.get('memory', '0Mi') + + multiples_pods_per_node = re.search(r'-\d+(? 0)) + job_key = job_payload['job_key'] + self.jobs.append(job_key) - activation_id = f'lithops-{job_key.lower()}' + # Send packages of tasks to the queue + granularity = max(1, job_payload['total_calls'] // len(self.nodes) + if self.k8s_config['worker_processes'] <= 1 else self.k8s_config['worker_processes']) - job_res = yaml.safe_load(config.JOB_DEFAULT) - job_res['metadata']['name'] = activation_id - job_res['metadata']['namespace'] = self.namespace - job_res['metadata']['labels']['version'] = 'lithops_v' + __version__ - job_res['metadata']['labels']['user'] = self.user + times, res = divmod(job_payload['total_calls'], granularity) - job_res['spec']['activeDeadlineSeconds'] = self.k8s_config['runtime_timeout'] - job_res['spec']['parallelism'] = total_workers + for i in range(times + (1 if res != 0 else 0)): + num_tasks = granularity if i < times else res + payload_edited = job_payload.copy() - container = job_res['spec']['template']['spec']['containers'][0] - container['image'] = docker_image_name - if not docker_image_name.endswith(':latest'): - container['imagePullPolicy'] = 'IfNotPresent' + start_index = i * granularity + end_index = start_index + num_tasks - container['env'][0]['value'] = 'run_job' - container['env'][1]['value'] = utils.dict_to_b64str(job_payload) - container['env'][2]['value'] = master_ip + payload_edited['call_ids'] = payload_edited['call_ids'][start_index:end_index] + payload_edited['data_byte_ranges'] = payload_edited['data_byte_ranges'][start_index:end_index] + payload_edited['total_calls'] = num_tasks - container['resources']['requests']['memory'] = f'{runtime_memory}Mi' - container['resources']['requests']['cpu'] = str(self.k8s_config['runtime_cpu']) - container['resources']['limits']['memory'] = f'{runtime_memory}Mi' - container['resources']['limits']['cpu'] = str(self.k8s_config['runtime_cpu']) + self.channel.basic_publish( + exchange='', + routing_key='task_queue', + body=json.dumps(payload_edited), + properties=pika.BasicProperties( + delivery_mode=pika.spec.PERSISTENT_DELIVERY_MODE + )) - logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Going ' - f'to run {total_calls} activations in {total_workers} workers') + activation_id = f'lithops-{job_key.lower()}' + else: + master_ip = self._start_master(docker_image_name) - if not all(key in self.k8s_config for key in ["docker_user", "docker_password"]): - del job_res['spec']['template']['spec']['imagePullSecrets'] + max_workers = job_payload['max_workers'] + executor_id = job_payload['executor_id'] + job_id = job_payload['job_id'] - try: - self.batch_api.create_namespaced_job( - namespace=self.namespace, - body=job_res + job_key = job_payload['job_key'] + self.jobs.append(job_key) + + total_calls = job_payload['total_calls'] + chunksize = job_payload['chunksize'] + total_workers = min(max_workers, total_calls // chunksize + (total_calls % chunksize > 0)) + + logger.debug( + f'ExecutorID {executor_id} | JobID {job_id} - Required Workers: {total_workers}' ) - except ApiException as e: - raise e + + activation_id = f'lithops-{job_key.lower()}' + + job_res = yaml.safe_load(config.JOB_DEFAULT) + job_res['metadata']['name'] = activation_id + job_res['metadata']['namespace'] = self.namespace + job_res['metadata']['labels']['version'] = 'lithops_v' + __version__ + job_res['metadata']['labels']['user'] = self.user + + job_res['spec']['activeDeadlineSeconds'] = self.k8s_config['runtime_timeout'] + job_res['spec']['parallelism'] = total_workers + + container = job_res['spec']['template']['spec']['containers'][0] + container['image'] = docker_image_name + if not docker_image_name.endswith(':latest'): + container['imagePullPolicy'] = 'IfNotPresent' + + container['env'][0]['value'] = 'run_job' + container['env'][1]['value'] = utils.dict_to_b64str(job_payload) + container['env'][2]['value'] = master_ip + + container['resources']['requests']['memory'] = f'{runtime_memory}Mi' + container['resources']['requests']['cpu'] = str(self.k8s_config['runtime_cpu']) + container['resources']['limits']['memory'] = f'{runtime_memory}Mi' + container['resources']['limits']['cpu'] = str(self.k8s_config['runtime_cpu']) + + logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Going ' + f'to run {total_calls} activations in {total_workers} workers') + + if not all(key in self.k8s_config for key in ["docker_user", "docker_password"]): + del job_res['spec']['template']['spec']['imagePullSecrets'] + + try: + self.batch_api.create_namespaced_job( + namespace=self.namespace, + body=job_res + ) + except ApiException as e: + raise e return activation_id @@ -435,7 +709,7 @@ def _generate_runtime_meta(self, docker_image_name): name=meta_job_name, propagation_policy='Background' ) - except ApiException as e: + except ApiException: pass try: @@ -459,7 +733,7 @@ def _generate_runtime_meta(self, docker_image_name): failed = event['object'].status.failed done = event['object'].status.succeeded logger.debug('...') - except Exception as e: + except Exception: pass w.stop() @@ -472,7 +746,7 @@ def _generate_runtime_meta(self, docker_image_name): name=meta_job_name, propagation_policy='Background' ) - except ApiException as e: + except ApiException: pass if failed: @@ -495,8 +769,7 @@ def get_runtime_key(self, docker_image_name, runtime_memory, version=__version__ in order to know which runtimes are installed and which not. """ jobdef_name = self._format_job_name(docker_image_name, 256, version) - user_hash = hashlib.sha1(self.user.encode()).hexdigest()[:6] - user_data = os.path.join(self.cluster, self.namespace, user_hash) + user_data = os.path.join(self.cluster, self.namespace, self.user) runtime_key = os.path.join(self.name, version, user_data, jobdef_name) return runtime_key diff --git a/lithops/serverless/backends/knative/__init__.py b/lithops/serverless/backends/knative/__init__.py index a8bc716ec..fc49c57fa 100644 --- a/lithops/serverless/backends/knative/__init__.py +++ b/lithops/serverless/backends/knative/__init__.py @@ -1 +1,3 @@ from .knative import KnativeServingBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/knative/config.py b/lithops/serverless/backends/knative/config.py index 0a40eccbd..755f85cd6 100644 --- a/lithops/serverless/backends/knative/config.py +++ b/lithops/serverless/backends/knative/config.py @@ -61,7 +61,8 @@ cloudpickle \ paramiko \ ps-mem \ - tblib + tblib \ + psutil ENV PORT 8080 ENV CONCURRENCY 1 diff --git a/lithops/serverless/backends/knative/knative.py b/lithops/serverless/backends/knative/knative.py index 263e494d3..03cfa67a5 100644 --- a/lithops/serverless/backends/knative/knative.py +++ b/lithops/serverless/backends/knative/knative.py @@ -50,7 +50,7 @@ class KnativeServingBackend: def __init__(self, knative_config, internal_storage): self.name = 'knative' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.kn_config = knative_config self.ingress_endpoint = self.kn_config.get('ingress_endpoint') self.kubecfg_path = self.kn_config.get('kubecfg_path') @@ -92,7 +92,7 @@ def __init__(self, knative_config, internal_storage): ip = None ingress = self.core_api.read_namespaced_service(service, namespace) http_port = list(filter(lambda port: port.port == 80, ingress.spec.ports))[0].node_port - https_port = list(filter(lambda port: port.port == 443, ingress.spec.ports))[0].node_port + # https_port = list(filter(lambda port: port.port == 443, ingress.spec.ports))[0].node_port if ingress.status.load_balancer.ingress is not None: # get loadbalancer ip ip = ingress.status.load_balancer.ingress[0].ip @@ -113,7 +113,7 @@ def __init__(self, knative_config, internal_storage): self.ingress_endpoint = f'http://{ip}:{http_port}' self.kn_config['ingress_endpoint'] = self.ingress_endpoint logger.debug(f"Ingress endpoint set to {self.ingress_endpoint}") - except Exception as e: + except Exception: pass if 'service_host_suffix' not in self.kn_config: @@ -420,7 +420,7 @@ def _create_container_registry_secret(self): try: self.core_api.delete_namespaced_secret("lithops-regcred", self.namespace) - except ApiException as e: + except ApiException: pass try: @@ -560,9 +560,9 @@ def build_runtime(self, runtime_name, dockerfile, extra_args=[]): if dockerfile: assert os.path.isfile(dockerfile), f'Cannot locate "{dockerfile}"' - cmd = f'{docker_path} build -t {runtime_name} -f {dockerfile} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {runtime_name} -f {dockerfile} . ' else: - cmd = f'{docker_path} build -t {runtime_name} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {runtime_name} . ' cmd = cmd + ' '.join(extra_args) try: @@ -610,13 +610,13 @@ def clean(self, **kwargs): Deletes all runtimes deployed in knative """ runtimes = self.list_runtimes() - for img_name, memory, version in runtimes: + for img_name, memory, version, wk_name in runtimes: self.delete_runtime(img_name, memory, version) def list_runtimes(self, runtime_name='all'): """ List all the runtimes deployed in knative - return: list of tuples [runtime_name, memory, version] + return: list of tuples [runtime_name, memory, version, worker_name] """ knative_services = self.custom_api.list_namespaced_custom_object( group=config.DEFAULT_GROUP, @@ -630,13 +630,14 @@ def list_runtimes(self, runtime_name='all'): try: template = service['spec']['template'] labels = template['metadata']['labels'] + wk_name = service['metadata']['name'] if labels and 'type' in labels and labels['type'] == 'lithops-runtime': version = labels['lithops-version'].replace('-', '.') container = template['spec']['containers'][0] memory = container['resources']['requests']['memory'].replace('Mi', '') memory = int(memory.replace('Gi', '')) * 1024 if 'Gi' in memory else memory if runtime_name in container['image'] or runtime_name == 'all': - runtimes.append((container['image'], memory, version)) + runtimes.append((container['image'], memory, version, wk_name)) except Exception: # It is not a lithops runtime pass diff --git a/lithops/serverless/backends/openwhisk/__init__.py b/lithops/serverless/backends/openwhisk/__init__.py index 7512a3baf..d284e9d13 100644 --- a/lithops/serverless/backends/openwhisk/__init__.py +++ b/lithops/serverless/backends/openwhisk/__init__.py @@ -1 +1,3 @@ from .openwhisk import OpenWhiskBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/openwhisk/config.py b/lithops/serverless/backends/openwhisk/config.py index 5df4cc8df..d182fd821 100644 --- a/lithops/serverless/backends/openwhisk/config.py +++ b/lithops/serverless/backends/openwhisk/config.py @@ -17,12 +17,13 @@ import os AVAILABLE_PY_RUNTIMES = { - '3.6': 'docker.io/lithopscloud/ibmcf-python-v36', - '3.7': 'docker.io/lithopscloud/ibmcf-python-v37', - '3.8': 'docker.io/lithopscloud/ibmcf-python-v38', - '3.9': 'docker.io/lithopscloud/ibmcf-python-v39', - '3.10': 'docker.io/lithopscloud/ibmcf-python-v310', - '3.11': 'docker.io/lithopscloud/ibmcf-python-v311' + '3.6': 'docker.io/lithopscloud/openwhisk-python-v36', + '3.7': 'docker.io/lithopscloud/openwhisk-python-v37', + '3.8': 'docker.io/lithopscloud/openwhisk-python-v38', + '3.9': 'docker.io/lithopscloud/openwhisk-python-v39', + '3.10': 'docker.io/lithopscloud/openwhisk-python-v310', + '3.11': 'docker.io/lithopscloud/openwhisk-python-v311', + '3.12': 'docker.io/lithopscloud/openwhisk-python-v312' } DEFAULT_CONFIG_KEYS = { diff --git a/lithops/serverless/backends/openwhisk/openwhisk.py b/lithops/serverless/backends/openwhisk/openwhisk.py index b1fb4f679..c23693e55 100644 --- a/lithops/serverless/backends/openwhisk/openwhisk.py +++ b/lithops/serverless/backends/openwhisk/openwhisk.py @@ -35,7 +35,7 @@ class OpenWhiskBackend: def __init__(self, ow_config, internal_storage): logger.debug("Creating OpenWhisk client") self.name = 'openwhisk' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.ow_config = ow_config self.is_lithops_worker = utils.is_lithops_worker() @@ -70,8 +70,34 @@ def _format_function_name(self, runtime_name, runtime_memory, version=__version_ def _unformat_function_name(self, action_name): runtime_name, memory, version = action_name.rsplit('_', 2) - image_name = runtime_name.replace('_', '/', 2) - image_name = image_name.replace('_', ':', -1) + image_name_parts = runtime_name.split('_') + + domain = image_name_parts[0] + + try: + port = int(image_name_parts[1]) + namespace = image_name_parts[2] + name = image_name_parts[3] + last_id = 4 + except Exception: + port = None + namespace = image_name_parts[1] + name = image_name_parts[2] + last_id = 3 + + try: + tag = image_name_parts[last_id] + except Exception: + tag = None + + if port: + image_name = f'{domain}:{port}/{namespace}/{name}' + else: + image_name = f'{domain}/{namespace}/{name}' + + if tag: + image_name = f'{image_name}:{tag}' + return version, image_name, int(memory.replace('MB', '')) def _get_default_runtime_image_name(self): @@ -176,7 +202,7 @@ def list_runtimes(self, docker_image_name='all'): for action in actions: version, image_name, memory = self._unformat_function_name(action['name']) if docker_image_name == image_name or docker_image_name == 'all': - runtimes.append((image_name, memory, version)) + runtimes.append((image_name, memory, version, action['name'])) return runtimes def invoke(self, docker_image_name, runtime_memory, payload): diff --git a/lithops/serverless/backends/oracle_f/__init__.py b/lithops/serverless/backends/oracle_f/__init__.py index 45c648292..be4ba3281 100644 --- a/lithops/serverless/backends/oracle_f/__init__.py +++ b/lithops/serverless/backends/oracle_f/__init__.py @@ -1 +1,3 @@ from .oracle_f import OracleCloudFunctionsBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/oracle_f/config.py b/lithops/serverless/backends/oracle_f/config.py index 577f5b8cd..b667afbef 100644 --- a/lithops/serverless/backends/oracle_f/config.py +++ b/lithops/serverless/backends/oracle_f/config.py @@ -15,7 +15,6 @@ import os import copy -from lithops.constants import TEMP_DIR DEFAULT_CONFIG_KEYS = { @@ -53,7 +52,8 @@ cloudpickle \ ps-mem \ tblib \ - oci + oci \ + psutil ARG FUNCTION_DIR="/function" @@ -76,12 +76,12 @@ ENTRYPOINT ["/usr/local/bin/fdk", "handler/entry_point.py", "handler"] """ -AVAILABLE_PY_RUNTIMES = ['3.6', '3.7', '3.8', '3.9'] +AVAILABLE_PY_RUNTIMES = ['3.6', '3.7', '3.8', '3.9', '3.11'] REQ_PARAMS_1 = ('compartment_id', 'user', 'key_file', 'region', 'tenancy', 'fingerprint') - REQ_PARAMS_2 = ('subnet_id', ) + def load_config(config_data=None): if 'oracle' not in config_data: raise Exception("'oracle' section is mandatory in the configuration") diff --git a/lithops/serverless/backends/oracle_f/oracle_f.py b/lithops/serverless/backends/oracle_f/oracle_f.py index df0bd941e..637b5ffb0 100644 --- a/lithops/serverless/backends/oracle_f/oracle_f.py +++ b/lithops/serverless/backends/oracle_f/oracle_f.py @@ -38,7 +38,7 @@ class OracleCloudFunctionsBackend: def __init__(self, oci_config, internal_storage): self.name = 'oracle_f' - self.type = 'faas' + self.type = utils.BackendType.FAAS.value self.config = oci_config self.user = oci_config['user'] @@ -83,7 +83,7 @@ def _get_namespace(self): raise Exception(f"An error occurred: ({response.status}) {response.data}") def _format_function_name(self, runtime_name, runtime_memory, version=__version__): - name = f'{runtime_name}-{runtime_memory}-{version}' + name = f'{runtime_name}-{runtime_memory}-{version}-{self.user}' name_hash = hashlib.sha1(name.encode("utf-8")).hexdigest()[:10] return f'lithops-worker-{runtime_name.split("/")[-1]}-v{version}-{name_hash}' @@ -200,9 +200,9 @@ def build_runtime(self, runtime_name, dockerfile, extra_args=[]): # Build the Docker image if dockerfile: assert os.path.isfile(dockerfile), f'Cannot locate "{dockerfile}"' - cmd = f'{docker_path} build -t {image_name} -f {dockerfile} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {image_name} -f {dockerfile} . ' else: - cmd = f'{docker_path} build -t {image_name} . ' + cmd = f'{docker_path} build --platform=linux/amd64 -t {image_name} . ' cmd = cmd + ' '.join(extra_args) try: @@ -353,7 +353,7 @@ def list_runtimes(self, runtime_name='all'): image_name = function.image version = function.freeform_tags['lithops_version'] if runtime_name == 'all' or self._format_image_name(runtime_name) == image_name: - runtimes.append((image_name, memory, version)) + runtimes.append((image_name, memory, version, function.display_name)) return runtimes diff --git a/lithops/serverless/backends/singularity/__init__.py b/lithops/serverless/backends/singularity/__init__.py new file mode 100644 index 000000000..3396168f0 --- /dev/null +++ b/lithops/serverless/backends/singularity/__init__.py @@ -0,0 +1,3 @@ +from .singularity import SingularityBackend as ServerlessBackend + +__all__ = ['ServerlessBackend'] diff --git a/lithops/serverless/backends/singularity/config.py b/lithops/serverless/backends/singularity/config.py new file mode 100644 index 000000000..3bb933d90 --- /dev/null +++ b/lithops/serverless/backends/singularity/config.py @@ -0,0 +1,78 @@ +# +# (C) Copyright Cloudlab URV 2024 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import shutil + +FH_ZIP_LOCATION = os.path.join(os.getcwd(), 'lithops_singularity.zip') + +DEFAULT_CONFIG_KEYS = { + 'runtime_timeout': 600, # Default: 10 minutes + 'runtime_memory': 512, # Default memory: 512 MB + 'max_workers': 100, + 'worker_processes': 1, +} + + +SINGULARITYFILE_DEFAULT = """ +%post + apt-get update && apt-get install -y \ + zip \ + && rm -rf /var/lib/apt/lists/* + + pip install --upgrade setuptools six pip \ + && pip install --no-cache-dir \ + boto3 \ + pika \ + flask \ + gevent \ + redis \ + requests \ + PyYAML \ + numpy \ + cloudpickle \ + ps-mem \ + tblib \ + psutil + +%files + lithops_singularity.zip /lithops/lithops_singularity.zip + +%post + cd /lithops + unzip lithops_singularity.zip && rm lithops_singularity.zip + +%runscript + python3 /lithops/lithopsentry.py $AMQP_URL +""" + + +def get_singularity_path(): + singularity_path = shutil.which('singularity') + if not singularity_path: + raise Exception('singularity command not found. Install singularity') + return singularity_path + + +def load_config(config_data): + for key in DEFAULT_CONFIG_KEYS: + if key not in config_data['singularity']: + config_data['singularity'][key] = DEFAULT_CONFIG_KEYS[key] + + if 'rabbitmq' not in config_data: + raise Exception('RabbitMQ configuration is needed in this backend') + else: + config_data['singularity']['amqp_url'] = config_data['rabbitmq'].get('amqp_url', False) diff --git a/lithops/serverless/backends/singularity/entry_point.py b/lithops/serverless/backends/singularity/entry_point.py new file mode 100644 index 000000000..22d3aeec6 --- /dev/null +++ b/lithops/serverless/backends/singularity/entry_point.py @@ -0,0 +1,142 @@ +# +# (C) Copyright Cloudlab URV 2024 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pika +import os +import sys +import time +import uuid +import json +import logging +from multiprocessing import Value, cpu_count +from threading import Thread + +from lithops.version import __version__ +from lithops.utils import setup_lithops_logger, b64str_to_dict, dict_to_b64str +from lithops.worker import function_handler +from lithops.worker.utils import get_runtime_metadata +from lithops.constants import JOBS_PREFIX +from lithops.storage.storage import InternalStorage + +logger = logging.getLogger('lithops.worker') + + +def extract_runtime_meta(payload): + logger.info(f"Lithops v{__version__} - Generating metadata") + + runtime_meta = get_runtime_metadata() + + internal_storage = InternalStorage(payload) + status_key = '/'.join([JOBS_PREFIX, payload['runtime_name'] + '.meta']) + dmpd_response_status = json.dumps(runtime_meta) + internal_storage.put_data(status_key, dmpd_response_status) + logger.info(f"Runtime metadata key {status_key}") + + +def run_job_k8s_rabbitmq(payload): + logger.info(f"Lithops v{__version__} - Starting singularity execution") + + act_id = str(uuid.uuid4()).replace('-', '')[:12] + os.environ['__LITHOPS_ACTIVATION_ID'] = act_id + os.environ['__LITHOPS_BACKEND'] = 'singularity' + + function_handler(payload) + with running_jobs.get_lock(): + running_jobs.value += len(payload['call_ids']) + + logger.info("Finishing singularity execution") + + +def manage_work_queue(ch, method, payload): + """Callback to receive the payload and run the jobs""" + logger.info("Call from lithops received.") + + message = payload + tasks = message['total_calls'] + + # If there are more tasks than cpus in the pod, we need to send a new message + if tasks <= running_jobs.value: + processes_to_start = tasks + else: + if running_jobs.value == 0: + logger.info("All cpus are busy. Waiting for a cpu to be free") + ch.basic_nack(delivery_tag=method.delivery_tag) + time.sleep(0.5) + return + + processes_to_start = running_jobs.value + + message_to_send = message.copy() + message_to_send['total_calls'] = tasks - running_jobs.value + message_to_send['call_ids'] = message_to_send['call_ids'][running_jobs.value:] + message_to_send['data_byte_ranges'] = message_to_send['data_byte_ranges'][running_jobs.value:] + message_to_send = {'action': 'send_task', 'payload': dict_to_b64str(message_to_send)} + message['call_ids'] = message['call_ids'][:running_jobs.value] + message['data_byte_ranges'] = message['data_byte_ranges'][:running_jobs.value] + + ch.basic_publish( + exchange='', + routing_key='task_queue', + body=json.dumps(message_to_send), + properties=pika.BasicProperties( + delivery_mode=pika.spec.PERSISTENT_DELIVERY_MODE + )) + + logger.info(f"Starting {processes_to_start} processes") + + message['worker_processes'] = running_jobs.value + with running_jobs.get_lock(): + running_jobs.value -= processes_to_start + + Thread(target=run_job_k8s_rabbitmq, args=([message])).start() + + ch.basic_ack(delivery_tag=method.delivery_tag) + + +def actions_switcher(ch, method, properties, body): + message = json.loads(body) + action = message['action'] + encoded_payload = message['payload'] + + payload = b64str_to_dict(encoded_payload) + setup_lithops_logger(payload.get('log_level', 'INFO')) + + logger.info(f"Action {action} received from lithops.") + + if action == 'get_metadata': + extract_runtime_meta(payload) + ch.basic_ack(delivery_tag=method.delivery_tag) + + elif action == 'send_task': + manage_work_queue(ch, method, payload) + + +if __name__ == '__main__': + # Shared variable to track completed jobs + running_jobs = Value('i', cpu_count()) + + # Connect to rabbitmq + params = pika.URLParameters(sys.argv[1]) + connection = pika.BlockingConnection(params) + channel = connection.channel() + channel.queue_declare(queue='task_queue', durable=True) + channel.basic_qos(prefetch_count=1) + + # Start listening to the new job + channel.basic_consume(queue='task_queue', on_message_callback=actions_switcher) + + logger.info("Listening to rabbitmq...") + channel.start_consuming() diff --git a/lithops/serverless/backends/singularity/singularity.py b/lithops/serverless/backends/singularity/singularity.py new file mode 100644 index 000000000..10c62882b --- /dev/null +++ b/lithops/serverless/backends/singularity/singularity.py @@ -0,0 +1,259 @@ +# +# (C) Copyright Cloudlab URV 2024 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import pika +import hashlib +import json +import logging +import copy +import time + +from lithops import utils +from lithops.version import __version__ +from lithops.constants import COMPUTE_CLI_MSG, JOBS_PREFIX + +from . import config + + +logger = logging.getLogger(__name__) + + +class SingularityBackend: + """ + A wrap-up around Singularity backend. + """ + + def __init__(self, singularity_config, internal_storage): + logger.debug("Creating Singularity client") + self.name = 'singularity' + self.type = utils.BackendType.BATCH.value + self.singularity_config = singularity_config + self.internal_storage = internal_storage + self.amqp_url = self.singularity_config['amqp_url'] + + # Init rabbitmq + params = pika.URLParameters(self.amqp_url) + self.connection = pika.BlockingConnection(params) + self.channel = self.connection.channel() + self.channel.queue_declare(queue='task_queue', durable=True) + + msg = COMPUTE_CLI_MSG.format('Singularity') + logger.info(f"{msg}") + + def _format_job_name(self, runtime_name, runtime_memory, version=__version__): + name = f'{runtime_name}-{runtime_memory}-{version}' + name_hash = hashlib.sha1(name.encode()).hexdigest()[:10] + + return f'lithops-worker-{version.replace(".", "")}-{name_hash}' + + def _get_default_runtime_image_name(self): + """ + Generates the default runtime image name + """ + py_version = utils.CURRENT_PY_VERSION.replace('.', '') + return f'singularity-runtime-v{py_version}' + + def build_runtime(self, singularity_image_name, singularityfile, extra_args=[]): + """ + Builds a new runtime from a Singularity file and pushes it to the registry + """ + logger.info(f'Building runtime {singularity_image_name} from {singularityfile or "Singularity"}') + + singularity_path = config.get_singularity_path() + + sif_path = self.singularity_config.get('sif_path', '/tmp/') + # Check if sif_path ends with /, if not, put it + sif_path = sif_path if sif_path.endswith('/') else sif_path + '/' + singularity_image_path = f'{sif_path}{singularity_image_name}.sif' + + if singularityfile: + assert os.path.isfile(singularityfile), f'Cannot locate "{singularityfile}"' + cmd = f'{singularity_path} build ' + ' '.join(extra_args) + f' {singularity_image_path} {singularityfile}' + else: + default_singularityfile = self._create_default_runtime() + cmd = f'{singularity_path} build ' + ' '.join(extra_args) + f' {singularity_image_path} {default_singularityfile}' + + try: + entry_point = os.path.join(os.path.dirname(__file__), 'entry_point.py') + utils.create_handler_zip(config.FH_ZIP_LOCATION, entry_point, 'lithopsentry.py') + utils.run_command(cmd) + finally: + os.remove(config.FH_ZIP_LOCATION) + + if not singularityfile: + os.remove(default_singularityfile) + + logger.debug('Building done!') + + def _create_default_runtime(self): + """ + Builds the default runtime + """ + # Build default runtime using local dokcer + singularityfile = 'singularity_template.def' + + with open(singularityfile, 'w') as f: + f.write("Bootstrap: docker\n") + f.write(f"From: python:{utils.CURRENT_PY_VERSION}-slim-buster\n") + f.write(config.SINGULARITYFILE_DEFAULT) + + return singularityfile + + def deploy_runtime(self, singularity_image_name, memory, timeout): + """ + Deploys a new runtime + """ + try: + default_image_name = self._get_default_runtime_image_name() + except Exception: + default_image_name = None + + if singularity_image_name == default_image_name: + self.build_runtime(singularity_image_name, None) + + logger.info(f"Deploying runtime: {singularity_image_name}") + runtime_meta = self._generate_runtime_meta(singularity_image_name) + + return runtime_meta + + def delete_runtime(self, singularity_image_name, memory, version): + """ + Deletes a runtime + """ + pass + + def clean(self, all=False): + """ + Deletes all jobs + """ + logger.debug('Cleaning RabbitMQ queues') + self.channel.queue_delete(queue='task_queue') + + def list_runtimes(self, singularity_image_name='all'): + """ + List all the runtimes + return: list of tuples (singularity_image_name, memory) + """ + logger.debug('Listing runtimes') + logger.debug('Note that this backend does not manage runtimes') + return [] + + def invoke(self, singularity_image_name, runtime_memory, job_payload): + """ + Invoke -- return information about this invocation + For array jobs only remote_invocator is allowed + """ + job_key = job_payload['job_key'] + granularity = self.singularity_config['worker_processes'] + times, res = divmod(job_payload['total_calls'], granularity) + + for i in range(times + (1 if res != 0 else 0)): + num_tasks = granularity if i < times else res + payload_edited = job_payload.copy() + + start_index = i * granularity + end_index = start_index + num_tasks + + payload_edited['call_ids'] = payload_edited['call_ids'][start_index:end_index] + payload_edited['data_byte_ranges'] = payload_edited['data_byte_ranges'][start_index:end_index] + payload_edited['total_calls'] = num_tasks + + message = { + 'action': 'send_task', + 'payload': utils.dict_to_b64str(payload_edited) + } + + self.channel.basic_publish( + exchange='', + routing_key='task_queue', + body=json.dumps(message), + properties=pika.BasicProperties( + delivery_mode=pika.spec.PERSISTENT_DELIVERY_MODE + )) + + activation_id = f'lithops-{job_key.lower()}' + + return activation_id + + def _generate_runtime_meta(self, singularity_image_name): + runtime_name = self._format_job_name(singularity_image_name, 128) + + logger.info(f"Extracting metadata from: {singularity_image_name}") + + payload = copy.deepcopy(self.internal_storage.storage.config) + payload['runtime_name'] = runtime_name + payload['log_level'] = logger.getEffectiveLevel() + encoded_payload = utils.dict_to_b64str(payload) + + message = { + 'action': 'get_metadata', + 'payload': encoded_payload + } + + # Send message to RabbitMQ + self.channel.basic_publish( + exchange='', + routing_key='task_queue', + body=json.dumps(message), + properties=pika.BasicProperties( + delivery_mode=pika.spec.PERSISTENT_DELIVERY_MODE + )) + + logger.debug("Waiting for runtime metadata") + + for i in range(0, 300): + try: + data_key = '/'.join([JOBS_PREFIX, runtime_name + '.meta']) + json_str = self.internal_storage.get_data(key=data_key) + runtime_meta = json.loads(json_str.decode("ascii")) + self.internal_storage.del_data(key=data_key) + break + except Exception: + time.sleep(2) + + if not runtime_meta or 'preinstalls' not in runtime_meta: + raise Exception(f'Failed getting runtime metadata: {runtime_meta}') + + return runtime_meta + + def get_runtime_key(self, singularity_image_name, runtime_memory, version=__version__): + """ + Method that creates and returns the runtime key. + Runtime keys are used to uniquely identify runtimes within the storage, + in order to know which runtimes are installed and which not. + """ + jobdef_name = self._format_job_name(singularity_image_name, 256, version) + runtime_key = os.path.join(self.name, version, jobdef_name) + + return runtime_key + + def get_runtime_info(self): + """ + Method that returns all the relevant information about the runtime set + in config + """ + if 'runtime' not in self.singularity_config or self.singularity_config['runtime'] == 'default': + self.singularity_config['runtime'] = self._get_default_runtime_image_name() + + runtime_info = { + 'runtime_name': self.singularity_config['runtime'], + 'runtime_memory': self.singularity_config['runtime_memory'], + 'runtime_timeout': self.singularity_config['runtime_timeout'], + 'max_workers': self.singularity_config['max_workers'], + } + + return runtime_info diff --git a/lithops/serverless/serverless.py b/lithops/serverless/serverless.py index 355770787..4f270cd47 100644 --- a/lithops/serverless/serverless.py +++ b/lithops/serverless/serverless.py @@ -95,7 +95,7 @@ def clean(self, **kwargs): """ self.backend.clean(**kwargs) - def clear(self, job_keys=None): + def clear(self, job_keys=None, exception=None): """ Wrapper method to clear the compute backend """ diff --git a/lithops/standalone/__init__.py b/lithops/standalone/__init__.py index 920733ec4..c0f4f6d71 100644 --- a/lithops/standalone/__init__.py +++ b/lithops/standalone/__init__.py @@ -1 +1,4 @@ from .standalone import StandaloneHandler +from .utils import LithopsValidationError + +__all__ = ['StandaloneHandler', LithopsValidationError] diff --git a/lithops/standalone/backends/aws_ec2/__init__.py b/lithops/standalone/backends/aws_ec2/__init__.py index f440093f8..cd2e11568 100644 --- a/lithops/standalone/backends/aws_ec2/__init__.py +++ b/lithops/standalone/backends/aws_ec2/__init__.py @@ -1 +1,3 @@ from .aws_ec2 import AWSEC2Backend as StandaloneBackend + +__all__ = ['StandaloneBackend'] diff --git a/lithops/standalone/backends/aws_ec2/aws_ec2.py b/lithops/standalone/backends/aws_ec2/aws_ec2.py index 25630c829..4a5eb2ec0 100644 --- a/lithops/standalone/backends/aws_ec2/aws_ec2.py +++ b/lithops/standalone/backends/aws_ec2/aws_ec2.py @@ -20,19 +20,18 @@ import uuid import logging import base64 +import boto3 +import botocore from botocore.exceptions import ClientError from datetime import datetime from concurrent.futures import ThreadPoolExecutor -import boto3 -import botocore - from lithops.version import __version__ from lithops.util.ssh_client import SSHClient -from lithops.constants import COMPUTE_CLI_MSG, CACHE_DIR, SA_IMAGE_NAME_DEFAULT +from lithops.constants import COMPUTE_CLI_MSG, CACHE_DIR from lithops.config import load_yaml_config, dump_yaml_config -from lithops.standalone.utils import CLOUD_CONFIG_WORKER, CLOUD_CONFIG_WORKER_PK, ExecMode, get_host_setup_script -from lithops.standalone.standalone import LithopsValidationError +from lithops.standalone.utils import CLOUD_CONFIG_WORKER, CLOUD_CONFIG_WORKER_PK, StandaloneMode, get_host_setup_script +from lithops.standalone import LithopsValidationError logger = logging.getLogger(__name__) @@ -43,6 +42,9 @@ DEFAULT_UBUNTU_IMAGE_VERSION = DEFAULT_UBUNTU_IMAGE.replace('*', '202306*') DEFAULT_UBUNTU_ACCOUNT_ID = '099720109477' +DEFAULT_LITHOPS_IMAGE_NAME = 'lithops-ubuntu-jammy-22.04-amd64-server' + + def b64s(string): """ Base-64 encode a string and return a string @@ -52,40 +54,58 @@ def b64s(string): class AWSEC2Backend: - def __init__(self, config, mode): + def __init__(self, ec2_config, mode): logger.debug("Creating AWS EC2 client") self.name = 'aws_ec2' - self.config = config + self.config = ec2_config self.mode = mode self.region_name = self.config['region'] + + suffix = 'vm' if self.mode == StandaloneMode.CONSUME.value else 'vpc' self.cache_dir = os.path.join(CACHE_DIR, self.name) - self.cache_file = os.path.join(self.cache_dir, self.region_name + '_data') + self.cache_file = os.path.join(self.cache_dir, f'{self.region_name}_{suffix}_data') + self.vpc_data_type = 'provided' if 'vpc_id' in self.config else 'created' self.ssh_data_type = 'provided' if 'ssh_key_name' in self.config else 'created' self.ec2_data = {} self.vpc_name = None self.vpc_key = None - self.user_key = self.config['access_key_id'][-4:].lower() - client_config = botocore.client.Config( - user_agent_extra=self.config['user_agent'] - ) + self.instance_types = {} - self.ec2_client = boto3.client( - 'ec2', aws_access_key_id=self.config['access_key_id'], - aws_secret_access_key=self.config['secret_access_key'], - aws_session_token=self.config.get('session_token'), - config=client_config, + self.aws_session = boto3.Session( + aws_access_key_id=ec2_config.get('access_key_id'), + aws_secret_access_key=ec2_config.get('secret_access_key'), + aws_session_token=ec2_config.get('session_token'), region_name=self.region_name ) + self.ec2_client = self.aws_session.client( + 'ec2', config=botocore.client.Config( + user_agent_extra=self.config['user_agent'] + ) + ) + + if 'user_id' not in self.config: + sts_client = self.aws_session.client('sts') + identity = sts_client.get_caller_identity() + + self.user_id = self.config.get('user_id') or identity["UserId"] + self.user_key = self.user_id.split(":")[0][-4:].lower() + self.master = None self.workers = [] msg = COMPUTE_CLI_MSG.format('AWS EC2') logger.info(f"{msg} - Region: {self.region_name}") + def is_initialized(self): + """ + Checks if the backend is initialized + """ + return os.path.isfile(self.cache_file) + def _load_ec2_data(self): """ Loads EC2 data from local cache @@ -105,6 +125,13 @@ def _dump_ec2_data(self): """ dump_yaml_config(self.cache_file, self.ec2_data) + def _delete_vpc_data(self): + """ + Deletes the vpc data file + """ + if os.path.exists(self.cache_file): + os.remove(self.cache_file) + def _create_vpc(self): """ Creates a new VPC @@ -113,15 +140,16 @@ def _create_vpc(self): return if 'vpc_id' in self.ec2_data: + logger.debug(f'Using VPC {self.ec2_data["vpc_name"]}') vpcs_info = self.ec2_client.describe_vpcs(VpcIds=[self.ec2_data['vpc_id']]) if len(vpcs_info) > 0: self.config['vpc_id'] = self.ec2_data['vpc_id'] return self.vpc_name = self.config.get('vpc_name', f'lithops-vpc-{self.user_key}-{str(uuid.uuid4())[-6:]}') - logger.debug(f'Setting VPC name to: {self.vpc_name}') + logger.debug(f'Setting VPC name to {self.vpc_name}') - assert re.match("^[a-z0-9-:-]*$", self.vpc_name),\ + assert re.match("^[a-z0-9-:-]*$", self.vpc_name), \ f'VPC name "{self.vpc_name}" not valid' filter = [{'Name': 'tag:Name', 'Values': [self.vpc_name]}] @@ -131,12 +159,50 @@ def _create_vpc(self): if 'vpc_id' not in self.config: logger.debug(f'Creating VPC {self.vpc_name}') - response = self.ec2_client.create_vpc(CidrBlock='192.168.0.0/16') + response = self.ec2_client.create_vpc(CidrBlock='10.0.0.0/16') tags = [{"Key": "Name", "Value": self.vpc_name}] self.ec2_client.create_tags(Resources=[response['Vpc']['VpcId']], Tags=tags) self.config['vpc_id'] = response['Vpc']['VpcId'] + def _create_subnets(self): + """ + Creates a public and a private subnets + """ + if 'public_subnet_id' in self.config: + return + + if 'public_subnet_id' in self.ec2_data: + sg_info = self.ec2_client.describe_subnets( + SubnetIds=[self.ec2_data['public_subnet_id']] + ) + if len(sg_info) > 0: + self.config['public_subnet_id'] = self.ec2_data['public_subnet_id'] + + if 'public_subnet_id' not in self.config: + logger.debug(f'Creating new public subnet in VPC {self.vpc_name}') + response = self.ec2_client.create_subnet( + CidrBlock=self.config['public_subnet_cidr_block'], + VpcId=self.config['vpc_id'], + ) + public_subnet_id = response['Subnet']['SubnetId'] + self.config['public_subnet_id'] = public_subnet_id + + # if 'private_subnet_id' in self.ec2_data: + # sg_info = self.ec2_client.describe_subnets( + # SubnetIds=[self.ec2_data['private_subnet_id']] + # ) + # if len(sg_info) > 0: + # self.config['private_subnet_id'] = self.ec2_data['private_subnet_id'] + # + # if 'private_subnet_id' not in self.config: + # logger.debug(f'Creating new private subnet in VPC {self.vpc_name}') + # response = self.ec2_client.create_subnet( + # CidrBlock='10.0.2.0/24', VpcId=self.config['vpc_id'] + # ) + # private_subnet_id = response['Subnet']['SubnetId'] + # self.config['private_subnet_id'] = private_subnet_id + def _create_internet_gateway(self): """ Creates a new internet gateway @@ -145,7 +211,9 @@ def _create_internet_gateway(self): return if 'internet_gateway_id' in self.ec2_data: - ig_info = self.ec2_client.describe_internet_gateways(InternetGatewayIds=[self.ec2_data['internet_gateway_id']]) + ig_info = self.ec2_client.describe_internet_gateways( + InternetGatewayIds=[self.ec2_data['internet_gateway_id']] + ) if len(ig_info) > 0: self.config['internet_gateway_id'] = self.ec2_data['internet_gateway_id'] return @@ -157,46 +225,119 @@ def _create_internet_gateway(self): if 'internet_gateway_id' not in self.config: # Create and Attach the Internet Gateway + logger.debug(f'Creating Internet Gateway in VPC {self.vpc_name}') response = self.ec2_client.create_internet_gateway() internet_gateway_id = response['InternetGateway']['InternetGatewayId'] - self.ec2_client.attach_internet_gateway(VpcId=self.config['vpc_id'], InternetGatewayId=internet_gateway_id) + self.ec2_client.attach_internet_gateway( + VpcId=self.config['vpc_id'], InternetGatewayId=internet_gateway_id + ) self.config['internet_gateway_id'] = internet_gateway_id - # Create a public route to Internet Gateway - response = self.ec2_client.describe_route_tables() - for rt in response['RouteTables']: - if rt['VpcId'] == self.config['vpc_id']: - route_table_id = rt['RouteTableId'] - self.ec2_client.create_route( - DestinationCidrBlock='0.0.0.0/0', - GatewayId=internet_gateway_id, - RouteTableId=route_table_id + def _create_nat_gateway(self): + """ + Creates a new internet gateway + """ + if 'nat_gateway_id' in self.config: + return + + if 'nat_gateway_id' in self.ec2_data: + ig_info = self.ec2_client.describe_nat_gateways( + NatGatewayIds=[self.ec2_data['nat_gateway_id']] + ) + if len(ig_info) > 0: + self.config['nat_gateway_id'] = self.ec2_data['nat_gateway_id'] + return + + response = self.ec2_client.describe_nat_gateways() + for ng in response['NatGateways']: + if ng['SubnetId'] == self.config['public_subnet_id']: + self.config['nat_gateway_id'] = ng['NatGatewayId'] + + if 'nat_gateway_id' not in self.config: + logger.debug(f'Creating NAT Gateway in VPC {self.vpc_name}') + # Create an Elastic IP address for the NAT Gateway + # Create the NAT gateway can take up to 2 minutes + # TODO: Reuse Elastic IP adress if available + eip_resp = self.ec2_client.allocate_address(Domain='vpc') + allocation_id = eip_resp['AllocationId'] + + # Create a NAT Gateway + nat_gateway_resp = self.ec2_client.create_nat_gateway( + SubnetId=self.config['public_subnet_id'], + AllocationId=allocation_id + ) + nat_gateway_id = nat_gateway_resp['NatGateway']['NatGatewayId'] + self.config['nat_gateway_id'] = nat_gateway_id + + self.ec2_client.get_waiter('nat_gateway_available').wait( + NatGatewayIds=[nat_gateway_id], + WaiterConfig={'Delay': 5, 'MaxAttempts': 40} ) - def _create_subnet(self): + def _create_routing_tables(self): """ - Creates a new Subnet + Creates the routing tables """ - if 'subnet_id' in self.config: + if 'public_rtb_id' in self.config: return - if 'subnet_id' in self.ec2_data: - sg_info = self.ec2_client.describe_subnets(SubnetIds=[self.ec2_data['subnet_id']]) + if 'public_rtb_id' in self.ec2_data: + sg_info = self.ec2_client.describe_route_tables( + RouteTableIds=[self.ec2_data['public_rtb_id']] + ) if len(sg_info) > 0: - self.config['subnet_id'] = self.ec2_data['subnet_id'] - return + self.config['public_rtb_id'] = self.ec2_data['public_rtb_id'] - response = self.ec2_client.describe_subnets() - for subnet in response['Subnets']: - if subnet['VpcId'] == self.config['vpc_id']: - subnet_id = subnet['SubnetId'] - self.config['subnet_id'] = subnet_id - - if 'subnet_id' not in self.config: - logger.debug(f'Creating new subnet in VPC {self.vpc_name}') - response = self.ec2_client.create_subnet(CidrBlock='192.168.0.0/16', VpcId=self.config['vpc_id']) - subnet_id = response['Subnet']['SubnetId'] - self.config['subnet_id'] = subnet_id + if 'public_rtb_id' not in self.config: + logger.debug(f'Creating public routing table in VPC {self.vpc_name}') + # The default RT is the public RT + response = self.ec2_client.describe_route_tables() + for rt in response['RouteTables']: + if rt['VpcId'] == self.config['vpc_id']: + publ_route_table_id = rt['RouteTableId'] + self.ec2_client.create_tags( + Resources=[publ_route_table_id], + Tags=[{'Key': 'Name', 'Value': f'{self.vpc_name}-publ'}] + ) + self.ec2_client.associate_route_table( + RouteTableId=publ_route_table_id, + SubnetId=self.config['public_subnet_id'] + ) + self.ec2_client.create_route( + RouteTableId=publ_route_table_id, + DestinationCidrBlock='0.0.0.0/0', + GatewayId=self.config['internet_gateway_id'] + ) + self.config['public_rtb_id'] = publ_route_table_id + + # if 'private_rtb_id' in self.ec2_data: + # sg_info = self.ec2_client.describe_route_tables( + # RouteTableIds=[self.ec2_data['private_rtb_id']] + # ) + # if len(sg_info) > 0: + # self.config['private_rtb_id'] = self.ec2_data['private_rtb_id'] + # + # if 'private_rtb_id' not in self.config: + # logger.debug(f'Creating private routing table in VPC {self.vpc_name}') + # # Create private RT + # priv_route_table_resp = self.ec2_client.create_route_table( + # VpcId=self.config['vpc_id'] + # ) + # priv_route_table_id = priv_route_table_resp['RouteTable']['RouteTableId'] + # self.ec2_client.create_tags( + # Resources=[priv_route_table_id], + # Tags=[{'Key': 'Name', 'Value': f'{self.vpc_name}-priv'}] + # ) + # self.ec2_client.associate_route_table( + # RouteTableId=priv_route_table_id, + # SubnetId=self.config['private_subnet_id'] + # ) + # self.ec2_client.create_route( + # RouteTableId=priv_route_table_id, + # DestinationCidrBlock='0.0.0.0/0', + # GatewayId=self.config['nat_gateway_id'] + # ) + # self.config['private_rtb_id'] = priv_route_table_id def _create_security_group(self): """ @@ -206,7 +347,9 @@ def _create_security_group(self): return if 'security_group_id' in self.ec2_data: - sg_info = self.ec2_client.describe_security_groups(GroupIds=[self.ec2_data['security_group_id']]) + sg_info = self.ec2_client.describe_security_groups( + GroupIds=[self.ec2_data['security_group_id']] + ) if len(sg_info) > 0: self.config['security_group_id'] = self.ec2_data['security_group_id'] return @@ -217,7 +360,7 @@ def _create_security_group(self): self.config['security_group_id'] = sg['GroupId'] if 'security_group_id' not in self.config: - logger.debug(f'Creating new security group in VPC {self.vpc_name}') + logger.debug(f'Creating Security Group in VPC {self.vpc_name}') response = self.ec2_client.create_security_group( GroupName=self.vpc_name, Description=self.vpc_name, @@ -230,7 +373,15 @@ def _create_security_group(self): {'IpProtocol': 'tcp', 'FromPort': 8080, 'ToPort': 8080, - 'IpRanges': [{'CidrIp': '192.168.0.0/16'}]}, + 'IpRanges': [{'CidrIp': '10.0.0.0/16'}]}, + {'IpProtocol': 'tcp', + 'FromPort': 8081, + 'ToPort': 8081, + 'IpRanges': [{'CidrIp': '10.0.0.0/16'}]}, + {'IpProtocol': 'tcp', + 'FromPort': 6379, + 'ToPort': 6379, + 'IpRanges': [{'CidrIp': '10.0.0.0/16'}]}, {'IpProtocol': 'tcp', 'FromPort': 22, 'ToPort': 22, @@ -293,12 +444,12 @@ def _request_image_id(self): response = self.ec2_client.describe_images(Filters=[ { 'Name': 'name', - 'Values': [SA_IMAGE_NAME_DEFAULT] + 'Values': [DEFAULT_LITHOPS_IMAGE_NAME] }]) for image in response['Images']: - if image['Name'] == SA_IMAGE_NAME_DEFAULT: - logger.debug(f"Found default AMI: {SA_IMAGE_NAME_DEFAULT}") + if image['Name'] == DEFAULT_LITHOPS_IMAGE_NAME: + logger.debug(f"Found default AMI: {DEFAULT_LITHOPS_IMAGE_NAME}") self.config['target_ami'] = image['ImageId'] break @@ -317,7 +468,7 @@ def _create_master_instance(self): """ name = self.config.get('master_name') or f'lithops-master-{self.vpc_key}' self.master = EC2Instance(name, self.config, self.ec2_client, public=True) - self.master.instance_id = self.config['instance_id'] if self.mode == ExecMode.CONSUME.value else None + self.master.instance_id = self.config['instance_id'] if self.mode == StandaloneMode.CONSUME.value else None self.master.instance_type = self.config['master_instance_type'] self.master.delete_on_dismantle = False self.master.ssh_credentials.pop('password') @@ -341,6 +492,35 @@ def _request_spot_price(self): self.config["spot_price"] = max(spot_prices) logger.debug(f'Current spot instance price for {wit} is ${self.config["spot_price"]}') + def _get_all_instance_types(self): + """ + Gets all instance types and their CPU COUNT + """ + if 'instance_types' in self.ec2_data: + self.instance_types = self.ec2_data['instance_types'] + return + + instances = {} + next_token = None + + while True: + if next_token: + response = self.ec2_client.describe_instance_types(NextToken=next_token) + else: + response = self.ec2_client.describe_instance_types() + + for instance_type in response['InstanceTypes']: + instance_name = instance_type['InstanceType'] + cpu_count = instance_type['VCpuInfo']['DefaultVCpus'] + instances[instance_name] = cpu_count + + next_token = response.get('NextToken') + + if not next_token: + break + + self.instance_types = instances + def init(self): """ Initialize the backend by defining the Master VM @@ -348,31 +528,31 @@ def init(self): logger.debug(f'Initializing AWS EC2 backend ({self.mode} mode)') self._load_ec2_data() - if self.mode != self.ec2_data.get('mode'): - self.ec2_data = {} - if self.mode == ExecMode.CONSUME.value: + if self.mode == StandaloneMode.CONSUME.value: ins_id = self.config['instance_id'] if not self.ec2_data or ins_id != self.ec2_data.get('instance_id'): instances = self.ec2_client.describe_instances(InstanceIds=[ins_id]) instance_data = instances['Reservations'][0]['Instances'][0] - self.config['master_name'] = 'lithops-consume' + master_name = 'lithops-consume' for tag in instance_data['Tags']: if tag['Key'] == 'Name': - self.config['master_name'] = tag['Value'] + master_name = tag['Value'] + self.ec2_data = { + 'mode': self.mode, + 'vpc_data_type': 'provided', + 'ssh_data_type': 'provided', + 'master_name': master_name, + 'master_id': self.config['instance_id'], + 'instance_type': instance_data['InstanceType'] + } # Create the master VM instance + self.config['master_name'] = self.ec2_data['master_name'] + self.config['master_instance_type'] = self.ec2_data['instance_type'] self._create_master_instance() - self.ec2_data = { - 'mode': self.mode, - 'vpc_data_type': 'provided', - 'ssh_data_type': 'provided', - 'master_name': self.master.name, - 'master_id': self.master.instance_id - } - - elif self.mode in [ExecMode.CREATE.value, ExecMode.REUSE.value]: + elif self.mode in [StandaloneMode.CREATE.value, StandaloneMode.REUSE.value]: # Create the VPC if not exists self._create_vpc() @@ -380,10 +560,14 @@ def init(self): # Set the suffix used for the VPC resources self.vpc_key = self.config['vpc_id'][-6:] + # Create the Subnet if not exists + self._create_subnets() # Create the internet gateway if not exists self. _create_internet_gateway() - # Create the Subnet if not exists - self._create_subnet() + # Create the NAT gateway + # self._create_nat_gateway() + # Create routing tables + self._create_routing_tables() # Create the security group if not exists self._create_security_group() # Create the ssh key pair if not exists @@ -392,6 +576,8 @@ def init(self): self._request_image_id() # Request SPOT price self._request_spot_price() + # Request instance types + self._get_all_instance_types() # Create the master VM instance self._create_master_instance() @@ -404,41 +590,57 @@ def init(self): 'master_id': self.vpc_key, 'vpc_name': self.vpc_name, 'vpc_id': self.config['vpc_id'], - 'iam_role': self.config['iam_role'], + 'instance_role': self.config['instance_role'], 'target_ami': self.config['target_ami'], 'ssh_key_name': self.config['ssh_key_name'], 'ssh_key_filename': self.config['ssh_key_filename'], - 'subnet_id': self.config['subnet_id'], + 'public_subnet_id': self.config['public_subnet_id'], + # 'private_subnet_id': self.config['private_subnet_id'], 'security_group_id': self.config['security_group_id'], - 'internet_gateway_id': self.config['internet_gateway_id'] + 'internet_gateway_id': self.config['internet_gateway_id'], + # 'nat_gateway_id': self.config['nat_gateway_id'], + # 'private_rtb_id': self.config['private_rtb_id'], + 'public_rtb_id': self.config['public_rtb_id'], + 'instance_types': self.instance_types } self._dump_ec2_data() - def build_image(self, image_name, script_file, overwrite, extra_args=[]): + def build_image(self, image_name, script_file, overwrite, include, extra_args=[]): """ Builds a new VM Image """ + image_name = image_name or DEFAULT_LITHOPS_IMAGE_NAME + images = self.ec2_client.describe_images(Filters=[ { 'Name': 'name', 'Values': [image_name] }])['Images'] + if len(images) > 0: image_id = images[0]['ImageId'] if overwrite: - logger.debug(f"Deleting existing VM Image '{image_name}'") - self.ec2_client.deregister_image(ImageId=image_id) - while len(self.ec2_client.describe_images(Filters=[{'Name': 'name', 'Values': [image_name]}])['Images']) > 0: - time.sleep(2) + self.delete_image(image_name) else: raise Exception(f"The image with name '{image_name}' already exists with ID: '{image_id}'." " Use '--overwrite' or '-o' if you want ot overwrite it") - initial_vpc_data = self._load_ec2_data() + is_initialized = self.is_initialized() self.init() - build_vm = EC2Instance(image_name, self.config, self.ec2_client, public=True) + try: + del self.config['target_ami'] + except Exception: + pass + try: + del self.ec2_data['target_ami'] + except Exception: + pass + + self._request_image_id() + + build_vm = EC2Instance('building-image-' + image_name, self.config, self.ec2_client, public=True) build_vm.delete_on_dismantle = False build_vm.create() build_vm.wait_ready() @@ -447,18 +649,24 @@ def build_image(self, image_name, script_file, overwrite, extra_args=[]): remote_script = "/tmp/install_lithops.sh" script = get_host_setup_script() build_vm.get_ssh_client().upload_data_to_file(script, remote_script) - logger.debug("Executing installation script. Be patient, this process can take up to 3 minutes") + logger.debug("Executing Lithops installation script. Be patient, this process can take up to 3 minutes") build_vm.get_ssh_client().run_remote_command(f"chmod 777 {remote_script}; sudo {remote_script}; rm {remote_script};") - logger.debug("Installation script finsihed") + logger.debug("Lithops installation script finsihed") + + for src_dst_file in include: + src_file, dst_file = src_dst_file.split(':') + if os.path.isfile(src_file): + logger.debug(f"Uploading local file '{src_file}' to VM image in '{dst_file}'") + build_vm.get_ssh_client().upload_local_file(src_file, dst_file) if script_file: script = os.path.expanduser(script_file) - logger.debug(f"Uploading user script {script_file} to {build_vm}") + logger.debug(f"Uploading user script '{script_file}' to {build_vm}") remote_script = "/tmp/install_user_lithops.sh" build_vm.get_ssh_client().upload_local_file(script, remote_script) - logger.debug("Executing user script. Be patient, this process can take long") + logger.debug(f"Executing user script '{script_file}'") build_vm.get_ssh_client().run_remote_command(f"chmod 777 {remote_script}; sudo {remote_script}; rm {remote_script};") - logger.debug("User script finsihed") + logger.debug(f"User script '{script_file}' finsihed") build_vm_id = build_vm.get_instance_id() @@ -471,6 +679,7 @@ def build_image(self, image_name, script_file, overwrite, extra_args=[]): Description='Lithops Image' ) + logger.debug("Starting VM image creation") logger.debug("Be patient, VM imaging can take up to 5 minutes") while True: @@ -481,13 +690,35 @@ def build_image(self, image_name, script_file, overwrite, extra_args=[]): break time.sleep(20) - build_vm.delete() - - if not initial_vpc_data: - self.clean(all) + if not is_initialized: + while not self.clean(all=True): + time.sleep(5) + else: + build_vm.delete() logger.info(f"VM Image created. Image ID: {images[0]['ImageId']}") + def delete_image(self, image_name): + """ + Deletes a VM Image + """ + def list_images(): + return self.ec2_client.describe_images(Filters=[ + { + 'Name': 'name', + 'Values': [image_name] + }])['Images'] + + images = list_images() + + if len(images) > 0: + image_id = images[0]['ImageId'] + logger.debug(f"Deleting existing VM Image '{image_name}'") + self.ec2_client.deregister_image(ImageId=image_id) + while len(list_images()) > 0: + time.sleep(2) + logger.debug(f"VM Image '{image_name}' successfully deleted") + def list_images(self): """ List VM Images @@ -521,7 +752,7 @@ def _delete_vm_instances(self, all=False): if self.vpc_name else 'Deleting all Lithops worker VMs') logger.info(msg) - vms_prefixes = ('lithops-worker', 'lithops-master') if all else ('lithops-worker',) + vms_prefixes = ('lithops-worker', 'lithops-master', 'building-image') if all else ('lithops-worker',) ins_to_delete = [] response = self.ec2_client.describe_instances() @@ -532,13 +763,13 @@ def _delete_vm_instances(self, all=False): for tag in ins['Tags']: if tag['Key'] == 'Name' and tag['Value'].startswith(vms_prefixes): ins_to_delete.append(ins['InstanceId']) - logger.debug(f"Going to delete VM instance {tag['Value']}") + logger.debug(f"Going to delete VM instance {tag['Value']} ({ins['InstanceId']})") if ins_to_delete: self.ec2_client.terminate_instances(InstanceIds=ins_to_delete) master_pk = os.path.join(self.cache_dir, f"{self.ec2_data['master_name']}-id_rsa.pub") - if os.path.isfile(master_pk): + if all and os.path.isfile(master_pk): os.remove(master_pk) if self.ec2_data['vpc_data_type'] == 'provided': @@ -570,6 +801,7 @@ def _delete_vpc(self): total_correct = 0 + # Security Group try: logger.debug(f"Deleting security group {self.ec2_data['security_group_id']}") self.ec2_client.delete_security_group( @@ -577,46 +809,87 @@ def _delete_vpc(self): ) total_correct += 1 except ClientError as e: - if e.response['ResponseMetadata']['HTTPStatusCode'] == 400: + if e.response['ResponseMetadata']['HTTPStatusCode'] == 400 and \ + 'does not exist' in e.response['Error']['Message']: total_correct += 1 logger.debug(e.response['Error']['Message']) + + # NAT Gateway + # try: + # logger.debug(f"Deleting nat gateway {self.ec2_data['nat_gateway_id']}") + # self.ec2_client.delete_nat_gateway( + # NatGatewayId=self.ec2_data['nat_gateway_id'] + # ) + # self.ec2_client.get_waiter('nat_gateway_deleted').wait( + # NatGatewayIds=[self.ec2_data['nat_gateway_id']], + # WaiterConfig={'Delay': 5, 'MaxAttempts': 40} + # ) + # total_correct += 1 + # except ClientError as e: + # if e.response['ResponseMetadata']['HTTPStatusCode'] == 400 and \ + # 'does not exist' in e.response['Error']['Message']: + # total_correct += 1 + # logger.debug(e.response['Error']['Message']) + + # Subnets try: - logger.debug(f"Deleting {self.ec2_data['subnet_id']}") - self.ec2_client.delete_subnet(SubnetId=self.ec2_data['subnet_id']) + logger.debug(f"Deleting public {self.ec2_data['public_subnet_id']}") + self.ec2_client.delete_subnet(SubnetId=self.ec2_data['public_subnet_id']) total_correct += 1 except ClientError as e: - if e.response['ResponseMetadata']['HTTPStatusCode'] == 400: + if e.response['ResponseMetadata']['HTTPStatusCode'] == 400 and \ + 'does not exist' in e.response['Error']['Message']: total_correct += 1 logger.debug(e.response['Error']['Message']) + # try: + # logger.debug(f"Deleting private {self.ec2_data['private_subnet_id']}") + # self.ec2_client.delete_subnet(SubnetId=self.ec2_data['private_subnet_id']) + # total_correct += 1 + # except ClientError as e: + # if e.response['ResponseMetadata']['HTTPStatusCode'] == 400 and \ + # 'does not exist' in e.response['Error']['Message']: + # total_correct += 1 + # logger.debug(e.response['Error']['Message']) + + # Internet gateway try: - logger.debug(f"Deleting internet gateway {self.ec2_data['internet_gateway_id']}") + logger.debug(f"Detaching internet gateway {self.ec2_data['internet_gateway_id']}") self.ec2_client.detach_internet_gateway( InternetGatewayId=self.ec2_data['internet_gateway_id'], VpcId=self.ec2_data['vpc_id']) total_correct += 1 except ClientError as e: - if e.response['ResponseMetadata']['HTTPStatusCode'] == 400: + if e.response['ResponseMetadata']['HTTPStatusCode'] == 400 and \ + 'does not exist' in e.response['Error']['Message']: total_correct += 1 logger.debug(e.response['Error']['Message']) try: + logger.debug(f"Deleting internet gateway {self.ec2_data['internet_gateway_id']}") self.ec2_client.delete_internet_gateway( InternetGatewayId=self.ec2_data['internet_gateway_id'] ) total_correct += 1 except ClientError as e: - if e.response['ResponseMetadata']['HTTPStatusCode'] == 400: + if e.response['ResponseMetadata']['HTTPStatusCode'] == 400 and \ + 'does not exist' in e.response['Error']['Message']: total_correct += 1 logger.debug(e.response['Error']['Message']) + + # VPC try: logger.debug(f"Deleting VPC {self.ec2_data['vpc_id']}") self.ec2_client.delete_vpc(VpcId=self.ec2_data['vpc_id']) total_correct += 1 except ClientError as e: - if e.response['ResponseMetadata']['HTTPStatusCode'] == 400: + if e.response['ResponseMetadata']['HTTPStatusCode'] == 400 and \ + 'does not exist' in e.response['Error']['Message']: total_correct += 1 logger.debug(e.response['Error']['Message']) - assert total_correct == 5, "Couldn't delete all the VPC resources, try againg in a few seconds" + if total_correct < 5: + logger.error("Couldn't delete all the VPC resources, try againg in a few seconds") + + return total_correct == 5 def _delete_ssh_key(self): """ @@ -645,20 +918,21 @@ def clean(self, all=False): """ logger.info('Cleaning AWS EC2 resources') - self._load_ec2_data() - if not self.ec2_data: - return + return True - if self.ec2_data['mode'] == ExecMode.CONSUME.value: - if os.path.exists(self.cache_file): - os.remove(self.cache_file) + if self.mode == StandaloneMode.CONSUME.value: + self._delete_vpc_data() + return True else: self._delete_vm_instances(all=all) - self._delete_vpc() if all else None - self._delete_ssh_key() if all else None - if all and os.path.exists(self.cache_file): - os.remove(self.cache_file) + if all: + if self._delete_vpc(): + self._delete_ssh_key() + self._delete_vpc_data() + return True + else: + return False def clear(self, job_keys=None): """ @@ -676,8 +950,7 @@ def dismantle(self, include_master=True): ex.map(lambda worker: worker.stop(), self.workers) self.workers = [] - if include_master or self.mode == ExecMode.CONSUME.value: - # in consume mode master VM is a worker + if include_master: self.master.stop() def get_instance(self, name, **kwargs): @@ -688,11 +961,23 @@ def get_instance(self, name, **kwargs): instance = EC2Instance(name, self.config, self.ec2_client) for key in kwargs: - if hasattr(instance, key): + if hasattr(instance, key) and kwargs[key] is not None: setattr(instance, key, kwargs[key]) return instance + def get_worker_instance_type(self): + """ + Return the worker instance type + """ + return self.config['worker_instance_type'] + + def get_worker_cpu_count(self): + """ + Returns the number of CPUs in the worker instance type + """ + return self.instance_types[self.config['worker_instance_type']] + def create_worker(self, name): """ Creates a new worker VM instance @@ -706,9 +991,10 @@ def create_worker(self, name): with open(pub_key, 'r') as pk: pk_data = pk.read().strip() user_data = CLOUD_CONFIG_WORKER_PK.format(user, pk_data) - worker.ssh_credentials['key_filename'] = '~/.ssh/id_rsa' + worker.ssh_credentials['key_filename'] = '~/.ssh/lithops_id_rsa' worker.ssh_credentials.pop('password') else: + logger.error(f'Unable to locate {pub_key}') worker.ssh_credentials.pop('key_filename') token = worker.ssh_credentials['password'] user_data = CLOUD_CONFIG_WORKER.format(user, token) @@ -751,6 +1037,8 @@ def __init__(self, name, ec2_config, ec2_client=None, public=False): self.fast_io = self.config.get('fast_io', False) self.home_dir = '/home/ubuntu' + self.runtime_name = None + self.ssh_credentials = { 'username': self.config['ssh_username'], 'password': self.config['ssh_password'], @@ -844,7 +1132,7 @@ def wait_ready(self, timeout=INSTANCE_STX_TIMEOUT): def is_stopped(self): """ - Checks if the VM instance is stoped + Checks if the VM instance is stopped """ state = self.get_instance_data()['State'] if state['Name'] == 'stopped': @@ -853,7 +1141,7 @@ def is_stopped(self): def wait_stopped(self, timeout=INSTANCE_STX_TIMEOUT): """ - Waits until the VM instance is stoped + Waits until the VM instance is stopped """ logger.debug(f'Waiting {self} to become stopped') @@ -870,26 +1158,34 @@ def _create_instance(self, user_data=None): """ Creates a new VM instance """ - if self.fast_io: - BlockDeviceMappings = [ - { - 'DeviceName': '/dev/xvda', - 'Ebs': { - 'VolumeSize': 100, - 'DeleteOnTermination': True, - 'VolumeType': 'gp2', - # 'Iops' : 10000, - }, - }, - ] - else: - BlockDeviceMappings = None + ebs_volumes = self.config.get('ebs_volumes', []) + BlockDeviceMappings = [] + + for volume in ebs_volumes: + ebs_config = volume.get('ebs', {}) + block_device = { + 'DeviceName': volume['device_name'], + 'Ebs': { + 'VolumeSize': ebs_config.get('volume_size', 8), # Default 8 GiB + 'VolumeType': ebs_config.get('volume_type', 'gp2'), + 'DeleteOnTermination': ebs_config.get('delete_on_termination', True), + 'Encrypted': ebs_config.get('encrypted', False), + } + } + if 'iops' in ebs_config: + block_device['Ebs']['Iops'] = ebs_config['iops'] + if 'throughput' in ebs_config: + block_device['Ebs']['Throughput'] = ebs_config['throughput'] + if 'kms_key_id' in ebs_config: + block_device['Ebs']['KmsKeyId'] = ebs_config['kms_key_id'] + + BlockDeviceMappings.append(block_device) LaunchSpecification = { "ImageId": self.config['target_ami'], "InstanceType": self.instance_type, "EbsOptimized": False, - "IamInstanceProfile": {'Name': self.config['iam_role']}, + "IamInstanceProfile": {'Name': self.config['instance_role']}, "Monitoring": {'Enabled': False}, 'KeyName': self.config['ssh_key_name'] } @@ -897,7 +1193,7 @@ def _create_instance(self, user_data=None): LaunchSpecification['NetworkInterfaces'] = [{ 'AssociatePublicIpAddress': True, 'DeviceIndex': 0, - 'SubnetId': self.config['subnet_id'], + 'SubnetId': self.config['public_subnet_id'], 'Groups': [self.config['security_group_id']] }] @@ -1055,7 +1351,7 @@ def start(self): """ Starts the VM instance """ - logger.info(f"Starting VM instance {self.name}") + logger.info(f"Starting VM instance {self.name} ({self.instance_id})") try: self.ec2_client.start_instances(InstanceIds=[self.instance_id]) @@ -1072,23 +1368,28 @@ def _delete_instance(self): """ Deletes the VM instance and the associated volume """ - logger.debug(f"Deleting VM instance {self.name}") + logger.debug(f"Deleting VM instance {self.name} ({self.instance_id})") self.ec2_client.terminate_instances(InstanceIds=[self.instance_id]) self.instance_data = None self.instance_id = None self.private_ip = None - self.public_ip = None + self.public_ip = '0.0.0.0' self.del_ssh_client() def _stop_instance(self): """ Stops the VM instance """ - logger.debug(f"Stopping VM instance {self.name}") + logger.debug(f"Stopping VM instance {self.name} ({self.instance_id})") self.ec2_client.stop_instances(InstanceIds=[self.instance_id]) + self.instance_data = None + self.private_ip = None + self.public_ip = '0.0.0.0' + self.del_ssh_client() + def stop(self): """ Stops the VM instance diff --git a/lithops/standalone/backends/aws_ec2/config.py b/lithops/standalone/backends/aws_ec2/config.py index e134ec0a2..f1255532d 100644 --- a/lithops/standalone/backends/aws_ec2/config.py +++ b/lithops/standalone/backends/aws_ec2/config.py @@ -20,32 +20,30 @@ from lithops.constants import SA_DEFAULT_CONFIG_KEYS DEFAULT_CONFIG_KEYS = { - 'master_instance_type': 't2.micro', - 'worker_instance_type': 't2.medium', + 'master_instance_type': 't3.micro', + 'worker_instance_type': 't3.medium', 'ssh_username': 'ubuntu', 'ssh_password': str(uuid.uuid4()), 'ssh_key_filename': '~/.ssh/id_rsa', 'request_spot_instances': True, 'delete_on_dismantle': True, 'max_workers': 100, - 'worker_processes': 2 + 'worker_processes': 'AUTO', + 'public_subnet_cidr_block': '10.0.1.0/24' } REQ_PARAMS_1 = ('instance_id',) -REQ_PARAMS_2 = ('iam_role',) +REQ_PARAMS_2 = ('instance_role',) def load_config(config_data): - if 'aws' not in config_data: - raise Exception("'aws' section is mandatory in the configuration") - - if not {'access_key_id', 'secret_access_key'}.issubset(set(config_data['aws'])): - raise Exception("'access_key_id' and 'secret_access_key' are mandatory under the 'aws' section of the configuration") - if not config_data['aws_ec2']: raise Exception("'aws_ec2' section is mandatory in the configuration") + if 'aws' not in config_data: + config_data['aws'] = {} + temp = copy.deepcopy(config_data['aws_ec2']) config_data['aws_ec2'].update(config_data['aws']) config_data['aws_ec2'].update(temp) @@ -69,6 +67,10 @@ def load_config(config_data): else: params_to_check = REQ_PARAMS_2 + # iam_role is deprectaded. To be removed in a future release + if 'iam_role' in config_data['aws_ec2']: + config_data['aws_ec2']['instance_role'] = config_data['aws_ec2'].pop('iam_role') + for param in params_to_check: if param not in config_data['aws_ec2']: msg = f"'{param}' is mandatory in the 'aws_ec2' section of the configuration" diff --git a/lithops/standalone/backends/azure_vms/__init__.py b/lithops/standalone/backends/azure_vms/__init__.py index 707355ba2..11f03e459 100644 --- a/lithops/standalone/backends/azure_vms/__init__.py +++ b/lithops/standalone/backends/azure_vms/__init__.py @@ -1 +1,3 @@ from .azure_vms import AzureVMSBackend as StandaloneBackend + +__all__ = ['StandaloneBackend'] diff --git a/lithops/standalone/backends/azure_vms/azure_vms.py b/lithops/standalone/backends/azure_vms/azure_vms.py index 6f5f88cab..ba4da1d1a 100644 --- a/lithops/standalone/backends/azure_vms/azure_vms.py +++ b/lithops/standalone/backends/azure_vms/azure_vms.py @@ -28,10 +28,10 @@ from lithops.version import __version__ from lithops.util.ssh_client import SSHClient -from lithops.constants import COMPUTE_CLI_MSG, CACHE_DIR, SA_DATA_FILE +from lithops.constants import COMPUTE_CLI_MSG, CACHE_DIR, SA_CONFIG_FILE from lithops.config import load_yaml_config, dump_yaml_config -from lithops.standalone.utils import ExecMode -from lithops.standalone.standalone import LithopsValidationError +from lithops.standalone.utils import StandaloneMode +from lithops.standalone import LithopsValidationError logger = logging.getLogger(__name__) @@ -55,11 +55,14 @@ def __init__(self, config, mode): self.config = config self.mode = mode self.location = config['region'] - self.cache_dir = os.path.join(CACHE_DIR, self.name) - self.cache_file = os.path.join(self.cache_dir, self.location + '_data') + self.vnet_data_type = 'provided' if 'vnet_name' in self.config else 'created' self.ssh_data_type = 'provided' if 'ssh_key_filename' in config else 'created' + suffix = 'vm' if self.mode == StandaloneMode.CONSUME.value else 'vpc' + self.cache_dir = os.path.join(CACHE_DIR, self.name) + self.cache_file = os.path.join(self.cache_dir, f'{self.location}_{suffix}_data') + self.azure_data = None self.vnet_name = None self.vnet_key = None @@ -71,10 +74,17 @@ def __init__(self, config, mode): self.master = None self.workers = [] + self.instance_types = {} msg = COMPUTE_CLI_MSG.format('Azure Virtual Machines') logger.info(f"{msg} - Region: {self.location}") + def is_initialized(self): + """ + Checks if the backend is initialized + """ + return os.path.isfile(self.cache_file) + def _load_azure_vms_data(self): """ Loads Azure data from local cache @@ -94,6 +104,13 @@ def _dump_azure_vms_data(self): """ dump_yaml_config(self.cache_file, self.azure_data) + def _delete_vpc_data(self): + """ + Deletes the vpc data file + """ + if os.path.exists(self.cache_file): + os.remove(self.cache_file) + def _create_vnet(self): """ Creates a new Virtual Network @@ -112,7 +129,7 @@ def _create_vnet(self): self.vnet_name = self.config.get('vnet_name', f'lithops-vnet-{str(uuid.uuid4())[-6:]}') logger.debug(f'Setting virtual network name to: {self.vnet_name}') - assert re.match("^[a-z0-9-:-]*$", self.vnet_name),\ + assert re.match("^[a-z0-9-:-]*$", self.vnet_name), \ f'Virtual network name "{self.vnet_name}" not valid' vnets_info = list(self.network_client.virtual_networks.list(self.config['resource_group'])) @@ -209,22 +226,57 @@ def _create_security_group(self): pass if 'security_group_id' not in self.config: - nsg_rule = { - "name": "allow-ssh", - "protocol": "Tcp", - "sourcePortRange": "*", - "destinationPortRange": "22", - "sourceAddressPrefix": "*", - "destinationAddressPrefix": "*", - "access": "Allow", - "direction": "Inbound", - "priority": 100 - } + nsg_rules = [ + { + "name": "allow-ssh", + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRange": "22", + "sourceAddressPrefix": "*", + "destinationAddressPrefix": "*", + "access": "Allow", + "direction": "Inbound", + "priority": 100 + }, + { + "name": "allow-master-port-8080", + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRange": "8080", + "sourceAddressPrefix": "10.0.0.0/24", + "destinationAddressPrefix": "*", + "access": "Allow", + "direction": "Inbound", + "priority": 101 + }, + { + "name": "allow-worker-port-8081", + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRange": "8081", + "sourceAddressPrefix": "10.0.0.0/24", + "destinationAddressPrefix": "*", + "access": "Allow", + "direction": "Inbound", + "priority": 102 + }, + { + "name": "allow-redis-port-6379", + "protocol": "Tcp", + "sourcePortRange": "*", + "destinationPortRange": "6379", + "sourceAddressPrefix": "10.0.0.0/24", + "destinationAddressPrefix": "*", + "access": "Allow", + "direction": "Inbound", + "priority": 103 + } + ] # Define the network security group to contain the rule network_security_group = { "location": self.location, - "securityRules": [nsg_rule] + "securityRules": nsg_rules } # Create or update the network security group @@ -245,7 +297,7 @@ def _create_master_floating_ip(self): def get_floating_ip(fip_name): try: - fip_info = self.network_client.network_security_groups.get( + fip_info = self.network_client.public_ip_addresses.get( self.config['resource_group'], fip_name ) self.config['floating_ip'] = fip_info.ip_address @@ -302,14 +354,33 @@ def _create_ssh_key(self): self.config['ssh_key_filename'] = key_filename + def _get_all_instance_types(self): + """ + Get all virtual machine sizes in the specified location + """ + if 'instance_types' in self.azure_data: + self.instance_types = self.azure_data['instance_types'] + return + + vm_sizes = self.compute_client.virtual_machine_sizes.list(self.location) + + instances = {} + + for vm_size in vm_sizes: + instance_name = vm_size.name + cpu_count = vm_size.number_of_cores + instances[instance_name] = cpu_count + + self.instance_types = instances + def _create_master_instance(self): """ Creates the master VM insatnce """ name = self.config.get('master_name') or f'lithops-master-{self.vnet_key}' self.master = VMInstance(name, self.config, self.compute_client, public=True) - self.master.name = self.config['instance_name'] if self.mode == ExecMode.CONSUME.value else name - self.master.public_ip = self.config['floating_ip'] + self.master.name = self.config['instance_name'] if self.mode == StandaloneMode.CONSUME.value else name + self.master.public_ip = self.config['floating_ip'] if self.mode != StandaloneMode.CONSUME.value else '0.0.0.0' self.master.instance_type = self.config['master_instance_type'] self.master.delete_on_dismantle = False self.master.ssh_credentials.pop('password') @@ -323,35 +394,31 @@ def init(self): logger.debug(f'Initializing Azure Virtual Machines backend ({self.mode} mode)') self._load_azure_vms_data() - if self.mode != self.azure_data.get('mode'): - self.azure_data = {} - if self.mode == ExecMode.CONSUME.value: + if self.mode == StandaloneMode.CONSUME.value: instance_name = self.config['instance_name'] if not self.azure_data or instance_name != self.azure_data.get('instance_name'): try: - self.compute_client.virtual_machines.get( + instance_data = self.compute_client.virtual_machines.get( self.config['resource_group'], instance_name ) except ResourceNotFoundError: - raise Exception(f"VM Instance {instance_name} does not exists") + raise Exception(f"VM Instance {instance_name} does not exist") + + self.azure_data = { + 'mode': self.mode, + 'vnet_data_type': 'provided', + 'ssh_data_type': 'provided', + 'instance_name': self.config['instance_name'], + 'master_id': instance_data.vm_id, + 'instance_type': instance_data.hardware_profile.vm_size + } # Create the master VM instance + self.config['master_instance_type'] = self.azure_data['instance_type'] self._create_master_instance() - # Make sure that the ssh key is provided - self.config['ssh_key_filename'] = self.config.get('ssh_key_filename', '~/.ssh/id_rsa') - - self.azure_data = { - 'mode': self.mode, - 'vnet_data_type': 'provided', - 'ssh_data_type': 'provided', - 'instance_name': self.config['instance_name'], - 'master_id': self.config['instance_id'], - 'ssh_key_filename': self.config['ssh_key_filename'], - } - - elif self.mode in [ExecMode.CREATE.value, ExecMode.REUSE.value]: + elif self.mode in [StandaloneMode.CREATE.value, StandaloneMode.REUSE.value]: # Create the Virtual Netowrk if not exists self._create_vnet() @@ -367,6 +434,8 @@ def init(self): self._create_master_floating_ip() # Create the ssh key pair if not exists self._create_ssh_key() + # Request instance types + self._get_all_instance_types() # Create the master VM instance self._create_master_instance() @@ -385,12 +454,13 @@ def init(self): 'security_group_id': self.config['security_group_id'], 'security_group_name': self.config['security_group_name'], 'floating_ip_id': self.config['floating_ip_id'], - 'floating_ip_name': self.config['floating_ip_name'] + 'floating_ip_name': self.config['floating_ip_name'], + 'instance_types': self.instance_types } self._dump_azure_vms_data() - def build_image(self, image_name, script_file, overwrite, extra_args=[]): + def build_image(self, image_name, script_file, overwrite, include, extra_args=[]): """ Builds a new VM Image """ @@ -461,7 +531,7 @@ def delete_instance(instance): [fut.result() for fut in futures] master_pk = os.path.join(self.cache_dir, f"{self.azure_data['master_name']}-id_rsa.pub") - if os.path.isfile(master_pk): + if all and os.path.isfile(master_pk): os.remove(master_pk) if self.azure_data['vnet_data_type'] == 'provided': @@ -525,20 +595,16 @@ def clean(self, all=False): """ logger.info('Cleaning Azure Virtual Machines resources') - self._load_azure_vms_data() - if not self.azure_data: return - if self.azure_data['mode'] == ExecMode.CONSUME.value: - if os.path.exists(self.cache_file): - os.remove(self.cache_file) + if self.azure_data['mode'] == StandaloneMode.CONSUME.value: + self._delete_vpc_data() else: self._delete_vm_instances(all=all) self._delete_vnet_and_subnet() if all else None self._delete_ssh_key() if all else None - if all and os.path.exists(self.cache_file): - os.remove(self.cache_file) + self._delete_vpc_data() if all else None def clear(self, job_keys=None): """ @@ -556,8 +622,7 @@ def dismantle(self, include_master=True): ex.map(lambda worker: worker.stop(), self.workers) self.workers = [] - if include_master or self.mode == ExecMode.CONSUME.value: - # in consume mode master VM is a worker + if include_master: self.master.stop() def get_instance(self, name, **kwargs): @@ -568,11 +633,23 @@ def get_instance(self, name, **kwargs): instance = VMInstance(name, self.config, self.compute_client) for key in kwargs: - if hasattr(instance, key): + if hasattr(instance, key) and kwargs[key] is not None: setattr(instance, key, kwargs[key]) return instance + def get_worker_instance_type(self): + """ + Return the worker profile name + """ + return self.config['worker_instance_type'] + + def get_worker_cpu_count(self): + """ + Returns the number of CPUs in the worker instance type + """ + return self.instance_types[self.config['worker_instance_type']] + def create_worker(self, name): """ Creates a new worker VM instance @@ -582,7 +659,7 @@ def create_worker(self, name): worker.ssh_credentials['key_filename'] = ssh_key worker.ssh_credentials.pop('password') worker.create() - worker.ssh_credentials['key_filename'] = '~/.ssh/id_rsa' + worker.ssh_credentials['key_filename'] = '~/.ssh/lithops_id_rsa' self.workers.append(worker) def get_runtime_key(self, runtime_name, version=__version__): @@ -813,6 +890,9 @@ def _create_instance(self, user_data=None): vm_parameters ) + self.instance_data = poller.result() + self.instance_id = self.instance_data.vm_id + return self.instance_data def get_instance_data(self): @@ -925,8 +1005,6 @@ def _delete_instance(self): self.get_instance_data() - logger.debug(f"Going to delete VM instance {self.name}") - poller = self.compute_client.virtual_machines.begin_delete( self.config['resource_group'], self.name, force_deletion=True ) @@ -960,7 +1038,7 @@ def _stop_instance(self): self.config['resource_group'], self.name ) except Exception: - if os.path.isfile(SA_DATA_FILE): + if os.path.isfile(SA_CONFIG_FILE): os.system("shutdown -h now") def stop(self): diff --git a/lithops/standalone/backends/azure_vms/config.py b/lithops/standalone/backends/azure_vms/config.py index 458d8c8f1..76e8f4961 100644 --- a/lithops/standalone/backends/azure_vms/config.py +++ b/lithops/standalone/backends/azure_vms/config.py @@ -27,7 +27,7 @@ 'request_spot_instances': True, 'delete_on_dismantle': False, 'max_workers': 100, - 'worker_processes': 2 + 'worker_processes': 'AUTO' } REQ_PARAMS_1 = ('resource_group', 'subscription_id', 'region') diff --git a/lithops/standalone/backends/ibm_vpc/__init__.py b/lithops/standalone/backends/ibm_vpc/__init__.py index 1214040e5..26d84e7dc 100644 --- a/lithops/standalone/backends/ibm_vpc/__init__.py +++ b/lithops/standalone/backends/ibm_vpc/__init__.py @@ -1 +1,3 @@ from .ibm_vpc import IBMVPCBackend as StandaloneBackend + +__all__ = ['StandaloneBackend'] diff --git a/lithops/standalone/backends/ibm_vpc/config.py b/lithops/standalone/backends/ibm_vpc/config.py index 906745443..e8333e35e 100644 --- a/lithops/standalone/backends/ibm_vpc/config.py +++ b/lithops/standalone/backends/ibm_vpc/config.py @@ -35,15 +35,17 @@ 'ssh_password': str(uuid.uuid4()), 'ssh_key_filename': '~/.ssh/id_rsa', 'delete_on_dismantle': True, + 'verify_resources': True, 'max_workers': 100, - 'worker_processes': 2, - 'boot_volume_capacity': 100 + 'boot_volume_capacity': 100, + 'worker_processes': 'AUTO' } VPC_ENDPOINT = "https://{}.iaas.cloud.ibm.com" REGIONS = ["jp-tok", "jp-osa", "au-syd", "eu-gb", "eu-de", "eu-es", "us-south", "us-east", "br-sao", "ca-tor"] + def load_config(config_data): if 'ibm' in config_data and config_data['ibm'] is not None: @@ -75,19 +77,26 @@ def load_config(config_data): msg = f"'{param}' is mandatory in 'ibm_vpc' section of the configuration" raise Exception(msg) - if "profile_name" in config_data['ibm_vpc']: - config_data['ibm_vpc']['worker_profile_name'] = config_data['ibm_vpc']['profile_name'] - if "region" not in config_data['ibm_vpc'] and "endpoint" not in config_data['ibm_vpc']: msg = "'region' or 'endpoint' parameter is mandatory in 'ibm_vpc' section of the configuration" raise Exception(msg) if 'endpoint' in config_data['ibm_vpc']: endpoint = config_data['ibm_vpc']['endpoint'] - config_data['ibm_vpc']['region'] = endpoint.split('//')[1].split('.')[0] + region = endpoint.split('//')[1].split('.')[0] + config_data['ibm_vpc']['region'] = region + config_data['ibm_vpc']['zone'] = region + '-1' elif "region" in config_data['ibm_vpc']: region = config_data['ibm_vpc']['region'] + + if region.count('-') == 2: + config_data['ibm_vpc']['zone'] = region + region = region.rsplit('-', 1)[0] + config_data['ibm_vpc']['region'] = region + else: + config_data['ibm_vpc']['zone'] = region + '-1' + if region not in REGIONS: msg = f"'region' conig parameter in 'ibm_vpc' section must be one of {REGIONS}" raise Exception(msg) diff --git a/lithops/standalone/backends/ibm_vpc/ibm_vpc.py b/lithops/standalone/backends/ibm_vpc/ibm_vpc.py index 54419a73e..fc6466893 100644 --- a/lithops/standalone/backends/ibm_vpc/ibm_vpc.py +++ b/lithops/standalone/backends/ibm_vpc/ibm_vpc.py @@ -15,14 +15,13 @@ # limitations under the License. # -import functools -import inspect import re import os -import paramiko import time import logging import uuid +import functools +import inspect from datetime import datetime from ibm_vpc import VpcV1 from ibm_cloud_sdk_core.authenticators import IAMAuthenticator @@ -31,16 +30,23 @@ from lithops.version import __version__ from lithops.util.ssh_client import SSHClient -from lithops.constants import COMPUTE_CLI_MSG, CACHE_DIR, SA_IMAGE_NAME_DEFAULT +from lithops.constants import COMPUTE_CLI_MSG, CACHE_DIR from lithops.config import load_yaml_config, dump_yaml_config -from lithops.standalone.utils import CLOUD_CONFIG_WORKER, CLOUD_CONFIG_WORKER_PK, ExecMode, get_host_setup_script -from lithops.standalone.standalone import LithopsValidationError +from lithops.standalone.utils import ( + CLOUD_CONFIG_WORKER, + CLOUD_CONFIG_WORKER_PK, + StandaloneMode, + get_host_setup_script, + LithopsValidationError +) logger = logging.getLogger(__name__) INSTANCE_START_TIMEOUT = 180 VPC_API_VERSION = '2021-09-21' +DEFAULT_LITHOPS_IMAGE_NAME = 'lithops-ubuntu-22-04-3-minimal-amd64-1' + class IBMVPCBackend: @@ -59,9 +65,12 @@ def __init__(self, config, mode): self.endpoint = self.config['endpoint'] self.region = self.config['region'] + self.zone = self.config['zone'] + self.verify_resources = self.config['verify_resources'] + + suffix = 'vm' if self.mode == StandaloneMode.CONSUME.value else 'vpc' self.cache_dir = os.path.join(CACHE_DIR, self.name) - self.cache_file = os.path.join(self.cache_dir, self.region + '_data') - self.custom_image = self.config.get('custom_lithops_image') + self.cache_file = os.path.join(self.cache_dir, f'{self.zone}_{suffix}_data') logger.debug(f'Setting VPC endpoint to: {self.endpoint}') @@ -73,14 +82,20 @@ def __init__(self, config, mode): self.vpc_cli = VpcV1(VPC_API_VERSION, authenticator=authenticator) self.vpc_cli.set_service_url(self.config['endpoint'] + '/v1') - user_agent_string = 'ibm_vpc_{}'.format(self.config['user_agent']) + user_agent_string = f"ibm_vpc_{self.config['user_agent']}" self.vpc_cli._set_user_agent_header(user_agent_string) # decorate instance public methods with except/retry logic decorate_instance(self.vpc_cli, vpc_retry_on_except) msg = COMPUTE_CLI_MSG.format('IBM VPC') - logger.info(f"{msg} - Region: {self.region}") + logger.info(f"{msg} - Region: {self.region} - Zone: {self.zone}") + + def is_initialized(self): + """ + Checks if the backend is initialized + """ + return os.path.isfile(self.cache_file) def _load_vpc_data(self): """ @@ -103,6 +118,13 @@ def _dump_vpc_data(self): """ dump_yaml_config(self.cache_file, self.vpc_data) + def _delete_vpc_data(self): + """ + Deletes the vpc data file + """ + if os.path.exists(self.cache_file): + os.remove(self.cache_file) + def _create_vpc(self): """ Creates a new VPC @@ -111,21 +133,23 @@ def _create_vpc(self): return if 'vpc_id' in self.vpc_data: + logger.debug(f'Using VPC {self.vpc_data["vpc_name"]}') try: - self.vpc_cli.get_vpc(self.vpc_data['vpc_id']) + self.vpc_cli.get_vpc(self.vpc_data['vpc_id']) if self.verify_resources else None self.config['vpc_id'] = self.vpc_data['vpc_id'] self.config['security_group_id'] = self.vpc_data['security_group_id'] return - except ApiException: - pass + except ApiException as e: + logger.error(f"Unable to find VPC {self.vpc_data['vpc_name']}") + raise e vpc_info = None iam_id = self.iam_api_key[:4].lower() self.vpc_name = self.config.get('vpc_name', f'lithops-vpc-{iam_id}-{str(uuid.uuid4())[-6:]}') - logger.debug(f'Setting VPC name to: {self.vpc_name}') + logger.debug(f'Setting VPC name to {self.vpc_name}') - assert re.match("^[a-z0-9-:-]*$", self.vpc_name),\ + assert re.match("^[a-z0-9-:-]*$", self.vpc_name), \ f'VPC name "{self.vpc_name}" not valid' vpcs_info = self.vpc_cli.list_vpcs().get_result() @@ -187,7 +211,7 @@ def _create_ssh_key(self): if 'ssh_key_id' in self.vpc_data: try: - self.vpc_cli.get_key(self.vpc_data['ssh_key_id']) + self.vpc_cli.get_key(self.vpc_data['ssh_key_id']) if self.verify_resources else None self.config['ssh_key_id'] = self.vpc_data['ssh_key_id'] self.config['ssh_key_filename'] = self.vpc_data['ssh_key_filename'] return @@ -250,7 +274,7 @@ def _create_subnet(self): if 'subnet_id' in self.vpc_data: try: - self.vpc_cli.get_subnet(self.vpc_data['subnet_id']) + self.vpc_cli.get_subnet(self.vpc_data['subnet_id']) if self.verify_resources else None self.config['subnet_id'] = self.vpc_data['subnet_id'] self.config['zone_name'] = self.vpc_data['zone_name'] return @@ -268,12 +292,12 @@ def _create_subnet(self): if not subnet_data: logger.debug(f'Creating Subnet {subnet_name}') subnet_prototype = {} - subnet_prototype['zone'] = {'name': self.region + '-1'} + subnet_prototype['zone'] = {'name': self.zone} subnet_prototype['ip_version'] = 'ipv4' subnet_prototype['name'] = subnet_name subnet_prototype['resource_group'] = {'id': self.config['resource_group_id']} subnet_prototype['vpc'] = {'id': self.config['vpc_id']} - subnet_prototype['total_ipv4_address_count'] = 256 + subnet_prototype['total_ipv4_address_count'] = 8192 response = self.vpc_cli.create_subnet(subnet_prototype) subnet_data = response.result @@ -290,7 +314,7 @@ def _create_gateway(self): if 'gateway_id' in self.vpc_data: try: - self.vpc_cli.get_public_gateway(self.vpc_data['gateway_id']) + self.vpc_cli.get_public_gateway(self.vpc_data['gateway_id']) if self.verify_resources else None self.config['gateway_id'] = self.vpc_data['gateway_id'] return except ApiException: @@ -330,7 +354,8 @@ def _get_or_create_floating_ip(self): floating_ips_info = self.vpc_cli.list_floating_ips().get_result() for fip in floating_ips_info['floating_ips']: - if fip['name'].startswith("lithops-recyclable") and 'target' not in fip: + if fip['name'].startswith("lithops-recyclable") and 'target' not in fip \ + and fip['zone']['name'] == self.config['zone_name']: fip_data = fip if not fip_data: @@ -354,7 +379,7 @@ def _create_master_floating_ip(self): if 'floating_ip_id' in self.vpc_data: try: - self.vpc_cli.get_floating_ip(self.vpc_data['floating_ip_id']) + self.vpc_cli.get_floating_ip(self.vpc_data['floating_ip_id']) if self.verify_resources else None self.config['floating_ip'] = self.vpc_data['floating_ip'] self.config['floating_ip_id'] = self.vpc_data['floating_ip_id'] return @@ -373,6 +398,10 @@ def _request_image_id(self): if 'image_id' in self.config: return + if 'image_id' in self.vpc_data and not self.verify_resources: + self.config['image_id'] = self.vpc_data['image_id'] + return + images_def = self.vpc_cli.list_images().result['images'] images_user = self.vpc_cli.list_images(resource_group_id=self.config['resource_group_id']).result['images'] images_def.extend(images_user) @@ -386,8 +415,8 @@ def _request_image_id(self): if 'image_id' not in self.config: for image in images_def: - if image['name'] == SA_IMAGE_NAME_DEFAULT: - logger.debug(f"Found default VM image: {SA_IMAGE_NAME_DEFAULT}") + if image['name'] == DEFAULT_LITHOPS_IMAGE_NAME: + logger.debug(f"Found default VM image: {DEFAULT_LITHOPS_IMAGE_NAME}") self.config['image_id'] = image['id'] break @@ -405,8 +434,8 @@ def _create_master_instance(self): name = self.config.get('master_name') or f'lithops-master-{self.vpc_key}' self.master = IBMVPCInstance(name, self.config, self.vpc_cli, public=True) self.master.public_ip = self.config['floating_ip'] - self.master.instance_id = self.config['instance_id'] if self.mode == ExecMode.CONSUME.value else None - self.master.profile_name = self.config['master_profile_name'] + self.master.instance_id = self.config['instance_id'] if self.mode == StandaloneMode.CONSUME.value else None + self.master.instance_type = self.config['master_profile_name'] self.master.delete_on_dismantle = False self.master.ssh_credentials.pop('password') @@ -417,29 +446,27 @@ def init(self): logger.debug(f'Initializing IBM VPC backend ({self.mode} mode)') self._load_vpc_data() - if self.mode != self.vpc_data.get('mode'): - self.vpc_data = {} - if self.mode == ExecMode.CONSUME.value: + if self.mode == StandaloneMode.CONSUME.value: ins_id = self.config['instance_id'] - if not self.vpc_data or ins_id != self.vpc_data.get('instance_id'): - name = self.vpc_cli.get_instance(ins_id).get_result()['name'] - self.config['master_name'] = name + if not self.vpc_data or ins_id != self.vpc_data.get('master_id'): + master_name = self.vpc_cli.get_instance(ins_id).get_result()['name'] + self.vpc_data = { + 'mode': self.mode, + 'vpc_data_type': 'provided', + 'ssh_data_type': 'provided', + 'master_name': master_name, + 'master_id': self.config['instance_id'], + 'floating_ip': self.config['floating_ip'], + 'instance_type': 'unknown' + } # Create the master VM instance + self.config['master_name'] = self.vpc_data['master_name'] self._create_master_instance() - self.vpc_data = { - 'mode': self.mode, - 'vpc_data_type': 'provided', - 'ssh_data_type': 'provided', - 'master_name': self.master.name, - 'master_id': self.master.instance_id, - 'floating_ip': self.master.public_ip - } - - elif self.mode in [ExecMode.CREATE.value, ExecMode.REUSE.value]: + elif self.mode in [StandaloneMode.CREATE.value, StandaloneMode.REUSE.value]: # Create the VPC if not exists self._create_vpc() @@ -482,33 +509,46 @@ def init(self): self._dump_vpc_data() - def build_image(self, image_name, script_file, overwrite, extra_args=[]): + def build_image(self, image_name, script_file, overwrite, include, extra_args=[]): """ Builds a new VM Image """ - images = self.vpc_cli.list_images(name=image_name, resource_group_id=self.config['resource_group_id']).result['images'] + image_name = image_name or DEFAULT_LITHOPS_IMAGE_NAME + + images = self.vpc_cli.list_images( + name=image_name, + resource_group_id=self.config['resource_group_id'] + ).result['images'] + if len(images) > 0: image_id = images[0]['id'] if overwrite: - logger.debug(f"Deleting existing VM Image '{image_name}'") - self.vpc_cli.delete_image(id=image_id) - while len(self.vpc_cli.list_images(name=image_name, resource_group_id=self.config['resource_group_id']).result['images']) > 0: - time.sleep(2) + self.delete_image(image_name) else: raise Exception(f"The image with name '{image_name}' already exists with ID: '{image_id}'." " Use '--overwrite' or '-o' if you want ot overwrite it") - initial_vpc_data = self._load_vpc_data() - + is_initialized = self.is_initialized() self.init() + try: + del self.config['image_id'] + except Exception: + pass + try: + del self.vpc_data['image_id'] + except Exception: + pass + + self._request_image_id() + fip, fip_id = self._get_or_create_floating_ip() self.config['floating_ip'] = fip self.config['floating_ip_id'] = fip_id - build_vm = IBMVPCInstance(image_name, self.config, self.vpc_cli, public=True) + build_vm = IBMVPCInstance('building-image-' + image_name, self.config, self.vpc_cli, public=True) build_vm.public_ip = self.config['floating_ip'] - build_vm.profile_name = self.config['master_profile_name'] + build_vm.instance_type = self.config['master_profile_name'] build_vm.delete_on_dismantle = False build_vm.create() build_vm.wait_ready() @@ -517,18 +557,24 @@ def build_image(self, image_name, script_file, overwrite, extra_args=[]): remote_script = "/tmp/install_lithops.sh" script = get_host_setup_script() build_vm.get_ssh_client().upload_data_to_file(script, remote_script) - logger.debug("Executing installation script. Be patient, this process can take up to 3 minutes") + logger.debug("Executing Lithops installation script. Be patient, this process can take up to 3 minutes") build_vm.get_ssh_client().run_remote_command(f"chmod 777 {remote_script}; sudo {remote_script}; rm {remote_script};") - logger.debug("Installation script finsihed") + logger.debug("Lithops installation script finsihed") + + for src_dst_file in include: + src_file, dst_file = src_dst_file.split(':') + if os.path.isfile(src_file): + logger.debug(f"Uploading local file '{src_file}' to VM image in '{dst_file}'") + build_vm.get_ssh_client().upload_local_file(src_file, dst_file) if script_file: script = os.path.expanduser(script_file) - logger.debug(f"Uploading user script {script_file} to {build_vm}") + logger.debug(f"Uploading user script '{script_file}' to {build_vm}") remote_script = "/tmp/install_user_lithops.sh" build_vm.get_ssh_client().upload_local_file(script, remote_script) - logger.debug("Executing user script. Be patient, this process can take long") + logger.debug(f"Executing user script '{script_file}'") build_vm.get_ssh_client().run_remote_command(f"chmod 777 {remote_script}; sudo {remote_script}; rm {remote_script};") - logger.debug("User script finsihed") + logger.debug(f"User script '{script_file}' finsihed") build_vm.stop() build_vm.wait_stopped() @@ -542,6 +588,7 @@ def build_image(self, image_name, script_file, overwrite, extra_args=[]): image_prototype['resource_group'] = {'id': self.config['resource_group_id']} self.vpc_cli.create_image(image_prototype) + logger.debug("Starting VM image creation") logger.debug("Be patient, VM imaging can take up to 6 minutes") while True: @@ -552,13 +599,33 @@ def build_image(self, image_name, script_file, overwrite, extra_args=[]): break time.sleep(30) - build_vm.delete() - - if not initial_vpc_data: - self.clean(all) + if not is_initialized: + self.clean(all=True) + else: + build_vm.delete() logger.info(f"VM Image created. Image ID: {images[0]['id']}") + def delete_image(self, image_name): + """ + Deletes a VM Image + """ + def list_images(): + return self.vpc_cli.list_images( + name=image_name, + resource_group_id=self.config['resource_group_id'] + ).result['images'] + + images = list_images() + + if len(images) > 0: + image_id = images[0]['id'] + logger.debug(f"Deleting VM Image '{image_name}'") + self.vpc_cli.delete_image(id=image_id) + while len(list_images()) > 0: + time.sleep(2) + logger.debug(f"VM Image '{image_name}' successfully deleted") + def list_images(self): """ List VM Images @@ -600,7 +667,7 @@ def delete_instance(instance_info): else: raise err - vms_prefixes = ('lithops-worker', 'lithops-master') if all else ('lithops-worker',) + vms_prefixes = ('lithops-worker', 'lithops-master', 'building-image') if all else ('lithops-worker',) def get_instances(): instances = set() @@ -626,7 +693,7 @@ def get_instances(): break master_pk = os.path.join(self.cache_dir, f"{self.vpc_data['master_name']}-id_rsa.pub") - if os.path.isfile(master_pk): + if all and os.path.isfile(master_pk): os.remove(master_pk) if self.vpc_data['vpc_data_type'] == 'provided': @@ -730,20 +797,16 @@ def clean(self, all=False): """ logger.info('Cleaning IBM VPC resources') - self._load_vpc_data() - if not self.vpc_data: return - if self.vpc_data['mode'] == ExecMode.CONSUME.value: - if os.path.exists(self.cache_file): - os.remove(self.cache_file) + if self.mode == StandaloneMode.CONSUME.value: + self._delete_vpc_data() else: self._delete_vm_instances(all=all) self._delete_vpc() if all else None self._delete_ssh_key() if all else None - if all and os.path.exists(self.cache_file): - os.remove(self.cache_file) + self._delete_vpc_data() if all else None def clear(self, job_keys=None): """ @@ -761,8 +824,7 @@ def dismantle(self, include_master=True): ex.map(lambda worker: worker.stop(), self.workers) self.workers = [] - if include_master or self.mode == ExecMode.CONSUME.value: - # in consume mode master VM is a worker + if include_master: self.master.stop() def get_instance(self, name, **kwargs): @@ -773,11 +835,23 @@ def get_instance(self, name, **kwargs): instance = IBMVPCInstance(name, self.config, self.vpc_cli) for key in kwargs: - if hasattr(instance, key): + if hasattr(instance, key) and kwargs[key] is not None: setattr(instance, key, kwargs[key]) return instance + def get_worker_instance_type(self): + """ + Return the worker profile name + """ + return self.config['worker_profile_name'] + + def get_worker_cpu_count(self): + """ + Returns the number of CPUs in the worker instance type + """ + return int(self.config['worker_profile_name'].split('-')[1].split('x')[0]) + def create_worker(self, name): """ Creates a new worker VM instance @@ -791,9 +865,10 @@ def create_worker(self, name): with open(pub_key, 'r') as pk: pk_data = pk.read().strip() user_data = CLOUD_CONFIG_WORKER_PK.format(user, pk_data) - worker.ssh_credentials['key_filename'] = '~/.ssh/id_rsa' + worker.ssh_credentials['key_filename'] = '~/.ssh/lithops_id_rsa' worker.ssh_credentials.pop('password') else: + logger.error(f'Unable to locate {pub_key}') worker.ssh_credentials.pop('key_filename') token = worker.ssh_credentials['password'] user_data = CLOUD_CONFIG_WORKER.format(user, token) @@ -821,7 +896,7 @@ def __init__(self, name, ibm_vpc_config, ibm_vpc_client=None, public=False): self.config = ibm_vpc_config self.delete_on_dismantle = self.config['delete_on_dismantle'] - self.profile_name = self.config['worker_profile_name'] + self.instance_type = self.config['worker_profile_name'] self.vpc_cli = ibm_vpc_client or self._create_vpc_client() self.public = public @@ -838,7 +913,6 @@ def __init__(self, name, ibm_vpc_config, ibm_vpc_client=None, public=False): 'password': self.config['ssh_password'], 'key_filename': self.config.get('ssh_key_filename', '~/.ssh/id_rsa') } - self.validated = False def __str__(self): ip = self.public_ip if self.public else self.private_ip @@ -865,36 +939,11 @@ def get_ssh_client(self): """ Creates an ssh client against the VM """ - - if not self.validated and self.public and self.instance_id: - # validate that private ssh key in ssh_credentials is a pair of public key on instance - key_filename = self.ssh_credentials['key_filename'] - key_filename = os.path.abspath(os.path.expanduser(key_filename)) - - if not os.path.exists(key_filename): - raise LithopsValidationError(f"Private key file {key_filename} doesn't exist") - - initialization_data = self.vpc_cli.get_instance_initialization(self.instance_id).get_result() - - private_res = paramiko.RSAKey(filename=key_filename).get_base64() - key = None - names = [] - for k in initialization_data['keys']: - public_res = self.vpc_cli.get_key(k['id']).get_result()['public_key'].split(' ')[1] - if public_res == private_res: - self.validated = True - break - else: - names.append(k['name']) - - if not self.validated: - raise LithopsValidationError( - f"No public key from keys: {names} on master {self} not a pair for private ssh key {key_filename}") - - if not self.ssh_client: - if self.public and self.public_ip: + if self.public: + if not self.ssh_client or self.ssh_client.ip_address != self.public_ip: self.ssh_client = SSHClient(self.public_ip, self.ssh_credentials) - elif self.private_ip: + else: + if not self.ssh_client or self.ssh_client.ip_address != self.private_ip: self.ssh_client = SSHClient(self.private_ip, self.ssh_credentials) return self.ssh_client @@ -974,7 +1023,7 @@ def _create_instance(self, user_data): instance_prototype = {} instance_prototype['name'] = self.name instance_prototype['keys'] = [key_identity_model] - instance_prototype['profile'] = {'name': self.profile_name} + instance_prototype['profile'] = {'name': self.instance_type} instance_prototype['resource_group'] = {'id': self.config['resource_group_id']} instance_prototype['vpc'] = {'id': self.config['vpc_id']} instance_prototype['image'] = {'id': self.config['image_id']} @@ -1190,9 +1239,9 @@ def validate_capabilities(self): """ if self.config.get('singlesocket'): cmd = "lscpu -p=socket|grep -v '#'" - res = self.get_ssh_client().run_remote_command(cmd) + out, err = self.get_ssh_client().run_remote_command(cmd) sockets = set() - for char in res: + for char in out: if char != '\n': sockets.add(char) if len(sockets) != 1: @@ -1236,7 +1285,7 @@ def decorate_instance(instance, decorator): def vpc_retry_on_except(func): - RETRIES = 3 + RETRIES = 5 SLEEP_FACTOR = 1.5 MAX_SLEEP = 30 @@ -1259,7 +1308,8 @@ def _sleep_or_raise(sleep_time, err): return func(*args, **kwargs) except ApiException as err: if func.__name__ in IGNORED_404_METHODS and err.code == 404: - logger.debug((f'Got exception {err} when trying to invoke {func.__name__}, ignoring')) + # logger.debug((f'Got exception {err} when trying to invoke {func.__name__}, ignoring')) + pass else: sleep_time = _sleep_or_raise(sleep_time, err) except Exception as err: diff --git a/lithops/standalone/backends/vm/__init__.py b/lithops/standalone/backends/vm/__init__.py index 14ed5f71b..81f96b3ec 100644 --- a/lithops/standalone/backends/vm/__init__.py +++ b/lithops/standalone/backends/vm/__init__.py @@ -1 +1,3 @@ from .vm import VMBackend as StandaloneBackend + +__all__ = ['StandaloneBackend'] diff --git a/lithops/standalone/backends/vm/config.py b/lithops/standalone/backends/vm/config.py index fa311315d..6880c0a46 100644 --- a/lithops/standalone/backends/vm/config.py +++ b/lithops/standalone/backends/vm/config.py @@ -19,14 +19,14 @@ def load_config(config_data): - config_data['vm']['max_workers'] = 1 - - if 'worker_processes' not in config_data['vm']: - config_data['vm']['worker_processes'] = 1 - for param in REQ_PARAMETERS: if param not in config_data['vm']: msg = f"'{param}' is mandatory in 'vm' section of the configuration" raise Exception(msg) + config_data['vm']['max_workers'] = 1 + + if "worker_processes" not in config_data['vm']: + config_data['vm']['worker_processes'] = "AUTO" + config_data['standalone']['auto_dismantle'] = False diff --git a/lithops/standalone/backends/vm/vm.py b/lithops/standalone/backends/vm/vm.py index c79778f3f..524223c5b 100644 --- a/lithops/standalone/backends/vm/vm.py +++ b/lithops/standalone/backends/vm/vm.py @@ -18,10 +18,11 @@ import logging import time +from lithops.standalone.utils import StandaloneMode from lithops.version import __version__ from lithops.constants import COMPUTE_CLI_MSG from lithops.util.ssh_client import SSHClient -from lithops.standalone.standalone import LithopsValidationError +from lithops.standalone import LithopsValidationError logger = logging.getLogger(__name__) @@ -40,6 +41,12 @@ def __init__(self, vm_config, mode): logger.info(COMPUTE_CLI_MSG.format('Virtual Machine')) + def is_initialized(self): + """ + Checks if the backend is initialized + """ + return self.mode == StandaloneMode.CONSUME.value + def init(self): """ Initialize the VM backend @@ -65,6 +72,9 @@ def clear(self, **kwargs): def dismantle(self, **kwargs): pass + def get_worker_instance_type(self): + return None + def get_runtime_key(self, runtime_name, version=__version__): runtime = runtime_name.replace('/', '-').replace(':', '-') runtime_key = os.path.join(self.name, version, self.config['ip_address'], runtime) diff --git a/lithops/standalone/keeper.py b/lithops/standalone/keeper.py index cff083ca4..beb4495b6 100644 --- a/lithops/standalone/keeper.py +++ b/lithops/standalone/keeper.py @@ -1,10 +1,27 @@ -import json +# +# (C) Copyright Cloudlab URV 2020 +# (C) Copyright IBM Corp. 2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + import os import time import threading import logging -from lithops.standalone.standalone import StandaloneHandler -from lithops.constants import SA_DATA_FILE, JOBS_DIR +from lithops.standalone import StandaloneHandler +from lithops.constants import JOBS_DIR +from lithops.standalone.utils import JobStatus logger = logging.getLogger(__name__) @@ -14,64 +31,64 @@ class BudgetKeeper(threading.Thread): """ BudgetKeeper class used to automatically stop the VM instance """ - def __init__(self, config): + def __init__(self, config, instance_data, stop_callback=None, delete_callback=None): threading.Thread.__init__(self) self.last_usage_time = time.time() self.standalone_config = config + self.stop_callback = stop_callback + self.delete_callback = delete_callback self.auto_dismantle = config['auto_dismantle'] self.soft_dismantle_timeout = config['soft_dismantle_timeout'] self.hard_dismantle_timeout = config['hard_dismantle_timeout'] self.exec_mode = config['exec_mode'] + self.runing = False self.jobs = {} + self.time_to_dismantle = self.hard_dismantle_timeout - with open(SA_DATA_FILE, 'r') as ad: - instance_data = json.load(ad) - - self.sh = StandaloneHandler(self.standalone_config) - self.instance = self.sh.backend.get_instance(**instance_data) + self.standalone_handler = StandaloneHandler(self.standalone_config) + self.instance = self.standalone_handler.backend.get_instance(**instance_data) - logger.debug("Starting BudgetKeeper for {} ({}), instance ID: {}" - .format(self.instance.name, self.instance.private_ip, - self.instance.instance_id)) + logger.debug(f"Starting BudgetKeeper for {self.instance.name} ({self.instance.private_ip}), " + f"instance ID: {self.instance.instance_id}") logger.debug(f"Delete {self.instance.name} on dismantle: {self.instance.delete_on_dismantle}") - def update_config(self, config): - self.standalone_config.update(config) - self.auto_dismantle = config['auto_dismantle'] - self.soft_dismantle_timeout = config['soft_dismantle_timeout'] - self.hard_dismantle_timeout = config['hard_dismantle_timeout'] - self.exec_mode = config['exec_mode'] + def get_time_to_dismantle(self): + return self.time_to_dismantle + + def add_job(self, job_key): + self.last_usage_time = time.time() + self.jobs[job_key] = JobStatus.RUNNING.value + + def set_job_done(self, job_key): + self.last_usage_time = time.time() + self.jobs[job_key] = JobStatus.DONE.value def run(self): - runing = True + self.runing = True jobs_running = False logger.debug("BudgetKeeper started") if self.auto_dismantle: logger.debug('Auto dismantle activated - Soft timeout: {}s, Hard Timeout: {}s' - .format(self.soft_dismantle_timeout, - self.hard_dismantle_timeout)) + .format(self.soft_dismantle_timeout, self.hard_dismantle_timeout)) else: # If auto_dismantle is deactivated, the VM will be always automatically # stopped after hard_dismantle_timeout. This will prevent the VM # being started forever due a wrong configuration - logger.debug('Auto dismantle deactivated - Hard Timeout: {}s' - .format(self.hard_dismantle_timeout)) + logger.debug(f'Auto dismantle deactivated - Hard Timeout: {self.hard_dismantle_timeout}s') - while runing: + while self.runing: time_since_last_usage = time.time() - self.last_usage_time for job_key in self.jobs.keys(): done = os.path.join(JOBS_DIR, job_key + '.done') if os.path.isfile(done): - self.jobs[job_key] = 'done' - - logger.debug(f"self.jobs: {self.jobs}") + self.jobs[job_key] = JobStatus.DONE.value - if len(self.jobs) > 0 and all(value == 'done' for value in self.jobs.values()) \ + if len(self.jobs) > 0 and all(value == JobStatus.DONE.value for value in self.jobs.values()) \ and self.auto_dismantle: # here we need to catch a moment when number of running JOBS become zero. @@ -82,20 +99,29 @@ def run(self): time_since_last_usage = time.time() - self.last_usage_time - time_to_dismantle = int(self.soft_dismantle_timeout - time_since_last_usage) + self.time_to_dismantle = int(self.soft_dismantle_timeout - time_since_last_usage) else: - time_to_dismantle = int(self.hard_dismantle_timeout - time_since_last_usage) + self.time_to_dismantle = int(self.hard_dismantle_timeout - time_since_last_usage) jobs_running = True - if time_to_dismantle > 0: - logger.debug(f"Time to dismantle: {time_to_dismantle} seconds") - check_interval = max(time_to_dismantle / 10, 1) + if self.time_to_dismantle > 0: + logger.debug(f"Time to dismantle: {self.time_to_dismantle} seconds") + check_interval = min(60, max(self.time_to_dismantle / 10, 1)) time.sleep(check_interval) else: - logger.debug("Dismantling setup") - try: - self.instance.stop() - runing = False - except Exception as e: - logger.debug(f"Dismantle error {e}") - time.sleep(5) + self.stop_instance() + + def stop_instance(self): + logger.debug("Dismantling setup") + + if self.instance.delete_on_dismantle: + self.delete_callback() if self.delete_callback is not None else None + else: + self.stop_callback() if self.stop_callback is not None else None + + try: + self.instance.stop() + self.runing = False + except Exception as e: + logger.debug(f"Dismantle error {e}") + time.sleep(5) diff --git a/lithops/standalone/master.py b/lithops/standalone/master.py index aa3cd942e..57ae7d9bb 100644 --- a/lithops/standalone/master.py +++ b/lithops/standalone/master.py @@ -1,5 +1,6 @@ # -# Copyright Cloudlab URV 2020 +# (C) Copyright Cloudlab URV 2020 +# (C) Copyright IBM Corp. 2023 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,89 +20,243 @@ import time import json import uuid +import redis import flask -import queue import logging import requests -from pathlib import Path import concurrent.futures as cf +from pathlib import Path +from datetime import datetime from gevent.pywsgi import WSGIServer from threading import Thread from concurrent.futures import ThreadPoolExecutor -from lithops.constants import LITHOPS_TEMP_DIR, SA_LOG_FILE, JOBS_DIR,\ - SA_SERVICE_PORT, SA_CONFIG_FILE, SA_DATA_FILE -from lithops.localhost.localhost import LocalhostHandler -from lithops.standalone.standalone import LithopsValidationError -from lithops.utils import verify_runtime_name, iterchunks, setup_lithops_logger -from lithops.standalone.utils import get_worker_setup_script +from lithops.version import __version__ +from lithops.localhost import LocalhostHandler +from lithops.standalone import LithopsValidationError from lithops.standalone.keeper import BudgetKeeper +from lithops.config import extract_standalone_config +from lithops.standalone.standalone import StandaloneHandler from lithops.version import __version__ as lithops_version - +from lithops.constants import ( + CPU_COUNT, + LITHOPS_TEMP_DIR, + SA_MASTER_LOG_FILE, + JOBS_DIR, + SA_MASTER_SERVICE_PORT, + SA_WORKER_SERVICE_PORT, + SA_CONFIG_FILE, + SA_MASTER_DATA_FILE +) +from lithops.utils import ( + verify_runtime_name, + setup_lithops_logger +) +from lithops.standalone.utils import ( + JobStatus, + StandaloneMode, + WorkerStatus, + get_host_setup_script, + get_worker_setup_script +) + +os.makedirs(LITHOPS_TEMP_DIR, exist_ok=True) log_format = "%(asctime)s\t[%(levelname)s] %(name)s:%(lineno)s -- %(message)s" -setup_lithops_logger(logging.DEBUG, filename=SA_LOG_FILE, log_format=log_format) +setup_lithops_logger(logging.DEBUG, filename=SA_MASTER_LOG_FILE, log_format=log_format) logger = logging.getLogger('lithops.standalone.master') app = flask.Flask(__name__) MAX_INSTANCE_CREATE_RETRIES = 2 -REUSE_WORK_QUEUE_NAME = 'all' +JOB_MONITOR_CHECK_INTERVAL = 1 -exec_mode = 'consume' -workers = {} -workers_state = {} - -standalone_config = None -standalone_handler = None +redis_client = None budget_keeper = None -work_queues = {} master_ip = None -# variables for consume mode -localhost_manager_process = None -localhos_handler = None -last_job_key = None + +# /---------------------------------------------------------------------------/ +# Workers +# /---------------------------------------------------------------------------/ + +def is_worker_free(worker_private_ip): + """ + Checks if the Lithops service is ready and free in the worker VM instance + """ + url = f"http://{worker_private_ip}:{SA_WORKER_SERVICE_PORT}/ping" + try: + r = requests.get(url, timeout=0.5) + resp = r.json() + logger.debug(f'Worker processes status from {worker_private_ip}: {resp}') + return True if resp.get('free', 0) > 0 else False + except Exception: + return False -def is_worker_free(worker): +def get_worker_ttd(worker_private_ip): """ Checks if the Lithops service is ready and free in the worker VM instance """ - url = f"http://{worker.private_ip}:{SA_SERVICE_PORT}/ping" - r = requests.get(url, timeout=0.5) - if r.status_code == 200: - if r.json()['status'] == 'free': - return True - return False + try: + if master_ip == worker_private_ip: + ttd = str(budget_keeper.get_time_to_dismantle()) + else: + url = f"http://{worker_private_ip}:{SA_WORKER_SERVICE_PORT}/ttd" + r = requests.get(url, timeout=0.5) + ttd = r.text + logger.debug(f'Worker TTD from {worker_private_ip}: {ttd}') + return ttd + except Exception as e: + logger.error(f"Unable to get TTD from {worker_private_ip}: {e}") + return "Unknown" + + +@app.route('/worker/list', methods=['GET']) +def list_workers(): + """ + Returns the current workers list + """ + logger.debug('Listing workers') + + budget_keeper.last_usage_time = time.time() + result = [['Worker Name', 'Created', 'Instance Type', 'Processes', 'Runtime', 'Mode', 'Status', 'TTD']] + + def get_worker(worker): + worker_data = redis_client.hgetall(worker) + name = worker_data['name'] + status = worker_data['status'] + private_ip = worker_data['private_ip'] + ttd = get_worker_ttd(private_ip) + ttd = ttd if ttd in ["Unknown", "Disabled"] else ttd + "s" + timestamp = float(worker_data['created']) + created = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S UTC') + instance_type = worker_data['instance_type'] + worker_processes = str(worker_data['worker_processes']) + exec_mode = worker_data['exec_mode'] + runtime = worker_data['runtime'] + result.append((name, created, instance_type, worker_processes, runtime, exec_mode, status, ttd)) + + workers = redis_client.keys('worker:*') + if workers: + with ThreadPoolExecutor(len(workers)) as ex: + ex.map(get_worker, workers) + + logger.debug(f"workers: {result}") + return flask.jsonify(result) -def setup_worker(worker_info, work_queue_name): + +@app.route('/worker/get', methods=['GET']) +def get_workers(): """ - Run worker process - Install all the Lithops dependencies into the worker. - Runs the job + Returns the number of free workers """ - global workers, workers_state + budget_keeper.last_usage_time = time.time() + + workers = redis_client.keys('worker:*') + logger.debug(f'Getting workers - Total workers: {len(workers)}') + payload = flask.request.get_json(force=True, silent=True) + if payload and not isinstance(payload, dict): + return error('The action did not receive a dictionary as an argument.') + + worker_instance_type = payload['worker_instance_type'] + worker_processes = payload['worker_processes'] + runtime_name = payload['runtime_name'] + + active_workers = [] + + for worker in workers: + worker_data = redis_client.hgetall(worker) + if worker_data['instance_type'] == worker_instance_type \ + and worker_data['runtime'] == runtime_name \ + and int(worker_data['worker_processes']) == int(worker_processes): + active_workers.append(worker_data) + + worker_type = f'{worker_instance_type}-{worker_processes}-{runtime_name}' + logger.debug(f'Workers for {worker_type}: {len(active_workers)}') + + free_workers = [] + + def check_worker(worker_data): + if is_worker_free(worker_data['private_ip']): + free_workers.append( + ( + worker_data['name'], + worker_data['private_ip'], + worker_data['instance_id'], + worker_data['ssh_credentials'], + worker_data['instance_type'], + runtime_name + ) + ) + + if active_workers: + with ThreadPoolExecutor(len(active_workers)) as ex: + ex.map(check_worker, active_workers) + + logger.debug(f'Free workers for {worker_type}: {len(free_workers)}') + + response = flask.jsonify(free_workers) + response.status_code = 200 + + return response + + +def save_worker(worker, standalone_config, work_queue_name): + """ + Saves the worker instance with the provided data in redis + """ + config = copy.deepcopy(standalone_config) + del config[config['backend']] + config = {key: str(value) if isinstance(value, bool) else value for key, value in config.items()} + + worker_processes = CPU_COUNT if worker.config['worker_processes'] == 'AUTO' \ + else worker.config['worker_processes'] + + redis_client.hset(f"worker:{worker.name}", mapping={ + 'name': worker.name, + 'status': WorkerStatus.STARTING.value, + 'private_ip': worker.private_ip or '', + 'instance_id': worker.instance_id or '', + 'instance_type': worker.instance_type, + 'worker_processes': worker_processes, + 'created': str(time.time()), + 'ssh_credentials': json.dumps(worker.ssh_credentials), + 'queue_name': work_queue_name, + 'err': "", **config, + }) + + +def setup_worker_create_reuse(standalone_handler, worker_info, work_queue_name): + """ + Run the worker setup process and installs all the Lithops dependencies into it + """ worker = standalone_handler.backend.get_instance(**worker_info, public=False) - logger.debug(f'Starting setup for {worker}') - max_instance_create_retries = standalone_config.get('worker_create_retries', MAX_INSTANCE_CREATE_RETRIES) + if redis_client.hget(f"worker:{worker.name}", 'status') == WorkerStatus.ACTIVE.value: + return + + save_worker(worker, standalone_handler.config, work_queue_name) + + max_instance_create_retries = worker.config.get('worker_create_retries', MAX_INSTANCE_CREATE_RETRIES) def wait_worker_ready(worker): instance_ready_retries = 1 while instance_ready_retries <= max_instance_create_retries: try: - workers_state[worker.name] = {'state': 'starting'} worker.wait_ready() break except TimeoutError as e: # VM not started in time - workers_state[worker.name] = {'state': 'error', 'err': str(e)} + redis_client.hset(f"worker:{worker.name}", 'status', WorkerStatus.ERROR.value) + err_msg = 'Timeout Error while waitting the VM to get ready' + redis_client.hset(f"worker:{worker.name}", 'err', err_msg) if instance_ready_retries == max_instance_create_retries: + logger.debug(f'Readiness probe expired for {worker}') raise e - logger.warning(f'Timeout Error. Recreating VM instance {worker.name}') + logger.warning(f'Timeout Error. Recreating {worker}') worker.delete() worker.create() instance_ready_retries += 1 @@ -111,313 +266,297 @@ def wait_worker_ready(worker): instance_validate_retries = 1 while instance_validate_retries <= max_instance_create_retries: try: - logger.debug(f'Validating {worker.name}') + logger.debug(f'Validating {worker}') worker.validate_capabilities() break except LithopsValidationError as e: - logger.debug(f'{worker.name} validation error: {e}') - workers_state[worker.name] = {'state': 'error', 'err': str(e)} + redis_client.hset(f"worker:{worker.name}", 'status', WorkerStatus.ERROR.value) + redis_client.hset(f"worker:{worker.name}", 'err', f'Validation error: {e}') if instance_validate_retries == max_instance_create_retries: - workers_state[worker.name] = {'state': 'setup', 'err': str(e)} - break - logger.warning(f'Worker {worker.name} setup failed with error {e} after {instance_validate_retries} retries') + logger.debug(f'Validation probe expired for {worker}') + raise e + logger.warning(f'{worker} validation error: {e}') worker.delete() worker.create() instance_validate_retries += 1 wait_worker_ready(worker) - # upload zip lithops package - logger.debug(f'Uploading lithops files to {worker}') - worker.get_ssh_client().upload_local_file( - '/opt/lithops/lithops_standalone.zip', - '/tmp/lithops_standalone.zip') - - logger.debug(f'Executing lithops installation process on {worker}') - - vm_data = {'name': worker.name, - 'private_ip': worker.private_ip, - 'instance_id': worker.instance_id, - 'ssh_credentials': worker.ssh_credentials, - 'master_ip': master_ip, - 'work_queue': work_queue_name} + redis_client.hset(f"worker:{worker.name}", 'private_ip', worker.private_ip) + redis_client.hset(f"worker:{worker.name}", 'status', WorkerStatus.STARTED.value) + redis_client.hset(f"worker:{worker.name}", 'err', '') - remote_script = "/tmp/install_lithops.sh" - script = get_worker_setup_script(standalone_config, vm_data) - worker.get_ssh_client().upload_data_to_file(script, remote_script) - cmd = f"chmod 777 {remote_script}; sudo {remote_script};" - worker.get_ssh_client().run_remote_command(cmd, run_async=True) - worker.del_ssh_client() - logger.debug(f'Installation script submitted to {worker}') - workers_state[worker.name] = {'state': 'running', 'err': workers_state[worker.name].get('err')} + try: + logger.debug(f'Uploading lithops files to {worker}') + worker.get_ssh_client().upload_local_file( + '/opt/lithops/lithops_standalone.zip', + '/tmp/lithops_standalone.zip') + + logger.debug(f'Preparing installation script for {worker}') + vm_data = { + 'name': worker.name, + 'private_ip': worker.private_ip, + 'instance_id': worker.instance_id, + 'ssh_credentials': worker.ssh_credentials, + 'instance_type': worker.instance_type, + 'master_ip': master_ip, + 'work_queue_name': work_queue_name, + 'lithops_version': __version__ + } + remote_script = "/tmp/install_lithops.sh" + script = get_host_setup_script() + script += get_worker_setup_script(standalone_handler.config, vm_data) + + logger.debug(f'Submitting installation script to {worker}') + worker.get_ssh_client().upload_data_to_file(script, remote_script) + cmd = f"chmod 755 {remote_script}; sudo {remote_script}; rm {remote_script}" + worker.get_ssh_client().run_remote_command(cmd, run_async=True) + worker.del_ssh_client() + + logger.debug(f'Installation script submitted to {worker}') + redis_client.hset(f"worker:{worker.name}", 'status', WorkerStatus.INSTALLING.value) - logger.debug(f'Appending {worker.name} to Worker list') - workers[worker.name] = worker + except Exception as e: + redis_client.hset(f"worker:{worker.name}", 'status', WorkerStatus.ERROR.value) + worker.err = f'Unable to setup lithops in the VM: {str(e)}' + raise e -def start_workers(job_payload, work_queue_name): +def setup_worker_consume(standalone_handler, worker_info, work_queue_name): """ - Creates the workers (if any) + Run the worker setup process in the case of Consume mode """ - workers = job_payload['worker_instances'] + instance = standalone_handler.backend.get_instance(**worker_info, public=False) + instance.private_ip = master_ip - if not workers: + if redis_client.hget(f"worker:{instance.name}", 'status') == WorkerStatus.ACTIVE.value: return - futures = [] - with ThreadPoolExecutor(len(workers)) as executor: - for worker_info in workers: - futures.append(executor.submit(setup_worker, worker_info, work_queue_name)) - - for future in cf.as_completed(futures): - try: - future.result() - except Exception as e: - # TODO consider to update worker state - logger.error(e) - - logger.debug(f'All workers set up for work queue "{work_queue_name}"') - - -def run_job_local(work_queue): - """ - Localhost jobs manager process for consume mode - """ - global localhos_handler - global last_job_key - - pull_runtime = standalone_config.get('pull_runtime', False) - - def wait_job_completed(job_key): - done = os.path.join(JOBS_DIR, job_key + '.done') - while True: - if os.path.isfile(done): - break - time.sleep(1) + save_worker(instance, standalone_handler.config, work_queue_name) try: - localhos_handler = LocalhostHandler({'pull_runtime': pull_runtime}) - - while True: - job_payload = work_queue.get() - job_key = job_payload['job_key'] - last_job_key = job_key - job_payload['config']['lithops']['backend'] = 'localhost' - localhos_handler.invoke(job_payload) - wait_job_completed(job_key) + logger.debug(f'Setting up the worker in the current {instance}') + vm_data = { + 'name': instance.name, + 'private_ip': instance.private_ip, + 'instance_id': instance.instance_id, + 'ssh_credentials': instance.ssh_credentials, + 'instance_type': instance.instance_type, + 'master_ip': master_ip, + 'work_queue_name': work_queue_name, + 'lithops_version': __version__ + } + worker_setup_script = "/tmp/install_lithops.sh" + script = get_worker_setup_script(standalone_handler.config, vm_data) + with open(worker_setup_script, 'w') as wis: + wis.write(script) + + redis_client.hset(f"worker:{instance.name}", 'status', WorkerStatus.INSTALLING.value) + os.chmod(worker_setup_script, 0o755) + os.system("sudo " + worker_setup_script) + os.remove(worker_setup_script) except Exception as e: - logger.error(e) + redis_client.hset(f"worker:{instance.name}", 'status', WorkerStatus.ERROR.value) + instance.err = f'Unable to setup lithops in the VM: {str(e)}' + raise e -def run_job_worker(job_payload, work_queue): +def handle_workers(job_payload, workers, work_queue_name): """ - Process responsible to wait for workers to become ready, and - submit individual tasks of the job to them + Creates the workers (if any) """ - job_key = job_payload['job_key'] - call_ids = job_payload['call_ids'] - chunksize = job_payload['chunksize'] + if not workers: + return - for call_ids_range in iterchunks(call_ids, chunksize): - task_payload = copy.deepcopy(job_payload) - dbr = task_payload['data_byte_ranges'] - task_payload['call_ids'] = call_ids_range - task_payload['data_byte_ranges'] = [dbr[int(call_id)] for call_id in call_ids_range] - work_queue.put(task_payload) + logger.debug(f"Going to setup {len(workers)} workers") - while not work_queue.empty(): - time.sleep(1) + standalone_config = extract_standalone_config(job_payload['config']) + standalone_handler = StandaloneHandler(standalone_config) - done = os.path.join(JOBS_DIR, job_key + '.done') - Path(done).touch() + futures = [] + total_correct = 0 - logger.debug(f'Job process {job_key} finished') + if standalone_config['exec_mode'] == StandaloneMode.CONSUME.value: + try: + setup_worker_consume( + standalone_handler, + workers[0], + work_queue_name + ) + total_correct += 1 + except Exception as e: + # TODO: If the local worker can't start, cancel all jobs + # in the budget keeper + logger.error(e) + else: + with ThreadPoolExecutor(len(workers)) as executor: + for worker_info in workers: + future = executor.submit( + setup_worker_create_reuse, + standalone_handler, + worker_info, + work_queue_name + ) + futures.append(future) + + for future in cf.as_completed(futures): + try: + future.result() + total_correct += 1 + except Exception as e: + logger.error(e) + logger.debug( + f'{total_correct} of {len(workers)} workers started ' + f'for work queue: {work_queue_name}' + ) -def error(msg): - response = flask.jsonify({'error': msg}) - response.status_code = 404 - return response +# /---------------------------------------------------------------------------/ +# Jobs +# /---------------------------------------------------------------------------/ -@app.route('/workers', methods=['GET']) -def get_workers(): +def cancel_job_process(job_key_list): """ - Returns the number of free workers + Cleans the work queues and sends the SIGTERM to the workers """ - global workers - global budget_keeper + for job_key in job_key_list: + logger.debug(f'Received SIGTERM: Stopping job process {job_key}') - # update last_usage_time to prevent race condition when keeper stops the vm - budget_keeper.last_usage_time = time.time() + queue_name = redis_client.hget(f'job:{job_key}', 'queue_name') - current_workers = [(worker.name, worker.private_ip) for worker in workers.values()] - logger.debug(f'Current workers: {current_workers}') + tmp_queue = [] + while redis_client.llen(queue_name) > 0: + task_payload_json = redis_client.rpop(queue_name) + task_payload = json.loads(task_payload_json) + if task_payload['job_key'] != job_key: + tmp_queue.append(task_payload_json) - free_workers = [] + for task_payload_json in tmp_queue: + redis_client.lpush(queue_name, task_payload_json) - def check_worker(worker): - if is_worker_free(worker): - free_workers.append(( - worker.name, - worker.private_ip, - worker.instance_id, - worker.ssh_credentials) - ) + def stop_task(worker): + worker_data = redis_client.hgetall(worker) + url = f"http://{worker_data['private_ip']}:{SA_WORKER_SERVICE_PORT}/stop/{job_key}" + requests.post(url, timeout=0.5) - if workers: + # Send stop signal to all workers + workers = redis_client.keys('worker:*') with ThreadPoolExecutor(len(workers)) as ex: - ex.map(check_worker, workers.values()) + ex.map(stop_task, workers) - logger.debug(f'Total free workers: {len(free_workers)}') - - response = flask.jsonify(free_workers) - response.status_code = 200 - - return response + Path(os.path.join(JOBS_DIR, job_key + '.done')).touch() + if redis_client.hget(f"job:{job_key}", 'status') != JobStatus.DONE.value: + redis_client.hset(f"job:{job_key}", 'status', JobStatus.CANCELED.value) -@app.route('/workers-state', methods=['GET']) -def get_workers_state(): +@app.route('/job/stop', methods=['POST']) +def stop(): """ - Returns the current workers state + Stops received job processes """ - logger.debug(f'Workers state: {workers_state}') - return flask.jsonify(workers_state) + job_key_list = flask.request.get_json(force=True, silent=True) + # Start a separate thread to do the task in background, + # for not keeping the client waiting. + Thread(target=cancel_job_process, args=(job_key_list, )).start() + + return ('', 204) -@app.route('/get-task/', methods=['GET']) -def get_task(work_queue_name): +@app.route('/job/list', methods=['GET']) +def list_jobs(): """ - Returns a task from the work queue + Returns the current workers state """ - global work_queues + logger.debug('Listing jobs') - try: - task_payload = work_queues.setdefault(work_queue_name, queue.Queue()).get(False) - response = flask.jsonify(task_payload) - response.status_code = 200 - job_key = task_payload['job_key'] - calls = task_payload['call_ids'] - worker_ip = flask.request.remote_addr - logger.debug(f'Worker {worker_ip} retrieved Job {job_key} - Calls {calls}') - except queue.Empty: - response = ('', 204) - - return response + budget_keeper.last_usage_time = time.time() + result = [['Job ID', 'Function Name', 'Submitted', 'Worker Type', 'Runtime', 'Tasks Done', 'Job Status']] -def stop_job_process(job_key_list): - """ - Stops a job process - """ - global localhos_handler - global work_queues + for job_job_key in redis_client.keys('job:*'): + job_data = redis_client.hgetall(job_job_key) + job_key = job_data['job_key'] + exec_mode = job_data['exec_mode'] + status = job_data['status'] + func_name = job_data['func_name'] + "()" + timestamp = float(job_data['submitted']) + runtime = job_data['runtime_name'] + worker_type = job_data['worker_type'] if exec_mode != StandaloneMode.CONSUME.value else 'VM' + submitted = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S UTC') + total_tasks = str(job_data['total_tasks']) + done_tasks = str(redis_client.llen(f'tasksdone:{job_key}')) + job = (job_key, func_name, submitted, worker_type, runtime, f'{done_tasks}/{total_tasks}', status) + result.append(job) - for job_key in job_key_list: - logger.debug(f'Received SIGTERM: Stopping job process {job_key}') + logger.debug(f'jobs: {result}') + return flask.jsonify(result) - if exec_mode == 'consume': - if job_key == last_job_key: - # kill current running job process - localhos_handler.clear() - done = os.path.join(JOBS_DIR, job_key + '.done') - Path(done).touch() - else: - # Delete job_payload from pending queue - work_queue = work_queues['local'] - tmp_queue = [] - while not work_queue.empty(): - try: - job_payload = work_queue.get(False) - if job_payload['job_key'] != job_key: - tmp_queue.append(job_payload) - except Exception: - pass - for job_payload in tmp_queue: - work_queue.put(job_payload) - else: - wqn = job_key if exec_mode == 'create' else REUSE_WORK_QUEUE_NAME - # empty work queue - work_queue = work_queues.setdefault(wqn, queue.Queue()) - while not work_queue.empty(): - try: - work_queue.get(False) - except Exception: - pass - - def stop_task(worker): - private_ip = worker['private_ip'] - url = f"http://{private_ip}:{SA_SERVICE_PORT}/stop/{job_key}" - requests.post(url, timeout=0.5) - - # Send stop signal to all workers - with ThreadPoolExecutor(len(workers)) as ex: - ex.map(stop_task, workers.values()) - - -@app.route('/stop', methods=['POST']) -def stop(): +def handle_job(job_payload, queue_name): """ - Stops received job processes + Process responsible to put the job in redis and all the + individual tasks in a work queue """ - job_key_list = flask.request.get_json(force=True, silent=True) - # Start a separate thread to do the task in background, - # for not keeping the client waiting. - Thread(target=stop_job_process, args=(job_key_list, )).start() + job_key = job_payload['job_key'] - return ('', 204) + redis_client.hset(f"job:{job_key}", mapping={ + 'job_key': job_key, + 'status': JobStatus.SUBMITTED.value, + 'submitted': job_payload['host_submit_tstamp'], + 'func_name': job_payload['func_name'], + 'worker_type': job_payload.get('worker_instance_type', 'VM'), + 'runtime_name': job_payload['runtime_name'], + 'exec_mode': job_payload['config']['standalone']['exec_mode'], + 'total_tasks': len(job_payload['call_ids']), + 'queue_name': queue_name + }) + + dbr = job_payload['data_byte_ranges'] + for call_id in job_payload['call_ids']: + task_payload = copy.deepcopy(job_payload) + task_payload['call_ids'] = [call_id] + task_payload['data_byte_ranges'] = [dbr[int(call_id)]] + redis_client.lpush(queue_name, json.dumps(task_payload)) + logger.debug(f"Job {job_key} correctly submitted to work queue '{queue_name}'") -@app.route('/run-job', methods=['POST']) + +@app.route('/job/run', methods=['POST']) def run(): """ - Run a job locally, in consume mode + Entry point for running jobs """ - global budget_keeper - global work_queues - global exec_mode - global localhost_manager_process - job_payload = flask.request.get_json(force=True, silent=True) if job_payload and not isinstance(job_payload, dict): - return error('The action did not receive a dictionary as an argument.') + return error('The action did not receive a dictionary as an argument') try: - runtime = job_payload['runtime_name'] - verify_runtime_name(runtime) + runtime_name = job_payload['runtime_name'] + verify_runtime_name(runtime_name) except Exception as e: return error(str(e)) job_key = job_payload['job_key'] - logger.debug('Received job {}'.format(job_key)) + logger.debug(f'Received job {job_key}') - budget_keeper.last_usage_time = time.time() - budget_keeper.update_config(job_payload['config']['standalone']) - budget_keeper.jobs[job_key] = 'running' - - exec_mode = job_payload['config']['standalone'].get('exec_mode', 'consume') - - if exec_mode == 'consume': - # Consume mode runs jobs in this master VM - work_queue_name = 'local' - work_queue = work_queues.setdefault(work_queue_name, queue.Queue()) - if not localhost_manager_process: - logger.debug('Starting manager process for localhost jobs') - lmp = Thread(target=run_job_local, args=(work_queue, ), daemon=True) - lmp.start() - localhost_manager_process = lmp - logger.debug(f'Putting job {job_key} into master queue') - work_queue.put(job_payload) - - elif exec_mode in ['create', 'reuse']: - # Create and reuse mode runs jobs on woker VMs - logger.debug(f'Starting process for job {job_key}') - work_queue_name = job_key if exec_mode == 'create' else REUSE_WORK_QUEUE_NAME - work_queue = work_queues.setdefault(work_queue_name, queue.Queue()) - Thread(target=start_workers, args=(job_payload, work_queue_name)).start() - Thread(target=run_job_worker, args=(job_payload, work_queue), daemon=True).start() + budget_keeper.add_job(job_key) + + exec_mode = job_payload['config']['standalone']['exec_mode'] + exec_mode = StandaloneMode[exec_mode.upper()] + workers = job_payload.pop('worker_instances') + + if exec_mode == StandaloneMode.CONSUME: + queue_name = f'wq:localhost:{runtime_name.replace("/", "-")}'.lower() + elif exec_mode == StandaloneMode.CREATE: + queue_name = f'wq:{job_key}'.lower() + elif exec_mode == StandaloneMode.REUSE: + worker_it = job_payload['worker_instance_type'] + worker_wp = job_payload['worker_processes'] + queue_name = f'wq:{worker_it}-{worker_wp}-{runtime_name.replace("/", "-")}'.lower() + + Thread(target=handle_job, args=(job_payload, queue_name)).start() + Thread(target=handle_workers, args=(job_payload, workers, queue_name)).start() act_id = str(uuid.uuid4()).replace('-', '')[:12] response = flask.jsonify({'activationId': act_id}) @@ -426,6 +565,45 @@ def run(): return response +def job_monitor(): + logger.info("Starting job monitoring thread") + + jobs_data = {} + + while True: + time.sleep(JOB_MONITOR_CHECK_INTERVAL) + for job_job_key in redis_client.keys('job:*'): + job_key = job_job_key.replace("job:", "") + if job_key not in jobs_data: + budget_keeper.add_job(job_key) + job_data = redis_client.hgetall(job_job_key) + jobs_data[job_key] = {'total': int(job_data['total_tasks']), 'done': 0} + if jobs_data[job_key]['total'] == jobs_data[job_key]['done']: + continue + done_tasks = int(redis_client.llen(f"tasksdone:{job_key}")) + if jobs_data[job_key]['done'] != done_tasks: + total_tasks = jobs_data[job_key]['total'] + jobs_data[job_key]['done'] = done_tasks + exec_id, job_id = job_key.rsplit('-', 1) + msg = f"ExecutorID: {exec_id} | JObID: {job_id} - Tasks done: {done_tasks}/{total_tasks}" + if jobs_data[job_key]['total'] == jobs_data[job_key]['done']: + Path(os.path.join(JOBS_DIR, f'{job_key}.done')).touch() + msg += " - Completed!" + logger.debug(msg) + + +# /---------------------------------------------------------------------------/ +# Misc +# /---------------------------------------------------------------------------/ + +@app.route('/clean', methods=['POST']) +def clean(): + logger.debug("Clean command received. Cleaning all data from redis") + redis_client.flushall() + + return ('', 204) + + @app.route('/ping', methods=['GET']) def ping(): response = flask.jsonify({'response': lithops_version}) @@ -433,25 +611,30 @@ def ping(): return response -@app.route('/get-metadata', methods=['GET']) +def error(msg): + response = flask.jsonify({'error': msg}) + response.status_code = 404 + return response + + +@app.route('/metadata', methods=['GET']) def get_metadata(): payload = flask.request.get_json(force=True, silent=True) if payload and not isinstance(payload, dict): return error('The action did not receive a dictionary as an argument.') try: - runtime = payload['runtime'] - verify_runtime_name(runtime) + verify_runtime_name(payload['runtime']) except Exception as e: return error(str(e)) - pull_runtime = standalone_config.get('pull_runtime', False) - lh = LocalhostHandler({'runtime': runtime, 'pull_runtime': pull_runtime}) - runtime_meta = lh.deploy_runtime(runtime) + localhos_handler = LocalhostHandler(payload) + localhos_handler.init() + runtime_meta = localhos_handler.deploy_runtime(payload['runtime']) if 'lithops_version' in runtime_meta: - logger.debug("Runtime metdata extracted correctly: Lithops " - f"{runtime_meta['lithops_version']}") + logger.debug(f"Runtime metdata extracted correctly from {payload['runtime']}" + f" - Lithops {runtime_meta['lithops_version']}") response = flask.jsonify(runtime_meta) response.status_code = 200 @@ -459,8 +642,7 @@ def get_metadata(): def main(): - global standalone_config - global standalone_handler + global redis_client global budget_keeper global master_ip @@ -469,20 +651,18 @@ def main(): with open(SA_CONFIG_FILE, 'r') as cf: standalone_config = json.load(cf) - # Delete ssh_key_filename - backend = standalone_config['backend'] - if 'ssh_key_filename' in standalone_config[backend]: - del standalone_config[backend]['ssh_key_filename'] + with open(SA_MASTER_DATA_FILE, 'r') as ad: + master_data = json.load(ad) + master_ip = master_data['private_ip'] - with open(SA_DATA_FILE, 'r') as ad: - master_ip = json.load(ad)['private_ip'] - - budget_keeper = BudgetKeeper(standalone_config) + budget_keeper = BudgetKeeper(standalone_config, master_data, stop_callback=clean) budget_keeper.start() - standalone_handler = budget_keeper.sh + redis_client = redis.Redis(decode_responses=True) + + Thread(target=job_monitor, daemon=True).start() - server = WSGIServer(('0.0.0.0', SA_SERVICE_PORT), app, log=app.logger) + server = WSGIServer(('0.0.0.0', SA_MASTER_SERVICE_PORT), app, log=app.logger) server.serve_forever() diff --git a/lithops/standalone/runner.py b/lithops/standalone/runner.py new file mode 100644 index 000000000..7ff764d96 --- /dev/null +++ b/lithops/standalone/runner.py @@ -0,0 +1,63 @@ +# +# (C) Copyright IBM Corp. 2024 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import sys +import json +import logging +import uuid + +from lithops.worker import function_handler +from lithops.constants import ( + RN_LOG_FILE, + LOGGER_FORMAT +) + +log_file_stream = open(RN_LOG_FILE, 'a') +logging.basicConfig(stream=log_file_stream, level=logging.INFO, format=LOGGER_FORMAT) +logger = logging.getLogger('lithops.standalone.runner') + + +def run_job(backend, task_filename): + logger.info(f'Got {task_filename} job file') + + with open(task_filename, 'rb') as jf: + task_payload = json.load(jf) + + executor_id = task_payload['executor_id'] + job_id = task_payload['job_id'] + call_id = task_payload['call_ids'][0] + + logger.info(f'ExecutorID {executor_id} | JobID {job_id} | CallID {call_id} - Starting execution') + + act_id = str(uuid.uuid4()).replace('-', '')[:12] + os.environ['__LITHOPS_ACTIVATION_ID'] = act_id + os.environ['__LITHOPS_BACKEND'] = backend.replace("_", " ").upper() + + task_payload['worker_processes'] = 1 + function_handler(task_payload) + + logger.info(f'ExecutorID {executor_id} | JobID {job_id} | CallID {call_id} - Execution Finished') + + +if __name__ == "__main__": + sys.stdout = log_file_stream + sys.stderr = log_file_stream + logger.info('Starting Standalone task runner') + backend = sys.argv[1] + task_filename = sys.argv[2] + run_job(backend, task_filename) + log_file_stream.close() diff --git a/lithops/standalone/standalone.py b/lithops/standalone/standalone.py index 5bb4c4a74..3b146887c 100644 --- a/lithops/standalone/standalone.py +++ b/lithops/standalone/standalone.py @@ -1,5 +1,6 @@ # -# Copyright Cloudlab URV 2020 +# (C) Copyright Cloudlab URV 2020 +# (C) Copyright IBM Corp. 2023 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,25 +16,36 @@ # import os +import uuid import json -import threading import time +import hashlib import logging import importlib import requests import shlex import concurrent.futures as cf -from lithops.utils import is_lithops_worker, create_handler_zip -from lithops.constants import SA_SERVICE_PORT, SA_INSTALL_DIR, TEMP_DIR -from lithops.standalone.utils import ExecMode, get_master_setup_script +from lithops.utils import ( + BackendType, + is_lithops_worker, + create_handler_zip +) +from lithops.constants import ( + TEMP_DIR, + SA_MASTER_SERVICE_PORT, + SA_MASTER_DATA_FILE, +) +from lithops.standalone.utils import ( + StandaloneMode, + LithopsValidationError, + get_host_setup_script, + get_master_setup_script +) from lithops.version import __version__ logger = logging.getLogger(__name__) -class LithopsValidationError(Exception): - pass - class StandaloneHandler: """ @@ -45,14 +57,13 @@ def __init__(self, standalone_config): self.config = standalone_config self.backend_name = self.config['backend'] self.start_timeout = self.config['start_timeout'] - self.exec_mode = self.config['exec_mode'] - self.workers_policy = self.config.get('workers_policy', 'permissive') # by default not forcing the creation of all workers + self.exec_mode = StandaloneMode[self.config['exec_mode'].upper()] self.is_lithops_worker = is_lithops_worker() module_location = f'lithops.standalone.backends.{self.backend_name}' sb_module = importlib.import_module(module_location) StandaloneBackend = getattr(sb_module, 'StandaloneBackend') - self.backend = StandaloneBackend(self.config[self.backend_name], self.exec_mode) + self.backend = StandaloneBackend(self.config[self.backend_name], self.exec_mode.value) self.jobs = [] # list to store executed jobs (job_keys) logger.debug("Standalone handler created successfully") @@ -63,11 +74,23 @@ def init(self): """ self.backend.init() - def build_image(self, image_name, script_file, overwrite, extra_args=[]): + def is_initialized(self): + """ + Check if the backend is initialized + """ + return self.backend.is_initialized() + + def build_image(self, image_name, script_file, overwrite, include, extra_args=[]): """ Builds a new VM Image """ - self.backend.build_image(image_name, script_file, overwrite, extra_args) + self.backend.build_image(image_name, script_file, overwrite, include, extra_args) + + def delete_image(self, name): + """ + Deletes VM Image + """ + self.backend.delete_image(name) def list_images(self): """ @@ -75,30 +98,52 @@ def list_images(self): """ return self.backend.list_images() + def _make_request(self, method, endpoint, data=None): + """ + Makes a requests to the master VM + """ + if self.is_lithops_worker: + url = f"http://lithops-master:{SA_MASTER_SERVICE_PORT}/{endpoint}" + if method == 'GET': + resp = requests.get(url, timeout=1) + return resp.json() + elif method == 'POST': + resp = requests.post(url, data=json.dumps(data)) + resp.raise_for_status() + return resp.json() + else: + url = f'http://127.0.0.1:{SA_MASTER_SERVICE_PORT}/{endpoint}' + cmd = f'curl -X {method} {url} -H \'Content-Type: application/json\'' + if data is not None: + json_data = json.dumps(data) + data_size = len(json_data) + if data_size < 130000: + data_str = shlex.quote(json_data) + cmd = f'{cmd} -d {data_str}' + else: + data_file_name = f'/tmp/lithops_data_{str(uuid.uuid4())[-6:]}.json' + self.backend.master.get_ssh_client().upload_data_to_file(json_data, data_file_name) + cmd = f'{cmd} -d @{data_file_name}; rm {data_file_name}' + out, err = self.backend.master.get_ssh_client().run_remote_command(cmd) + if not out: + raise ValueError(err) + try: + return json.loads(out) + except Exception: + raise ValueError(out) + def _is_master_service_ready(self): """ Checks if the proxy is ready to receive http connections """ try: - if self.is_lithops_worker: - url = f"http://lithops-master:{SA_SERVICE_PORT}/ping" - r = requests.get(url, timeout=1) - if r.status_code == 200: - return True - return False - else: - cmd = f'curl -X GET http://127.0.0.1:{SA_SERVICE_PORT}/ping' - out = self.backend.master.get_ssh_client().run_remote_command(cmd) - data = json.loads(out) - if data['response'] == __version__: - return True - else: - self.dismantle(include_master=False) - raise LithopsValidationError( - f"Lithops version {data['response']} on {self.backend.master}, " - f"doesn't match local lithops version {__version__}, consider " - "running 'lithops clean' to delete runtime metadata leftovers or " - "'lithops clean --all' to delete master instance as well") + resp = self._make_request('GET', 'ping') + if resp['response'] != __version__: + raise LithopsValidationError( + f"{self.backend.master} is running Lithops {resp['response']} and " + f"it doesn't match local lithops version {__version__}, consider running " + f"'lithops clean -b {self.backend_name} --all' to delete the master instance") + return True except LithopsValidationError as e: raise e except Exception: @@ -109,16 +154,15 @@ def _validate_master_service_setup(self): Checks the master VM is correctly installed """ logger.debug(f'Validating lithops master service is installed on {self.backend.master}') - ssh_client = self.backend.master.get_ssh_client() - res = ssh_client.run_remote_command(f'cat {SA_INSTALL_DIR}/access.data') - if not res: + out, err = ssh_client.run_remote_command(f'cat {SA_MASTER_DATA_FILE}') + if not out: self._setup_master_service() return logger.debug(f"Validating lithops master service is running on {self.backend.master}") - res = ssh_client.run_remote_command("service lithops-master status") - if not res or 'Active: active (running)' not in res: + out, err = ssh_client.run_remote_command("service lithops-master status") + if not out or 'Active: active (running)' not in out: self.dismantle() raise LithopsValidationError( f"Lithops master service not active on {self.backend.master}, " @@ -127,9 +171,9 @@ def _validate_master_service_setup(self): def _wait_master_service_ready(self): """ - Waits until the proxy is ready to receive http connections + Waits until the master service is ready to receive http connections """ - logger.info(f'Waiting Lithops service to become ready on {self.backend.master}') + logger.info(f'Waiting for Lithops service to become ready on {self.backend.master}') start = time.time() while (time.time() - start < self.start_timeout): @@ -142,82 +186,22 @@ def _wait_master_service_ready(self): self.dismantle() raise Exception(f'Lithops service readiness probe expired on {self.backend.master}') - def _get_workers_on_master(self): + def _get_workers_on_master(self, worker_instance_type, worker_processes, runtime_name): """ gets the total available workers on the master VM """ workers_on_master = [] try: - if self.is_lithops_worker: - url = f"http://lithops-master:{SA_SERVICE_PORT}/workers" - resp = requests.get(url) - workers_on_master = resp.json() - else: - cmd = (f'curl http://127.0.0.1:{SA_SERVICE_PORT}/workers ' - '-H \'Content-Type: application/json\' -X GET') - resp = self.backend.master.get_ssh_client().run_remote_command(cmd) - workers_on_master = json.loads(resp) - except LithopsValidationError as e: - raise e + payload = { + 'worker_instance_type': worker_instance_type, + 'worker_processes': worker_processes, + 'runtime_name': runtime_name + } + workers_on_master = self._make_request('GET', 'worker/get', payload) except Exception: pass return workers_on_master - def _wait_workers_ready(self, new_workers): - """ - Wait a given set of workers to become ready - """ - w_names = [w.name for w in new_workers] - logger.info(f'Waiting following workers to become ready: {w_names}') - - start = time.time() - workers_state_on_master = {} - while (time.time() - start < self.start_timeout * 2): - try: - cmd = (f'curl -X GET http://127.0.0.1:{SA_SERVICE_PORT}/workers-state ' - '-H \'Content-Type: application/json\'') - resp = self.backend.master.get_ssh_client().run_remote_command(cmd) - prev = workers_state_on_master - - workers_state_on_master = json.loads(resp) - - running = 0 - if prev != workers_state_on_master: - - msg = 'All workers states: ' - for w in workers_state_on_master: - w_state = workers_state_on_master[w]["state"] - msg += f'({w} - {w_state})' - if w in w_names and w_state == 'running': - if workers_state_on_master[w].get('err'): - logger.warning(f'Worker may operate not in desired ' - f'configuration, worker {w} error: ' - f'{workers_state_on_master[w].get("err")}') - running += 1 - - logger.info(msg) - - if running == len(w_names): - logger.info(f'All workers are ready: {w_names}') - - # on backend, in case workers failed to get optimal workers setup, they may run - # but in order to notify user they will have running state, but 'err' containing error - for w in workers_state_on_master: - if w in w_names and workers_state_on_master[w]["state"] == 'running' \ - and workers_state_on_master[w].get('err'): - logger.warning(f'Workers may operate not in desired configuration, ' - f'worker {w} error: {workers_state_on_master[w].get("err")}') - return - - except LithopsValidationError as e: - raise e - except Exception as e: - pass - - time.sleep(10) - - raise Exception(f'Lithops workers service readiness probe expired on {self.backend.master}') - def invoke(self, job_payload): """ Run the job description against the selected environment @@ -225,61 +209,83 @@ def invoke(self, job_payload): executor_id = job_payload['executor_id'] job_id = job_payload['job_id'] total_calls = job_payload['total_calls'] - chunksize = job_payload['chunksize'] - total_required_workers = (total_calls // chunksize + (total_calls % chunksize > 0) - if self.exec_mode in [ExecMode.CREATE.value, ExecMode.REUSE.value] else 1) + if self.exec_mode == StandaloneMode.CONSUME: + logger.debug( + f'ExecutorID {executor_id} | JobID {job_id} - Worker processes: ' + f'{job_payload["worker_processes"]}' + ) + else: + worker_instance_type = self.backend.get_worker_instance_type() + worker_processes = self.backend.get_worker_cpu_count() + + job_payload['worker_instance_type'] = worker_instance_type + + if job_payload['worker_processes'] == "AUTO": + job_payload['worker_processes'] = worker_processes + job_payload['config'][self.backend_name]['worker_processes'] = worker_processes + + wp = job_payload['worker_processes'] + max_workers = job_payload['max_workers'] + required_workers = min(max_workers, total_calls // wp + (total_calls % wp > 0)) + + logger.debug( + f'ExecutorID {executor_id} | JobID {job_id} - Instance Type: {worker_instance_type} - Worker ' + f'processes: {job_payload["worker_processes"]} - Required Workers: {required_workers}' + ) def create_workers(workers_to_create): current_workers_old = set(self.backend.workers) futures = [] with cf.ThreadPoolExecutor(min(workers_to_create, 48)) as ex: for vm_n in range(workers_to_create): - worker_id = "{:04d}".format(vm_n) - name = f'lithops-worker-{executor_id}-{job_id}-{worker_id}' + worker_id = f"{executor_id}-{job_id}-{vm_n}" + worker_hash = hashlib.sha1(worker_id.encode("utf-8")).hexdigest()[:8] + name = f'lithops-worker-{worker_hash}' futures.append(ex.submit(self.backend.create_worker, name)) for future in cf.as_completed(futures): try: future.result() - except Exception as e: - # if workers policy is strict, raise exception in case failed to create all workers - if self.workers_policy == 'strict': - raise e + except Exception: + pass current_workers_new = set(self.backend.workers) new_workers = current_workers_new - current_workers_old - logger.debug("Total worker VM instances created: {}/{}" - .format(len(new_workers), workers_to_create)) + logger.debug(f"Total worker VM instances created: {len(new_workers)}/{workers_to_create}") return list(new_workers) new_workers = [] - if self.exec_mode == ExecMode.CONSUME.value: - total_workers = total_required_workers + if self.exec_mode == StandaloneMode.CONSUME: + new_workers.append(self.backend.master) + total_workers = 1 - elif self.exec_mode == ExecMode.CREATE.value: - new_workers = create_workers(total_required_workers) + elif self.exec_mode == StandaloneMode.CREATE: + new_workers = create_workers(required_workers) total_workers = len(new_workers) - elif self.exec_mode == ExecMode.REUSE.value: - workers = self._get_workers_on_master() + elif self.exec_mode == StandaloneMode.REUSE: + workers = self._get_workers_on_master( + job_payload['worker_instance_type'], + job_payload['worker_processes'], + job_payload['runtime_name'], + ) total_workers = len(workers) - logger.debug(f"Found {total_workers} free workers " - f"connected to master {self.backend.master}") - if total_workers < total_required_workers: + logger.debug(f"Found {total_workers} free workers connected to {self.backend.master}") + if total_workers < required_workers: # create missing delta of workers - workers_to_create = total_required_workers - total_workers + workers_to_create = required_workers - total_workers logger.debug(f'Going to create {workers_to_create} new workers') new_workers = create_workers(workers_to_create) total_workers += len(new_workers) if total_workers == 0: - raise Exception('It was not possible to create any worker') + raise Exception('It was not possible to create any workers') - logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Going to run {total_calls} ' - f'activations in {min(total_workers, total_required_workers)} workers') + logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Going to run ' + f'{total_calls} activations in {total_workers} workers') logger.debug(f"Checking if {self.backend.master} is ready") if not self._is_master_service_ready(): @@ -288,37 +294,26 @@ def create_workers(workers_to_create): self._validate_master_service_setup() self._wait_master_service_ready() + # delete ssh key + backend = job_payload['config']['lithops']['backend'] + job_payload['config'][backend].pop('ssh_key_filename', None) + + # prepare worker instances data job_payload['worker_instances'] = [ {'name': inst.name, 'private_ip': inst.private_ip, 'instance_id': inst.instance_id, - 'ssh_credentials': inst.ssh_credentials} + 'ssh_credentials': inst.ssh_credentials, + 'instance_type': inst.instance_type} for inst in new_workers ] - # delete ssh key - backend = job_payload['config']['lithops']['backend'] - job_payload['config'][backend].pop('ssh_key_filename', None) - - if self.is_lithops_worker: - url = f"http://lithops-master:{SA_SERVICE_PORT}/run-job" - requests.post(url, data=json.dumps(job_payload)) - else: - pl = shlex.quote(json.dumps(job_payload)) - cmd = (f'curl http://127.0.0.1:{SA_SERVICE_PORT}/run-job -d {pl} ' - '-H \'Content-Type: application/json\' -X POST') - self.backend.master.get_ssh_client().run_remote_command(cmd) - # self.backend.master.del_ssh_client() # Client is deleted in clear() - + # invoke Job + self._make_request('POST', 'job/run', job_payload) logger.debug(f'Job invoked on {self.backend.master}') self.jobs.append(job_payload['job_key']) - # in case workers policy is strict, track all required workers create - # in case of 'consume' mode there no new workers created - if self.exec_mode != 'consume' and self.workers_policy == 'strict': - threading.Thread(target=self._wait_workers_ready, args=(new_workers,), daemon=True).start() - def deploy_runtime(self, runtime_name, *args): """ Installs the proxy and extracts the runtime metadata @@ -332,19 +327,8 @@ def deploy_runtime(self, runtime_name, *args): self._wait_master_service_ready() logger.debug('Extracting runtime metadata information') - payload = {'runtime': runtime_name, 'pull_runtime': True} - - if self.is_lithops_worker: - url = f"http://lithops-master:{SA_SERVICE_PORT}/get-metadata" - resp = requests.get(url, data=json.dumps(payload)) - runtime_meta = resp.json() - else: - pl = shlex.quote(json.dumps(payload)) - cmd = (f'curl http://127.0.0.1:{SA_SERVICE_PORT}/get-metadata -d {pl} ' - '-H \'Content-Type: application/json\' -X GET') - out = self.backend.master.get_ssh_client().run_remote_command(cmd) - runtime_meta = json.loads(out) + runtime_meta = self._make_request('GET', 'metadata', payload) return runtime_meta @@ -358,30 +342,41 @@ def clean(self, **kwargs): """ Clan all the backend resources """ + if self.is_initialized(): + try: + self.init() + self._make_request('POST', 'clean') + except Exception: + pass + self.backend.clean(**kwargs) - def clear(self, job_keys=None): + def clear(self, job_keys=None, exception=None): """ Clear all the backend resources. clear method is executed after the results are get, when an exception is produced, or when a user press ctrl+c """ try: - if self.is_lithops_worker: - url = f"http://lithops-master:{SA_SERVICE_PORT}/stop" - requests.post(url, data=json.dumps(self.jobs)) - else: - pl = shlex.quote(json.dumps(self.jobs)) - cmd = (f'curl http://127.0.0.1:{SA_SERVICE_PORT}/stop -d {pl} ' - '-H \'Content-Type: application/json\' -X POST') - self.backend.master.get_ssh_client().run_remote_command(cmd) - self.backend.master.del_ssh_client() + self._make_request('POST', 'job/stop', self.jobs) except Exception: pass - if self.exec_mode != ExecMode.REUSE.value: + if self.exec_mode != StandaloneMode.REUSE: self.backend.clear(job_keys) + def list_jobs(self): + """ + Lists jobs in master VM + """ + return self._make_request('GET', 'job/list') + + def list_workers(self): + """ + Lists available workers in master VM + """ + return self._make_request('GET', 'worker/list') + def get_runtime_key(self, runtime_name, runtime_memory, version=__version__): """ Wrapper method that returns a formated string that represents the @@ -408,7 +403,7 @@ def get_backend_type(self): """ Wrapper method that returns the type of the backend (Batch or FaaS) """ - return 'batch' + return BackendType.BATCH.value def _setup_master_service(self): """ @@ -418,34 +413,37 @@ def _setup_master_service(self): ssh_client = self.backend.master.get_ssh_client() - handler_zip = os.path.join(TEMP_DIR, 'lithops_standalone.zip') + handler_zip = os.path.join(TEMP_DIR, f'lithops_standalone_{str(uuid.uuid4())[-6:]}.zip') worker_path = os.path.join(os.path.dirname(__file__), 'worker.py') master_path = os.path.join(os.path.dirname(__file__), 'master.py') - create_handler_zip(handler_zip, [master_path, worker_path]) + runner_path = os.path.join(os.path.dirname(__file__), 'runner.py') + create_handler_zip(handler_zip, [master_path, worker_path, runner_path]) logger.debug(f'Uploading lithops files to {self.backend.master}') ssh_client.upload_local_file(handler_zip, '/tmp/lithops_standalone.zip') os.remove(handler_zip) - vm_data = {'name': self.backend.master.name, - 'instance_id': self.backend.master.get_instance_id(), - 'private_ip': self.backend.master.get_private_ip(), - 'delete_on_dismantle': self.backend.master.delete_on_dismantle, - 'lithops_version': __version__} + master_data = { + 'name': self.backend.master.name, + 'instance_id': self.backend.master.get_instance_id(), + 'private_ip': self.backend.master.get_private_ip(), + 'delete_on_dismantle': self.backend.master.delete_on_dismantle, + 'lithops_version': __version__ + } logger.debug(f'Executing lithops installation process on {self.backend.master}') logger.debug('Be patient, initial installation process may take up to 3 minutes') remote_script = "/tmp/install_lithops.sh" - script = get_master_setup_script(self.config, vm_data) - ssh_client.upload_data_to_file(script, remote_script) - ssh_client.run_remote_command(f"chmod 777 {remote_script}; sudo {remote_script};") + script = get_host_setup_script() + script += get_master_setup_script(self.config, master_data) - try: - # Download the master VM public key generated with the installation script - # This public key will be used to create to worker - ssh_client.download_remote_file( - f'{self.backend.master.home_dir}/.ssh/id_rsa.pub', - f'{self.backend.cache_dir}/{self.backend.master.name}-id_rsa.pub') - except FileNotFoundError: - pass + ssh_client.upload_data_to_file(script, remote_script) + cmd = f"chmod 755 {remote_script}; sudo {remote_script}; rm {remote_script}" + ssh_client.run_remote_command(cmd) + + # Download the master VM public key generated with the installation script + # This public key will be used to create the workers + ssh_client.download_remote_file( + f'{self.backend.master.home_dir}/.ssh/lithops_id_rsa.pub', + f'{self.backend.cache_dir}/{self.backend.master.name}-id_rsa.pub') diff --git a/lithops/standalone/utils.py b/lithops/standalone/utils.py index 9849459b0..3009ee456 100644 --- a/lithops/standalone/utils.py +++ b/lithops/standalone/utils.py @@ -1,24 +1,48 @@ +import os import json from enum import Enum from lithops.constants import ( SA_INSTALL_DIR, - SA_LOG_FILE, + SA_SETUP_LOG_FILE, SA_CONFIG_FILE, - SA_DATA_FILE, - SA_TMP_DIR + SA_WORKER_DATA_FILE, + SA_MASTER_DATA_FILE, + SA_WORKER_SERVICE_PORT, + SA_WORKER_LOG_FILE, + SA_SETUP_DONE_FILE ) -class ExecMode(Enum): - """ - Mode of execution - """ +class StandaloneMode(Enum): CONSUME = "consume" CREATE = "create" REUSE = "reuse" +class WorkerStatus(Enum): + STARTING = "starting" + STARTED = "started" + ERROR = "error" + INSTALLING = "installing" + ACTIVE = "active" + IDLE = "idle" + BUSY = "busy" + STOPPED = "stopped" + + +class JobStatus(Enum): + SUBMITTED = "submitted" + PENDING = "pending" + RUNNING = "running" + DONE = 'done' + CANCELED = 'canceled' + + +class LithopsValidationError(Exception): + pass + + MASTER_SERVICE_NAME = 'lithops-master.service' MASTER_SERVICE_FILE = f""" [Unit] @@ -34,14 +58,19 @@ class ExecMode(Enum): """ WORKER_SERVICE_NAME = 'lithops-worker.service' -WORKER_SERVICE_FILE = f""" +WORKER_SERVICE_FILE = """ [Unit] Description=Lithops Worker Service After=network.target +RestartSec=2s +StartLimitBurst=1 +StartLimitIntervalSec=5 [Service] -ExecStart=/usr/bin/python3 {SA_INSTALL_DIR}/worker.py -Restart=always +ExecStartPre={0} +ExecStart={1} +ExecStop={2} +Restart=on-failure [Install] WantedBy=multi-user.target @@ -77,7 +106,7 @@ def get_host_setup_script(docker=True): Returns the script necessary for installing a lithops VM host """ script = f"""#!/bin/bash - mkdir -p {SA_TMP_DIR}; + mkdir -p {SA_INSTALL_DIR}; wait_internet_connection(){{ echo "--> Checking internet connection" @@ -108,27 +137,28 @@ def get_host_setup_script(docker=True): apt-get update; if [ "$INSTALL_DOCKER" = true ] && [ "$DOCKER_REQUIRED" = true ]; then - apt-get install unzip python3-pip docker-ce docker-ce-cli containerd.io -y --fix-missing; + apt-get install unzip redis-server python3-pip docker-ce docker-ce-cli containerd.io -y --fix-missing; else - apt-get install unzip python3-pip -y --fix-missing; + apt-get install unzip redis-server python3-pip -y --fix-missing; fi; + sudo systemctl enable redis-server.service; + sed -i 's/^bind 127.0.0.1 ::1/bind 0.0.0.0/' /etc/redis/redis.conf; + sudo systemctl restart redis-server.service; fi; if [[ ! $(pip3 list|grep "lithops") ]]; then wait_internet_connection; echo "--> Installing Lithops python dependencies" - pip3 install -U flask gevent lithops[all]; + pip3 install -U pip flask gevent lithops[all]; fi; }} - install_packages >> {SA_LOG_FILE} 2>&1 - - unzip -o /tmp/lithops_standalone.zip -d {SA_INSTALL_DIR} > /dev/null 2>&1; - rm /tmp/lithops_standalone.zip + install_packages >> {SA_SETUP_LOG_FILE} 2>&1 + touch {SA_SETUP_DONE_FILE}; """ - return script + def docker_login(config): backend = config['backend'] if all(k in config[backend] for k in ("docker_server", "docker_user", "docker_password")): @@ -139,26 +169,20 @@ def docker_login(config): """ return "" + def get_master_setup_script(config, vm_data): """ Returns master VM installation script """ - script = f"""#!/bin/bash - rm -R {SA_INSTALL_DIR}; - mkdir -p {SA_INSTALL_DIR}; - mkdir -p {SA_TMP_DIR}; - + script = docker_login(config) + script += f""" setup_host(){{ - cp /tmp/lithops_standalone.zip {SA_INSTALL_DIR}; - echo '{json.dumps(vm_data)}' > {SA_DATA_FILE}; + unzip -o /tmp/lithops_standalone.zip -d {SA_INSTALL_DIR}; + mv /tmp/lithops_standalone.zip {SA_INSTALL_DIR}; + echo '{json.dumps(vm_data)}' > {SA_MASTER_DATA_FILE}; echo '{json.dumps(config)}' > {SA_CONFIG_FILE}; }} - setup_host >> {SA_LOG_FILE} 2>&1; - """ - script += get_host_setup_script() - - script += docker_login(config) - script += f""" + setup_host >> {SA_SETUP_LOG_FILE} 2>&1; setup_service(){{ echo '{MASTER_SERVICE_FILE}' > /etc/systemd/system/{MASTER_SERVICE_NAME}; chmod 644 /etc/systemd/system/{MASTER_SERVICE_NAME}; @@ -167,59 +191,77 @@ def get_master_setup_script(config, vm_data): systemctl enable {MASTER_SERVICE_NAME}; systemctl start {MASTER_SERVICE_NAME}; }} - setup_service >> {SA_LOG_FILE} 2>&1; - + setup_service >> {SA_SETUP_LOG_FILE} 2>&1; USER_HOME=$(eval echo ~${{SUDO_USER}}); - generate_ssh_key(){{ echo ' StrictHostKeyChecking no UserKnownHostsFile=/dev/null' >> /etc/ssh/ssh_config; - ssh-keygen -f $USER_HOME/.ssh/id_rsa -t rsa -N ''; - chown ${{SUDO_USER}}:${{SUDO_USER}} $USER_HOME/.ssh/id_rsa*; + ssh-keygen -f $USER_HOME/.ssh/lithops_id_rsa -t rsa -N ''; + chown ${{SUDO_USER}}:${{SUDO_USER}} $USER_HOME/.ssh/lithops_id_rsa*; + cp $USER_HOME/.ssh/lithops_id_rsa $USER_HOME/.ssh/id_rsa + cp $USER_HOME/.ssh/lithops_id_rsa.pub $USER_HOME/.ssh/id_rsa.pub cp $USER_HOME/.ssh/* /root/.ssh; echo '127.0.0.1 lithops-master' >> /etc/hosts; + cat $USER_HOME/.ssh/id_rsa.pub >> $USER_HOME/.ssh/authorized_keys; }} - test -f $USER_HOME/.ssh/id_rsa || generate_ssh_key >> {SA_LOG_FILE} 2>&1; + test -f $USER_HOME/.ssh/lithops_id_rsa || generate_ssh_key >> {SA_SETUP_LOG_FILE} 2>&1; + echo 'tail -f -n 100 /tmp/lithops-*/master-service.log'>> $USER_HOME/.bash_history """ - return script + def get_worker_setup_script(config, vm_data): """ Returns worker VM installation script this script is expected to be executed only from Master VM """ - ssh_user = vm_data['ssh_credentials']['username'] - home_dir = '/root' if ssh_user == 'root' else f'/home/{ssh_user}' - try: - master_pub_key = open(f'{home_dir}/.ssh/id_rsa.pub', 'r').read() - except Exception: - master_pub_key = '' - - script = f"""#!/bin/bash - rm -R {SA_INSTALL_DIR}; - mkdir -p {SA_INSTALL_DIR}; - """ - script += get_host_setup_script() - - script += docker_login(config) + if config['runtime'].startswith(('python', '/')): + cmd_pre = cmd_stop = "id" + cmd_start = f"/usr/bin/python3 {SA_INSTALL_DIR}/worker.py" + else: + cmd_pre = '-docker rm -f lithops_worker' + cmd_start = 'docker run --rm --name lithops_worker ' + cmd_start += '--gpus all ' if config["use_gpu"] else '' + cmd_start += f'--user {os.getuid()}:{os.getgid()} ' + cmd_start += f'--env USER={os.getenv("USER", "root")} --env DOCKER=Lithops ' + cmd_start += f'-p {SA_WORKER_SERVICE_PORT}:{SA_WORKER_SERVICE_PORT} ' + cmd_start += f'-v {SA_INSTALL_DIR}:{SA_INSTALL_DIR} -v /tmp:/tmp ' + cmd_start += f'--entrypoint "python3" {config["runtime"]} {SA_INSTALL_DIR}/worker.py' + cmd_stop = '-docker rm -f lithops_worker' + + script = docker_login(config) script += f""" + setup_host(){{ + unzip -o /tmp/lithops_standalone.zip -d {SA_INSTALL_DIR}; + rm /tmp/lithops_standalone.zip; + echo '{json.dumps(vm_data)}' > {SA_WORKER_DATA_FILE}; echo '{json.dumps(config)}' > {SA_CONFIG_FILE}; - echo '{json.dumps(vm_data)}' > {SA_DATA_FILE}; - + }} + setup_host >> {SA_SETUP_LOG_FILE} 2>&1; + USER_HOME=$(eval echo ~${{SUDO_USER}}); setup_service(){{ - systemctl stop {MASTER_SERVICE_NAME}; - echo '{WORKER_SERVICE_FILE}' > /etc/systemd/system/{WORKER_SERVICE_NAME}; + echo '{WORKER_SERVICE_FILE.format(cmd_pre, cmd_start, cmd_stop)}' > /etc/systemd/system/{WORKER_SERVICE_NAME}; chmod 644 /etc/systemd/system/{WORKER_SERVICE_NAME}; systemctl daemon-reload; systemctl stop {WORKER_SERVICE_NAME}; systemctl enable {WORKER_SERVICE_NAME}; systemctl start {WORKER_SERVICE_NAME}; }} - setup_service >> {SA_LOG_FILE} 2>&1 - USER_HOME=$(eval echo ~${{SUDO_USER}}); - echo '{master_pub_key}' >> $USER_HOME/.ssh/authorized_keys; + setup_service >> {SA_SETUP_LOG_FILE} 2>&1 echo '{vm_data['master_ip']} lithops-master' >> /etc/hosts + echo 'tail -f -n 100 {SA_WORKER_LOG_FILE}'>> $USER_HOME/.bash_history """ + if "ssh_credentials" in vm_data: + ssh_user = vm_data['ssh_credentials']['username'] + home_dir = '/root' if ssh_user == 'root' else f'/home/{ssh_user}' + try: + master_pub_key = open(f'{home_dir}/.ssh/lithops_id_rsa.pub', 'r').read() + except Exception: + master_pub_key = '' + script += f""" + if ! grep -qF "{master_pub_key}" "$USER_HOME/.ssh/authorized_keys"; then + echo "{master_pub_key}" >> $USER_HOME/.ssh/authorized_keys; + fi + """ return script diff --git a/lithops/standalone/worker.py b/lithops/standalone/worker.py index b76c7f48f..03cb77faa 100644 --- a/lithops/standalone/worker.py +++ b/lithops/standalone/worker.py @@ -1,5 +1,6 @@ # -# Copyright Cloudlab URV 2020 +# (C) Copyright Cloudlab URV 2020 +# (C) Copyright IBM Corp. 2024 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,164 +16,273 @@ # import os -import logging -import time import json +import redis import flask -import requests +import logging +import signal +import subprocess as sp from pathlib import Path from threading import Thread +from functools import partial from gevent.pywsgi import WSGIServer +from concurrent.futures import ThreadPoolExecutor -from lithops.constants import LITHOPS_TEMP_DIR, SA_LOG_FILE, JOBS_DIR,\ - SA_SERVICE_PORT, SA_CONFIG_FILE, SA_DATA_FILE -from lithops.localhost.localhost import LocalhostHandler -from lithops.utils import verify_runtime_name, setup_lithops_logger +from lithops.utils import setup_lithops_logger from lithops.standalone.keeper import BudgetKeeper +from lithops.standalone.utils import JobStatus, StandaloneMode, WorkerStatus +from lithops.constants import ( + CPU_COUNT, + LITHOPS_TEMP_DIR, + RN_LOG_FILE, + SA_INSTALL_DIR, + SA_WORKER_LOG_FILE, + JOBS_DIR, + LOGS_DIR, + SA_CONFIG_FILE, + SA_WORKER_DATA_FILE, + SA_WORKER_SERVICE_PORT +) + +os.makedirs(LITHOPS_TEMP_DIR, exist_ok=True) +os.makedirs(JOBS_DIR, exist_ok=True) +os.makedirs(LOGS_DIR, exist_ok=True) log_format = "%(asctime)s\t[%(levelname)s] %(name)s:%(lineno)s -- %(message)s" -setup_lithops_logger(logging.DEBUG, filename=SA_LOG_FILE, log_format=log_format) +setup_lithops_logger(logging.DEBUG, filename=SA_WORKER_LOG_FILE, log_format=log_format) logger = logging.getLogger('lithops.standalone.worker') app = flask.Flask(__name__) -stanbdalone_config = None +redis_client = None budget_keeper = None -localhos_handler = None -last_job_key = None + +job_processes = {} +worker_threads = {} +canceled = [] @app.route('/ping', methods=['GET']) def ping(): - bussy = localhos_handler.job_manager if localhos_handler else False - response = flask.jsonify({'status': 'bussy' if bussy else 'free'}) + idle_count = sum(1 for worker in worker_threads.values() if worker['status'] == WorkerStatus.IDLE.value) + busy_count = sum(1 for worker in worker_threads.values() if worker['status'] == WorkerStatus.BUSY.value) + response = flask.jsonify({'busy': busy_count, 'free': idle_count}) response.status_code = 200 return response +@app.route('/ttd', methods=['GET']) +def ttd(): + if budget_keeper: + ttd = budget_keeper.get_time_to_dismantle() + else: + ttd = "Disabled" + return str(ttd), 200 + + @app.route('/stop/', methods=['POST']) def stop(job_key): - if job_key == last_job_key: - logger.debug(f'Received SIGTERM: Stopping job process {job_key}') - localhos_handler.clear() - done = os.path.join(JOBS_DIR, job_key + '.done') - Path(done).touch() + logger.debug(f'Received SIGTERM: Stopping job process {job_key}') + canceled.append(job_key) + + for job_key_call_id in job_processes: + if job_key_call_id.startswith(job_key): + PID = job_processes[job_key_call_id].pid + PGID = os.getpgid(PID) + logger.debug(f"Killing Job {job_key} - PID {PID}") + os.killpg(PGID, signal.SIGKILL) + Path(os.path.join(JOBS_DIR, job_key_call_id + '.done')).touch() + job_processes[job_key_call_id] = None + response = flask.jsonify({'response': 'cancel'}) response.status_code = 200 return response -def wait_job_completed(job_key): - """ - Waits until the current job is completed - """ - global budget_keeper +def notify_worker_active(worker_name): + try: + redis_client.hset(f"worker:{worker_name}", 'status', WorkerStatus.ACTIVE.value) + except Exception as e: + logger.error(e) - done = os.path.join(JOBS_DIR, job_key + '.done') - while True: - if os.path.isfile(done): - os.remove(done) - budget_keeper.jobs[job_key] = 'done' - break - time.sleep(1) +def notify_worker_idle(worker_name): + try: + data = {'status': WorkerStatus.IDLE.value, 'runtime': '', 'worker_processes': ''} + redis_client.hset(f"worker:{worker_name}", mapping=data) + except Exception as e: + logger.error(e) -def run_worker(master_ip, work_queue): - """ - Run a job - """ - global budget_keeper - global localhos_handler - global last_job_key - pull_runtime = stanbdalone_config.get('pull_runtime', False) - localhos_handler = LocalhostHandler({'pull_runtime': pull_runtime}) +def notify_worker_stop(worker_name): + try: + redis_client.hset(f"worker:{worker_name}", 'status', WorkerStatus.STOPPED.value) + except Exception as e: + logger.error(e) + + +def notify_worker_delete(worker_name): + try: + redis_client.delete(f"worker:{worker_name}") + except Exception as e: + logger.error(e) + + +def notify_task_start(job_key, call_id): + try: + if redis_client.hget(f"job:{job_key}", 'status') == JobStatus.SUBMITTED.value: + redis_client.hset(f"job:{job_key}", 'status', JobStatus.RUNNING.value) + except Exception as e: + logger.error(e) + + +def notify_task_done(job_key, call_id): + try: + done_tasks = int(redis_client.rpush(f"tasksdone:{job_key}", call_id)) + if int(redis_client.hget(f"job:{job_key}", 'total_tasks')) == done_tasks: + redis_client.hset(f"job:{job_key}", 'status', JobStatus.DONE.value) + except Exception as e: + logger.error(e) + + +def redis_queue_consumer(pid, work_queue_name, exec_mode, backend): + worker_threads[pid]['status'] = WorkerStatus.IDLE.value + + logger.info(f"Redis consumer process {pid} started") while True: - url = f'http://{master_ip}:{SA_SERVICE_PORT}/get-task/{work_queue}' - logger.debug(f'Getting task from {url}') + if exec_mode == StandaloneMode.CREATE.value: + task_payload_str = redis_client.rpop(work_queue_name) + if task_payload_str is None: + break + else: + key, task_payload_str = redis_client.brpop(work_queue_name) - try: - resp = requests.get(url) - except Exception: - time.sleep(1) - continue - - if resp.status_code != 200: - if stanbdalone_config.get('exec_mode') == 'reuse': - time.sleep(1) - continue - else: - logger.debug(f'All tasks completed from {url}') - return + worker_threads[pid]['status'] = WorkerStatus.BUSY.value - job_payload = resp.json() + task_payload = json.loads(task_payload_str) + + executor_id = task_payload['executor_id'] + job_id = task_payload['job_id'] + job_key = task_payload['job_key'] + call_id = task_payload['call_ids'][0] + job_key_call_id = f'{job_key}-{call_id}' try: - runtime = job_payload['runtime_name'] - verify_runtime_name(runtime) - except Exception: - return + logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Running ' + f'CallID {call_id} in the local worker (consumer {pid})') + notify_task_start(job_key, call_id) - job_key = job_payload['job_key'] - last_job_key = job_key + if budget_keeper: + budget_keeper.add_job(job_key_call_id) - budget_keeper.last_usage_time = time.time() - budget_keeper.update_config(job_payload['config']['standalone']) - budget_keeper.jobs[job_key] = 'running' + task_filename = os.path.join(JOBS_DIR, f'{job_key_call_id}.task') - try: - localhos_handler.invoke(job_payload) + with open(task_filename, 'w') as jl: + json.dump(task_payload, jl, default=str) + + cmd = ["python3", f"{SA_INSTALL_DIR}/runner.py", backend, task_filename] + log = open(RN_LOG_FILE, 'a') + process = sp.Popen(cmd, stdout=log, stderr=log, start_new_session=True) + job_processes[job_key_call_id] = process + process.communicate() # blocks until the process finishes + del job_processes[job_key_call_id] + + if os.path.exists(task_filename): + os.remove(task_filename) + + Path(os.path.join(JOBS_DIR, f'{job_key_call_id}.done')).touch() + + msg = f'ExecutorID {executor_id} | JobID {job_id} - ' + if job_key in canceled: + msg += f'CallID {call_id} execution canceled' + else: + notify_task_done(job_key, call_id) + msg += f'CallID {call_id} execution finished' + logger.debug(msg) except Exception as e: logger.error(e) - wait_job_completed(job_key) + worker_threads[pid]['status'] = WorkerStatus.IDLE.value + logger.info(f"Redis consumer process {pid} finished") -def main(): - global stanbdalone_config + +def run_worker(): + global redis_client global budget_keeper os.makedirs(LITHOPS_TEMP_DIR, exist_ok=True) # read the Lithops standaole configuration file with open(SA_CONFIG_FILE, 'r') as cf: - stanbdalone_config = json.load(cf) + standalone_config = json.load(cf) # Read the VM data file that contains the instance id, the master IP, # and the queue for getting tasks - with open(SA_DATA_FILE, 'r') as ad: - vm_data = json.load(ad) - worker_ip = vm_data['private_ip'] - master_ip = vm_data['master_ip'] - work_queue = vm_data['work_queue'] + with open(SA_WORKER_DATA_FILE, 'r') as ad: + worker_data = json.load(ad) + + # Start the redis client + redis_client = redis.Redis(host=worker_data['master_ip'], decode_responses=True) + + # Set the worker as Active + notify_worker_active(worker_data['name']) # Start the budget keeper. It is responsible to automatically terminate the # worker after X seconds - budget_keeper = BudgetKeeper(stanbdalone_config) - budget_keeper.start() + if worker_data['master_ip'] != worker_data['private_ip']: + stop_callback = partial(notify_worker_stop, worker_data['name']) + delete_callback = partial(notify_worker_delete, worker_data['name']) + budget_keeper = BudgetKeeper(standalone_config, worker_data, stop_callback, delete_callback) + budget_keeper.start() # Start the http server. This will be used by the master VM to pìng this # worker and for canceling tasks def run_wsgi(): - server = WSGIServer((worker_ip, SA_SERVICE_PORT), app, log=app.logger) + ip_address = "0.0.0.0" if os.getenv("DOCKER") == "Lithops" else worker_data['private_ip'] + server = WSGIServer((ip_address, SA_WORKER_SERVICE_PORT), app, log=app.logger) server.serve_forever() Thread(target=run_wsgi, daemon=True).start() - # Start the worker that will get tasks from the work queue - run_worker(master_ip, work_queue) - - # run_worker will run forever in reuse mode. In create mode it will + # Start the consumer threads + worker_processes = standalone_config[standalone_config['backend']]['worker_processes'] + worker_processes = CPU_COUNT if worker_processes == 'AUTO' else worker_processes + logger.info(f"Starting Worker - Instance type: {worker_data['instance_type']} - Runtime " + f"name: {standalone_config['runtime']} - Worker processes: {worker_processes}") + + # Create a ThreadPoolExecutor for cosnuming tasks + redis_queue_consumer_futures = [] + with ThreadPoolExecutor(max_workers=worker_processes) as executor: + for i in range(worker_processes): + worker_threads[i] = {} + future = executor.submit( + redis_queue_consumer, i, + worker_data['work_queue_name'], + standalone_config['exec_mode'], + standalone_config['backend'] + ) + redis_queue_consumer_futures.append(future) + worker_threads[i]['future'] = future + + for future in redis_queue_consumer_futures: + future.result() + + # Set the worker as idle + if standalone_config['exec_mode'] == StandaloneMode.CONSUME.value: + notify_worker_idle(worker_data['name']) + + # run_worker will run forever in reuse mode. In create and consume mode it will # run until there are no more tasks in the queue. - logger.debug('Finished') + logger.debug('Worker service finished') try: # Try to stop the current worker VM once no more pending tasks to run # in case of create mode - budget_keeper.vm.stop() + budget_keeper.stop_instance() except Exception: pass if __name__ == '__main__': - main() + run_worker() diff --git a/lithops/storage/__init__.py b/lithops/storage/__init__.py index 604926a61..c1568ee33 100644 --- a/lithops/storage/__init__.py +++ b/lithops/storage/__init__.py @@ -1,2 +1,7 @@ from .storage import InternalStorage from .storage import Storage + +__all__ = [ + 'InternalStorage', + 'Storage' +] diff --git a/lithops/storage/backends/aliyun_oss/__init__.py b/lithops/storage/backends/aliyun_oss/__init__.py index 00aca61d6..431cf073c 100644 --- a/lithops/storage/backends/aliyun_oss/__init__.py +++ b/lithops/storage/backends/aliyun_oss/__init__.py @@ -1 +1,3 @@ from .aliyun_oss import AliyunObjectStorageServiceBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/aliyun_oss/aliyun_oss.py b/lithops/storage/backends/aliyun_oss/aliyun_oss.py index a27d78d17..584e413cc 100644 --- a/lithops/storage/backends/aliyun_oss/aliyun_oss.py +++ b/lithops/storage/backends/aliyun_oss/aliyun_oss.py @@ -59,6 +59,15 @@ def _connect_bucket(self, bucket_name): def get_client(self): return self + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + key = self.config['access_key_id'] + self.config['storage_bucket'] = f'lithops-{self.region}-{key[:6].lower()}' + + return self.config['storage_bucket'] + def create_bucket(self, bucket_name): """ Create a bucket if it doesn't exist @@ -119,7 +128,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): except (oss2.exceptions.NoSuchKey, oss2.exceptions.NoSuchBucket): raise StorageNoSuchKeyError(bucket_name, key) - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file :param file_name: File to upload @@ -140,7 +149,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file :param bucket: Bucket to download from diff --git a/lithops/storage/backends/aliyun_oss/config.py b/lithops/storage/backends/aliyun_oss/config.py index 55b407499..73720aa2f 100644 --- a/lithops/storage/backends/aliyun_oss/config.py +++ b/lithops/storage/backends/aliyun_oss/config.py @@ -15,7 +15,6 @@ # import copy -import hashlib CONNECTION_POOL_SIZE = 300 @@ -48,9 +47,3 @@ def load_config(config_data=None): region = config_data['aliyun_oss']['region'] config_data['aliyun_oss']['public_endpoint'] = PUBLIC_ENDPOINT.format(region) config_data['aliyun_oss']['internal_endpoint'] = INTERNAL_ENDPOINT.format(region) - - if 'storage_bucket' not in config_data['aliyun_oss']: - ossc = config_data['aliyun_oss'] - key = ossc['access_key_id'] - endpoint = hashlib.sha1(ossc['public_endpoint'].encode()).hexdigest()[:6] - config_data['aliyun_oss']['storage_bucket'] = f'lithops-{endpoint}-{key[:6].lower()}' diff --git a/lithops/storage/backends/aws_s3/__init__.py b/lithops/storage/backends/aws_s3/__init__.py index 598afb5ec..0284a4c91 100644 --- a/lithops/storage/backends/aws_s3/__init__.py +++ b/lithops/storage/backends/aws_s3/__init__.py @@ -1,17 +1,3 @@ -# -# Copyright Cloudlab URV 2020 -# -# Licensed under the Apache License, Version 2.0 (the 'License'); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an 'AS IS' BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - from .aws_s3 import S3Backend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/aws_s3/aws_s3.py b/lithops/storage/backends/aws_s3/aws_s3.py index 2fcadbbe6..007e76384 100644 --- a/lithops/storage/backends/aws_s3/aws_s3.py +++ b/lithops/storage/backends/aws_s3/aws_s3.py @@ -17,7 +17,6 @@ import os import logging import boto3 -from botocore import UNSIGNED from botocore.config import Config import botocore @@ -34,46 +33,49 @@ class S3Backend: def __init__(self, s3_config): - logger.debug("Creating S3 client") + logger.debug("Creating AWS S3 Client") self.config = s3_config self.user_agent = s3_config['user_agent'] - self.region_name = s3_config.get('region') - self.access_key_id = s3_config.get('access_key_id') - self.secret_access_key = s3_config.get('secret_access_key') - self.session_token = s3_config.get('session_token') + self.region = s3_config.get('region') - if self.access_key_id and self.secret_access_key: - client_config = Config( - max_pool_connections=128, - user_agent_extra=self.user_agent, - connect_timeout=CONN_READ_TIMEOUT, - read_timeout=CONN_READ_TIMEOUT, - retries={'max_attempts': OBJ_REQ_RETRIES} - ) - self.s3_client = boto3.client( - 's3', aws_access_key_id=self.access_key_id, - aws_secret_access_key=self.secret_access_key, - aws_session_token=self.session_token, - config=client_config, - region_name=self.region_name - ) - else: - client_config = Config( - signature_version=UNSIGNED, - user_agent_extra=self.user_agent - ) - self.s3_client = boto3.client('s3', config=client_config) + self.aws_session = boto3.Session( + aws_access_key_id=s3_config.get('access_key_id'), + aws_secret_access_key=s3_config.get('secret_access_key'), + aws_session_token=s3_config.get('session_token'), + region_name=self.region + ) + + s3_client_config = Config( + max_pool_connections=128, + user_agent_extra=self.user_agent, + connect_timeout=CONN_READ_TIMEOUT, + read_timeout=CONN_READ_TIMEOUT, + retries={'max_attempts': OBJ_REQ_RETRIES} + ) + + self.s3_client = self.aws_session.client('s3', config=s3_client_config) msg = STORAGE_CLI_MSG.format('S3') - logger.info(f"{msg} - Region: {self.region_name}") + logger.info(f"{msg} - Region: {self.region}") def get_client(self): - ''' + """ Get boto3 client. :return: boto3 client - ''' + """ return self.s3_client + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + sts_client = self.aws_session.client('sts') + caller_id = sts_client.get_caller_identity() + user_key = caller_id["UserId"].split(":")[0][-4:].lower() + self.config['storage_bucket'] = f'lithops-{self.region}-{user_key}' + + return self.config['storage_bucket'] + def create_bucket(self, bucket_name): """ Create a bucket if it doesn't exist @@ -84,19 +86,26 @@ def create_bucket(self, bucket_name): if e.response['ResponseMetadata']['HTTPStatusCode'] == 404: logger.debug(f"Could not find the bucket {bucket_name} in the AWS S3 storage backend") logger.debug(f"Creating new bucket {bucket_name} in the AWS S3 storage backend") - bucket_config = {'LocationConstraint': self.region_name} - self.s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=bucket_config) + bucket_config = {'LocationConstraint': self.region} + try: + self.s3_client.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=bucket_config) + except botocore.exceptions.ClientError as ce: + error_code = ce.response.get('Error', {}).get('Code', 'Unknown') + if error_code == "InvalidLocationConstraint" and self.region == "us-east-1": + self.s3_client.create_bucket(Bucket=bucket_name) + else: + raise ce else: raise e def put_object(self, bucket_name, key, data): - ''' + """ Put an object in COS. Override the object if the key already exists. :param key: key of the object. :param data: data of the object :type data: str/bytes :return: None - ''' + """ try: res = self.s3_client.put_object(Bucket=bucket_name, Key=key, Body=data) status = 'OK' if res['ResponseMetadata']['HTTPStatusCode'] == 200 else 'Error' @@ -111,12 +120,12 @@ def put_object(self, bucket_name, key, data): raise e def get_object(self, bucket_name, key, stream=False, extra_get_args={}): - ''' + """ Get object from COS with a key. Throws StorageNoSuchKeyError if the given key does not exist. :param key: key of the object :return: Data of the object :rtype: str/bytes - ''' + """ try: r = self.s3_client.get_object(Bucket=bucket_name, Key=key, **extra_get_args) if stream: @@ -130,7 +139,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): else: raise e - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file to an S3 bucket :param file_name: File to upload @@ -142,15 +151,16 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): if key is None: key = os.path.basename(file_name) - # Upload the file + kwargs = {'ExtraArgs': extra_args} if extra_args else {} + kwargs.update({'Config': config} if config else {}) try: - self.s3_client.upload_file(file_name, bucket, key, ExtraArgs=extra_args) + self.s3_client.upload_file(Filename=file_name, Bucket=bucket, Key=key, **kwargs) except botocore.exceptions.ClientError as e: logging.error(e) return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file from an S3 bucket :param bucket: Bucket to download from @@ -162,21 +172,22 @@ def download_file(self, bucket, key, file_name=None, extra_args={}): if file_name is None: file_name = key - # Download the file + kwargs = {'ExtraArgs': extra_args} if extra_args else {} + kwargs.update({'Config': config} if config else {}) try: - self.s3_client.download_file(bucket, key, file_name, ExtraArgs=extra_args) + self.s3_client.download_file(Bucket=bucket, Key=key, Filename=file_name, **kwargs) except botocore.exceptions.ClientError as e: logging.error(e) return False return True def head_object(self, bucket_name, key): - ''' + """ Head object from COS with a key. Throws StorageNoSuchKeyError if the given key does not exist. :param key: key of the object :return: Data of the object :rtype: str/bytes - ''' + """ try: metadata = self.s3_client.head_object(Bucket=bucket_name, Key=key) return metadata['ResponseMetadata']['HTTPHeaders'] @@ -187,19 +198,19 @@ def head_object(self, bucket_name, key): raise e def delete_object(self, bucket_name, key): - ''' + """ Delete an object from storage. :param bucket: bucket name :param key: data key - ''' + """ return self.s3_client.delete_object(Bucket=bucket_name, Key=key) def delete_objects(self, bucket_name, key_list): - ''' + """ Delete a list of objects from storage. :param bucket: bucket name :param key_list: list of keys - ''' + """ result = [] max_keys_num = 1000 for i in range(0, len(key_list), max_keys_num): @@ -209,12 +220,12 @@ def delete_objects(self, bucket_name, key_list): return result def head_bucket(self, bucket_name): - ''' + """ Head bucket from COS with a name. Throws StorageNoSuchKeyError if the given bucket does not exist. :param bucket_name: name of the bucket :return: Metadata of the bucket :rtype: str/bytes - ''' + """ try: return self.s3_client.head_bucket(Bucket=bucket_name) except botocore.exceptions.ClientError as e: @@ -224,13 +235,13 @@ def head_bucket(self, bucket_name): raise e def list_objects(self, bucket_name, prefix=None, match_pattern=None): - ''' + """ Return a list of objects for the given bucket and prefix. :param bucket_name: Name of the bucket. :param prefix: Prefix to filter object names. :return: List of objects in bucket that match the given prefix. :rtype: list of str - ''' + """ try: prefix = '' if prefix is None else prefix paginator = self.s3_client.get_paginator('list_objects_v2') @@ -251,13 +262,13 @@ def list_objects(self, bucket_name, prefix=None, match_pattern=None): raise e def list_keys(self, bucket_name, prefix=None): - ''' + """ Return a list of keys for the given prefix. :param bucket_name: Name of the bucket. :param prefix: Prefix to filter object names. :return: List of keys in bucket that match the given prefix. :rtype: list of str - ''' + """ try: prefix = '' if prefix is None else prefix paginator = self.s3_client.get_paginator('list_objects_v2') diff --git a/lithops/storage/backends/aws_s3/config.py b/lithops/storage/backends/aws_s3/config.py index 60585a827..f72b9750a 100644 --- a/lithops/storage/backends/aws_s3/config.py +++ b/lithops/storage/backends/aws_s3/config.py @@ -22,9 +22,6 @@ def load_config(config_data): if 'aws' in config_data: - if not {'access_key_id', 'secret_access_key'}.issubset(set(config_data['aws'])): - raise Exception("'access_key_id' and 'secret_access_key' are mandatory under the 'aws' section of the configuration") - if 'aws_s3' not in config_data: config_data['aws_s3'] = {} @@ -37,8 +34,3 @@ def load_config(config_data): if 'region' not in config_data['aws_s3']: raise Exception("'region' is mandatory under 'aws_s3' or 'aws' section of the configuration") - - if 'storage_bucket' not in config_data['aws_s3']: - key = config_data['aws_s3']['access_key_id'] - region = config_data['aws_s3']['region'] - config_data['aws_s3']['storage_bucket'] = f'lithops-{region}-{key[:6].lower()}' diff --git a/lithops/storage/backends/azure_storage/__init__.py b/lithops/storage/backends/azure_storage/__init__.py index 2b0270ab6..d13e7e448 100644 --- a/lithops/storage/backends/azure_storage/__init__.py +++ b/lithops/storage/backends/azure_storage/__init__.py @@ -1 +1,3 @@ from .azure_storage import AzureBlobStorageBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/azure_storage/azure_storage.py b/lithops/storage/backends/azure_storage/azure_storage.py index c698306a1..123be7db7 100644 --- a/lithops/storage/backends/azure_storage/azure_storage.py +++ b/lithops/storage/backends/azure_storage/azure_storage.py @@ -15,6 +15,7 @@ # import os +import hashlib import shutil import logging from io import BytesIO @@ -30,10 +31,14 @@ class AzureBlobStorageBackend: def __init__(self, azure_blob_config): logger.debug("Creating Azure Blob Storage client") + self.config = azure_blob_config self.storage_account_name = azure_blob_config['storage_account_name'] self.blob_service_url = 'https://{}.blob.core.windows.net'.format(self.storage_account_name) - self.blob_client = BlobServiceClient(account_url=self.blob_service_url, - credential=azure_blob_config['storage_account_key']) + + self.blob_client = BlobServiceClient( + account_url=self.blob_service_url, + credential=azure_blob_config['storage_account_key'] + ) msg = STORAGE_CLI_MSG.format('Azure Blob') logger.info("{}".format(msg)) @@ -46,6 +51,16 @@ def get_client(self): """ return self.blob_client + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + key = self.config['storage_account_key'] + account = hashlib.sha1(self.config['storage_account_name'].encode()).hexdigest()[:6] + self.config['storage_bucket'] = f'lithops-{account}-{key[:6].lower()}' + + return self.config['storage_bucket'] + def create_bucket(self, bucket_name): """ Create a bucket if it doesn't exist @@ -95,7 +110,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): except ResourceNotFoundError: raise StorageNoSuchKeyError(bucket_name, key) - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file :param file_name: File to upload @@ -116,7 +131,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file :param bucket: Bucket to download from diff --git a/lithops/storage/backends/azure_storage/config.py b/lithops/storage/backends/azure_storage/config.py index 62cff7761..abb1ca949 100644 --- a/lithops/storage/backends/azure_storage/config.py +++ b/lithops/storage/backends/azure_storage/config.py @@ -14,8 +14,6 @@ # limitations under the License. # -import hashlib - REQ_PARAMS = ('storage_account_name', 'storage_account_key') @@ -29,9 +27,3 @@ def load_config(config_data=None): if param not in config_data['azure_storage']: msg = f"'{param}' is mandatory under 'azure_storage' section of the configuration" raise Exception(msg) - - if 'storage_bucket' not in config_data['azure_storage']: - azsc = config_data['azure_storage'] - key = azsc['storage_account_key'] - account = hashlib.sha1(azsc['storage_account_name'].encode()).hexdigest()[:6] - config_data['azure_storage']['storage_bucket'] = f'lithops-{account}-{key[:6].lower()}' diff --git a/lithops/storage/backends/ceph/__init__.py b/lithops/storage/backends/ceph/__init__.py index afe6f5ec7..baffdb9ae 100644 --- a/lithops/storage/backends/ceph/__init__.py +++ b/lithops/storage/backends/ceph/__init__.py @@ -1 +1,3 @@ from .ceph import CephStorageBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/ceph/ceph.py b/lithops/storage/backends/ceph/ceph.py index c58edce49..37691f029 100644 --- a/lithops/storage/backends/ceph/ceph.py +++ b/lithops/storage/backends/ceph/ceph.py @@ -17,6 +17,7 @@ import os import logging import boto3 +import hashlib import botocore from lithops.storage.utils import StorageNoSuchKeyError from lithops.utils import sizeof_fmt @@ -37,9 +38,9 @@ def __init__(self, ceph_config): logger.debug("Creating Ceph client") self.config = ceph_config user_agent = ceph_config['user_agent'] - service_endpoint = ceph_config['endpoint'] + self.service_endpoint = ceph_config['endpoint'] - logger.debug(f"Setting Ceph endpoint to {service_endpoint}") + logger.debug(f"Setting Ceph endpoint to {self.service_endpoint}") client_config = botocore.client.Config( max_pool_connections=128, @@ -48,17 +49,19 @@ def __init__(self, ceph_config): read_timeout=CONN_READ_TIMEOUT, retries={'max_attempts': OBJ_REQ_RETRIES} ) + if 'region' in ceph_config: + client_config.region_name = ceph_config['region'] self.s3_client = boto3.client( 's3', aws_access_key_id=ceph_config['access_key_id'], aws_secret_access_key=ceph_config['secret_access_key'], aws_session_token=ceph_config.get('session_token'), config=client_config, - endpoint_url=service_endpoint + endpoint_url=self.service_endpoint ) msg = STORAGE_CLI_MSG.format('Ceph') - logger.info(f"{msg} - Endpoint: {service_endpoint}") + logger.info(f"{msg} - Endpoint: {self.service_endpoint}") def get_client(self): """ @@ -67,6 +70,16 @@ def get_client(self): """ return self.s3_client + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + key = self.config['access_key_id'] + endpoint = hashlib.sha1(self.service_endpoint.encode()).hexdigest()[:6] + self.config['storage_bucket'] = f'lithops-{endpoint}-{key[:6].lower()}' + + return self.config['storage_bucket'] + def create_bucket(self, bucket_name): """ Create a bucket if it doesn't exist @@ -139,7 +152,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): retries += 1 return data - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file to an S3 bucket :param file_name: File to upload @@ -151,15 +164,16 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): if key is None: key = os.path.basename(file_name) - # Upload the file + kwargs = {'ExtraArgs': extra_args} if extra_args else {} + kwargs.update({'Config': config} if config else {}) try: - self.s3_client.upload_file(file_name, bucket, key, ExtraArgs=extra_args) + self.s3_client.upload_file(Filename=file_name, Bucket=bucket, Key=key, **kwargs) except botocore.exceptions.ClientError as e: logging.error(e) return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file from an S3 bucket :param bucket: Bucket to download from @@ -171,9 +185,10 @@ def download_file(self, bucket, key, file_name=None, extra_args={}): if file_name is None: file_name = key - # Download the file + kwargs = {'ExtraArgs': extra_args} if extra_args else {} + kwargs.update({'Config': config} if config else {}) try: - self.s3_client.download_file(bucket, key, file_name, ExtraArgs=extra_args) + self.s3_client.download_file(Bucket=bucket, Key=key, Filename=file_name, **kwargs) except botocore.exceptions.ClientError as e: logging.error(e) return False diff --git a/lithops/storage/backends/ceph/config.py b/lithops/storage/backends/ceph/config.py index 810ddc8c1..36bf6a48d 100644 --- a/lithops/storage/backends/ceph/config.py +++ b/lithops/storage/backends/ceph/config.py @@ -15,9 +15,6 @@ # -import hashlib - - REQ_PARAMS = ('endpoint', 'secret_access_key', 'access_key_id') @@ -32,8 +29,3 @@ def load_config(config_data): if not config_data['ceph']['endpoint'].startswith('http'): raise Exception('Ceph endpoint must start with http:// or https://') - - if 'storage_bucket' not in config_data['ceph']: - key = config_data['ceph']['access_key_id'] - endpoint = hashlib.sha1(config_data['ceph']['endpoint'].encode()).hexdigest()[:6] - config_data['ceph']['storage_bucket'] = f'lithops-{endpoint}-{key[:6].lower()}' diff --git a/lithops/storage/backends/gcp_storage/__init__.py b/lithops/storage/backends/gcp_storage/__init__.py index 4f69eedbb..8c6290057 100644 --- a/lithops/storage/backends/gcp_storage/__init__.py +++ b/lithops/storage/backends/gcp_storage/__init__.py @@ -1 +1,3 @@ from .gcp_storage import GCPStorageBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/gcp_storage/config.py b/lithops/storage/backends/gcp_storage/config.py index 432e51451..ad8453056 100644 --- a/lithops/storage/backends/gcp_storage/config.py +++ b/lithops/storage/backends/gcp_storage/config.py @@ -15,7 +15,6 @@ # import copy -import hashlib import os @@ -39,9 +38,3 @@ def load_config(config_data=None): if 'region' not in config_data['gcp_storage']: raise Exception("'region' parameter is mandatory under 'gcp_storage' or 'gcp' section of the configuration") - - if 'storage_bucket' not in config_data['gcp_storage']: - gcps = config_data['gcp_storage'] - region = gcps['region'] - key = hashlib.sha1(gcps['credentials_path'].encode()).hexdigest()[:6] - config_data['gcp_storage']['storage_bucket'] = f'lithops-{region}-{key[:6].lower()}' diff --git a/lithops/storage/backends/gcp_storage/gcp_storage.py b/lithops/storage/backends/gcp_storage/gcp_storage.py index 638be1cf7..086995b72 100644 --- a/lithops/storage/backends/gcp_storage/gcp_storage.py +++ b/lithops/storage/backends/gcp_storage/gcp_storage.py @@ -18,12 +18,12 @@ import os import shutil import time +import hashlib import logging from requests.exceptions import SSLError as TooManyConnectionsError from io import BytesIO from google.api_core import exceptions as google_exceptions from google.cloud import storage -from google.cloud.exceptions import NotFound from lithops.constants import STORAGE_CLI_MSG from lithops.storage.utils import StorageNoSuchKeyError @@ -31,10 +31,12 @@ TIMEOUT = 5 + class GCPStorageBackend: def __init__(self, gcp_storage_config): logger.debug("Creating GCP Storage client") + self.config = gcp_storage_config self.credentials_path = gcp_storage_config.get('credentials_path') self.region = gcp_storage_config['region'] @@ -51,6 +53,22 @@ def __init__(self, gcp_storage_config): def get_client(self): return self.client + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + key = hashlib.sha1(self.credentials_path.encode()).hexdigest()[:6] + self.config['storage_bucket'] = f'lithops-{self.region}-{key[:6].lower()}' + + return self.config['storage_bucket'] + + def exists_bucket(self, bucket_name): + try: + self.client.get_bucket(bucket_name, timeout=TIMEOUT) + return True + except google_exceptions.NotFound: + return False + def create_bucket(self, bucket_name): """ Create a bucket if it doesn't exist @@ -58,9 +76,12 @@ def create_bucket(self, bucket_name): try: bucket = self.client.bucket(bucket_name) bucket.storage_class = "STANDARD" - self.client.create_bucket(bucket, location=self.region) - except google_exceptions.Conflict: - pass + if not self.exists_bucket(bucket_name): + logger.debug(f"Could not find the bucket {bucket_name} in the GCP storage backend") + logger.debug(f"Creating new bucket {bucket_name} in the GCP storage backend") + self.client.create_bucket(bucket, location=self.region) + except google_exceptions.Forbidden: + raise StorageNoSuchKeyError(bucket_name, '') def put_object(self, bucket_name, key, data): done = False @@ -102,9 +123,9 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): stream.seek(0) # Retrun to the initial buffer position return stream else: - return blob.download_as_string(start=start, end=end) + return blob.download_as_bytes(start=start, end=end) - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file :param file_name: File to upload @@ -125,7 +146,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file :param bucket: Bucket to download from diff --git a/lithops/storage/backends/ibm_cos/__init__.py b/lithops/storage/backends/ibm_cos/__init__.py index 8ec5846cc..a4ff4de1d 100644 --- a/lithops/storage/backends/ibm_cos/__init__.py +++ b/lithops/storage/backends/ibm_cos/__init__.py @@ -1 +1,3 @@ from .ibm_cos import IBMCloudObjectStorageBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/ibm_cos/config.py b/lithops/storage/backends/ibm_cos/config.py index 1a65242cb..2a03f0940 100644 --- a/lithops/storage/backends/ibm_cos/config.py +++ b/lithops/storage/backends/ibm_cos/config.py @@ -65,7 +65,8 @@ def load_config(config_data): if 'private_endpoint' not in config_data['ibm_cos']: raise Exception('You must provide the private_endpoint to access to IBM COS') elif 'direct' not in config_data['ibm_cos']['private_endpoint']: - raise Exception('The private_endpoint you provided to access to IBM COS is not valid. You must use the "direct" endpoint') + raise Exception('The private_endpoint you provided to access to IBM COS is not valid. You must use the ' + '"direct" endpoint') if not config_data['ibm_cos']['private_endpoint'].startswith('http'): raise Exception('IBM COS Private Endpoint must start with http:// or https://') @@ -79,17 +80,7 @@ def load_config(config_data): endpoint = config_data['ibm_cos']['endpoint'] config_data['ibm_cos']['region'] = endpoint.split('//')[1].split('.')[1] - if 'access_key' in config_data['ibm_cos']: - config_data['ibm_cos']['access_key_id'] = config_data['ibm_cos'].pop('access_key') - if 'secret_key' in config_data['ibm_cos']: - config_data['ibm_cos']['secret_access_key'] = config_data['ibm_cos'].pop('secret_key') - - if 'storage_bucket' not in config_data['ibm_cos']: - if not {'access_key_id', 'secret_access_key'}.issubset(config_data['ibm_cos']): - msg = "'storage_bucket' parameter not found in config. " - msg += "You must provide HMAC Credentials if you want the bucket to be automatically created" - raise Exception(msg) - cosc = config_data['ibm_cos'] - key = cosc.get('access_key_id') or cosc.get('api_key') or cosc.get('iam_api_key') - region = config_data['ibm_cos']['region'] - config_data['ibm_cos']['storage_bucket'] = f'lithops-{region}-{key[:6].lower()}' + if ('service_instance_id' not in config_data['ibm_cos'] + and {'access_key_id', 'secret_access_key'}.isdisjoint(config_data['ibm_cos'])): + raise Exception('You must provide the service_instance_id or the (access_key_id,secret_access_key) to ' + 'access to IBM COS') diff --git a/lithops/storage/backends/ibm_cos/ibm_cos.py b/lithops/storage/backends/ibm_cos/ibm_cos.py index a76c07173..9a41a0b0f 100644 --- a/lithops/storage/backends/ibm_cos/ibm_cos.py +++ b/lithops/storage/backends/ibm_cos/ibm_cos.py @@ -103,7 +103,8 @@ def __init__(self, ibm_cos_config): aws_secret_access_key="", aws_session_token=token, config=client_config, - endpoint_url=service_endpoint + endpoint_url=service_endpoint, + ibm_service_instance_id=self.config.get('service_instance_id') ) msg = STORAGE_CLI_MSG.format('IBM COS') @@ -116,6 +117,19 @@ def get_client(self): """ return self.cos_client + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + if not {'access_key_id', 'secret_access_key'}.issubset(self.config): + msg = "'storage_bucket' parameter not found in config. You must provide HMAC " + msg += "Credentials if you want the bucket to be automatically created" + raise Exception(msg) + key = self.config.get('access_key_id') or self.api_key or self.iam_api_key + self.config['storage_bucket'] = f"lithops-{self.region}-{key[:6].lower()}" + + return self.config['storage_bucket'] + def create_bucket(self, bucket_name): """ Create a bucket if it doesn't exist @@ -188,7 +202,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): retries += 1 return data - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file to an S3 bucket :param file_name: File to upload @@ -200,15 +214,16 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): if key is None: key = os.path.basename(file_name) - # Upload the file + kwargs = {'ExtraArgs': extra_args} if extra_args else {} + kwargs.update({'Config': config} if config else {}) try: - self.cos_client.upload_file(file_name, bucket, key, ExtraArgs=extra_args) + self.cos_client.upload_file(Filename=file_name, Bucket=bucket, Key=key, **kwargs) except ibm_botocore.exceptions.ClientError as e: logging.error(e) return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file from an S3 bucket :param bucket: Bucket to download from @@ -220,9 +235,10 @@ def download_file(self, bucket, key, file_name=None, extra_args={}): if file_name is None: file_name = key - # Download the file + kwargs = {'ExtraArgs': extra_args} if extra_args else {} + kwargs.update({'Config': config} if config else {}) try: - self.cos_client.download_file(bucket, key, file_name, ExtraArgs=extra_args) + self.cos_client.download_file(Bucket=bucket, Key=key, Filename=file_name, **kwargs) except ibm_botocore.exceptions.ClientError as e: logging.error(e) return False diff --git a/lithops/storage/backends/infinispan/__init__.py b/lithops/storage/backends/infinispan/__init__.py index e3b717d6b..8165d321f 100644 --- a/lithops/storage/backends/infinispan/__init__.py +++ b/lithops/storage/backends/infinispan/__init__.py @@ -1 +1,3 @@ -from lithops.storage.backends.infinispan.infinispan import InfinispanBackend as StorageBackend +from .infinispan import InfinispanBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/infinispan/infinispan.py b/lithops/storage/backends/infinispan/infinispan.py index 65e081a80..406f5a5cc 100644 --- a/lithops/storage/backends/infinispan/infinispan.py +++ b/lithops/storage/backends/infinispan/infinispan.py @@ -127,7 +127,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): return io.BytesIO(data) return data - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file :param file_name: File to upload @@ -148,7 +148,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file :param bucket: Bucket to download from diff --git a/lithops/storage/backends/infinispan_hotrod/__init__.py b/lithops/storage/backends/infinispan_hotrod/__init__.py index 636b69cb3..996aa27cd 100644 --- a/lithops/storage/backends/infinispan_hotrod/__init__.py +++ b/lithops/storage/backends/infinispan_hotrod/__init__.py @@ -1 +1,3 @@ -from lithops.storage.backends.infinispan_hotrod.infinispan_hotrod import InfinispanHotrodBackend as StorageBackend +from .infinispan_hotrod import InfinispanHotrodBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/infinispan_hotrod/infinispan_hotrod.py b/lithops/storage/backends/infinispan_hotrod/infinispan_hotrod.py index f9d3ebd54..7629437aa 100644 --- a/lithops/storage/backends/infinispan_hotrod/infinispan_hotrod.py +++ b/lithops/storage/backends/infinispan_hotrod/infinispan_hotrod.py @@ -104,7 +104,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): return io.BytesIO(b) return b - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file :param file_name: File to upload @@ -125,7 +125,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file :param bucket: Bucket to download from diff --git a/lithops/storage/backends/localhost/__init__.py b/lithops/storage/backends/localhost/__init__.py index 176bc2ec4..ab9bd3177 100644 --- a/lithops/storage/backends/localhost/__init__.py +++ b/lithops/storage/backends/localhost/__init__.py @@ -1 +1,3 @@ from .localhost import LocalhostStorageBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/localhost/config.py b/lithops/storage/backends/localhost/config.py index ee81bcbf8..068c3c34c 100644 --- a/lithops/storage/backends/localhost/config.py +++ b/lithops/storage/backends/localhost/config.py @@ -16,7 +16,10 @@ def load_config(config_data): - if 'localhost' not in config_data: + if 'localhost' not in config_data or config_data['localhost'] is None: config_data['localhost'] = {} + if 'monitoring_interval' not in config_data['lithops']: + config_data['lithops']['monitoring_interval'] = 0.1 + config_data['localhost']['storage_bucket'] = 'storage' diff --git a/lithops/storage/backends/localhost/localhost.py b/lithops/storage/backends/localhost/localhost.py index 1175be675..899e24aaa 100644 --- a/lithops/storage/backends/localhost/localhost.py +++ b/lithops/storage/backends/localhost/localhost.py @@ -108,7 +108,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): except Exception: raise StorageNoSuchKeyError(os.path.join(LITHOPS_TEMP_DIR, bucket_name), key) - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file :param file_name: File to upload @@ -129,7 +129,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file :param bucket: Bucket to download from diff --git a/lithops/storage/backends/minio/__init__.py b/lithops/storage/backends/minio/__init__.py index a6766b836..abe82db3d 100644 --- a/lithops/storage/backends/minio/__init__.py +++ b/lithops/storage/backends/minio/__init__.py @@ -1 +1,3 @@ from .minio import MinioStorageBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/minio/config.py b/lithops/storage/backends/minio/config.py index eff74165e..1f3a3f7fd 100644 --- a/lithops/storage/backends/minio/config.py +++ b/lithops/storage/backends/minio/config.py @@ -15,9 +15,6 @@ # -import hashlib - - REQ_PARAMS = ('endpoint', 'secret_access_key', 'access_key_id') @@ -32,8 +29,3 @@ def load_config(config_data): if not config_data['minio']['endpoint'].startswith('http'): raise Exception('MinIO endpoint must start with http:// or https://') - - if 'storage_bucket' not in config_data['minio']: - key = config_data['minio']['access_key_id'] - endpoint = hashlib.sha1(config_data['minio']['endpoint'].encode()).hexdigest()[:6] - config_data['minio']['storage_bucket'] = f'lithops-{endpoint}-{key[:6].lower()}' diff --git a/lithops/storage/backends/minio/minio.py b/lithops/storage/backends/minio/minio.py index 957b2c607..b2b14acb1 100644 --- a/lithops/storage/backends/minio/minio.py +++ b/lithops/storage/backends/minio/minio.py @@ -15,6 +15,7 @@ # import os +import hashlib import logging import boto3 import botocore @@ -37,9 +38,9 @@ def __init__(self, minio_config): logger.debug("Creating MinIO client") self.config = minio_config user_agent = minio_config['user_agent'] - service_endpoint = minio_config['endpoint'] + self.service_endpoint = minio_config['endpoint'] - logger.debug(f"Setting MinIO endpoint to {service_endpoint}") + logger.debug(f"Setting MinIO endpoint to {self.service_endpoint}") client_config = botocore.client.Config( max_pool_connections=128, @@ -54,11 +55,11 @@ def __init__(self, minio_config): aws_secret_access_key=minio_config['secret_access_key'], aws_session_token=minio_config.get('session_token'), config=client_config, - endpoint_url=service_endpoint + endpoint_url=self.service_endpoint ) msg = STORAGE_CLI_MSG.format('MinIO') - logger.info(f"{msg} - Endpoint: {service_endpoint}") + logger.info(f"{msg} - Endpoint: {self.service_endpoint}") def get_client(self): """ @@ -67,6 +68,16 @@ def get_client(self): """ return self.s3_client + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + key = self.config['access_key_id'] + endpoint = hashlib.sha1(self.service_endpoint.encode()).hexdigest()[:6] + self.config['storage_bucket'] = f'lithops-{endpoint}-{key[:6].lower()}' + + return self.config['storage_bucket'] + def create_bucket(self, bucket_name): """ Create a bucket if it doesn't exist @@ -139,7 +150,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): retries += 1 return data - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file to an S3 bucket :param file_name: File to upload @@ -151,15 +162,16 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): if key is None: key = os.path.basename(file_name) - # Upload the file + kwargs = {'ExtraArgs': extra_args} if extra_args else {} + kwargs.update({'Config': config} if config else {}) try: - self.s3_client.upload_file(file_name, bucket, key, ExtraArgs=extra_args) + self.s3_client.upload_file(Filename=file_name, Bucket=bucket, Key=key, **kwargs) except botocore.exceptions.ClientError as e: logging.error(e) return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file from an S3 bucket :param bucket: Bucket to download from @@ -171,9 +183,10 @@ def download_file(self, bucket, key, file_name=None, extra_args={}): if file_name is None: file_name = key - # Download the file + kwargs = {'ExtraArgs': extra_args} if extra_args else {} + kwargs.update({'Config': config} if config else {}) try: - self.s3_client.download_file(bucket, key, file_name, ExtraArgs=extra_args) + self.s3_client.download_file(Bucket=bucket, Key=key, Filename=file_name, **kwargs) except botocore.exceptions.ClientError as e: logging.error(e) return False diff --git a/lithops/storage/backends/oracle_oss/__init__.py b/lithops/storage/backends/oracle_oss/__init__.py index f91bc4c43..e373099f2 100644 --- a/lithops/storage/backends/oracle_oss/__init__.py +++ b/lithops/storage/backends/oracle_oss/__init__.py @@ -1 +1,3 @@ from .oracle_oss import OCIObjectStorageBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/oracle_oss/config.py b/lithops/storage/backends/oracle_oss/config.py index da430610a..f6319e84e 100644 --- a/lithops/storage/backends/oracle_oss/config.py +++ b/lithops/storage/backends/oracle_oss/config.py @@ -13,7 +13,6 @@ # limitations under the License. # -import os import copy @@ -38,8 +37,3 @@ def load_config(config_data=None): temp = copy.deepcopy(config_data['oracle_oss']) config_data['oracle_oss'].update(config_data['oracle']) config_data['oracle_oss'].update(temp) - - if 'storage_bucket' not in config_data['oracle_oss']: - user = config_data['oracle_oss']['user'] - region = config_data['oracle_oss']['region'] - config_data['oracle_oss']['storage_bucket'] = f'lithops-{region}-{user[-8:-1].lower()}' diff --git a/lithops/storage/backends/oracle_oss/oracle_oss.py b/lithops/storage/backends/oracle_oss/oracle_oss.py index 824ffb616..f5ec60b59 100644 --- a/lithops/storage/backends/oracle_oss/oracle_oss.py +++ b/lithops/storage/backends/oracle_oss/oracle_oss.py @@ -28,6 +28,7 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) + class OCIObjectStorageBackend: def __init__(self, oci_config): @@ -55,6 +56,15 @@ def _init_storage_client(self): def get_client(self): return self + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + user = self.config['user'] + self.config['storage_bucket'] = f'lithops-{self.region}-{user[-8:-1].lower()}' + + return self.config['storage_bucket'] + def create_bucket(self, bucket_name): """ Create a bucket if it doesn't exist @@ -116,7 +126,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): raise e return data - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): ''' Uploads a file to OCI Object Storage. The file is read in binary mode and uploaded under a specified key. If no key is provided, the base name of the file is used as the key. @@ -139,7 +149,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): ''' Download a file from the specified bucket and key in the object storage. :param bucket: Name of the bucket diff --git a/lithops/storage/backends/redis/__init__.py b/lithops/storage/backends/redis/__init__.py index 7fbcf5359..5ee81307a 100644 --- a/lithops/storage/backends/redis/__init__.py +++ b/lithops/storage/backends/redis/__init__.py @@ -1 +1,3 @@ from .redis import RedisBackend as StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/redis/redis.py b/lithops/storage/backends/redis/redis.py index 431ea0935..d987b2c1d 100644 --- a/lithops/storage/backends/redis/redis.py +++ b/lithops/storage/backends/redis/redis.py @@ -108,7 +108,7 @@ def get_object(self, bucket_name, key, stream=False, extra_get_args={}): else: return data - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file :param file_name: File to upload @@ -129,7 +129,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file :param bucket: Bucket to download from @@ -208,7 +208,7 @@ def head_bucket(self, bucket_name): """ return bool(self._client.exists(self._format_key(bucket_name, ''))) - def list_objects(self, bucket_name, prefix=None, match_pattern = None): + def list_objects(self, bucket_name, prefix=None, match_pattern=None): """ Return a list of objects for the given bucket and prefix. :param bucket_name: name of the bucket. diff --git a/lithops/storage/backends/swift/__init__.py b/lithops/storage/backends/swift/__init__.py index 7268a55bf..97ce386c5 100644 --- a/lithops/storage/backends/swift/__init__.py +++ b/lithops/storage/backends/swift/__init__.py @@ -1 +1,3 @@ from .swift import StorageBackend + +__all__ = ['StorageBackend'] diff --git a/lithops/storage/backends/swift/swift.py b/lithops/storage/backends/swift/swift.py index e7999e28a..8366a3ca2 100644 --- a/lithops/storage/backends/swift/swift.py +++ b/lithops/storage/backends/swift/swift.py @@ -33,11 +33,13 @@ class StorageBackend: def __init__(self, swift_config): logger.debug("Creating OpenStack Swift client") - self.auth_url = swift_config['swift_auth_url'] - self.user_id = swift_config['swift_user_id'] - self.project_id = swift_config['swift_project_id'] - self.password = swift_config['swift_password'] - self.region = swift_config['swift_region'] + self.auth_url = swift_config['auth_url'] + self.user_id = swift_config['user_id'] + self.project_id = swift_config['project_id'] + self.password = swift_config['password'] + self.region = swift_config['region'] + self.user_domain_name = swift_config.get("user_domain_name", "default") + self.project_domain_name = swift_config.get("project_domain_name", "default") self.endpoint = None if 'token' in swift_config: @@ -57,6 +59,14 @@ def __init__(self, swift_config): msg = STORAGE_CLI_MSG.format('OpenStack Swift') logger.info("{} - Region: {}".format(msg, self.region)) + def generate_bucket_name(self): + """ + Generates a unique bucket name + """ + self.config['storage_bucket'] = f'lithops-{self.region}-{self.user_id[:6].lower()}' + + return self.config['storage_bucket'] + def generate_swift_token(self): """ Generates new token for accessing to Swift. @@ -64,9 +74,26 @@ def generate_swift_token(self): """ url = self.auth_url + "/v3/auth/tokens" headers = {'Content-Type': 'application/json'} - data = {"auth": {"identity": {"methods": ["password"], - "password": {"user": {"id": self.user_id, "password": self.password}}}, - "scope": {"project": {"id": self.project_id}}}} + data = { + "auth": { + "identity": { + "methods": ["password"], + "password": { + "user": { + "name": self.user_id, + "password": self.password, + "domain": {"name": self.user_domain_name} + } + } + }, + "scope": { + "project": { + "id": self.project_id, + "domain": {"name": self.project_domain_name} + } + } + } + } json_data = json.dumps(data) r = requests.post(url, data=json_data, headers=headers) @@ -89,7 +116,7 @@ def generate_swift_token(self): message = json.loads(r.text)['error']['message'] raise Exception("{} - {} - {}".format(r.status_code, r.reason, message)) - def put_object(self, container_name, key, data): + def put_object(self, bucket_name, key, data): """ Put an object in Swift. Override the object if the key already exists. :param key: key of the object. @@ -97,7 +124,7 @@ def put_object(self, container_name, key, data): :type data: str/bytes :return: None """ - url = '/'.join([self.endpoint, container_name, key]) + url = '/'.join([self.endpoint, bucket_name, key]) try: res = self.session.put(url, data=data) status = 'OK' if res.status_code == 201 else 'Error' @@ -108,16 +135,14 @@ def put_object(self, container_name, key, data): except Exception as e: print(e) - def get_object(self, container_name, key, stream=False, extra_get_args={}): + def get_object(self, bucket_name, key, stream=False, extra_get_args={}): """ Get object from Swift with a key. Throws StorageNoSuchKeyError if the given key does not exist. :param key: key of the object :return: Data of the object :rtype: str/bytes """ - if not container_name: - container_name = self.storage_container - url = '/'.join([self.endpoint, container_name, key]) + url = '/'.join([self.endpoint, bucket_name, key]) headers = {'X-Auth-Token': self.token} headers.update(extra_get_args) try: @@ -129,16 +154,16 @@ def get_object(self, container_name, key, stream=False, extra_get_args={}): data = res.content return data elif res.status_code == 404: - raise StorageNoSuchKeyError(container_name, key) + raise StorageNoSuchKeyError(bucket_name, key) else: raise Exception('{} - {}'.format(res.status_code, key)) except StorageNoSuchKeyError: - raise StorageNoSuchKeyError(container_name, key) + raise StorageNoSuchKeyError(bucket_name, key) except Exception as e: print(e) - raise StorageNoSuchKeyError(container_name, key) + raise StorageNoSuchKeyError(bucket_name, key) - def upload_file(self, file_name, bucket, key=None, extra_args={}): + def upload_file(self, file_name, bucket, key=None, extra_args={}, config=None): """Upload a file :param file_name: File to upload @@ -159,7 +184,7 @@ def upload_file(self, file_name, bucket, key=None, extra_args={}): return False return True - def download_file(self, bucket, key, file_name=None, extra_args={}): + def download_file(self, bucket, key, file_name=None, extra_args={}, config=None): """Download a file :param bucket: Bucket to download from @@ -184,35 +209,35 @@ def download_file(self, bucket, key, file_name=None, extra_args={}): return False return True - def head_object(self, container_name, key): + def head_object(self, bucket_name, key): """ Head object from Swift with a key. Throws StorageNoSuchKeyError if the given key does not exist. :param key: key of the object :return: Data of the object :rtype: str/bytes """ - url = '/'.join([self.endpoint, container_name, key]) + url = '/'.join([self.endpoint, bucket_name, key]) try: res = self.session.head(url) if res.status_code == 200: return res.headers elif res.status_code == 404: - raise StorageNoSuchKeyError(container_name, key) + raise StorageNoSuchKeyError(bucket_name, key) else: raise Exception('{} - {}'.format(res.status_code, key)) - except Exception as e: - raise StorageNoSuchKeyError(container_name, key) + except Exception: + raise StorageNoSuchKeyError(bucket_name, key) - def delete_object(self, container_name, key): + def delete_object(self, bucket_name, key): """ Delete an object from Swift. :param bucket: bucket name :param key: data key """ - url = '/'.join([self.endpoint, container_name, key]) + url = '/'.join([self.endpoint, bucket_name, key]) return self.session.delete(url) - def delete_objects(self, container_name, key_list): + def delete_objects(self, bucket_name, key_list): """ Delete a list of objects from Swift. :param bucket: bucket name @@ -223,13 +248,13 @@ def delete_objects(self, container_name, key_list): keys_to_delete = [] for key in key_list: - keys_to_delete.append('/{}/{}'.format(container_name, key)) + keys_to_delete.append('/{}/{}'.format(bucket_name, key)) keys_to_delete = '\n'.join(keys_to_delete) url = '/'.join([self.endpoint, '?bulk-delete']) return self.session.delete(url, data=keys_to_delete, headers=headers) - def list_objects(self, container_name, prefix='', match_pattern=None): + def list_objects(self, bucket_name, prefix='', match_pattern=None): """ Lists the objects in a bucket. Throws StorageNoSuchKeyError if the given bucket does not exist. :param key: key of the object @@ -237,9 +262,9 @@ def list_objects(self, container_name, prefix='', match_pattern=None): :rtype: str/bytes """ if prefix: - url = '/'.join([self.endpoint, container_name, '?format=json&prefix=' + prefix]) + url = '/'.join([self.endpoint, bucket_name, '?format=json&prefix=' + prefix]) else: - url = '/'.join([self.endpoint, container_name, '?format=json']) + url = '/'.join([self.endpoint, bucket_name, '?format=json']) try: res = self.session.get(url) objects = res.json() @@ -249,7 +274,7 @@ def list_objects(self, container_name, prefix='', match_pattern=None): except Exception as e: raise e - def list_keys(self, container_name, prefix): + def list_keys(self, bucket_name, prefix): """ Return a list of keys for the given prefix. :param prefix: Prefix to filter object names. @@ -257,7 +282,7 @@ def list_keys(self, container_name, prefix): :rtype: list of str """ try: - objects = self.list_objects(container_name, prefix) + objects = self.list_objects(bucket_name, prefix) object_keys = [r['name'] for r in objects] return object_keys except Exception as e: diff --git a/lithops/storage/cloud_proxy.py b/lithops/storage/cloud_proxy.py index add5fd59b..d3f0005c7 100644 --- a/lithops/storage/cloud_proxy.py +++ b/lithops/storage/cloud_proxy.py @@ -24,8 +24,8 @@ def remove_lithops_keys(keys): - return list(filter(lambda key: not any([key.startswith(prefix) for prefix in - [JOBS_PREFIX, TEMP_PREFIX, LOGS_PREFIX, RUNTIMES_PREFIX]]), keys)) + return list(filter(lambda key: not any([key.startswith(prefix) for prefix in [ + JOBS_PREFIX, TEMP_PREFIX, LOGS_PREFIX, RUNTIMES_PREFIX]]), keys)) # @@ -88,13 +88,15 @@ def listdir(self, path='', suffix_dirs=False): paths = self._storage.list_bucket_keys(prefix=prefix) names = set() for p in paths: - if any([p.startswith(prefix) for prefix in [JOBS_PREFIX, TEMP_PREFIX, LOGS_PREFIX, RUNTIMES_PREFIX]]): + if any([p.startswith(prefix) for prefix in [ + JOBS_PREFIX, TEMP_PREFIX, LOGS_PREFIX, RUNTIMES_PREFIX]]): continue p = p[len(prefix):] if p.startswith(prefix) else p if p.startswith('/'): p = p[1:] splits = p.split('/') - name = splits[0] + '/' if suffix_dirs and len(splits) > 1 else splits[0] + name = splits[0] + \ + '/' if suffix_dirs and len(splits) > 1 else splits[0] names |= {name} return list(names) @@ -113,11 +115,23 @@ def walk(self, top, topdown=True, onerror=None, followlinks=False): elif topdown: yield top, dirs, files for dir_name in dirs: - for result in self.walk(base_os.path.join(top, dir_name), topdown, onerror, followlinks): + for result in self.walk( + base_os.path.join( + top, + dir_name), + topdown, + onerror, + followlinks): yield result else: for dir_name in dirs: - for result in self.walk(base_os.path.join(top, dir_name), topdown, onerror, followlinks): + for result in self.walk( + base_os.path.join( + top, + dir_name), + topdown, + onerror, + followlinks): yield result yield top, dirs, files @@ -144,7 +158,9 @@ def isfile(self, path): if path.startswith('/'): prefix = path[1:] - keys = remove_lithops_keys(self._storage.list_bucket_keys(prefix=prefix)) + keys = remove_lithops_keys( + self._storage.list_bucket_keys( + prefix=prefix)) if len(keys) == 1: key = keys.pop() key = key[len(prefix):] @@ -160,7 +176,9 @@ def isdir(self, path): if prefix != '' and not prefix.endswith('/'): prefix = prefix + '/' - keys = remove_lithops_keys(self._storage.list_bucket_keys(prefix=prefix)) + keys = remove_lithops_keys( + self._storage.list_bucket_keys( + prefix=prefix)) return bool(keys) def exists(self, path): diff --git a/lithops/storage/storage.py b/lithops/storage/storage.py index 4956d80da..c14800643 100644 --- a/lithops/storage/storage.py +++ b/lithops/storage/storage.py @@ -20,7 +20,7 @@ import logging import itertools import importlib -from typing import Optional, List, Union, Tuple, Dict, TextIO, BinaryIO +from typing import Optional, List, Union, Dict, TextIO, BinaryIO, Any from lithops.constants import CACHE_DIR, RUNTIMES_PREFIX, JOBS_PREFIX, TEMP_PREFIX from lithops.utils import is_lithops_worker @@ -31,13 +31,15 @@ RUNTIME_META_CACHE = {} +COBJECTS_INDEX = itertools.count() class Storage: """ An Storage object is used by partitioner and other components to access - underlying storage backend without exposing the the implementation details. + underlying storage backend without exposing the implementation details. """ + def __init__(self, config=None, backend=None, storage_config=None): """ Creates an Storage instance @@ -51,11 +53,11 @@ def __init__(self, config=None, backend=None, storage_config=None): if storage_config: self.config = storage_config else: - storage_config = default_storage_config(config_data=config, backend=backend) + storage_config = default_storage_config( + config_data=config, backend=backend) self.config = extract_storage_config(storage_config) self.backend = self.config['backend'] - self.bucket = self.config['bucket'] try: module_location = f'lithops.storage.backends.{self.backend}' @@ -67,7 +69,8 @@ def __init__(self, config=None, backend=None, storage_config=None): f"'{self.backend}' storage backend") raise e - self._created_cobjects_n = itertools.count() + bucket = self.config[self.backend].get('storage_bucket') + self.bucket = bucket or self.storage_handler.generate_bucket_name() def get_client(self) -> object: """ @@ -94,7 +97,8 @@ def create_bucket(self, bucket: str): if hasattr(self.storage_handler, 'create_bucket'): return self.storage_handler.create_bucket(bucket) - def put_object(self, bucket: str, key: str, body: Union[str, bytes, TextIO, BinaryIO]): + def put_object(self, bucket: str, key: str, + body: Union[str, bytes, TextIO, BinaryIO]): """ Adds an object to a bucket of the storage backend. @@ -104,23 +108,37 @@ def put_object(self, bucket: str, key: str, body: Union[str, bytes, TextIO, Bina """ return self.storage_handler.put_object(bucket, key, body) - def get_object(self, bucket: str, key: str, + def get_object(self, + bucket: str, + key: str, stream: Optional[bool] = False, - extra_get_args: Optional[Dict] = {}) -> Union[str, bytes, TextIO, BinaryIO]: + extra_get_args: Optional[Dict] = {}) -> Union[str, + bytes, + TextIO, + BinaryIO]: """ Retrieves objects from the storage backend. :param bucket: Name of the bucket :param key: Key of the object :param stream: Get the object data or a file-like object - :param extra_get_args: Extra get arguments to be passed to the underlying backend implementation (dict). For example, to specify the byte-range to read: ``extra_get_args={'Range': 'bytes=0-100'}``. + :param extra_get_args: Extra get arguments to be passed to the underlying backend implementation (dict). + For example, to specify the byte-range to read: ``extra_get_args={'Range': 'bytes=0-100'}``. :return: Object, as a binary array or as a file-like stream if parameter `stream` is enabled """ - return self.storage_handler.get_object(bucket, key, stream, extra_get_args) + return self.storage_handler.get_object( + bucket, key, stream, extra_get_args) - def upload_file(self, file_name: str, bucket: str, key: Optional[str] = None, - extra_args: Optional[Dict] = {}) -> Union[str, bytes, TextIO, BinaryIO]: + def upload_file(self, + file_name: str, + bucket: str, + key: Optional[str] = None, + extra_args: Optional[Dict] = {}, + config: Optional[Any] = None) -> Union[str, + bytes, + TextIO, + BinaryIO]: """ Upload a file to a bucket of the storage backend. (Multipart upload) @@ -128,11 +146,19 @@ def upload_file(self, file_name: str, bucket: str, key: Optional[str] = None, :param bucket: Name of the bucket :param key: Key of the object :param extra_args: Extra get arguments to be passed to the underlying backend implementation (dict). + :param config: The transfer configuration to be used when performing the transfer (boto3.s3.transfer.TransferConfig). """ - return self.storage_handler.upload_file(file_name, bucket, key, extra_args) + return self.storage_handler.upload_file(file_name, bucket, key, extra_args, config) - def download_file(self, bucket: str, key: str, file_name: Optional[str] = None, - extra_args: Optional[Dict] = {}) -> Union[str, bytes, TextIO, BinaryIO]: + def download_file(self, + bucket: str, + key: str, + file_name: Optional[str] = None, + extra_args: Optional[Dict] = {}, + config: Optional[Any] = None) -> Union[str, + bytes, + TextIO, + BinaryIO]: """ Download a file from the storage backend. (Multipart download) @@ -140,10 +166,11 @@ def download_file(self, bucket: str, key: str, file_name: Optional[str] = None, :param key: Key of the object :param file_name: Name of the file to save the object data :param extra_args: Extra get arguments to be passed to the underlying backend implementation (dict). + :param config: The transfer configuration to be used when performing the transfer (boto3.s3.transfer.TransferConfig). :return: Object, as a binary array or as a file-like stream if parameter `stream` is enabled """ - return self.storage_handler.download_file(bucket, key, file_name, extra_args) + return self.storage_handler.download_file(bucket, key, file_name, extra_args, config) def head_object(self, bucket: str, key: str) -> Dict: """ @@ -189,15 +216,20 @@ def head_bucket(self, bucket: str) -> Dict: """ return self.storage_handler.head_bucket(bucket) - def list_objects(self, bucket: str, prefix: Optional[str] = None, match_pattern: Optional[str] = None) -> List[Tuple[str, int]]: + def list_objects(self, + bucket: str, + prefix: Optional[str] = None, + match_pattern: Optional[str] = None) -> List[Dict[str, + Any]]: """ - Returns all of the object keys in a bucket. For each object, the list contains the name - of the object (key) and the size. + Returns all of the object keys in a bucket. For each object, the list contains a dictionary + with at least the object key ('Key') and the size in bytes ('Size'). Additional fields may be + present, depending on the backend implementation. :param bucket: Name of the bucket :param prefix: Key prefix for filtering - :return: List of tuples containing the object key and size in bytes + :return: List of dictionaries containing at least 'Key' and 'Size' for each object """ return self.storage_handler.list_objects(bucket, prefix, match_pattern) @@ -214,8 +246,13 @@ def list_keys(self, bucket, prefix=None) -> List[str]: """ return self.storage_handler.list_keys(bucket, prefix) - def put_cloudobject(self, body: Union[str, bytes, TextIO, BinaryIO], - bucket: Optional[str] = None, key: Optional[str] = None) -> utils.CloudObject: + def put_cloudobject(self, + body: Union[str, + bytes, + TextIO, + BinaryIO], + bucket: Optional[str] = None, + key: Optional[str] = None) -> utils.CloudObject: """ Put a CloudObject into storage. @@ -226,7 +263,7 @@ def put_cloudobject(self, body: Union[str, bytes, TextIO, BinaryIO], :return: CloudObject instance """ prefix = os.environ.get('__LITHOPS_SESSION_ID', '') - coid = hex(next(self._created_cobjects_n))[2:] + coid = hex(next(COBJECTS_INDEX))[2:] coname = 'cloudobject_{}'.format(coid) name = '/'.join([prefix, coname]) if prefix else coname key = key or '/'.join([TEMP_PREFIX, name]) @@ -235,8 +272,12 @@ def put_cloudobject(self, body: Union[str, bytes, TextIO, BinaryIO], return utils.CloudObject(self.backend, bucket, key) - def get_cloudobject(self, cloudobject: utils.CloudObject, - stream: Optional[bool] = False) -> Union[str, bytes, TextIO, BinaryIO]: + def get_cloudobject(self, + cloudobject: utils.CloudObject, + stream: Optional[bool] = False) -> Union[str, + bytes, + TextIO, + BinaryIO]: """ Get a CloudObject's content from storage. @@ -282,7 +323,8 @@ def delete_cloudobjects(self, cloudobjects: List[utils.CloudObject]): for backend in cobjs: if backend == self.backend: for bucket in cobjs[backend]: - self.storage_handler.delete_objects(bucket, cobjs[backend][co.bucket]) + self.storage_handler.delete_objects( + bucket, cobjs[backend][co.bucket]) else: raise Exception("CloudObject: Invalid Storage backend") @@ -304,8 +346,9 @@ def __init__(self, storage_config): self.bucket = self.storage.bucket if not self.bucket: - raise Exception(f"'storage_bucket' is mandatory under '{self.backend}'" - " section of the configuration") + raise Exception( + f"'storage_bucket' is mandatory under '{self.backend}'" + " section of the configuration") self.storage.create_bucket(self.bucket) @@ -347,7 +390,8 @@ def get_data(self, key, stream=False, extra_get_args={}): :param key: data key :return: data content """ - return self.storage.get_object(self.bucket, key, stream, extra_get_args) + return self.storage.get_object( + self.bucket, key, stream, extra_get_args) def get_func(self, key): """ @@ -374,12 +418,14 @@ def get_job_status(self, executor_id): callset_prefix = '/'.join([JOBS_PREFIX, executor_id]) keys = self.storage.list_keys(self.bucket, callset_prefix) - running_keys = [k.split('/') for k in keys if utils.init_key_suffix in k] + running_keys = [k.split('/') + for k in keys if utils.init_key_suffix in k] running_callids = [(tuple(k[1].rsplit("-", 1) + [k[2]]), k[3].replace(utils.init_key_suffix, '')) for k in running_keys] - done_keys = [k.split('/')[1:] for k in keys if utils.status_key_suffix in k] + done_keys = [k.split('/')[1:] + for k in keys if utils.status_key_suffix in k] done_callids = [tuple(k[0].rsplit("-", 1) + [k[1]]) for k in done_keys] return set(running_callids), set(done_callids) @@ -417,9 +463,6 @@ def get_runtime_meta(self, key): :param runtime: name of the runtime :return: runtime metadata """ - - global RUNTIME_META_CACHE - path = [RUNTIMES_PREFIX, key + ".meta.json"] filename_local_path = os.path.join(CACHE_DIR, *path) @@ -435,24 +478,28 @@ def get_runtime_meta(self, key): return runtime_meta else: - logger.debug("Runtime metadata not found in local cache. Retrieving it from storage") + logger.debug( + "Runtime metadata not found in local cache. Retrieving it from storage") try: obj_key = '/'.join(path).replace('\\', '/') - logger.debug('Trying to download runtime metadata from: {}://{}/{}' - .format(self.backend, self.bucket, obj_key)) + logger.debug( + 'Trying to download runtime metadata from: {}://{}/{}' .format( + self.backend, self.bucket, obj_key)) json_str = self.storage.get_object(self.bucket, obj_key) logger.debug('Runtime metadata found in storage') runtime_meta = json.loads(json_str.decode("ascii")) # Save runtime meta to cache try: - if not os.path.exists(os.path.dirname(filename_local_path)): + if not os.path.exists( + os.path.dirname(filename_local_path)): os.makedirs(os.path.dirname(filename_local_path)) with open(filename_local_path, "w") as f: f.write(json.dumps(runtime_meta)) except Exception as e: - logger.error("Could not save runtime meta to local cache: {}".format(e)) + logger.error( + "Could not save runtime meta to local cache: {}".format(e)) RUNTIME_META_CACHE['/'.join(path)] = runtime_meta return runtime_meta @@ -474,7 +521,8 @@ def put_runtime_meta(self, key, runtime_meta): if not is_lithops_worker(): filename_local_path = os.path.join(CACHE_DIR, *path) - logger.debug("Storing runtime metadata into local cache: {}".format(filename_local_path)) + logger.debug( + "Storing runtime metadata into local cache: {}".format(filename_local_path)) if not os.path.exists(os.path.dirname(filename_local_path)): os.makedirs(os.path.dirname(filename_local_path)) diff --git a/lithops/storage/utils.py b/lithops/storage/utils.py index 6bc093255..23f1ed0a9 100644 --- a/lithops/storage/utils.py +++ b/lithops/storage/utils.py @@ -78,7 +78,7 @@ def clean_bucket(storage, bucket, prefix, sleep=5): Deletes all the files from COS. These files include the function, the data serialization and the function invocation results. """ - msg = f"Going to delete all objects from bucket '{bucket}'" + msg = f"Deleting objects from bucket '{bucket}'" msg = msg + f" and prefix '{prefix}'" if prefix else msg logger.info(msg) total_objects = 0 @@ -165,13 +165,13 @@ def create_init_key(executor_id, job_id, call_id, act_id): def get_storage_path(storage_config): - storage_bucket = storage_config['bucket'] - storage_backend = storage_config['backend'] + backend = storage_config['backend'] + bucket = storage_config[backend]['storage_bucket'] - return [storage_backend, storage_bucket] + return [backend, bucket] -def check_storage_path(config, prev_path): - current_path = get_storage_path(config) +def check_storage_path(storage_config, prev_path): + current_path = get_storage_path(storage_config) if current_path != prev_path: raise StorageConfigMismatchError(current_path, prev_path) diff --git a/lithops/tests/conftest.py b/lithops/tests/conftest.py new file mode 100644 index 000000000..7fbcac6be --- /dev/null +++ b/lithops/tests/conftest.py @@ -0,0 +1,47 @@ +import os +import pytest +import logging +from lithops.config import ( + default_config, + load_yaml_config +) + +logger = logging.getLogger(__name__) + +TESTS_PREFIX = '__lithops.test' + + +def pytest_addoption(parser): + parser.addoption("--config", metavar="", default=None, help="path to lithops config file") + parser.addoption("--backend", metavar="", default=None, help="compute backend") + parser.addoption("--storage", metavar="", default=None, help="storage backend") + parser.addoption("--region", metavar="", default=None, help="region") + + +@pytest.fixture(scope="session", autouse=True) +def setup_global(request): + config = request.config + config_file = config.getoption("--config") + backend = config.getoption("--backend") + storage = config.getoption("--storage") + region = config.getoption("--region") + + config_data = None + + if config_file: + if os.path.exists(config_file): + config_data = load_yaml_config(config_file) + else: + raise FileNotFoundError(f"The provided config file '{config_file}' does not exist") + + config_ow = {'lithops': {}, 'backend': {}} + config_ow['lithops']['log_level'] = 'DEBUG' + if storage: + config_ow['lithops']['storage'] = storage + if backend: + config_ow['lithops']['backend'] = backend + if region: + config_ow['backend']['region'] = region + + lithops_config = default_config(config_data=config_data, config_overwrite=config_ow) + pytest.lithops_config = lithops_config diff --git a/lithops/tests/functions.py b/lithops/tests/functions.py new file mode 100644 index 000000000..ce3dfe3ca --- /dev/null +++ b/lithops/tests/functions.py @@ -0,0 +1,179 @@ +import lithops +import time +import pickle + + +def simple_map_function(x, y): + return x + y + + +def concat(lst): + return " ".join(lst) + + +def hello_world(param): + return "Hello World!" + + +def lithops_inside_lithops_map_function(x): + def _func(x): + return x + + fexec = lithops.FunctionExecutor() + fexec.map(_func, range(x)) + return fexec.get_result() + + +def lithops_return_futures_map(x): + def _func(x): + return x + 1 + + fexec = lithops.FunctionExecutor() + futures = fexec.map(_func, range(x)) + + # this while loop is required to pass localhost tests on Windows + while not all(f.running or f.ready for f in futures): + time.sleep(0.1) + + return futures + + +def lithops_return_futures_call_async(x): + def _func(x): + return x + 1 + + fexec = lithops.FunctionExecutor() + fut = fexec.call_async(_func, x + 5) + + # this while loop is required to pass localhost tests on Windows + while not (fut.running or fut.ready): + time.sleep(0.1) + + return fut + + +def lithops_return_futures_map_multiple(x): + def _func(x): + return x + 1 + + fexec = lithops.FunctionExecutor() + fut1 = fexec.map(_func, range(x)) + fut2 = fexec.map(_func, range(x)) + + # this while loop is required to pass localhost tests on Windows + while not all(f.running or f.ready for f in fut1 + fut2): + time.sleep(0.1) + + return fut1 + fut2 + + +def my_map_function_obj(obj, id): + """returns a dictionary of {word:number of appearances} key:value items.""" + print('Function id: {}'.format(id)) + print('Bucket: {}'.format(obj.bucket)) + print('Key: {}'.format(obj.key)) + print('Partition num: {}'.format(obj.part)) + + print('Chunk size: {}'.format(obj.chunk_size)) + print('Byte range: {}'.format(obj.data_byte_range)) + + counter = {} + data = obj.data_stream.read() + + # chunk = obj.data_stream.read(10000) + # data = b"" + # while chunk: + # data += chunk + # chunk = obj.data_stream.read(10000) + + print('Data lenght: {}'.format(len(data))) + + for line in data.splitlines(): + for word in line.decode('utf-8').split(): + if word not in counter: + counter[word] = 1 + else: + counter[word] += 1 + print('Testing map_reduce() over a bucket') + return counter + + +def my_map_function_url(id, obj): + print('I am processing the object from {}'.format(obj.url)) + print('Function id: {}'.format(id)) + print('Partition num: {}'.format(obj.part)) + print('Chunk size: {}'.format(obj.chunk_size)) + print('Byte range: {}'.format(obj.data_byte_range)) + + counter = {} + data = obj.data_stream.read() + + print('Data lenght: {}'.format(len(data))) + + for line in data.splitlines(): + for word in line.decode('utf-8').split(): + if word not in counter: + counter[word] = 1 + else: + counter[word] += 1 + return counter + + +def simple_reduce_function(results): + """general purpose reduce function that sums up the results + of previous activations of map functions """ + total = 0 + for map_result in results: + total = total + map_result + return total + + +def my_reduce_function(results): + """sums up the number of words by totaling the number of appearances of each word. + @param results: dictionary that counts the appearances of each word within a url.""" + final_result = 0 + for count in results: + for word in count: + final_result += count[word] + return final_result + + +def my_cloudobject_put(obj, storage): + """uploads to storage pickled dict of type: {word:number of appearances} """ + counter = my_map_function_obj(obj, 0) + cloudobject = storage.put_cloudobject(pickle.dumps(counter)) + return cloudobject + + +def my_cloudobject_get(cloudobjects, storage): + """unpickles list of data from storage and return their sum by using a reduce function """ + data = [pickle.loads(storage.get_cloudobject(co)) for co in cloudobjects] + return my_reduce_function(data) + + +def my_map_function_storage(key_i, bucket_name, storage): + print(f'I am processing the object /{bucket_name}/{key_i}') + counter = {} + data = storage.get_object(bucket_name, key_i) + for line in data.splitlines(): + for word in line.decode('utf-8').split(): + if word not in counter: + counter[word] = 1 + else: + counter[word] += 1 + return counter + + +class SideEffect: + def __init__(self): + pass + + @property + def foo(self): + raise RuntimeError("Side effect triggered") + + result = 5 + + +def passthrough_function(x): + return x.result diff --git a/lithops/tests/main_util.py b/lithops/tests/main_util.py deleted file mode 100644 index d64e752dc..000000000 --- a/lithops/tests/main_util.py +++ /dev/null @@ -1,45 +0,0 @@ -# -# (C) Copyright IBM Corp. 2020 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import logging - -logger = logging.getLogger(__name__) - -CONFIG = None -STORAGE_CONFIG = None -STORAGE = None -TEST_FILES_URLS = None -WORDS_IN_DATA_SET = None - - -def init_config(config, storage, storage_config, words_in_data_set, test_files_urls): - global CONFIG, STORAGE, STORAGE_CONFIG, WORDS_IN_DATA_SET, TEST_FILES_URLS - - CONFIG, STORAGE, STORAGE_CONFIG, WORDS_IN_DATA_SET, TEST_FILES_URLS = \ - config, storage, storage_config, words_in_data_set, test_files_urls - - -def get_config(): - return {'config': CONFIG, 'storage': STORAGE, 'storage_config': STORAGE_CONFIG} - - -def get_data_sets(): - """returns urls to data-sets that contains many single word rows (for easy processing) """ - return TEST_FILES_URLS - - -def get_words_in_files(): - return WORDS_IN_DATA_SET diff --git a/lithops/tests/pytest.ini b/lithops/tests/pytest.ini new file mode 100644 index 000000000..fabc2730d --- /dev/null +++ b/lithops/tests/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +log_format = %(asctime)s,%(msecs)03d [%(levelname)s] %(filename)s:%(lineno)s -- %(message)s +log_date_format = %Y-%m-%d %H:%M:%S diff --git a/lithops/tests/test_call_async.py b/lithops/tests/test_call_async.py index 5b892f74a..b74d8fac4 100644 --- a/lithops/tests/test_call_async.py +++ b/lithops/tests/test_call_async.py @@ -14,55 +14,50 @@ # limitations under the License. # - -import unittest -import logging +import pytest import lithops -from lithops.tests import main_util -from lithops.tests.util_func.map_util import simple_map_function +import logging +from lithops.tests.functions import ( + SideEffect, + passthrough_function, + simple_map_function +) logger = logging.getLogger(__name__) -CONFIG = None -STORAGE_CONFIG = None -STORAGE = None - - -class TestAsync(unittest.TestCase): - @classmethod - def setUpClass(cls): - global CONFIG, STORAGE, STORAGE_CONFIG +class TestCallAsync: - CONFIG, STORAGE, STORAGE_CONFIG = main_util.get_config().values() - - @classmethod - def setUp(cls): - print('\n-------------------------------------------------------------\n') - - def test_call_async(self): + def test_hello_world(self): def hello_world(param): return "Hello World!" - logger.info('Testing call_async()') - fexec = lithops.FunctionExecutor(config=CONFIG) + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.call_async(hello_world, "") result = fexec.get_result() - self.assertEqual(result, "Hello World!") + assert result == "Hello World!" - fexec = lithops.FunctionExecutor(config=CONFIG) + def test_lambda_fn(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.call_async(lambda x: " ".join(x), ["a", "b"]) result = fexec.get_result() - self.assertEqual(result, "a b") + assert result == "a b" - fexec = lithops.FunctionExecutor(config=CONFIG) + def test_set_iterdata(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.call_async(simple_map_function, (4, 6)) result = fexec.get_result() - self.assertEqual(result, 10) + assert result == 10 - fexec = lithops.FunctionExecutor(config=CONFIG) + def test_dict_iterdata(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.call_async(simple_map_function, {'x': 2, 'y': 8}) result = fexec.get_result() - self.assertEqual(result, 10) - + assert result == 10 + def test_object_with_side_effects(self): + se = SideEffect() + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + fexec.call_async(passthrough_function, se) + result = fexec.get_result() + assert result == 5 diff --git a/lithops/tests/test_future.py b/lithops/tests/test_future.py new file mode 100644 index 000000000..4d18dd9f2 --- /dev/null +++ b/lithops/tests/test_future.py @@ -0,0 +1,26 @@ +import pytest + +import lithops + + +class HasAmbiguousTruthValue: + """An object with an ambiguous truth value, simulates pandas.DataFrame and numpy.NDArray.""" + + def __init__(self, data): + self.data = data + + def __bool__(self): + raise ValueError( + f"The truth value of a {type(self).__name__} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + ) + + +def test_fn_returns_obj_with_ambiguous_truth_value(): + def returns_obj_with_ambiguous_truth_value(param): + return HasAmbiguousTruthValue(param) + + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + future = fexec.call_async(returns_obj_with_ambiguous_truth_value, "Hello World!") + result = future.result() + assert result.data == "Hello World!" diff --git a/lithops/tests/test_map.py b/lithops/tests/test_map.py index e064e6e7c..835924a51 100644 --- a/lithops/tests/test_map.py +++ b/lithops/tests/test_map.py @@ -14,85 +14,76 @@ # limitations under the License. # - -import unittest -import logging +import pytest import lithops -from lithops.tests import main_util -from lithops.tests.util_func.map_util import simple_map_function, hello_world, lithops_inside_lithops_map_function, \ - lithops_return_futures_map_function1, lithops_return_futures_map_function3, lithops_return_futures_map_function2, \ +from lithops.tests.functions import ( + simple_map_function, + hello_world, + lithops_inside_lithops_map_function, + lithops_return_futures_map, + lithops_return_futures_call_async, + lithops_return_futures_map_multiple, concat +) -logger = logging.getLogger(__name__) - -CONFIG = None -STORAGE_CONFIG = None -STORAGE = None - - -class TestMap(unittest.TestCase): - - @classmethod - def setUpClass(cls): - global CONFIG, STORAGE, STORAGE_CONFIG - CONFIG, STORAGE, STORAGE_CONFIG = main_util.get_config().values() +class TestMap: - @classmethod - def setUp(cls): - print('\n-------------------------------------------------------------\n') - - def test_map(self): - logger.info('Testing map()') + def test_simple_map(self): iterdata = [(1, 1), (2, 2), (3, 3), (4, 4)] - fexec = lithops.FunctionExecutor(config=CONFIG) + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.map(simple_map_function, iterdata) result = fexec.get_result() - self.assertEqual(result, [2, 4, 6, 8]) + assert result == [2, 4, 6, 8] - fexec = lithops.FunctionExecutor(config=CONFIG, max_workers=1) + def test_max_workers(self): + iterdata = [(1, 1), (2, 2), (3, 3), (4, 4)] + fexec = lithops.FunctionExecutor(config=pytest.lithops_config, max_workers=1) fexec.map(simple_map_function, iterdata) result = fexec.get_result() - self.assertEqual(result, [2, 4, 6, 8]) - - fexec = lithops.FunctionExecutor(config=CONFIG) - set_iterdata = set(range(2)) - fexec.map(hello_world, set_iterdata) - result = fexec.get_result() - self.assertEqual(result, ['Hello World!'] * 2) + assert result == [2, 4, 6, 8] - fexec = lithops.FunctionExecutor(config=CONFIG) + def test_range_iterdata(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) generator_iterdata = range(2) fexec.map(hello_world, generator_iterdata) result = fexec.get_result() - self.assertEqual(result, ['Hello World!'] * 2) + assert result == ['Hello World!'] * 2 - fexec = lithops.FunctionExecutor(config=CONFIG) + def test_dict_iterdata(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) listDicts_iterdata = [{'x': 2, 'y': 8}, {'x': 2, 'y': 8}] fexec.map(simple_map_function, listDicts_iterdata) result = fexec.get_result() - self.assertEqual(result, [10, 10]) + assert result == [10, 10] - fexec = lithops.FunctionExecutor(config=CONFIG) + def test_set_iterdata(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) set_iterdata = [["a", "b"], ["c", "d"]] fexec.map(concat, set_iterdata) result = fexec.get_result() - self.assertEqual(result, ["a b", "c d"]) + assert result == ["a b", "c d"] + + def test_set_range_iterdata(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + set_iterdata = set(range(2)) + fexec.map(hello_world, set_iterdata) + result = fexec.get_result() + assert result == ['Hello World!'] * 2 def test_multiple_executions(self): - logger.info('Testing multiple executions before requesting results') - fexec = lithops.FunctionExecutor(config=CONFIG) + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) iterdata = [(1, 1), (2, 2)] fexec.map(simple_map_function, iterdata) iterdata = [(3, 3), (4, 4)] fexec.map(simple_map_function, iterdata) result = fexec.get_result() - self.assertEqual(result, [2, 4, 6, 8]) + assert result == [2, 4, 6, 8] iterdata = [(1, 1), (2, 2)] fexec.map(simple_map_function, iterdata) result = fexec.get_result() - self.assertEqual(result, [2, 4]) + assert result == [2, 4] iterdata = [(1, 1), (2, 2)] futures1 = fexec.map(simple_map_function, iterdata) @@ -100,25 +91,30 @@ def test_multiple_executions(self): iterdata = [(3, 3), (4, 4)] futures2 = fexec.map(simple_map_function, iterdata) result2 = fexec.get_result(fs=futures2) - self.assertEqual(result1, [2, 4]) - self.assertEqual(result2, [6, 8]) + assert result1 == [2, 4] + assert result2 == [6, 8] - def test_internal_executions(self): - logger.info('Testing internal executions') - fexec = lithops.FunctionExecutor(config=CONFIG) + def test_lithops_inside_lithops(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.map(lithops_inside_lithops_map_function, range(1, 5)) result = fexec.get_result() - self.assertEqual(result, [list(range(i)) for i in range(1, 5)]) + assert result == [list(range(i)) for i in range(1, 5)] - fexec = lithops.FunctionExecutor(config=CONFIG) - fexec.call_async(lithops_return_futures_map_function1, 3) - fexec.get_result() + def test_lithops_return_futures_map(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + fexec.call_async(lithops_return_futures_map, 3) + result = fexec.get_result() + assert result == [1, 2, 3] - fexec = lithops.FunctionExecutor(config=CONFIG) - fexec.call_async(lithops_return_futures_map_function2, 3) - fexec.get_result() + def test_lithops_return_futures_call_async(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + fexec.call_async(lithops_return_futures_call_async, 3) + result = fexec.get_result() + assert result == 9 - fexec = lithops.FunctionExecutor(config=CONFIG) - fexec.call_async(lithops_return_futures_map_function3, 3) + def test_lithops_return_futures_map_multiple(self): + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + fexec.call_async(lithops_return_futures_map_multiple, 3) fexec.wait() - fexec.get_result() + result = fexec.get_result() + assert result == [1, 2, 3, 1, 2, 3] diff --git a/lithops/tests/test_map_reduce.py b/lithops/tests/test_map_reduce.py index 76aa27d9c..279508eca 100644 --- a/lithops/tests/test_map_reduce.py +++ b/lithops/tests/test_map_reduce.py @@ -12,167 +12,227 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +import pytest import math -import unittest +import base64 +import urllib import logging - import lithops -from lithops.tests import main_util -from lithops.tests.util_func.map_reduce_util import simple_reduce_function, my_reduce_function -from lithops.tests.util_func.map_util import simple_map_function, my_map_function_obj, my_map_function_url -from lithops.tests.util_func.storage_util import list_dataset_keys, get_dataset_key_size +from concurrent.futures import ThreadPoolExecutor +from lithops.tests.conftest import TESTS_PREFIX +from lithops.config import extract_storage_config +from lithops.tests.functions import ( + simple_reduce_function, + my_reduce_function, + simple_map_function, + my_map_function_obj, + my_map_function_url +) + + +DATASET_PREFIX = TESTS_PREFIX + '/dataset' +base64_bytes = 'aHR0cHM6Ly9zMy1ldS13ZXN0LTEuYW1hem9uYXdzLmNvbS9hcnRtLw=='.encode('ascii') +TEST_FILES_REPO = base64.b64decode(base64_bytes).decode('ascii') +TEST_FILES_URLS = [ + TEST_FILES_REPO + "vocab.enron.txt", + TEST_FILES_REPO + "vocab.kos.txt", + TEST_FILES_REPO + "vocab.nips.txt", + TEST_FILES_REPO + "vocab.nytimes.txt", + TEST_FILES_REPO + "vocab.pubmed.txt" +] logger = logging.getLogger(__name__) -CONFIG = None -STORAGE_CONFIG = None -STORAGE = None -TEST_FILES_URLS = None -PREFIX = '__lithops.test' -DATASET_PREFIX = PREFIX + '/dataset' - -class TestMapReduce(unittest.TestCase): - words_in_cos_files = None +class TestMapReduce: @classmethod - def setUpClass(cls): - global CONFIG, STORAGE, STORAGE_CONFIG, TEST_FILES_URLS - - CONFIG, STORAGE, STORAGE_CONFIG = main_util.get_config().values() - TEST_FILES_URLS = main_util.get_data_sets() - cls.words_in_cos_files = main_util.get_words_in_files() + def setup_class(cls): + storage_config = extract_storage_config(pytest.lithops_config) + storage = lithops.Storage(storage_config=storage_config) + cls.words_in_files = upload_data_sets(storage) + cls.storage = storage + cls.storage_backend = storage.backend + cls.bucket = storage.bucket @classmethod - def setUp(cls): - print('\n-------------------------------------------------------------\n') + def teardown_class(cls): + for key in cls.storage.list_keys(bucket=cls.bucket, prefix=DATASET_PREFIX): + cls.storage.delete_object(bucket=cls.bucket, key=key) - def test_map_reduce(self): + def test_simple_map_reduce(self): logger.info('Testing map_reduce() using memory') iterdata = [(1, 1), (2, 2), (3, 3), (4, 4)] - fexec = lithops.FunctionExecutor(config=CONFIG) - fexec.map_reduce(simple_map_function, iterdata, - simple_reduce_function) + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + fexec.map_reduce(simple_map_function, iterdata, simple_reduce_function) result = fexec.get_result() - self.assertEqual(result, 20) + assert result == 20 - def test_map_reduce_obj_bucket(self): + def test_obj_bucket(self): logger.info('Testing map_reduce() over a bucket') - sb = STORAGE_CONFIG['backend'] - data_prefix = sb + '://' + STORAGE_CONFIG['bucket'] + '/' + DATASET_PREFIX + '/' - fexec = lithops.FunctionExecutor(config=CONFIG) + data_prefix = self.storage_backend + '://' + self.bucket + '/' + DATASET_PREFIX + '/' + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.map_reduce(my_map_function_obj, data_prefix, my_reduce_function) result = fexec.get_result() - self.assertEqual(result, self.__class__.words_in_cos_files) + assert result == self.words_in_files - def test_map_reduce_obj_bucket_reduce_by_key(self): + def test_obj_bucket_reduce_by_key(self): logger.info('Testing map_reduce() over a bucket with one reducer per object') - sb = STORAGE_CONFIG['backend'] - data_prefix = sb + '://' + STORAGE_CONFIG['bucket'] + '/' + DATASET_PREFIX + '/' - fexec = lithops.FunctionExecutor(config=CONFIG) + data_prefix = self.storage_backend + '://' + self.bucket + '/' + DATASET_PREFIX + '/' + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.map_reduce(my_map_function_obj, data_prefix, - my_reduce_function, - obj_reduce_by_key=True) + my_reduce_function, obj_reduce_by_key=True) result = fexec.get_result() - # the reducer returns a list containing sum of the words uploaded via each file. - self.assertEqual(sum(result), self.__class__.words_in_cos_files) + assert sum(result) == self.words_in_files - def test_map_reduce_obj_key(self): + def test_obj_key(self): logger.info('Testing map_reduce() over object keys') - sb = STORAGE_CONFIG['backend'] - bucket_name = STORAGE_CONFIG['bucket'] - iterdata = [sb + '://' + bucket_name + '/' + key for key in list_dataset_keys(STORAGE, STORAGE_CONFIG)] - fexec = lithops.FunctionExecutor(config=CONFIG) - fexec.map_reduce(my_map_function_obj, iterdata, - my_reduce_function) + keys = self.storage.list_keys(bucket=self.bucket, prefix=DATASET_PREFIX + '/') + iterdata = [self.storage_backend + '://' + self.bucket + '/' + key for key in keys] + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + fexec.map_reduce(my_map_function_obj, iterdata, my_reduce_function) result = fexec.get_result() - self.assertEqual(result, self.__class__.words_in_cos_files) + assert result == self.words_in_files - def test_map_reduce_obj_key_reduce_by_key(self): + def test_obj_key_reduce_by_key(self): logger.info('Testing map_reduce() over object keys with one reducer per object') - sb = STORAGE_CONFIG['backend'] - bucket_name = STORAGE_CONFIG['bucket'] - iterdata = [sb + '://' + bucket_name + '/' + key for key in list_dataset_keys(STORAGE, STORAGE_CONFIG)] - fexec = lithops.FunctionExecutor(config=CONFIG) + keys = self.storage.list_keys(bucket=self.bucket, prefix=DATASET_PREFIX + '/') + iterdata = [self.storage_backend + '://' + self.bucket + '/' + key for key in keys] + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.map_reduce(my_map_function_obj, iterdata, - my_reduce_function, - obj_reduce_by_key=True) + my_reduce_function, obj_reduce_by_key=True) result = fexec.get_result() - self.assertEqual(sum(result), self.__class__.words_in_cos_files) + assert sum(result) == self.words_in_files - def test_map_reduce_url(self): + def test_url_processing(self): logger.info('Testing map_reduce() over URLs') - fexec = lithops.FunctionExecutor(config=CONFIG) + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) fexec.map_reduce(my_map_function_url, TEST_FILES_URLS, my_reduce_function, obj_chunk_number=2) result = fexec.get_result() - self.assertEqual(result, self.__class__.words_in_cos_files) + assert result == self.words_in_files - def test_chunks_bucket(self): - """tests the ability to create a separate function invocation based on the following parameters: - chunk_size - creates [file_size//chunk_size] invocations to process each chunk_size bytes, of a given object. - chunk_number - creates 'chunk_number' invocations that process [file_size//chunk_number] bytes each. """ - - logger.info('Testing chunks on a bucket') + def test_bucket_chunk_size(self): + """tests the ability to create a separate function invocation + based on the following parameters: chunk_size creates [file_size//chunk_size] + invocations to process each chunk_size bytes, of a given object. + """ OBJ_CHUNK_SIZE = 1 * 800 ** 2 # create a new invocation - OBJ_CHUNK_NUMBER = 2 activations = 0 - sb = STORAGE_CONFIG['backend'] - data_prefix = sb + '://' + STORAGE_CONFIG['bucket'] + '/' + DATASET_PREFIX + '/' + data_prefix = self.storage_backend + '://' + self.bucket + '/' + DATASET_PREFIX + '/' - fexec = lithops.FunctionExecutor(config=CONFIG) - futures = fexec.map_reduce(my_map_function_obj, data_prefix, - my_reduce_function, - obj_chunk_size=OBJ_CHUNK_SIZE) + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + futures = fexec.map_reduce( + my_map_function_obj, data_prefix, + my_reduce_function, obj_chunk_size=OBJ_CHUNK_SIZE + ) result = fexec.get_result(futures) - self.assertEqual(result, self.__class__.words_in_cos_files) + assert result == self.words_in_files - for size in get_dataset_key_size(STORAGE, STORAGE_CONFIG): + for size in get_dataset_key_size(self.storage, self.bucket): activations += math.ceil(size / OBJ_CHUNK_SIZE) - self.assertEqual(len(futures), activations + 1) # +1 due to the reduce function + assert len(futures) == activations + 1 # +1 due to the reduce function - fexec = lithops.FunctionExecutor(config=CONFIG) - futures = fexec.map_reduce(my_map_function_obj, data_prefix, - my_reduce_function, obj_chunk_number=OBJ_CHUNK_NUMBER) - result = fexec.get_result(futures) - self.assertEqual(result, self.__class__.words_in_cos_files) + def test_bucket_chunk_number(self): + """tests the ability to create a separate function invocation + based on the following parameters: chunk_number + creates 'chunk_number' invocations that process [file_size//chunk_number] bytes each. + """ + OBJ_CHUNK_NUMBER = 2 - self.assertEqual(len(futures), len(TEST_FILES_URLS)*OBJ_CHUNK_NUMBER + 1) + data_prefix = self.storage_backend + '://' + self.bucket + '/' + DATASET_PREFIX + '/' + + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + futures = fexec.map_reduce( + my_map_function_obj, data_prefix, + my_reduce_function, obj_chunk_number=OBJ_CHUNK_NUMBER + ) + result = fexec.get_result(futures) + assert result == self.words_in_files - def test_chunks_bucket_one_reducer_per_object(self): - """tests the ability to create a separate function invocation based on the following parameters, as well as - create a separate invocation of a reduce function for each object: - chunk_size - creates [file_size//chunk_size] invocations to process each chunk_size bytes, of a given object. - chunk_number - creates 'chunk_number' invocations that process [file_size//chunk_number] bytes each. """ + assert len(futures) == len(TEST_FILES_URLS) * OBJ_CHUNK_NUMBER + 1 - logger.info('Testing chunks on a bucket with one reducer per object') + def test_bucket_chunk_size_one_reducer_per_object(self): + """tests the ability to create a separate function invocation based + on the following parameters, as well as create a separate invocation + of a reduce function for each object: chunk_size creates [file_size//chunk_size] + invocations to process each chunk_size bytes, of a given object. hunk_number + creates 'chunk_number' invocations that process [file_size//chunk_number] bytes each. + """ OBJ_CHUNK_SIZE = 1 * 1024 ** 2 - OBJ_CHUNK_NUMBER = 2 activations = 0 - - sb = STORAGE_CONFIG['backend'] - data_prefix = sb + '://' + STORAGE_CONFIG['bucket'] + '/' + DATASET_PREFIX + '/' - - fexec = lithops.FunctionExecutor(config=CONFIG) - futures = fexec.map_reduce(my_map_function_obj, data_prefix, - my_reduce_function, - obj_chunk_size=OBJ_CHUNK_SIZE, - obj_reduce_by_key=True) + data_prefix = self.storage_backend + '://' + self.bucket + '/' + DATASET_PREFIX + '/' + + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + futures = fexec.map_reduce( + my_map_function_obj, data_prefix, + my_reduce_function, + obj_chunk_size=OBJ_CHUNK_SIZE, + obj_reduce_by_key=True + ) result = fexec.get_result(futures) - self.assertEqual(sum(result), self.__class__.words_in_cos_files) + assert sum(result) == self.words_in_files - for size in get_dataset_key_size(STORAGE, STORAGE_CONFIG): + for size in get_dataset_key_size(self.storage, self.bucket): activations += math.ceil(size / OBJ_CHUNK_SIZE) - self.assertEqual(len(futures), activations + len(TEST_FILES_URLS)) # + len(TEST_FILES_URLS) due to map_reduce activation per object + # + len(TEST_FILES_URLS) due to map_reduce activation per object + assert len(futures) == activations + len(TEST_FILES_URLS) - fexec = lithops.FunctionExecutor(config=CONFIG) - futures = fexec.map_reduce(my_map_function_obj, data_prefix, - my_reduce_function, obj_chunk_number=OBJ_CHUNK_NUMBER, - obj_reduce_by_key=True) + def test_bucket_chunk_number_one_reducer_per_object(self): + """tests the ability to create a separate function invocation based + on the following parameters, as well as create a separate invocation + of a reduce function for each object: chunk_size creates [file_size//chunk_size] + invocations to process each chunk_size bytes, of a given object. hunk_number + creates 'chunk_number' invocations that process [file_size//chunk_number] bytes each. + """ + OBJ_CHUNK_NUMBER = 2 + data_prefix = self.storage_backend + '://' + self.bucket + '/' + DATASET_PREFIX + '/' + + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + futures = fexec.map_reduce( + my_map_function_obj, + data_prefix, + my_reduce_function, + obj_chunk_number=OBJ_CHUNK_NUMBER, + obj_reduce_by_key=True + ) result = fexec.get_result(futures) - self.assertEqual(sum(result), self.__class__.words_in_cos_files) - self.assertEqual(len(futures), len(TEST_FILES_URLS) * OBJ_CHUNK_NUMBER + len(TEST_FILES_URLS)) # + len(TEST_FILES_URLS) due to map_reduce activation per object + assert sum(result) == self.words_in_files + # + len(TEST_FILES_URLS) due to map_reduce activation per object + assert len(futures) == len(TEST_FILES_URLS) * OBJ_CHUNK_NUMBER + len(TEST_FILES_URLS) + + +def get_dataset_key_size(storage, bucket): + """return a list of file sizes in bytes, belonging to files whose names are + prefixed by 'prefix' """ + sizes = [] + keys = storage.list_keys(bucket=bucket, prefix=DATASET_PREFIX + '/') + for key in keys: + sizes.append(float(storage.head_object(bucket, key)['content-length'])) + return sizes + + +def upload_data_sets(storage): + """ + Uploads datasets to storage and return a list of + the number of words within each test file + """ + def up(param): + dataset_name = param[1].split("/")[-1] + logger.info(f'Uploading bag-of-words dataset: {dataset_name}') + i, url = param + content = urllib.request.urlopen(url).read() + storage.put_object(bucket=storage.bucket, + key=f'{DATASET_PREFIX}/{dataset_name}', + body=content) + return len(content.split()) + + with ThreadPoolExecutor() as pool: + results = list(pool.map(up, enumerate(TEST_FILES_URLS))) + + return sum(results) diff --git a/lithops/tests/test_retries.py b/lithops/tests/test_retries.py new file mode 100644 index 000000000..66e892a19 --- /dev/null +++ b/lithops/tests/test_retries.py @@ -0,0 +1,156 @@ +import time +import pytest + +from lithops import FunctionExecutor +from lithops import RetryingFunctionExecutor + + +def run_test(function, input, retries, timeout=5): + fexec = FunctionExecutor(config=pytest.lithops_config) + with RetryingFunctionExecutor(fexec) as executor: + futures = executor.map( + function, + input, + timeout=timeout, + retries=retries, + ) + done, pending = executor.wait(futures, throw_except=False) + assert len(pending) == 0 + outputs = set(f.result() for f in done) + return outputs + + +# fmt: off +@pytest.mark.parametrize( + "timing_map, n_tasks, retries", + [ + # no failures + ({}, 3, 2), + # first invocation fails + ({0: [-1], 1: [-1], 2: [-1]}, 3, 2), + # first two invocations fail + ({0: [-1, -1], 1: [-1, -1], 2: [-1, -1]}, 3, 2), + # first input sleeps once + ({0: [20]}, 3, 2), + ], +) +# fmt: on +def test_success(tmp_path, timing_map, n_tasks, retries): + def partial_map_function(x): + return deterministic_failure(tmp_path, timing_map, x) + + outputs = run_test( + function=partial_map_function, + input=range(n_tasks), + retries=retries, + ) + + assert outputs == set(range(n_tasks)) + + check_invocation_counts(tmp_path, timing_map, n_tasks, retries) + + +# fmt: off +@pytest.mark.parametrize( + "timing_map, n_tasks, retries", + [ + # too many failures + ({0: [-1], 1: [-1], 2: [-1, -1, -1]}, 3, 2), + ], +) +# fmt: on +def test_failure(tmp_path, timing_map, n_tasks, retries): + def partial_map_function(x): + return deterministic_failure(tmp_path, timing_map, x) + + with pytest.raises(RuntimeError): + run_test( + function=partial_map_function, + input=range(n_tasks), + retries=retries, + ) + + check_invocation_counts(tmp_path, timing_map, n_tasks, retries) + + +def read_int_from_file(path): + with open(path) as f: + return int(f.read()) + + +def write_int_to_file(path, i): + with open(path, "w") as f: + f.write(str(i)) + + +def deterministic_failure(path, timing_map, i): + """A function that can either run normally, run slowly, or raise + an exception, depending on input and invocation count. + The timing_map is a dictionary whose keys are inputs and values + are sequences of timing information for each invocation. + The maginitude of the value is the time to sleep in seconds, and + the sign indicates the input is returned normally (positive, or 0), + or an exception is raised (negative). + If a input is missing then all invocations will run normally. + If there are subsequent invocations to the ones in the sequence, then + they will all run normally. + """ + # increment number of invocations of this function with arg i + invocation_count_file = path / str(i) + if invocation_count_file.exists(): + invocation_count = read_int_from_file(invocation_count_file) + else: + invocation_count = 0 + write_int_to_file(invocation_count_file, invocation_count + 1) + + timing_code = 0 + if i in timing_map: + timing_codes = timing_map[i] + if invocation_count >= len(timing_codes): + timing_code = 0 + else: + timing_code = timing_codes[invocation_count] + + if timing_code >= 0: + time.sleep(timing_code) + return i + else: + time.sleep(-timing_code) + raise RuntimeError( + f"Deliberately fail on invocation number {invocation_count+1} for input {i}" + ) + + +def check_invocation_counts( + path, timing_map, n_tasks, retries=None, expected_invocation_counts_overrides=None +): + expected_invocation_counts = {} + for i in range(n_tasks): + if i not in timing_map: + expected_invocation_counts[i] = 1 + else: + timing_codes = timing_map[i] + expected_invocation_count = len(timing_codes) + 1 + + if retries is not None: + # there shouldn't have been more than retries + 1 invocations + max_invocations = retries + 1 + expected_invocation_count = min( + expected_invocation_count, max_invocations + ) + + expected_invocation_counts[i] = expected_invocation_count + + if expected_invocation_counts_overrides is not None: + expected_invocation_counts.update(expected_invocation_counts_overrides) + + actual_invocation_counts = {i: read_int_from_file(path / str(i)) for i in range(n_tasks)} + + if actual_invocation_counts != expected_invocation_counts: + for i, expected_count in expected_invocation_counts.items(): + actual_count = actual_invocation_counts[i] + if actual_count != expected_count: + print( + f"Invocation count for {i}, expected: {expected_count}, actual: {actual_count}" + ) + assert actual_invocation_counts == expected_invocation_counts diff --git a/lithops/tests/test_storage.py b/lithops/tests/test_storage.py index b22cbed25..04c7eac34 100644 --- a/lithops/tests/test_storage.py +++ b/lithops/tests/test_storage.py @@ -14,297 +14,277 @@ # limitations under the License. # - -import unittest +import pytest import logging +import lithops from io import BytesIO +from lithops.config import extract_storage_config from lithops.storage.utils import CloudObject, StorageNoSuchKeyError -import lithops -from lithops.tests import main_util -from lithops.tests.util_func.map_reduce_util import my_reduce_function -from lithops.tests.util_func.storage_util import my_map_function_storage, \ - my_cloudobject_put, my_cloudobject_get, list_dataset_keys, extract_keys +from lithops.tests.conftest import TESTS_PREFIX +from lithops.tests.functions import my_map_function_storage, \ + my_cloudobject_put, my_cloudobject_get, my_reduce_function + logger = logging.getLogger(__name__) -CONFIG = None -STORAGE_CONFIG = None -STORAGE = None -TEST_FILES_URLS = None -PREFIX = '__lithops.test' -DATASET_PREFIX = PREFIX + '/dataset' +STORAGE_PREFIX = TESTS_PREFIX + '/storage' -class TestStorage(unittest.TestCase): - words_in_cos_files = None +class TestStorage: @classmethod - def setUpClass(cls): - global CONFIG, STORAGE, STORAGE_CONFIG, TEST_FILES_URLS - - CONFIG, STORAGE, STORAGE_CONFIG = main_util.get_config().values() - TEST_FILES_URLS = main_util.get_data_sets() - cls.words_in_cos_files = main_util.get_words_in_files() + def setup_class(cls): + storage_config = extract_storage_config(pytest.lithops_config) + storage = lithops.Storage(storage_config=storage_config) + cls.storage = storage + cls.storage_backend = storage.backend + cls.bucket = storage.bucket + storage.put_object(cls.bucket, STORAGE_PREFIX + '/test0', BytesIO(b'test storage handler')) + storage.put_object(cls.bucket, STORAGE_PREFIX + '/test1', BytesIO(b'test storage')) + cls.words_in_files = 5 @classmethod - def setUp(cls): - print('\n-------------------------------------------------------------\n') + def teardown_class(cls): + for key in cls.storage.list_keys(bucket=cls.bucket, prefix=STORAGE_PREFIX): + cls.storage.delete_object(bucket=cls.bucket, key=key) def test_storage_handler(self): logger.info('Testing "storage" function arg') - iterdata = [(key, STORAGE_CONFIG['bucket']) for key in list_dataset_keys(STORAGE, STORAGE_CONFIG)] - fexec = lithops.FunctionExecutor(config=CONFIG) - fexec.map_reduce(my_map_function_storage, iterdata, - my_reduce_function) + keys = self.storage.list_keys(bucket=self.bucket, prefix=STORAGE_PREFIX) + iterdata = [(key, self.bucket) for key in keys] + fexec = lithops.FunctionExecutor(config=pytest.lithops_config) + fexec.map_reduce(my_map_function_storage, iterdata, my_reduce_function) result = fexec.get_result() - self.assertEqual(result, self.__class__.words_in_cos_files) + assert result == self.words_in_files def test_cloudobject(self): logger.info('Testing cloudobjects') - sb = STORAGE_CONFIG['backend'] - data_prefix = sb + '://' + STORAGE_CONFIG['bucket'] + '/' + DATASET_PREFIX + '/' - with lithops.FunctionExecutor(config=CONFIG) as fexec: + data_prefix = self.storage_backend + '://' + self.bucket + '/' + STORAGE_PREFIX + '/' + with lithops.FunctionExecutor(config=pytest.lithops_config) as fexec: fexec.map(my_cloudobject_put, data_prefix) cloudobjects = fexec.get_result() fexec.call_async(my_cloudobject_get, cloudobjects) result = fexec.get_result() - self.assertEqual(result, self.__class__.words_in_cos_files) + assert result == self.words_in_files fexec.clean(cs=cloudobjects) - def test_storage_put_get_by_stream(self): + def test_put_get_by_stream(self): logger.info('Testing Storage.put_object and get_object with streams') - bucket = STORAGE_CONFIG['bucket'] - bytes_data = b'123' - bytes_key = PREFIX + '/bytes' - STORAGE.put_object(bucket, bytes_key, BytesIO(bytes_data)) - bytes_stream = STORAGE.get_object(bucket, bytes_key, stream=True) + bytes_data = b'123' + bytes_key = STORAGE_PREFIX + '/bytes' + self.storage.put_object(self.bucket, bytes_key, BytesIO(bytes_data)) + bytes_stream = self.storage.get_object(self.bucket, bytes_key, stream=True) - self.assertTrue(hasattr(bytes_stream, 'read')) - self.assertEqual(bytes_stream.read(), bytes_data) + assert hasattr(bytes_stream, 'read') + assert bytes_stream.read() == bytes_data - def test_storage_get_by_range(self): + def test_get_by_range(self): logger.info('Testing Storage.get_object with Range argument') - bucket = STORAGE_CONFIG['bucket'] - key = PREFIX + '/bytes' - STORAGE.put_object(bucket, key, b'0123456789') + key = STORAGE_PREFIX + '/bytes' + self.storage.put_object(self.bucket, key, b'0123456789') - result = STORAGE.get_object(bucket, key, extra_get_args={'Range': 'bytes=1-4'}) + result = self.storage.get_object(self.bucket, key, extra_get_args={'Range': 'bytes=1-4'}) - self.assertEqual(result, b'1234') + assert result == b'1234' - def test_storage_list_keys(self): + def test_list_keys(self): logger.info('Testing Storage.list_keys') - bucket = STORAGE_CONFIG['bucket'] test_keys = sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_bar/baz', - PREFIX + '/foo_baz', - PREFIX + '/bar', - PREFIX + '/bar_baz', + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + STORAGE_PREFIX + '/foo_bar/baz', + STORAGE_PREFIX + '/foo_baz', + STORAGE_PREFIX + '/bar', + STORAGE_PREFIX + '/bar_baz', ]) for key in test_keys: - STORAGE.put_object(bucket, key, key.encode()) - - all_bucket_keys = STORAGE.list_keys(bucket) - prefix_keys = STORAGE.list_keys(bucket, PREFIX) - foo_keys = STORAGE.list_keys(bucket, PREFIX + '/foo') - foo_slash_keys = STORAGE.list_keys(bucket, PREFIX + '/foo/') - bar_keys = STORAGE.list_keys(bucket, PREFIX + '/bar') - non_existent_keys = STORAGE.list_keys(bucket, PREFIX + '/doesnt_exist') - - self.assertTrue(set(all_bucket_keys).issuperset(test_keys)) - self.assertTrue(set(prefix_keys).issuperset(test_keys)) - self.assertTrue(all(key.startswith(PREFIX) for key in prefix_keys)) - # To ensure parity between filesystem and object storage implementations, test that - # prefixes are treated as textual prefixes, not directory names. - self.assertEqual(sorted(foo_keys), sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_bar/baz', - PREFIX + '/foo_baz', - ])) - self.assertEqual(sorted(foo_slash_keys), sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - ])) - self.assertEqual(sorted(bar_keys), sorted([ - PREFIX + '/bar', - PREFIX + '/bar_baz', - ])) - - self.assertEqual(non_existent_keys, []) - - def test_storage_head_object(self): + self.storage.put_object(self.bucket, key, key.encode()) + + all_bucket_keys = self.storage.list_keys(self.bucket) + prefix_keys = self.storage.list_keys(self.bucket, STORAGE_PREFIX) + foo_keys = self.storage.list_keys(self.bucket, STORAGE_PREFIX + '/foo') + foo_slash_keys = self.storage.list_keys(self.bucket, STORAGE_PREFIX + '/foo/') + bar_keys = self.storage.list_keys(self.bucket, STORAGE_PREFIX + '/bar') + non_existent_keys = self.storage.list_keys(self.bucket, STORAGE_PREFIX + '/doesnt_exist') + + assert set(all_bucket_keys).issuperset(test_keys) + assert set(prefix_keys).issuperset(test_keys) + assert all(key.startswith(STORAGE_PREFIX) for key in prefix_keys) + assert sorted(foo_keys) == sorted([ + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + STORAGE_PREFIX + '/foo_bar/baz', + STORAGE_PREFIX + '/foo_baz', + ]) + assert sorted(foo_slash_keys) == sorted([ + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + ]) + assert sorted(bar_keys) == sorted([ + STORAGE_PREFIX + '/bar', + STORAGE_PREFIX + '/bar_baz', + ]) + + assert non_existent_keys == [] + + def test_head_object(self): logger.info('Testing Storage.head_object') - bucket = STORAGE_CONFIG['bucket'] data = b'123456789' - STORAGE.put_object(bucket, PREFIX + '/data', data) - - result = STORAGE.head_object(bucket, PREFIX + '/data') - self.assertEqual(result['content-length'], str(len(data))) + self.storage.put_object(self.bucket, STORAGE_PREFIX + '/data', data) - def get_nonexistent_object(): - STORAGE.head_object(bucket, PREFIX + '/doesnt_exist') + result = self.storage.head_object(self.bucket, STORAGE_PREFIX + '/data') + assert result['content-length'] == str(len(data)) - self.assertRaises(StorageNoSuchKeyError, get_nonexistent_object) + with pytest.raises(StorageNoSuchKeyError): + self.storage.head_object(self.bucket, STORAGE_PREFIX + '/doesnt_exist') - def test_storage_list_objects(self): + def test_list_objects(self): logger.info('Testing Storage.list_objects') - bucket = STORAGE_CONFIG['bucket'] test_keys = sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_bar/baz', - PREFIX + '/foo_baz', - PREFIX + '/bar', - PREFIX + '/bar_baz', + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + STORAGE_PREFIX + '/foo_bar/baz', + STORAGE_PREFIX + '/foo_baz', + STORAGE_PREFIX + '/bar', + STORAGE_PREFIX + '/bar_baz', ]) for key in test_keys: - STORAGE.put_object(bucket, key, key.encode()) - - all_bucket_objects = STORAGE.list_objects(bucket) - prefix_objects = STORAGE.list_objects(bucket, PREFIX) - foo_objects = STORAGE.list_objects(bucket, PREFIX + '/foo') - foo_slash_objects = STORAGE.list_objects(bucket, PREFIX + '/foo/') - bar_objects = STORAGE.list_objects(bucket, PREFIX + '/bar') - non_existent_objects = STORAGE.list_objects(bucket, PREFIX + '/doesnt_exist') - - self.assertTrue(set(extract_keys(all_bucket_objects)).issuperset(test_keys)) - self.assertTrue(set(extract_keys(prefix_objects)).issuperset(test_keys)) - self.assertTrue(all(key.startswith(PREFIX) for key in extract_keys(prefix_objects))) - self.assertEqual(sorted(extract_keys(foo_objects)), sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_bar/baz', - PREFIX + '/foo_baz', - ])) - self.assertEqual(sorted(extract_keys(foo_slash_objects)), sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - ])) - self.assertEqual(sorted(extract_keys(bar_objects)), sorted([ - PREFIX + '/bar', - PREFIX + '/bar_baz', - ])) - - self.assertEqual(non_existent_objects, []) - - def test_storage_list_objects_size(self): - logger.info('Testing Storage.list_objects_size') - bucket = STORAGE_CONFIG['bucket'] - isEqual = True + self.storage.put_object(self.bucket, key, key.encode()) + + all_bucket_objects = self.storage.list_objects(self.bucket) + prefix_objects = self.storage.list_objects(self.bucket, STORAGE_PREFIX) + foo_objects = self.storage.list_objects(self.bucket, STORAGE_PREFIX + '/foo') + foo_slash_objects = self.storage.list_objects(self.bucket, STORAGE_PREFIX + '/foo/') + bar_objects = self.storage.list_objects(self.bucket, STORAGE_PREFIX + '/bar') + non_existent_objects = self.storage.list_objects(self.bucket, STORAGE_PREFIX + '/doesnt_exist') + + def extract_keys(bucket_objects): + keys = [] + for obj in bucket_objects: + keys.append(obj['Key']) + return keys + + assert set(extract_keys(all_bucket_objects)).issuperset(test_keys) + assert set(extract_keys(prefix_objects)).issuperset(test_keys) + assert all(key.startswith(STORAGE_PREFIX) for key in extract_keys(prefix_objects)) + assert sorted(extract_keys(foo_objects)) == sorted([ + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + STORAGE_PREFIX + '/foo_bar/baz', + STORAGE_PREFIX + '/foo_baz', + ]) + assert sorted(extract_keys(foo_slash_objects)) == sorted([ + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + ]) + assert sorted(extract_keys(bar_objects)) == sorted([ + STORAGE_PREFIX + '/bar', + STORAGE_PREFIX + '/bar_baz', + ]) + + assert non_existent_objects == [] + def test_list_objects_size(self): + logger.info('Testing Storage.list_objects_size') test_keys = sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_bar/baz', - PREFIX + '/foo_baz', - PREFIX + '/bar', - PREFIX + '/bar_baz', + STORAGE_PREFIX + '/list/foo/baz', + STORAGE_PREFIX + '/list/foo/bar/baz', + STORAGE_PREFIX + '/list/foo_bar/baz', + STORAGE_PREFIX + '/list/foo_baz', + STORAGE_PREFIX + '/list/bar', + STORAGE_PREFIX + '/list/bar_baz', ]) for key in test_keys: - STORAGE.put_object(bucket, key, key.encode()) + self.storage.put_object(self.bucket, key, key.encode()) - all_bucket_objects = STORAGE.list_objects(bucket) - - for key in test_keys: - for obj in all_bucket_objects: - if obj['Key'] == key and obj['Size'] != len(key.encode()): - isEqual = False - self.assertTrue(isEqual) + all_bucket_objects = self.storage.list_objects(self.bucket, prefix=STORAGE_PREFIX + '/list') + isEqual = all(obj['Size'] == len(obj['Key'].encode()) for obj in all_bucket_objects) + assert isEqual def test_delete_object(self): logger.info('Testing Storage.delete_object') - bucket = STORAGE_CONFIG['bucket'] test_keys = sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_baz', - PREFIX + '/bar', - PREFIX + '/to_be_deleted', + STORAGE_PREFIX + '/delete/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + STORAGE_PREFIX + '/foo_baz', + STORAGE_PREFIX + '/bar', + STORAGE_PREFIX + '/to_be_deleted', ]) for key in test_keys: - STORAGE.put_object(bucket, key, key.encode()) + self.storage.put_object(self.bucket, key, key.encode()) - STORAGE.delete_object(bucket, PREFIX + '/to_be_deleted') - all_bucket_keys = STORAGE.list_keys(bucket) - self.assertFalse(PREFIX + '/to_be_deleted' in all_bucket_keys) + self.storage.delete_object(self.bucket, STORAGE_PREFIX + '/to_be_deleted') + all_bucket_keys = self.storage.list_keys(self.bucket) + assert STORAGE_PREFIX + '/to_be_deleted' not in all_bucket_keys def test_delete_objects(self): logger.info('Testing Storage.delete_objects') - bucket = STORAGE_CONFIG['bucket'] test_keys = sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_baz', - PREFIX + '/bar', - PREFIX + '/to_be_deleted1', - PREFIX + '/to_be_deleted2', - PREFIX + '/to_be_deleted3' + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + STORAGE_PREFIX + '/foo_baz', + STORAGE_PREFIX + '/bar', + STORAGE_PREFIX + '/to_be_deleted1', + STORAGE_PREFIX + '/to_be_deleted2', + STORAGE_PREFIX + '/to_be_deleted3' ]) keys_to_delete = [ - PREFIX + '/to_be_deleted1', - PREFIX + '/to_be_deleted2', - PREFIX + '/to_be_deleted3' + STORAGE_PREFIX + '/to_be_deleted1', + STORAGE_PREFIX + '/to_be_deleted2', + STORAGE_PREFIX + '/to_be_deleted3' ] for key in test_keys: - STORAGE.put_object(bucket, key, key.encode()) + self.storage.put_object(self.bucket, key, key.encode()) - STORAGE.delete_objects(bucket, keys_to_delete) - all_bucket_keys = STORAGE.list_keys(bucket) - self.assertTrue(all(key not in all_bucket_keys for key in keys_to_delete)) + self.storage.delete_objects(self.bucket, keys_to_delete) + all_bucket_keys = self.storage.list_keys(self.bucket) + assert all(key not in all_bucket_keys for key in keys_to_delete) def test_head_bucket(self): logger.info('Testing Storage.head_bucket') - bucket = STORAGE_CONFIG['bucket'] - result = STORAGE.head_bucket(bucket) - self.assertEqual(result['ResponseMetadata']['HTTPStatusCode'], 200) + result = self.storage.head_bucket(self.bucket) + assert result['ResponseMetadata']['HTTPStatusCode'] == 200 def test_delete_cloudobject(self): logger.info('Testing Storage.delete_cloudobject') - sb = STORAGE_CONFIG['backend'] - bucket = STORAGE_CONFIG['bucket'] test_keys = sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_baz', - PREFIX + '/bar', - PREFIX + '/to_be_deleted', + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + STORAGE_PREFIX + '/foo_baz', + STORAGE_PREFIX + '/bar', + STORAGE_PREFIX + '/to_be_deleted', ]) for key in test_keys: - STORAGE.put_object(bucket, key, key.encode()) - cloudobject = CloudObject(sb, bucket, PREFIX + '/to_be_deleted') - STORAGE.delete_cloudobject(cloudobject) - all_bucket_keys = STORAGE.list_keys(bucket) - self.assertFalse(PREFIX + '/to_be_deleted' in all_bucket_keys) + self.storage.put_object(self.bucket, key, key.encode()) + cloudobject = CloudObject(self.storage_backend, self.bucket, STORAGE_PREFIX + '/to_be_deleted') + self.storage.delete_cloudobject(cloudobject) + all_bucket_keys = self.storage.list_keys(self.bucket) + assert STORAGE_PREFIX + '/to_be_deleted' not in all_bucket_keys def test_delete_cloudobjects(self): logger.info('Testing Storage.delete_cloudobjects') - sb = STORAGE_CONFIG['backend'] - bucket = STORAGE_CONFIG['bucket'] test_keys = sorted([ - PREFIX + '/foo/baz', - PREFIX + '/foo/bar/baz', - PREFIX + '/foo_baz', - PREFIX + '/bar', - PREFIX + '/to_be_deleted1', - PREFIX + '/to_be_deleted2', - PREFIX + '/to_be_deleted3' + STORAGE_PREFIX + '/foo/baz', + STORAGE_PREFIX + '/foo/bar/baz', + STORAGE_PREFIX + '/foo_baz', + STORAGE_PREFIX + '/bar', + STORAGE_PREFIX + '/to_be_deleted1', + STORAGE_PREFIX + '/to_be_deleted2', + STORAGE_PREFIX + '/to_be_deleted3' ]) cloudobjects = [] keys_to_delete = [ - PREFIX + '/to_be_deleted1', - PREFIX + '/to_be_deleted2', - PREFIX + '/to_be_deleted3' + STORAGE_PREFIX + '/to_be_deleted1', + STORAGE_PREFIX + '/to_be_deleted2', + STORAGE_PREFIX + '/to_be_deleted3' ] for key in keys_to_delete: - cobject = CloudObject(sb, bucket, key) + cobject = CloudObject(self.storage_backend, self.bucket, key) cloudobjects.append(cobject) for key in test_keys: - STORAGE.put_object(bucket, key, key.encode()) + self.storage.put_object(self.bucket, key, key.encode()) - STORAGE.delete_cloudobjects(cloudobjects) - all_bucket_keys = STORAGE.list_keys(bucket) - self.assertTrue(all(key not in all_bucket_keys for key in keys_to_delete)) + self.storage.delete_cloudobjects(cloudobjects) + all_bucket_keys = self.storage.list_keys(self.bucket) + assert all(key not in all_bucket_keys for key in keys_to_delete) diff --git a/lithops/tests/test_template.py b/lithops/tests/test_template.py deleted file mode 100644 index e603bce20..000000000 --- a/lithops/tests/test_template.py +++ /dev/null @@ -1,80 +0,0 @@ -# -# (C) Copyright IBM Corp. 2020 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import subprocess -import unittest # Mandatory. Required to incorporate the test class into the test framework -import lithops # Mandatory, unless test class is confined to storage testing -from lithops.tests import main_util # required to initialize config variables and other constants below. - - -# To utilize the project's logging capabilities: -import logging -logger = logging.getLogger(__name__) - -# Global constants that upon necessity, can be initialized at setUpClass: -CONFIG = None # contains the entire config data, including compute backend as well as STORAGE_CONFIG and STORAGE -STORAGE_CONFIG = None # to get access to the details of the chosen storage, e.g - bucket in COS. -STORAGE = None # storage class to directly use specific storage backend functions -TEST_FILES_URLS = None # to run tests using the urls of the test files as parameters -PREFIX = '__lithops.test' # prefix of the files uploaded to storage -DATASET_PREFIX = PREFIX + '/dataset' # prefix of the dataset files uploaded to storage - - -class TestFeatureName(unittest.TestCase): # Mandatory,unittest test classes are in camel format and inherit as demonstrated. - words_in_cos_files = None # an example of a class variable - - # method called once, before the tests are run. - @classmethod - def setUpClass(cls): - - # config variables to gain access to config variables as needed: - global CONFIG, STORAGE, STORAGE_CONFIG, TEST_FILES_URLS - CONFIG, STORAGE, STORAGE_CONFIG = main_util.get_config().values() - - TEST_FILES_URLS = main_util.get_data_sets() - cls.words_in_cos_files = main_util.get_words_in_files() # get number of words in test files for testing. - - # called once, after the tests are run. - - # @classmethod - # def tearDownClass(cls): - # print('--------- All tests in template have been completed ---------') - - # Method called automatically before every single test method. - - # @classmethod - # def setUp(cls): - # print('\n-------------------------------------------------------------\n') - - # Method called automatically after every single test method. - - # @classmethod - # def tearDown(cls): - # print('--------- A test in template has been completed ---------') - -# ------------------------------------ Incorporate your test function here --------------------------------------------- - - @unittest.skipIf(subprocess.getoutput("lithops --version").split()[2] >= "2.3.4", - "This test function isn't a part of the test procedure.") # conditionally skip a test - def test_example_function(self): # unittest's function naming convention requires functions to be named as demonstrated. - """A simple test function using memory against a lithop's map function.""" - - from lithops.tests.util_func import map_util - logger.info('Testing test_tester_name()') - - fexec = lithops.FunctionExecutor(config=CONFIG) # Passing the config parameter to allow your test function to work on users that provided a path to the config file via a flag - fexec.map(map_util.simple_map_function, [(1, 1), (2, 2), (3, 3), (4, 4)]) - result = fexec.get_result() - self.assertEqual(result, [2, 4, 6, 8]) diff --git a/lithops/tests/tests_main.py b/lithops/tests/tests_main.py deleted file mode 100644 index 0dc7c36ae..000000000 --- a/lithops/tests/tests_main.py +++ /dev/null @@ -1,256 +0,0 @@ -# -# (C) Copyright IBM Corp. 2020 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import os -import sys -import inspect -import pathlib -import argparse -import base64 -import unittest -import logging -import urllib.request -from os import walk -from importlib import import_module - -from lithops.storage import Storage -from lithops.config import default_config, extract_storage_config, load_yaml_config -from concurrent.futures import ThreadPoolExecutor -from lithops.tests import main_util -from lithops.tests.util_func.storage_util import clean_tests -from lithops.utils import setup_lithops_logger - -TEST_MODULES = None # test files, e.g. test_map -TEST_GROUPS = {} # dict of test classes in the format: {test class names:test class objects} -CONFIG = None -STORAGE_CONFIG = None -STORAGE = None -PREFIX = '__lithops.test' -DATASET_PREFIX = PREFIX + '/dataset' - -base64_bytes = 'aHR0cHM6Ly9zMy1ldS13ZXN0LTEuYW1hem9uYXdzLmNvbS9hcnRtLw=='.encode('ascii') - -TEST_FILES_REPO = base64.b64decode(base64_bytes).decode('ascii') -TEST_FILES_URLS = [ - TEST_FILES_REPO + "vocab.enron.txt", - TEST_FILES_REPO + "vocab.kos.txt", - TEST_FILES_REPO + "vocab.nips.txt", - TEST_FILES_REPO + "vocab.nytimes.txt", - TEST_FILES_REPO + "vocab.pubmed.txt" -] - -logger = logging.getLogger(__name__) - - -def get_tests_of_class(class_obj): - """returns a list of all test methods of a given test class """ - method_list = [] - for attribute in dir(class_obj): - attribute_value = getattr(class_obj, attribute) - if callable(attribute_value): - if attribute.startswith('test'): - method_list.append(attribute) - return method_list - - -def print_test_functions(): - """responds to '-t help' from CLI by printing the test functions within the various test_modules""" - print("\nAvailable test functions:") - init_test_variables() - - for test_group in sorted(TEST_GROUPS.keys()): - print(f'\n{test_group}:') - for test in get_tests_of_class(TEST_GROUPS[test_group]): - print(f' ->{test}') - - -def print_test_groups(): - """responds to '-g help' from CLI by printing test groups within the various test_modules, e.g. storage/map etc. """ - print("\nAvailable test groups:\n") - init_test_variables() - for test_group in sorted(TEST_GROUPS.keys()): - print(f'{test_group} \n-----------------') - - -def register_test_groups(): - """initializes the TEST_GROUPS variable - test classes within given test modules""" - global TEST_GROUPS - for module in TEST_MODULES: - group_name = str(module).split('test_')[1].split('\'')[0] - # A test group is created for every module that contains a class inheriting from unittest.TestCase. - for member in inspect.getmembers(module, inspect.isclass): - if issubclass(member[1], unittest.TestCase): - TEST_GROUPS[group_name] = member[1] - - -def import_test_modules(): - """dynamically imports test modules from test files within the tests package""" - global TEST_MODULES - TEST_MODULES = [import_module(module) for module in ["lithops.tests." + file[:-3] - for file in - next(walk(pathlib.Path(__file__).parent.absolute()))[2] - if file.startswith("test_")]] - - -def init_test_variables(): - """initializes the global TEST variables in case they haven't been initialized""" - if not TEST_MODULES: - import_test_modules() - if not TEST_GROUPS: - register_test_groups() - - -def upload_data_sets(): - """uploads datasets to storage and return a list of the number of words within each test file""" - - def up(param): - logger.info(f'Uploading bag-of-words dataset: {param[1].split("/")[-1]}') - i, url = param - content = urllib.request.urlopen(url).read() - STORAGE.put_object(bucket=STORAGE_CONFIG['bucket'], - key=f'{DATASET_PREFIX}/test{str(i)}', - body=content) - return len(content.split()) - - with ThreadPoolExecutor() as pool: - results = list(pool.map(up, enumerate(TEST_FILES_URLS))) - result_to_compare = sum(results) - return result_to_compare - - -def config_suite(suite, tests, groups): - """ Loads tests into unittest's test-suite according to user input. """ - - if groups: # user specified the name(s) of a test group(s) - groups_list = groups.split(',') - for test_group in groups_list: - if test_group in TEST_GROUPS: - suite.addTest(unittest.makeSuite(TEST_GROUPS[test_group])) - else: - terminate('group', test_group) - - if tests: - if tests == 'all': - for test_class in TEST_GROUPS.values(): # values of TEST_GROUPS are test class objects. - suite.addTest(unittest.makeSuite(test_class)) - - else: # user specified specific test/s - tests_list = tests.split(',') - for test in tests_list: - test_found = False - - if test.find( - '.') != -1: # user specified a test class along with the tester, i.e - test_class = TEST_GROUPS.get(test.split('.')[0]) - test_name = test.split('.')[1] - if test_name in get_tests_of_class(test_class): - suite.addTest(test_class(test_name)) - test_found = True - - else: # user simply specified a test function, i.e - for test_class in TEST_GROUPS.values(): - if test in get_tests_of_class(test_class): - suite.addTest(test_class(test)) - test_found = True - - if not test_found: - terminate('test', test) - - -def run_tests(tests, config=None, group=None, backend=None, storage=None, region=None, - fail_fast=False, keep_datasets=False): - global CONFIG, STORAGE_CONFIG, STORAGE - - config_ow = {'lithops': {}, 'backend': {}} - if storage: - config_ow['lithops']['storage'] = storage - if backend: - config_ow['lithops']['backend'] = backend - if region: - config_ow['backend']['region'] = region - - CONFIG = default_config(config_data=config, config_overwrite=config_ow) - STORAGE_CONFIG = extract_storage_config(CONFIG) - STORAGE = Storage(storage_config=STORAGE_CONFIG) - init_test_variables() - - suite = unittest.TestSuite() - config_suite(suite, tests, group) - words_in_data_set = upload_data_sets() # uploads datasets and returns word count - main_util.init_config(CONFIG, STORAGE, STORAGE_CONFIG, words_in_data_set, TEST_FILES_URLS) - - runner = unittest.TextTestRunner(verbosity=2, failfast=fail_fast) - tests_results = runner.run(suite) - - # removes previously uploaded datasets from storage. - if not keep_datasets: - clean_tests(STORAGE, STORAGE_CONFIG, PREFIX) - - if not tests_results.wasSuccessful(): # Fails github workflow action to reject merge to repository - sys.tracebacklimit = 0 # avoid displaying redundant stack track-back info - raise Exception("--------Test procedure failed. Merge rejected--------") - - -def terminate(msg_type, failed_input): - if msg_type == 'group': # group not found - print(f'unknown test group: {failed_input}, use: "test -g help" to get a list of the available test groups') - else: # test not found - print(f'unknown test: {failed_input}, use: "test -t help" to get a list of the available testers') - sys.exit() - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description="test all Lithops's functionality", - usage='python -m lithops.tests.tests_main [-c CONFIG] [-t TESTNAME] ...') - parser.add_argument('-c', '--config', metavar='', default=None, - help="'path to yaml config file") - parser.add_argument('-t', '--test', metavar='', default='all', - help='run a specific test, type "-t help" for tests list') - parser.add_argument('-g', '--groups', metavar='', default='', - help='run all tests belonging to a specific group.' - ' type "-g help" for groups list') - parser.add_argument('-b', '--backend', metavar='', default=None, - help='compute backend') - parser.add_argument('-s', '--storage', metavar='', default=None, - help='storage backend') - parser.add_argument('-d', '--debug', action='store_true', default=False, - help='activate debug logging') - parser.add_argument('-f', '--fail_fast', action='store_true', default=False, - help='Stops test run upon first occurrence of a failed test') - parser.add_argument('-k', '--keep_datasets', action='store_true', default=False, - help='keeps datasets in storage after the test run. ' - 'Mainly for some instances in github workflow.') - args = parser.parse_args() - - if args.config: - if os.path.exists(args.config): - args.config = load_yaml_config(args.config) - else: - raise FileNotFoundError("Provided config file '{}' does not exist".format(args.config)) - - log_level = logging.INFO if not args.debug else logging.DEBUG - setup_lithops_logger(log_level) - - if args.groups and args.test == 'all': # if user specified test a group(s) avoid running all tests. - args.test = '' - - if args.groups == 'help': - print_test_groups() - elif args.test == 'help': - print_test_functions() - else: - run_tests(args.test, args.config, args.groups, args.backend, - args.storage, args.fail_fast, args.keep_datasets) diff --git a/lithops/tests/util_func/map_reduce_util.py b/lithops/tests/util_func/map_reduce_util.py deleted file mode 100644 index 4aa26f084..000000000 --- a/lithops/tests/util_func/map_reduce_util.py +++ /dev/null @@ -1,17 +0,0 @@ -def simple_reduce_function(results): - """general purpose reduce function that sums up the results - of previous activations of map functions """ - total = 0 - for map_result in results: - total = total + map_result - return total - - -def my_reduce_function(results): - """sums up the number of words by totaling the number of appearances of each word. - @param results: dictionary that counts the appearances of each word within a url.""" - final_result = 0 - for count in results: - for word in count: - final_result += count[word] - return final_result diff --git a/lithops/tests/util_func/map_util.py b/lithops/tests/util_func/map_util.py deleted file mode 100644 index acb6ae3d3..000000000 --- a/lithops/tests/util_func/map_util.py +++ /dev/null @@ -1,97 +0,0 @@ -import lithops -import logging - -logger = logging.getLogger(__name__) - - -def simple_map_function(x, y): - return x + y - - -def concat(lst): - return " ".join(lst) - - -def hello_world(param): - return "Hello World!" - - -def lithops_inside_lithops_map_function(x): - def _func(x): - return x - - fexec = lithops.FunctionExecutor() - fexec.map(_func, range(x)) - return fexec.get_result() - - -def lithops_return_futures_map_function1(x): - def _func(x): - return x + 1 - - fexec = lithops.FunctionExecutor() - return fexec.map(_func, range(x)) - - -def lithops_return_futures_map_function2(x): - def _func(x): - return x + 1 - - fexec = lithops.FunctionExecutor() - return fexec.call_async(_func, x + 5) - - -def lithops_return_futures_map_function3(x): - def _func(x): - return x + 1 - - fexec = lithops.FunctionExecutor() - fut1 = fexec.map(_func, range(x)) - fut2 = fexec.map(_func, range(x)) - return fut1 + fut2 - - -def my_map_function_obj(obj, id): - """returns a dictionary of {word:number of appearances} key:value items.""" - print('Function id: {}'.format(id)) - print('Bucket: {}'.format(obj.bucket)) - print('Key: {}'.format(obj.key)) - print('Partition num: {}'.format(obj.part)) - - print('Chunk size: {}'.format(obj.chunk_size)) - print('Byte range: {}'.format(obj.data_byte_range)) - - counter = {} - data = obj.data_stream.read() - - print('Data lenght: {}'.format(len(data))) - - for line in data.splitlines(): - for word in line.decode('utf-8').split(): - if word not in counter: - counter[word] = 1 - else: - counter[word] += 1 - logger.info('Testing map_reduce() over a bucket') - return counter - - -def my_map_function_url(id, obj): - print('I am processing the object from {}'.format(obj.url)) - print('Function id: {}'.format(id)) - print('Partition num: {}'.format(obj.part)) - print('Chunk size: {}'.format(obj.chunk_size)) - print('Byte range: {}'.format(obj.data_byte_range)) - - counter = {} - data = obj.data_stream.read() - - print('Data lenght: {}'.format(len(data))) - - for line in data.splitlines(): - for word in line.decode('utf-8').split(): - if word not in counter: - counter[word] = 1 - else: - counter[word] += 1 - return counter diff --git a/lithops/tests/util_func/storage_util.py b/lithops/tests/util_func/storage_util.py deleted file mode 100644 index 8ed81e7da..000000000 --- a/lithops/tests/util_func/storage_util.py +++ /dev/null @@ -1,66 +0,0 @@ -import pickle -from lithops.tests.util_func.map_reduce_util import my_reduce_function -from lithops.tests.util_func.map_util import my_map_function_obj - -PREFIX = '__lithops.test' -DATASET_PREFIX = PREFIX + '/dataset' - - -def clean_tests(storage, storage_config, prefix=PREFIX): - """removes datasets from storage""" - def _list_test_keys(storage, storage_config, prefix): - return storage.list_keys(bucket=storage_config['bucket'], prefix=prefix + '/') - - for key in _list_test_keys(storage, storage_config, prefix): - storage.delete_object(bucket=storage_config['bucket'], - key=key) - - -def list_dataset_keys(storage, storage_config, dataset_prefix=DATASET_PREFIX): - return storage.list_keys(bucket=storage_config['bucket'], - prefix=dataset_prefix + '/') - - -def my_cloudobject_put(obj, storage): - """uploads to storage pickled dict of type: {word:number of appearances} """ - counter = my_map_function_obj(obj, 0) - cloudobject = storage.put_cloudobject(pickle.dumps(counter)) - return cloudobject - - -def my_cloudobject_get(cloudobjects, storage): - """unpickles list of data from storage and return their sum by using a reduce function """ - data = [pickle.loads(storage.get_cloudobject(co)) for co in cloudobjects] - return my_reduce_function(data) - - -def my_map_function_storage(key_i, bucket_name, storage): - print(f'I am processing the object /{bucket_name}/{key_i}') - counter = {} - data = storage.get_object(bucket_name, key_i) - for line in data.splitlines(): - for word in line.decode('utf-8').split(): - if word not in counter: - counter[word] = 1 - else: - counter[word] += 1 - return counter - - -def get_dataset_key_size(storage, storage_config, key_prefix=DATASET_PREFIX): - """return a list of file sizes in bytes, belonging to files whose names are - prefixed by 'key_prefix' """ - - sizes = [] - bucket_name = storage_config['bucket'] - keys = list_dataset_keys(storage, storage_config, key_prefix) - for key in keys: - sizes.append(float(storage.head_object(bucket_name, key)['content-length'])) - return sizes - - -def extract_keys(bucket_objects): - keys = [] - for obj in bucket_objects: - keys.append(obj['Key']) - return keys diff --git a/lithops/util/joblib/lithops_backend.py b/lithops/util/joblib/lithops_backend.py index cdf8c3c11..c742f80c8 100644 --- a/lithops/util/joblib/lithops_backend.py +++ b/lithops/util/joblib/lithops_backend.py @@ -14,18 +14,22 @@ # limitations under the License. # -import os -import gc import logging +import os import pickle import diskcache - -from joblib._parallel_backends import ParallelBackendBase, PoolManagerMixin -from joblib.parallel import register_parallel_backend from numpy import ndarray +from multiprocessing.pool import ThreadPool from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict, Optional -from lithops.multiprocessing import Pool +from joblib import Parallel +from joblib._parallel_backends import MultiprocessingBackend +from joblib.pool import PicklingPool +from joblib.parallel import register_parallel_backend + +from lithops.multiprocessing import Pool, cpu_count +from lithops.constants import LITHOPS_TEMP_DIR from lithops.storage import Storage logger = logging.getLogger(__name__) @@ -36,76 +40,91 @@ def register_lithops(): register_parallel_backend("lithops", LithopsBackend) -class LithopsBackend(ParallelBackendBase, PoolManagerMixin): +class LithopsBackend(MultiprocessingBackend): """A ParallelBackend which will use a multiprocessing.Pool. Will introduce some communication and memory overhead when exchanging input and output data with the with the worker Python processes. However, does not suffer from the Python Global Interpreter Lock. """ - def __init__(self, nesting_level=None, inner_max_num_threads=None, **pool_kwargs): - super().__init__(nesting_level, inner_max_num_threads, **{}) - self.__pool_kwargs = pool_kwargs - - # Environment variables to protect against bad situations when nesting - JOBLIB_SPAWNED_PROCESS = "__JOBLIB_SPAWNED_PARALLEL__" - supports_timeout = True supports_sharedmem = False + supports_retrieve_callback = False + + def __init__( + self, + nesting_level: Optional[int] = None, + inner_max_num_threads: Optional[int] = None, + lithops_args: Optional[Dict[str, Any]] = None, + **kwargs + ): + self.lithops_args = lithops_args + self.eff_n_jobs = None + self.prefer = None + super().__init__( + nesting_level=nesting_level, + inner_max_num_threads=inner_max_num_threads, + **kwargs + ) + + def configure( + self, + n_jobs: int = 1, + parallel: Optional[Parallel] = None, + prefer: Optional[str] = None, + require: Optional[str] = None, + **memmappingpool_args + ): + """Make Lithops Pool the father class of PicklingPool. PicklingPool is a + father class that inherits Pool from multiprocessing.pool. The next + line is a patch, which changes the inheritance of Pool to be from + lithops.multiprocessing.pool + """ + self.prefer = prefer + PicklingPool.__bases__ = (Pool,) + + if n_jobs == -1: + n_jobs = self.effective_n_jobs(n_jobs) + + eff_n_jobs = super(LithopsBackend, self).configure( + n_jobs, + parallel, + prefer, + require, + **memmappingpool_args + ) + return eff_n_jobs def effective_n_jobs(self, n_jobs): - """Determine the number of jobs which are going to run in parallel. - This also checks if we are attempting to create a nested parallel - loop. - """ - # this must be 1 as we only want to create 1 LithopsExecutor() - return 1 - - def configure(self, n_jobs=1, parallel=None, prefer=None, require=None, - **memmappingpool_args): - """Build a process or thread pool and return the number of workers""" - - n_jobs = self.effective_n_jobs(n_jobs) - - already_forked = int(os.environ.get(self.JOBLIB_SPAWNED_PROCESS, 0)) - if already_forked: - raise ImportError( - '[joblib] Attempting to do parallel computing ' - 'without protecting your import on a system that does ' - 'not support forking. To use parallel-computing in a ' - 'script, you must protect your main loop using "if ' - "__name__ == '__main__'" - '". Please see the joblib documentation on Parallel ' - 'for more information') - # Set an environment variable to avoid infinite loops - os.environ[self.JOBLIB_SPAWNED_PROCESS] = '1' - - # Make sure to free as much memory as possible before forking - gc.collect() - self._pool = Pool() - self.parallel = parallel - - return n_jobs - - def terminate(self): - """Shutdown the process or thread pool""" - super().terminate() - if self.JOBLIB_SPAWNED_PROCESS in os.environ: - del os.environ[self.JOBLIB_SPAWNED_PROCESS] + eff_n_jobs = super(LithopsBackend, self).effective_n_jobs(n_jobs) + if n_jobs == -1: + self.eff_n_jobs = self.eff_n_jobs or cpu_count() + eff_n_jobs = self.eff_n_jobs + return eff_n_jobs + + def start_call(self): + """This is a workaround to make "batch size" working properly + and invoke all the tasks using a single map() instead of + individual apply_async()""" + self.parallel._cached_effective_n_jobs = 1 + self.parallel.pre_dispatch = 'all' def compute_batch_size(self): return int(1e6) def apply_async(self, func, callback=None): """Schedule a func to be run""" - # return self._get_pool().map_async(handle_call, func.items, callback=callback) # bypass - mem_opt_calls = find_shared_objects(func.items) - return self._get_pool().starmap_async(handle_call, mem_opt_calls) + if self.prefer == "threads": + return self._get_pool().apply_async(handle_call_threads, (mem_opt_calls, ), callback=callback) + else: + return self._get_pool().starmap_async(handle_call_process, mem_opt_calls, callback=callback) def find_shared_objects(calls): # find and annotate repeated arguments + logger.info('Optimizing shared data between tasks') + record = {} for i, call in enumerate(calls): for j, arg in enumerate(call[1]): @@ -159,7 +178,14 @@ def put_arg_obj(positions): return [tuple(item) for item in calls] -def handle_call(func, args, kwargs, proxy_positions=[]): +def handle_call_threads(mem_opt_calls): + with ThreadPool(processes=len(mem_opt_calls)) as pool: + results = pool.starmap(handle_call_process, mem_opt_calls) + + return list(results) + + +def handle_call_process(func, args, kwargs, proxy_positions=[]): if len(proxy_positions) > 0: args, kwargs = replace_with_values(args, kwargs, proxy_positions) @@ -169,7 +195,7 @@ def handle_call(func, args, kwargs, proxy_positions=[]): def replace_with_values(args, kwargs, proxy_positions): args_as_list = list(args) thread_pool = ThreadPoolExecutor(max_workers=len(proxy_positions)) - cache = diskcache.Cache('/tmp/lithops/cache') + cache = diskcache.Cache(os.path.join(LITHOPS_TEMP_DIR, 'cache')) def get_arg_obj(idx_or_key): if isinstance(idx_or_key, str): diff --git a/lithops/util/ssh_client.py b/lithops/util/ssh_client.py index 6870125b7..4e33a8742 100644 --- a/lithops/util/ssh_client.py +++ b/lithops/util/ssh_client.py @@ -1,7 +1,6 @@ import paramiko import logging import os -from lithops.standalone.standalone import LithopsValidationError logger = logging.getLogger(__name__) @@ -15,9 +14,10 @@ def __init__(self, ip_address, ssh_credentials): if 'key_filename' in self.ssh_credentials: fpath = os.path.expanduser(self.ssh_credentials['key_filename']) - if not os.path.exists(fpath): - raise LithopsValidationError(f"Private key file {fpath} doesn't exist") self.ssh_credentials['key_filename'] = fpath + if not os.path.exists(fpath): + logger.debug(f"Private key file {fpath} doesn't exist. Trying with the default key") + self.ssh_credentials['key_filename'] = os.path.expanduser('~/.ssh/id_rsa') def close(self): """ @@ -28,7 +28,7 @@ def close(self): def create_client(self, timeout=2): """ - Crate the SSH client connection + Create the SSH client connection """ try: self.ssh_client = paramiko.SSHClient() @@ -51,9 +51,6 @@ def create_client(self, timeout=2): logger.debug(f"{self.ip_address} ssh client created") except Exception as e: - pk = self.ssh_credentials.get('key_filename') - if pk and str(e) == 'Authentication failed.': - raise LithopsValidationError(f'Private key {pk} is not valid') raise e return self.ssh_client @@ -72,17 +69,19 @@ def run_remote_command(self, cmd, timeout=None, run_async=False): try: stdin, stdout, stderr = self.ssh_client.exec_command(cmd, timeout=timeout) - except Exception as e: + except Exception: # Normally this is a timeout exception self.ssh_client = self.create_client() stdin, stdout, stderr = self.ssh_client.exec_command(cmd, timeout=timeout) out = None + err = None + if not run_async: out = stdout.read().decode().strip() - error = stderr.read().decode().strip() + err = stderr.read().decode().strip() - return out + return out, err def download_remote_file(self, remote_src, local_dst): """ diff --git a/lithops/utils.py b/lithops/utils.py index 9a00fcb4b..b6fc4076b 100644 --- a/lithops/utils.py +++ b/lithops/utils.py @@ -21,6 +21,7 @@ import sys import uuid import json +import socket import shutil import base64 import inspect @@ -28,8 +29,11 @@ import lithops import zipfile import platform +import threading import logging.config import subprocess as sp +from enum import Enum +from contextlib import closing from lithops import constants from lithops.version import __version__ @@ -252,8 +256,8 @@ def add_folder_to_zip(zip_file, full_dir_path, sub_dir=''): def verify_runtime_name(runtime_name): """Check if the runtime name has a correct formating""" - assert re.match("^[A-Za-z0-9_/.:-]*$", runtime_name),\ - 'Runtime name "{}" not valid'.format(runtime_name) + assert re.match("^[A-Za-z0-9_/.:-]*$", runtime_name), \ + f'Runtime name "{runtime_name}" not valid' def timeout_handler(error_msg, signum, frame): @@ -316,7 +320,7 @@ def convert_bools_to_string(extra_env): Converts all booleans of a dictionary to a string """ for key in extra_env: - if type(extra_env[key]) == bool: + if type(extra_env[key]) is bool: extra_env[key] = str(extra_env[key]) return extra_env @@ -437,6 +441,13 @@ def get_docker_username(): return user +def find_free_port(): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] + + def split_object_url(obj_url): if '://' in obj_url: sb, path = obj_url.split('://') @@ -482,7 +493,7 @@ def format_data(iterdata, extra_args): # Format iterdata in a proper way if type(iterdata) in [range, set]: data = list(iterdata) - elif type(iterdata) != list and type(iterdata) != FuturesList: + elif type(iterdata) is not list and type(iterdata) is not FuturesList: data = [iterdata] else: data = iterdata @@ -531,7 +542,7 @@ def verify_args(func, iterdata, extra_args): new_data = list() for elem in data: - if type(elem) == dict: + if isinstance(elem, dict): if set(list(new_func_sig.parameters.keys())) <= set(elem): new_data.append(elem) else: @@ -540,7 +551,7 @@ def verify_args(func, iterdata, extra_args): "the args must be: {}" .format(list(elem.keys()), list(new_func_sig.parameters.keys()))) - elif type(elem) == tuple: + elif isinstance(elem, tuple): new_elem = dict(new_func_sig.bind(*list(elem)).arguments) new_data.append(new_elem) else: @@ -658,8 +669,8 @@ def read(self, n=None): self._first_byte = self.sb.read(self._plusbytes) retval = self.sb.read(n) - - self.pos += len(retval) + last_row_end_pos = len(retval) + self.pos += last_row_end_pos first_row_start_pos = 0 if self._first_read and self._first_byte and \ @@ -670,11 +681,11 @@ def read(self, n=None): first_row_start_pos = retval.find(self.newline_char) + 1 self._first_read = False - last_row_end_pos = self.pos # Find end of the line in threshold - if self.pos > self.size: - last_byte_pos = retval[self.size - 1:].find(self.newline_char) - last_row_end_pos = self.size + last_byte_pos + if self.pos >= self.size: + current_end_pos = last_row_end_pos - (self.pos - self.size) + last_byte_pos = retval[current_end_pos - 1:].find(self.newline_char) + last_row_end_pos = current_end_pos + last_byte_pos self._eof = True return retval[first_row_start_pos:last_row_end_pos] @@ -727,4 +738,30 @@ def is_podman(docker_path): return False +class BackendType(Enum): + BATCH = 'batch' + FAAS = 'faas' + + +class CountDownLatch: + def __init__(self, count): + self.count = count + self.event = threading.Event() + self.lock = threading.Lock() + + def unlock(self): + with self.lock: + self.count -= 1 + if self.count == 0: + self.event.set() + + def wait(self): + if self.count > 0: + self.event.wait() + + @property + def done(self): + return self.count == 0 + + CURRENT_PY_VERSION = version_str(sys.version_info) diff --git a/lithops/version.py b/lithops/version.py index b7be92b95..56fcc5d20 100644 --- a/lithops/version.py +++ b/lithops/version.py @@ -1,5 +1,5 @@ -__version__ = "3.0.1.dev0" +__version__ = "3.6.3.dev0" if __name__ == "__main__": print(__version__) diff --git a/lithops/wait.py b/lithops/wait.py index a07c39813..22f12b2e8 100644 --- a/lithops/wait.py +++ b/lithops/wait.py @@ -49,8 +49,9 @@ def wait(fs: Union[ResponseFuture, FuturesList, List[ResponseFuture]], download_results: Optional[bool] = False, timeout: Optional[int] = None, threadpool_size: Optional[int] = THREADPOOL_SIZE, - wait_dur_sec: Optional[int] = WAIT_DUR_SEC, - show_progressbar: Optional[bool] = True) -> Tuple[FuturesList, FuturesList]: + wait_dur_sec: Optional[int] = None, + show_progressbar: Optional[bool] = True, + futures_from_executor_wait: Optional[bool] = False) -> Tuple[FuturesList, FuturesList]: """ Wait for the Future instances (possibly created by different Executor instances) given by fs to complete. Returns a named 2-tuple of sets. The first set, named done, @@ -67,7 +68,7 @@ def wait(fs: Union[ResponseFuture, FuturesList, List[ResponseFuture]], :param download_results: Download results. Default false (Only get statuses) :param timeout: Timeout of waiting for results. :param threadpool_size: Number of threads to use. Default 64 - :param wait_dur_sec: Time interval between each check. + :param wait_dur_sec: Time interval between each check. Default 1 second :param show_progressbar: whether or not to show the progress bar. :return: `(fs_done, fs_notdone)` @@ -78,28 +79,32 @@ def wait(fs: Union[ResponseFuture, FuturesList, List[ResponseFuture]], if not fs: return - if type(fs) != list and type(fs) != FuturesList: + if type(fs) is not list and type(fs) is not FuturesList: fs = [fs] if download_results: - fs_to_wait = len(fs) - msg = (f'ExecutorID {fs[0].executor_id} - Getting results from {len(fs)} function activations') fs_done = [f for f in fs if f.done] fs_not_done = [f for f in fs if not f.done] - else: - fs_to_wait = math.ceil(return_when * len(fs) / 100) - msg_text = 'any' if return_when == ANY_COMPLETED else f'{return_when}%' - msg = (f'ExecutorID {fs[0].executor_id} - Waiting for {msg_text} of ' - f'{len(fs)} function activations to complete') fs_done = [f for f in fs if f.success or f.done] fs_not_done = [f for f in fs if not (f.success or f.done)] - logger.info(msg) - if not fs_not_done: + logger.debug(f'ExecutorID {fs[0].executor_id} - All function activations are done') return fs_done, fs_not_done + not_done_futures = fs_not_done if futures_from_executor_wait else fs + + fs_to_wait = math.ceil(return_when * len(not_done_futures) / 100) + + if return_when == ALL_COMPLETED: + logger.info(f'ExecutorID {fs[0].executor_id} - Waiting for ' + f'{len(not_done_futures)} function activations to complete') + else: + txt = 'any' if return_when == ANY_COMPLETED else f'{return_when}%' + logger.info(f'ExecutorID {fs[0].executor_id} - Waiting for {txt} of ' + f'{len(not_done_futures)} function activations to complete') + if is_unix_system() and timeout is not None: logger.debug(f'Setting waiting timeout to {timeout} seconds') error_msg = 'Timeout of {timeout} seconds exceeded waiting for function activations to finish' @@ -108,8 +113,7 @@ def wait(fs: Union[ResponseFuture, FuturesList, List[ResponseFuture]], # Setup progress bar pbar = None - if not is_lithops_worker() and logger.getEffectiveLevel() == logging.INFO \ - and show_progressbar: + if not is_lithops_worker() and show_progressbar and logger.getEffectiveLevel() != logging.DEBUG: from tqdm.auto import tqdm if not is_notebook(): print() @@ -127,7 +131,8 @@ def wait(fs: Union[ResponseFuture, FuturesList, List[ResponseFuture]], internal_storage=executor_data.internal_storage) job_monitor.start(fs=executor_data.futures) - sleep_sec = wait_dur_sec if job_monitor.backend == 'storage' else 0.3 + sleep_sec = wait_dur_sec or WAIT_DUR_SEC if job_monitor.type == 'storage' \ + and job_monitor.storage_backend != 'localhost' else 0.1 if return_when == ALWAYS: for executor_data in executors_data: @@ -137,6 +142,8 @@ def wait(fs: Union[ResponseFuture, FuturesList, List[ResponseFuture]], threadpool_size=threadpool_size) else: while not _check_done(fs, return_when, download_results): + if not job_monitor.is_alive(): + job_monitor.start(fs=fs) for executor_data in executors_data: new_data = _get_executor_data(fs, executor_data, pbar=pbar, throw_except=throw_except, @@ -149,7 +156,7 @@ def wait(fs: Union[ResponseFuture, FuturesList, List[ResponseFuture]], not_dones_call_ids = [(f.job_id, f.call_id) for f in fs if not f.done] else: not_dones_call_ids = [(f.job_id, f.call_id) for f in fs if not f.success and not f.done] - msg = ('Cancelled - Total Activations not done: {}'.format(len(not_dones_call_ids))) + msg = (f'Cancelled - Total Activations not done: {len(not_dones_call_ids)}') if pbar: pbar.close() print() @@ -182,7 +189,7 @@ def get_result(fs: Optional[Union[ResponseFuture, FuturesList, List[ResponseFutu throw_except: Optional[bool] = True, timeout: Optional[int] = None, threadpool_size: Optional[int] = THREADPOOL_SIZE, - wait_dur_sec: Optional[int] = WAIT_DUR_SEC, + wait_dur_sec: Optional[int] = None, show_progressbar: Optional[bool] = True): """ For getting the results from all function activations @@ -192,14 +199,19 @@ def get_result(fs: Optional[Union[ResponseFuture, FuturesList, List[ResponseFutu :param throw_except: Reraise exception if call raised. Default True. :param timeout: Timeout for waiting for results. :param threadpool_size: Number of threads to use. Default 128 - :param wait_dur_sec: Time interval between each check. + :param wait_dur_sec: Time interval between each check. Default 1 second :param show_progressbar: whether or not to show the progress bar. :return: The result of the future/s """ - if type(fs) != list and type(fs) != FuturesList: + if type(fs) is not list and type(fs) is not FuturesList: fs = [fs] + logger.info( + (f'ExecutorID {fs[0].executor_id} - Getting results from ' + f'{len(fs)} function activations') + ) + fs_done, _ = wait(fs=fs, throw_except=throw_except, timeout=timeout, download_results=True, internal_storage=internal_storage, @@ -207,11 +219,10 @@ def get_result(fs: Optional[Union[ResponseFuture, FuturesList, List[ResponseFutu wait_dur_sec=wait_dur_sec, show_progressbar=show_progressbar) result = [] - fs_done = [f for f in fs_done if not f.futures and f._produce_output] - for f in fs_done: + for f in [f for f in fs_done if not f.futures and f._produce_output]: result.append(f.result(throw_except=throw_except)) - logger.debug("ExecutorID {} - Finished getting results".format(fs[0].executor_id)) + logger.debug(f"ExecutorID {fs[0].executor_id} - Finished getting results") return result diff --git a/lithops/worker/__init__.py b/lithops/worker/__init__.py index 43b3ad34d..433b8a8ad 100644 --- a/lithops/worker/__init__.py +++ b/lithops/worker/__init__.py @@ -1,2 +1,7 @@ from .handler import function_handler from .invoker import function_invoker + +__all__ = [ + 'function_handler', + 'function_invoker' +] diff --git a/lithops/worker/handler.py b/lithops/worker/handler.py index b9f6b4578..0de58c580 100644 --- a/lithops/worker/handler.py +++ b/lithops/worker/handler.py @@ -20,12 +20,13 @@ import zlib import time import json -import queue +import uuid import base64 import pickle import logging import traceback import multiprocessing as mp +from queue import Queue, Empty from threading import Thread from multiprocessing import Process, Pipe from tblib import pickling_support @@ -36,11 +37,12 @@ from lithops.config import extract_storage_config from lithops.storage import InternalStorage from lithops.worker.jobrunner import JobRunner -from lithops.worker.utils import LogStream, custom_redirection,\ +from lithops.worker.utils import LogStream, custom_redirection, \ get_function_and_modules, get_function_data from lithops.constants import JOBS_PREFIX, LITHOPS_TEMP_DIR, MODULES_DIR from lithops.utils import setup_lithops_logger, is_unix_system from lithops.worker.status import create_call_status +from lithops.worker.utils import SystemMonitor pickling_support.install() @@ -52,48 +54,48 @@ class ShutdownSentinel: pass -def function_handler(payload): +def create_job(payload: dict) -> SimpleNamespace: job = SimpleNamespace(**payload) + storage_config = extract_storage_config(job.config) + internal_storage = InternalStorage(storage_config) + job.func = get_function_and_modules(job, internal_storage) + job.data = get_function_data(job, internal_storage) + + return job + + +def function_handler(payload): + """ + Default function entry point called from Serverless backends + """ + job = create_job(payload) setup_lithops_logger(job.log_level) worker_processes = min(job.worker_processes, len(job.call_ids)) logger.info(f'Tasks received: {len(job.call_ids)} - Worker processes: {worker_processes}') - env = job.extra_env - env['LITHOPS_WORKER'] = 'True' - env['PYTHONUNBUFFERED'] = 'True' - os.environ.update(env) - - storage_config = extract_storage_config(job.config) - internal_storage = InternalStorage(storage_config) - job.func = get_function_and_modules(job, internal_storage) - job_data = get_function_data(job, internal_storage) - if worker_processes == 1: - job_queue = queue.Queue() + work_queue = Queue() for call_id in job.call_ids: - data = job_data.pop(0) - job_queue.put((job, call_id, data)) - job_queue.put(ShutdownSentinel()) - process_runner(job_queue) + data = job.data.pop(0) + work_queue.put((job, call_id, data)) + work_queue.put(ShutdownSentinel()) + python_queue_consumer(0, work_queue, ) else: manager = SyncManager() manager.start() - job_queue = manager.Queue() + work_queue = manager.Queue() job_runners = [] for call_id in job.call_ids: - data = job_data.pop(0) - job_queue.put((job, call_id, data)) - - for i in range(worker_processes): - job_queue.put(ShutdownSentinel()) + data = job.data.pop(0) + work_queue.put((job, call_id, data)) - for runner_id in range(worker_processes): - p = mp.Process(target=process_runner, args=(job_queue,)) + for pid in range(worker_processes): + work_queue.put(ShutdownSentinel()) + p = mp.Process(target=python_queue_consumer, args=(pid, work_queue,)) job_runners.append(p) p.start() - logger.info(f'Worker process {runner_id} started') for runner in job_runners: runner.join() @@ -105,67 +107,93 @@ def function_handler(payload): if module_path in sys.path: sys.path.remove(module_path) - # Unset specific job env vars - for key in job.extra_env: - os.environ.pop(key, None) os.environ.pop('__LITHOPS_TOTAL_EXECUTORS', None) -def process_runner(job_queue): +def python_queue_consumer(pid, work_queue, initializer=None, callback=None): """ - Listens the job_queue and executes the jobs + Listens to the job_queue and executes the individual job tasks """ + logger.info(f'Worker process {pid} started') while True: try: - event = job_queue.get(block=True) + event = work_queue.get(block=True) + except Empty: + break except BrokenPipeError: break if isinstance(event, ShutdownSentinel): break - job, call_id, data = event - job.start_tstamp = time.time() - job.call_id = call_id - job.data = data + task, call_id, data = event + task.call_id = call_id + task.data = data - storage_backend = job.config['lithops']['storage'] - bucket = job.config[storage_backend]['storage_bucket'] - job.task_dir = os.path.join(LITHOPS_TEMP_DIR, bucket, JOBS_PREFIX, job.job_key, job.call_id) - job.log_file = os.path.join(job.task_dir, 'execution.log') - job.stats_file = os.path.join(job.task_dir, 'job_stats.txt') - os.makedirs(job.task_dir, exist_ok=True) + initializer(pid, task) if initializer is not None else None - with open(job.log_file, 'a') as log_strem: - job.log_stream = LogStream(log_strem) - with custom_redirection(job.log_stream): - run_job(job) + prepare_and_run_task(task) + callback(pid, task) if callback is not None else None -def run_job(job): + logger.info(f'Worker process {pid} finished') + + +def prepare_and_run_task(task): + task.start_tstamp = time.time() + + if '__LITHOPS_ACTIVATION_ID' not in os.environ: + act_id = str(uuid.uuid4()).replace('-', '')[:12] + os.environ['__LITHOPS_ACTIVATION_ID'] = act_id + + os.environ['LITHOPS_WORKER'] = 'True' + os.environ['PYTHONUNBUFFERED'] = 'True' + os.environ.update(task.extra_env) + + storage_backend = task.config['lithops']['storage'] + bucket = task.config[storage_backend]['storage_bucket'] + task.task_dir = os.path.join(LITHOPS_TEMP_DIR, bucket, JOBS_PREFIX, task.job_key, task.call_id) + task.log_file = os.path.join(task.task_dir, 'execution.log') + task.stats_file = os.path.join(task.task_dir, 'job_stats.txt') + os.makedirs(task.task_dir, exist_ok=True) + + with open(task.log_file, 'a') as log_strem: + task.log_stream = LogStream(log_strem) + with custom_redirection(task.log_stream): + run_task(task) + + # Unset specific job env vars + for key in task.extra_env: + os.environ.pop(key, None) + + +def run_task(task): """ Runs a single job within a separate process """ - setup_lithops_logger(job.log_level) + setup_lithops_logger(task.log_level) backend = os.environ.get('__LITHOPS_BACKEND', '') - logger.info("Lithops v{} - Starting {} execution".format(__version__, backend)) - logger.info("Execution ID: {}/{}".format(job.job_key, job.call_id)) + logger.info(f"Lithops v{__version__} - Starting {backend} execution") + logger.info(f"Execution ID: {task.job_key}/{task.call_id}") - env = job.extra_env - env['LITHOPS_CONFIG'] = json.dumps(job.config) - env['__LITHOPS_SESSION_ID'] = '-'.join([job.job_key, job.call_id]) + env = task.extra_env + env['LITHOPS_CONFIG'] = json.dumps(task.config) + env['__LITHOPS_SESSION_ID'] = '-'.join([task.job_key, task.call_id]) os.environ.update(env) - storage_config = extract_storage_config(job.config) + storage_config = extract_storage_config(task.config) internal_storage = InternalStorage(storage_config) - call_status = create_call_status(job, internal_storage) + call_status = create_call_status(task, internal_storage) - if job.runtime_memory: - logger.debug('Runtime: {} - Memory: {}MB - Timeout: {} seconds' - .format(job.runtime_name, job.runtime_memory, job.execution_timeout)) + runtime_name = task.runtime_name + memory = task.runtime_memory + timeout = task.execution_timeout + + if task.runtime_memory: + logger.debug(f'Runtime: {runtime_name} - Memory: {memory}MB - Timeout: {timeout} seconds') else: - logger.debug('Runtime: {} - Timeout: {} seconds'.format(job.runtime_name, job.execution_timeout)) + logger.debug(f'Runtime: {runtime_name} - Timeout: {timeout} seconds') job_interruped = False @@ -174,13 +202,34 @@ def run_job(job): call_status.send_init_event() handler_conn, jobrunner_conn = Pipe() - jobrunner = JobRunner(job, jobrunner_conn, internal_storage) + jobrunner = JobRunner(task, jobrunner_conn, internal_storage) logger.debug('Starting JobRunner process') jrp = Process(target=jobrunner.run) if is_unix_system() else Thread(target=jobrunner.run) + + process_id = os.getpid() if is_unix_system() else mp.current_process().pid + sys_monitor = SystemMonitor(process_id) + sys_monitor.start() + jrp.start() - jrp.join(job.execution_timeout) + jrp.join(task.execution_timeout) + + sys_monitor.stop() logger.debug('JobRunner process finished') + cpu_info = sys_monitor.get_cpu_info() + call_status.add('worker_func_cpu_usage', cpu_info['usage']) + call_status.add('worker_func_cpu_system_time', round(cpu_info['system'], 8)) + call_status.add('worker_func_cpu_user_time', round(cpu_info['user'], 8)) + + net_io = sys_monitor.get_network_io() + call_status.add('worker_func_sent_net_io', net_io['sent']) + call_status.add('worker_func_recv_net_io', net_io['recv']) + + mem_info = sys_monitor.get_memory_info() + call_status.add('worker_func_rss', mem_info['rss']) + call_status.add('worker_func_vms', mem_info['vms']) + call_status.add('worker_func_uss', mem_info['uss']) + if jrp.is_alive(): # If process is still alive after jr.join(job_max_runtime), kill it try: @@ -189,7 +238,7 @@ def run_job(job): # thread does not have terminate method pass msg = ('Function exceeded maximum time of {} seconds and was ' - 'killed'.format(job.execution_timeout)) + 'killed'.format(task.execution_timeout)) raise TimeoutError('HANDLER', msg) if not handler_conn.poll(): @@ -201,8 +250,8 @@ def run_job(job): msg = 'Function exceeded maximum memory and was killed' raise MemoryError('HANDLER', msg) - if os.path.exists(job.stats_file): - with open(job.stats_file, 'r') as fid: + if os.path.exists(task.stats_file): + with open(task.stats_file, 'r') as fid: for line in fid.readlines(): key, value = line.strip().split(" ", 1) try: @@ -232,9 +281,9 @@ def run_job(job): call_status.add('worker_end_tstamp', time.time()) # Flush log stream and save it to the call status - job.log_stream.flush() - if os.path.isfile(job.log_file): - with open(job.log_file, 'rb') as lf: + task.log_stream.flush() + if os.path.isfile(task.log_file): + with open(task.log_file, 'rb') as lf: log_str = base64.b64encode(zlib.compress(lf.read())).decode() call_status.add('logs', log_str) diff --git a/lithops/worker/jobrunner.py b/lithops/worker/jobrunner.py index 809c20ba9..fa60d0307 100644 --- a/lithops/worker/jobrunner.py +++ b/lithops/worker/jobrunner.py @@ -18,6 +18,7 @@ import os import io import sys +import ast import pika import time import pickle @@ -26,20 +27,20 @@ import requests import traceback from pydoc import locate -from distutils.util import strtobool from lithops.worker.utils import peak_memory try: import numpy as np + np.__version__ except ModuleNotFoundError: pass from lithops.storage import Storage from lithops.wait import wait from lithops.future import ResponseFuture -from lithops.utils import WrappedStreamingBody, sizeof_fmt, is_object_processing_function, FuturesList,\ - verify_args +from lithops.utils import WrappedStreamingBody, sizeof_fmt, \ + is_object_processing_function, FuturesList, verify_args from lithops.utils import WrappedStreamingBodyPartition from lithops.util.metrics import PrometheusExporter from lithops.storage.utils import create_output_key @@ -209,7 +210,7 @@ def run(self): func = pickle.loads(self.job.func) data = pickle.loads(self.job.data) - if strtobool(os.environ.get('__LITHOPS_REDUCE_JOB', 'False')): + if ast.literal_eval(os.environ.get('__LITHOPS_REDUCE_JOB', 'False')): self._wait_futures(data) elif is_object_processing_function(func): self._load_object(data) @@ -230,7 +231,7 @@ def run(self): ) ) - logger.info("Going to execute '{}()'".format(str(fn_name))) + logger.info(f"Going to execute '{str(fn_name)}()'") print('---------------------- FUNCTION LOG ----------------------') function_start_tstamp = time.time() result = func(**data) @@ -246,7 +247,7 @@ def run(self): if result is not None: # Check for new futures if isinstance(result, ResponseFuture) or isinstance(result, FuturesList) \ - or (type(result) == list and len(result) > 0 and isinstance(result[0], ResponseFuture)): + or (type(result) is list and len(result) > 0 and isinstance(result[0], ResponseFuture)): self.stats.write('new_futures', pickle.dumps(result)) result = None else: diff --git a/lithops/worker/status.py b/lithops/worker/status.py index 427d212cf..b6880782f 100644 --- a/lithops/worker/status.py +++ b/lithops/worker/status.py @@ -1,10 +1,10 @@ import os +import ast import pika import json import time import logging from tblib import pickling_support -from distutils.util import strtobool from contextlib import contextmanager import lithops.worker @@ -45,7 +45,7 @@ def __init__(self, job, internal_storage): 'chunksize': job.chunksize } - if strtobool(os.environ.get('WARM_CONTAINER', 'False')): + if ast.literal_eval(os.environ.get('WARM_CONTAINER', 'False')): self.status['worker_cold_start'] = False else: self.status['worker_cold_start'] = True diff --git a/lithops/worker/utils.py b/lithops/worker/utils.py index bba04dc55..fb20c1d37 100644 --- a/lithops/worker/utils.py +++ b/lithops/worker/utils.py @@ -25,7 +25,14 @@ from lithops.version import __version__ as lithops_ver from lithops.utils import sizeof_fmt, is_unix_system, b64str_to_bytes -from lithops.constants import LITHOPS_TEMP_DIR, MODULES_DIR +from lithops.constants import MODULES_DIR, SA_INSTALL_DIR, LITHOPS_TEMP_DIR + +try: + import psutil + psutil_found = True +except ModuleNotFoundError: + psutil_found = False + logger = logging.getLogger(__name__) @@ -40,22 +47,26 @@ def get_function_and_modules(job, internal_storage): """ Gets the function and modules from storage """ - logger.debug("Getting function and modules") - - if job.config['lithops'].get('customized_runtime'): - logger.debug("Customized runtime feature activated. Loading " - "function and modules from local runtime") - func_path = '/'.join([LITHOPS_TEMP_DIR, job.func_key]) + logger.info("Getting function and modules") + backend = job.config['lithops']['backend'] + func_path = '/'.join([LITHOPS_TEMP_DIR, job.func_key]) + func_obj = None + + if job.config[backend].get('runtime_include_function'): + logger.info("Runtime include function feature activated. Loading " + "function/mods from local runtime") + func_path = '/'.join([SA_INSTALL_DIR, job.func_key]) with open(func_path, "rb") as f: func_obj = f.read() else: + logger.info(f"Loading {job.func_key} from storage") func_obj = internal_storage.get_func(job.func_key) loaded_func_all = pickle.loads(func_obj) if loaded_func_all.get('module_data'): module_path = os.path.join(MODULES_DIR, job.job_key) - logger.debug("Writing function dependencies to {}".format(module_path)) + logger.info(f"Writing function dependencies to {module_path}") os.makedirs(module_path, exist_ok=True) sys.path.append(module_path) @@ -85,16 +96,15 @@ def get_function_data(job, internal_storage): """ Get function data (iteradata) from storage """ - logger.debug("Getting function data") - if job.data_key: extra_get_args = {} if job.data_byte_ranges is not None: init_byte = job.data_byte_ranges[0][0] last_byte = job.data_byte_ranges[-1][1] - range_str = 'bytes={}-{}'.format(init_byte, last_byte) + range_str = f'bytes={init_byte}-{last_byte}' extra_get_args['Range'] = range_str + logger.info("Loading function data parameters from storage") data_obj = internal_storage.get_data(job.data_key, extra_get_args=extra_get_args) loaded_data = [] @@ -257,8 +267,82 @@ def write(self, log): def flush(self): try: self._stream.flush() + self._stdout.flush() except ValueError: pass def fileno(self): return self._stdout.fileno() + + +class SystemMonitor: + + def __init__(self, process_id=None): + """ + Initialize the SystemMonitor. + If process_id is None, monitor the current process. + """ + self.process_id = process_id + self.cpu_usage = [] + self.process = None + self.cpu_times = None + self.current_net_io = None + self.mem_info = None + + def start(self): + """ + Start monitoring. + """ + if not psutil_found: + return + + self.process = psutil.Process(self.process_id) + + # record the initial CPU usage (to be ignored). + psutil.cpu_percent(interval=None, percpu=True) + + # Reset the network IO counters cache and baseline. + psutil.net_io_counters.cache_clear() + self.start_net_io = psutil.net_io_counters() + + def stop(self): + """ + Stop monitoring. + """ + if not psutil_found: + return + + # Record the CPU usage since the last call (start). + self.cpu_usage = psutil.cpu_percent(interval=None, percpu=True) + self.cpu_times = psutil.cpu_times() + self.current_net_io = psutil.net_io_counters() + self.mem_info = self.process.memory_full_info() + + def get_cpu_info(self): + """ + Return CPU usage, system time, and user time for each CPU core. + """ + if not psutil_found: + return {"usage": [], "system": 0, "user": 0} + + return {"usage": self.cpu_usage, "system": self.cpu_times.system, "user": self.cpu_times.user} + + def get_network_io(self): + """ + Calculate network IO (bytes sent and received) since the last reset. + """ + if not psutil_found: + return {"sent": 0, "recv": 0} + + bytes_sent = self.current_net_io.bytes_sent - self.start_net_io.bytes_sent + bytes_recv = self.current_net_io.bytes_recv - self.start_net_io.bytes_recv + return {"sent": bytes_sent, "recv": bytes_recv} + + def get_memory_info(self): + """ + Get memory usage information of the monitored process. + """ + if not psutil_found: + return {"rss": 0, "vms": 0, "uss": 0} + + return {"rss": self.mem_info.rss, "vms": self.mem_info.vms, "uss": self.mem_info.uss} diff --git a/runtime/README.md b/runtime/README.md index 47bc329b1..34cda20d3 100644 --- a/runtime/README.md +++ b/runtime/README.md @@ -2,15 +2,16 @@ Choose your compute backend: -1. [IBM Cloud Functions](ibm_cf/) -2. [IBM Code Engine](code_engine/) -3. [IBM VPC](ibm_vpc/) -4. [AWS Lambda](aws_lambda/) -5. [AWS EC2](aws_ec2/) -6. [Google Cloud Functions](gcp_functions/) -7. [Google Cloud Run](gcp_cloudrun/) -8. [Aliyun Functions Compute](aliyun_fc/) -9. [Azure Functions](azure_functions/) -10. [Azure Container APPs](azure_containers/) -11. [Knative](knative/) -12. [Kubernetes](kubernetes/) +1. [IBM Code Engine](code_engine/) +2. [IBM VPC](ibm_vpc/) +3. [AWS Lambda](aws_lambda/) +4. [AWS EC2](aws_ec2/) +2. [Google Cloud Functions](gcp_functions/) +3. [Google Cloud Run](gcp_cloudrun/) +7. [Azure Functions](azure_functions/) +8. [Azure Container APPs](azure_containers/) +9. [Aliyun Functions Compute](aliyun_fc/) +10. [Kubernetes](kubernetes/) +11. [OpenWhisk](openwhisk/) +12. [Knative](knative/) +13. [Singularity](singularity/) diff --git a/runtime/aliyun_fc/README.md b/runtime/aliyun_fc/README.md index f70a18442..55fe8b8f9 100644 --- a/runtime/aliyun_fc/README.md +++ b/runtime/aliyun_fc/README.md @@ -1,27 +1,10 @@ # Lithops runtime for Aliyun Functions Compute -The runtime is the place where your functions are executed. +The runtime is the place where your functions are executed. The default runtime is automatically created the first time you execute a function. Lithops automatically detects the Python version of your environment and deploys the default runtime based on it. -The default runtime is created the first time you execute a function. Lithops automatically detects the Python version of your environment and deploys the default runtime based on it. +Currently, Aliyun Functions Compute supports Python 3.6, 3.9 and 3.10. You can find the list of pre-installed modules [here](https://www.alibabacloud.com/help/en/function-compute/latest/python-event-functions). In addition, the Lithops default runtimes are built with the packages included in this [requirements.txt](requirements.txt) file: -Currently, Aliyun Functions Compute supports Python 3.6, and 3.9, and it provides the following default runtimes with some packages already preinstalled: - - -| Runtime name | Python version | Packages included | -| ----| ----| ---- | -| lithops-default-runtime-v36 | 3.6 | [list of packages](https://www.alibabacloud.com/help/en/function-compute/latest/python-event-functions) | -| lithops-default-runtime-v38 | 3.9 | [list of packages](https://www.alibabacloud.com/help/en/function-compute/latest/python-event-functions) | - -Lithops default runtimes are also ship with the following packages: -``` -pika -tblib -cloudpickle -ps-mem -``` - - -To run a function with the default runtime you don't need to specify anything in the code, since everything is managed internally by Lithops: +To run a function with the default runtime you don't need to specify anything in the code, since everything is handled internally by Lithops: ```python import lithops @@ -45,44 +28,14 @@ pw = lithops.FunctionExecutor(runtime_memory=512) **Build your own Lithops runtime for Aliyun Functions Compute** -If you need some Python modules which are not included in the default runtime, it is possible to build your own Lithops runtime with all of them. - -To build your own runtime, you have to collect all necessary modules in a `requirements.txt` file. +If you require additional Python modules not included in the default runtime, you can create your own custom Lithops runtime incorporating them. To create a custom runtime, compile all the necessary modules into a `requirements.txt` file. -For example, we want to add module `matplotlib` to our runtime, since it is not provided in the default runtime. +For instance, if you wish to integrate the `matplotlib` module into your runtime, which isn't part of the default setup, you need to append it to the existing [requirements.txt](requirements.txt) file. Note that this `requirements.txt` contains the mandatory pakcges required by lithops, so you don't have to remove any of them from the list, but just add your packages at the end. -First, we need to extend the default `requirements.txt` file provided with Lithops with all the modules we need. For our example, the `requirements.txt` will contain the following modules: -``` -numpy -scikit-learn -scipy -pandas -google-cloud -google-cloud-storage -google-cloud-pubsub -certifi -chardet -docutils -httplib2 -idna -jmespath -kafka-python -lxml -pika==0.13.0 -python-dateutil -redis -requests -simplejson -six -urllib3 -virtualenv -PyYAML -matplotlib -``` +After updating the file accordingly, you can proceed to build the custom runtime by specifying the modified `requirements.txt` file along with a chosen runtime name: -Then, we will build the runtime, specifying the modified `requirements.txt` file and a runtime name: ``` -$ lithops runtime build -f requirements.txt my_matplotlib_runtime -b aliyun_fc +$ lithops runtime build -b aliyun_fc -f requirements.txt my_matplotlib_runtime ``` This command will built and deploy a runtime called `my_matplotlib_runtime` to the available Aliyun FUnctions Compute runtimes. diff --git a/runtime/aliyun_fc/requirements.txt b/runtime/aliyun_fc/requirements.txt index 31a097e1f..0e9ce45f7 100644 --- a/runtime/aliyun_fc/requirements.txt +++ b/runtime/aliyun_fc/requirements.txt @@ -1,3 +1,6 @@ +# Requirements.txt contains a list of dependencies for the Python Application # + +# Mandatory Lithops packages aliyun-fc2 oss2 pika @@ -10,4 +13,9 @@ kubernetes numpy cloudpickle ps-mem -tblib \ No newline at end of file +tblib +psutil + +# Optional packages (Add your packages below) +#pandas +#matplotlib \ No newline at end of file diff --git a/runtime/aws_batch/Dockerfile b/runtime/aws_batch/Dockerfile index e3307b00d..2d54e8aa0 100644 --- a/runtime/aws_batch/Dockerfile +++ b/runtime/aws_batch/Dockerfile @@ -30,7 +30,8 @@ RUN pip install --upgrade --ignore-installed setuptools six pip \ cloudpickle \ ps-mem \ tblib \ - matplotlib + matplotlib \ + psutil # Copy Lithops proxy and lib to the container image. ENV APP_HOME /lithops diff --git a/runtime/aws_batch/conda.Dockerfile b/runtime/aws_batch/conda.Dockerfile index e7b4d76b6..2b671bfe1 100644 --- a/runtime/aws_batch/conda.Dockerfile +++ b/runtime/aws_batch/conda.Dockerfile @@ -52,7 +52,8 @@ RUN pip install --upgrade --no-cache-dir --ignore-installed \ kafka-python \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil # Put here your pip dependencies... ENV APP_HOME /lithops diff --git a/runtime/aws_ec2/README.md b/runtime/aws_ec2/README.md index ce2878d15..824336589 100644 --- a/runtime/aws_ec2/README.md +++ b/runtime/aws_ec2/README.md @@ -1,10 +1,8 @@ # Lithops runtime for AWS EC2 -In AWS EC2, you can run functions by using a Virtual machine (VM). In the VM, functions run using parallel processes. In this case, it is not needed to install anything in the remote VMs since Lithops does this process automatically the first time you use them. However, use a custom VM it is a preferable approach, since using a pre-built custom image will greatly improve the overall execution time. To benefit from this approach, follow the following steps: +In AWS EC2, you can execute functions using a Virtual Machine (VM). These functions run as parallel processes within the VM. When using Lithops for the first time, there's no need to manually install anything on the remote VMs, as Lithops handles this process automatically. However, utilizing a custom VM is preferable, as employing a pre-built custom image significantly improves overall execution time. To benefit from this approach, follow these steps: -## Option 1 (beta): - -**Note**: This is a beta feature. Please open an issue if you encounter any errors using this way of creating VM images in AWS EC2. +## Option 1: For building the default VM image that contains all dependencies required by Lithops, execute: @@ -12,8 +10,8 @@ For building the default VM image that contains all dependencies required by Lit lithops image build -b aws_ec2 ``` -This command will create an image called "lithops-worker-default" in the target region. -If the image already exists, and you want to updete it, use the '--overwrite' or '-o' flag: +This command will create an image called "lithops-ubuntu-jammy-22.04-amd64-server" in the target region. +If the image already exists, and you want to update it, use the `--overwrite` or `-o` parameter: ``` lithops image build -b aws_ec2 --overwrite @@ -27,7 +25,13 @@ For creating a custom VM image, you can provide an `.sh` script with all the des lithops image build -b aws_ec2 -f myscript.sh custom-lithops-runtime ``` -In this case, if you use a custom name, you must provide the `target_ami`, printed at the end of the build command, in your lithops config: +If you want to upload local files to the custom VM Image, you can include them using the `--include` or `-i` parameter (src:dst), for example: + +``` +lithops image build -b aws_ec2 -f myscript.sh -i /home/user/test.bin:/home/ubuntu/test.bin custom-lithops-runtime +``` + +In the case of using using a custom name, you must provide the `target_ami`, printed at the end of the build command, in your lithops config, for example: ```yaml aws_ec2: @@ -39,7 +43,7 @@ aws_ec2: ## Option 2: -You can create a VM image manually. For example, you can create a VM in you AWS region, access the VM, install all the dependencies in the VM itself (apt-get, pip3 install, ...), stop the VM, create a VM Image, and then put the image_id in your lithops config file: +You can create a VM image manually. For example, you can create a VM in your AWS region, access the VM, install all the dependencies in the VM itself (apt-get, pip3 install, ...), stop the VM, create a VM Image, and then put the AMI ID in your lithops config, for example: ```yaml aws_ec2: @@ -48,4 +52,4 @@ aws_ec2: ... ``` -Note that if you name your VM Image (AMI) as "lithops-worker-default", there is no need to provide the `target_ami` in the config, since lithops automatically looks for this AMI name. +Note that if you name your VM Image (AMI) as "lithops-ubuntu-jammy-22.04-amd64-server", there is no need to provide the `target_ami` in the config, since lithops automatically looks for this AMI name. diff --git a/runtime/aws_lambda/Dockerfile b/runtime/aws_lambda/Dockerfile index 8ecc94d28..1f999b4ad 100644 --- a/runtime/aws_lambda/Dockerfile +++ b/runtime/aws_lambda/Dockerfile @@ -1,17 +1,17 @@ # Python 3.6 -#FROM python:3.6-slim-buster +#FROM python:3.6-slim-trixie # Python 3.7 -#FROM python:3.7-slim-buster +#FROM python:3.7-slim-trixie # Python 3.8 -#FROM python:3.8-slim-buster +#FROM python:3.8-slim-trixie # Python 3.9 -#FROM python:3.9-slim-buster +#FROM python:3.9-slim-trixie # Python 3.10 -FROM python:3.10-slim-buster +FROM python:3.10-slim-trixie RUN apt-get update \ # Install aws-lambda-cpp build dependencies @@ -44,7 +44,8 @@ RUN pip install --upgrade --ignore-installed pip wheel six setuptools \ kafka-python \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil # Set working directory to function root directory WORKDIR ${FUNCTION_DIR} @@ -60,4 +61,4 @@ RUN unzip lithops_lambda.zip \ # Put your dependencies here, using RUN pip install... or RUN apt install... ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ] -CMD [ "handler.entry_point.lambda_handler" ] \ No newline at end of file +CMD [ "handler.entry_point.lambda_handler" ] diff --git a/runtime/aws_lambda/Dockerfile.arm64 b/runtime/aws_lambda/Dockerfile.arm64 index 0b016656f..e5b2ca986 100644 --- a/runtime/aws_lambda/Dockerfile.arm64 +++ b/runtime/aws_lambda/Dockerfile.arm64 @@ -22,7 +22,8 @@ RUN pip install --upgrade pip six && \ kafka-python \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil # Define custom function directory ARG FUNCTION_DIR="/function" diff --git a/runtime/aws_lambda/README.md b/runtime/aws_lambda/README.md index e5c8c0dbf..471df70e0 100644 --- a/runtime/aws_lambda/README.md +++ b/runtime/aws_lambda/README.md @@ -5,7 +5,7 @@ The runtime is the place where your functions are executed. AWS Lambda provides two methods for packaging the function code and dependencies of a runtime: ## Using predefined **runtimes** and **layers** -An AWS Lambda *runtime* is a predefined environment to run code on Lambda. For example, for Lithops we use runtimes `python3.9`, `python3.8`, `python3.7` or `python3.6` that come with already preinstalled modules. A *layer* is a set of packaged dependencies that can be used by multiple runtimes. For example, Lithops dependencies are deployed as a layer, so if multiple runtimes are created with different memory values, they can mount the same layer containing the dependencies, instead +An AWS Lambda *runtime* is a predefined environment to run code on Lambda. For example, for Lithops we use runtimes for python >= 3.6 that come with already preinstalled modules. A *layer* is a set of packaged dependencies that can be used by multiple runtimes. For example, Lithops dependencies are deployed as a layer, so if multiple runtimes are created with different memory values, they can mount the same layer containing the dependencies, instead of deploying them separately for each runtime. [In this link](https://gist.github.com/gene1wood/4a052f39490fae00e0c3#gistcomment-3131227) you can find which modules are preinstalled by default in an AWS Lambda Python runtime. Moreover, Lithops runtime also ships with the following packages: @@ -65,16 +65,23 @@ Note that you only need to specify the container name: `my-container-runtime-nam As far as possible, avoid using 'points' ('.') in the runtime name. ``` -$ lithops runtime build -f MyDockerfile -b aws_lambda my-container-runtime-name +lithops runtime build -f MyDockerfile -b aws_lambda my-container-runtime-name ``` For example: ``` -$ lithops runtime build -f MyDockerfile -b aws_lambda lithops-ndvi-v39:01 +lithops runtime build -f MyDockerfile -b aws_lambda lithops-ndvi-v312:01 ``` -Finally, we can specify this new runtime when creating a Lithops Function Executor: +Finally, we can specify this new runtime in the lithops config: + +```yaml +aws_lambda: + runtime: lithops-ndvi-v312:01 +``` + +or when creating a Lithops Function Executor: ```python import lithops @@ -82,8 +89,16 @@ import lithops def test(): return 'hello' -lith = lithops.FunctionExecutor(runtime='lithops-ndvi-v39:01') +lith = lithops.FunctionExecutor(runtime='lithops-ndvi-v312:01') lith.call_async(test, data=()) res = lith.get_result() print(res) # Prints 'hello' ``` + +**View your deployed runtimes** + +To view the already deployed runtimes in your account, you can submit the next command in the console: + +``` +lithops runtime list -b aws_lambda +``` diff --git a/runtime/aws_lambda/conda/conda-python3-9.Dockerfile b/runtime/aws_lambda/conda/conda-python3-9.Dockerfile index c50bbb149..95714fcae 100644 --- a/runtime/aws_lambda/conda/conda-python3-9.Dockerfile +++ b/runtime/aws_lambda/conda/conda-python3-9.Dockerfile @@ -30,7 +30,8 @@ RUN /opt/conda-env/bin/pip install --upgrade --no-cache-dir --ignore-installed \ kafka-python \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil # Put here your PIP dependencies... # RUN /opt/conda-env/bin/pip install --upgrade --no-cache-dir --ignore-installed diff --git a/runtime/azure_containers/Dockerfile b/runtime/azure_containers/Dockerfile index 4b8c4373b..5f3a71a79 100644 --- a/runtime/azure_containers/Dockerfile +++ b/runtime/azure_containers/Dockerfile @@ -31,7 +31,8 @@ RUN pip install --upgrade --ignore-installed setuptools six pip \ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil WORKDIR /app COPY lithops_azure_ca.zip . diff --git a/runtime/azure_containers/README.md b/runtime/azure_containers/README.md index 089bfa9f7..d677c0bf8 100644 --- a/runtime/azure_containers/README.md +++ b/runtime/azure_containers/README.md @@ -51,27 +51,27 @@ azure_containers: $ docker login Update the Dockerfile that better fits to your requirements with your required system packages and Python modules. - If you need another Python version, for example Python 3.9, you must change the initial line of the Dockefile: + If you need another Python version, for example Python 3.12, you must change the initial line of the Dockefile: - $ lithops runtime build docker_username/runtimename:tag -b azure_containers + $ lithops runtime build -b azure_containers docker_username/runtimename:tag Note that Docker hub image names look like *"docker_username/runtimename:tag"* and must be all lower case, for example: - $ lithops runtime build -b azure_containers myaccount/lithops-ca-custom-v39:01 + $ lithops runtime build -b azure_containers myaccount/lithops-ca-custom-v312:01 By default the Dockerfile should be located in the same folder from where you execute the **lithops runtime** command. If your Dockerfile is located in another folder, or the Dockerfile has another name, you can specify its location with the **-f** parameter, for example: - $ lithops runtime build -b azure_containers -f azure_containers/Dockerfile myaccount/lithops-ca-custom-v39:01 + $ lithops runtime build -b azure_containers -f azure_containers/Dockerfile myaccount/lithops-ca-custom-v312:01 Once you have built your runtime with all of your necessary packages, you can already use it with Lithops. To do so, you have to specify the full docker image name in the configuration or when you create the **FunctionExecutor** instance, or directly in the config file, for example: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='myaccount/lithops-ca-custom-v39:01') + fexec = lithops.FunctionExecutor(runtime='myaccount/lithops-ca-custom-v312:01') ``` - *NOTE: In this previous example shows how to build a Docker image based on Python 3.9, this means that now you also need Python 3.9 in the client machine.* + *NOTE: In this previous example shows how to build a Docker image based on Python 3.12, this means that now you also need Python 3.12 in the client machine.* 2. **Use an already built runtime from a public repository** @@ -80,29 +80,29 @@ azure_containers: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='lithopscloud/ca-conda-v39:01') + fexec = lithops.FunctionExecutor(runtime='lithopscloud/ca-conda-v312:01') ``` Alternatively, you can create a Lithops runtime based on already built Docker image by executing the following command, which will deploy all the necessary information to use the runtime with your Lithops. $ lithops runtime deploy -b azure_containers -s azure_storage docker_username/runtimename:tag - For example, you can use an already created runtime based on Python 3.9 and with the *matplotlib* and *nltk* libraries by running: + For example, you can use an already created runtime based on Python 3.12 and with the *matplotlib* and *nltk* libraries by running: - $ lithops runtime deploy -b azure_containers -s azure_storage lithopscloud/ca-matplotlib-v39:01 + $ lithops runtime deploy -b azure_containers -s azure_storage lithopscloud/ca-matplotlib-v312:01 Once finished, you can use the runtime in your Lithops code: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='lithopscloud/ca-matplotlib:v39:01') + fexec = lithops.FunctionExecutor(runtime='lithopscloud/ca-matplotlib:v312:01') ``` ## Runtime Management 1. **Update an existing runtime** - If you are a developer, and modified the PyWeen source code, you need to deploy the changes before executing Lithops. + If you are a developer, and modified the Lithops source code, you need to deploy the changes before executing Lithops. You can update default runtime by: @@ -112,9 +112,9 @@ azure_containers: $ lithops runtime update docker_username/runtimename:tag -b azure_containers -s azure_storage - For example, you can update an already created runtime based on the Docker image `lithopscloud/ca-matplotlib-v39:01` by: + For example, you can update an already created runtime based on the Docker image `lithopscloud/ca-matplotlib-v312:01` by: - $ lithops runtime update lithopscloud/ca-matplotlib-v39:01 -b azure_containers -s azure_storage + $ lithops runtime update lithopscloud/ca-matplotlib-v312:01 -b azure_containers -s azure_storage Alternatively, you can update all the deployed runtimes at a time by: @@ -132,9 +132,9 @@ azure_containers: $ lithops runtime delete docker_username/runtimename:tag -b azure_containers -s azure_storage - For example, you can delete runtime based on the Docker image `lithopscloud/ca-conda-v39:01` by: + For example, you can delete runtime based on the Docker image `lithopscloud/ca-conda-v312:01` by: - $ lithops runtime delete lithopscloud/ca-conda-v39:01 -b azure_containers -s azure_storage + $ lithops runtime delete lithopscloud/ca-conda-v312:01 -b azure_containers -s azure_storage You can delete all the runtimes at a time by: diff --git a/runtime/azure_functions/README.md b/runtime/azure_functions/README.md index 5de3962d8..4980ccf63 100644 --- a/runtime/azure_functions/README.md +++ b/runtime/azure_functions/README.md @@ -1,37 +1,10 @@ # Lithops runtime for Azure Functions -The runtime is the place where your functions are executed. +The runtime is the place where your functions are executed. The default runtime is automatically created the first time you execute a function. Lithops automatically detects the Python version of your environment and deploys the default runtime based on it. -The default runtime is created the first time you execute a function. Lithops automatically detects the Python version of your environment and deploys the default runtime based on it. +Currently, Azure Functions supports Python 3.6, 3.7, 3.8 and 3.9. You can find the list of pre-installed modules [here](https://github.com/Azure/azure-functions-python-worker/wiki/Preinstalled-Python-Libraries). In addition, the Lithops default runtimes are built with the packages included in this [requirements.txt](requirements.txt) file -Currently, Azure Functions supports Python 3.6, 3.7, 3.8 and 3.9, and it provides the following default runtimes with some packages already preinstalled: - -| Runtime name | Python version | Packages included | -| ----| ----| ---- | -| lithops-runtime-v36 | 3.6 | [list of packages](https://github.com/Azure/azure-functions-python-worker/wiki/Preinstalled-Python-Libraries) | -| lithops-runtime-v38 | 3.7 | [list of packages](https://github.com/Azure/azure-functions-python-worker/wiki/Preinstalled-Python-Libraries) | -| lithops-runtime-v38 | 3.8 | [list of packages](https://github.com/Azure/azure-functions-python-worker/wiki/Preinstalled-Python-Libraries) | -| lithops-runtime-v38 | 3.9 | [list of packages](https://github.com/Azure/azure-functions-python-worker/wiki/Preinstalled-Python-Libraries) | - -Lithops default runtimes are also ship with the following packages: -``` -azure-functions -azure-storage-blob -azure-storage-queue -pika -flask -gevent -redis -requests -PyYAML -kubernetes -numpy -cloudpickle -ps-mem -tblib -``` - -To run a function with the default runtime you don't need to specify anything in the code, since everything is managed internally by Lithops: +To run a function with the default runtime you don't need to specify anything in the code, since everything is handled internally by Lithops: ```python import lithops @@ -50,32 +23,13 @@ result = lithops.get_result() **Build your own Lithops runtime for Azure Functions** -If you need some Python modules which are not included in the default runtime, it is possible to build your own Lithops runtime with all of them. - -To build your own runtime, you have to collect all necessary modules in a `requirements.txt` file. For example, if you want to add the modules `numpy` and `matplotlib` to our runtime, since they are not provided in the default runtime. +If you require additional Python modules not included in the default runtime, you can create your own custom Lithops runtime incorporating them. To create a custom runtime, compile all the necessary modules into a `requirements.txt` file. -First, we need to extend the `requirements.txt` file provided with Lithops with all the modules we need. For our example, the `requirements.txt` should contain the following modules (note that we added `numpy` and `matplotlib` at the end): -``` -azure-functions -azure-storage-blob -azure-storage-queue -pika -flask -gevent -redis -requests -PyYAML -kubernetes -cloudpickle -ps-mem -tblib -numpy -matplotlib -``` +For instance, if you wish to integrate the `matplotlib` module into your runtime, which isn't part of the default setup, you need to append it to the existing [requirements.txt](requirements.txt) file. Note that this `requirements.txt` contains the mandatory pakcges required by lithops, so you don't have to remove any of them from the list, but just add your packages at the end. **IMPORTANT**: Note that the runtime is built using your local machine, and some libraries, like Numpy, compile some *C* code based on the Operating System you are using. Azure functions run on a Linux machine, this mean that if you use **MacOS** or **Windows** for building the runtime, those libraries that compiled *C* code cannot be executed from within the function. In this case, you must use a Linux machine for building the runtime. -Then, we will build the runtime, specifying the modified `requirements.txt` file and a runtime name: +After updating the file accordingly, you can proceed to build the custom runtime by specifying the modified `requirements.txt` file along with a chosen runtime name: ``` $ lithops runtime build -b azure_functions -f requirements.txt matplotlib-runtime ``` diff --git a/runtime/azure_functions/requirements.txt b/runtime/azure_functions/requirements.txt index be5b651da..dcf6336c2 100644 --- a/runtime/azure_functions/requirements.txt +++ b/runtime/azure_functions/requirements.txt @@ -1,3 +1,6 @@ +# Requirements.txt contains a list of dependencies for the Python Application # + +# Mandatory Lithops packages azure-functions azure-storage-blob azure-storage-queue @@ -11,4 +14,9 @@ PyYAML kubernetes cloudpickle ps-mem -tblib \ No newline at end of file +tblib +psutil + +# Optional packages (Add your packages below) +#pandas +#matplotlib \ No newline at end of file diff --git a/runtime/code_engine/Dockerfile b/runtime/code_engine/Dockerfile index 54216945b..d0e3a38be 100644 --- a/runtime/code_engine/Dockerfile +++ b/runtime/code_engine/Dockerfile @@ -13,16 +13,12 @@ # Python 3.10 FROM python:3.10-slim-buster +# Python 3.11 +# FROM python:3.11-slim-buster + RUN apt-get update \ - # add some packages required for the pip install - && apt-get install -y \ - gcc \ - zlib1g-dev \ - libxslt-dev \ - libxml2-dev \ - zip \ - unzip \ - make \ + # add the required packages + && apt-get install -y unzip \ # cleanup package lists, they are not used anymore in this image && rm -rf /var/lib/apt/lists/* \ && apt-cache search linux-headers-generic diff --git a/runtime/code_engine/Dockerfile.conda b/runtime/code_engine/Dockerfile.conda index 4baff2faf..fe1d7985a 100644 --- a/runtime/code_engine/Dockerfile.conda +++ b/runtime/code_engine/Dockerfile.conda @@ -1,51 +1,59 @@ # Python 3.6 -#FROM continuumio/miniconda3:4.5.4 +#FROM docker.io/continuumio/miniconda3:4.5.4 # Python 3.7 -#FROM continuumio/miniconda3:4.7.12 +#FROM docker.io/continuumio/miniconda3:4.7.12 # Python 3.8 -FROM continuumio/miniconda3:4.9.2 +#FROM docker.io/continuumio/miniconda3:4.9.2 # Python 3.9 -#FROM continuumio/miniconda3:4.10.3 +#FROM docker.io/continuumio/miniconda3:4.10.3 + +# Python 3.10 +FROM docker.io/continuumio/miniconda3:23.3.1-0 + +# Python 3.11 +#FROM docker.io/continuumio/miniconda3:23.10.0-1 # YOU MUST PIN THE PYTHON VERSION TO PREVENT IT TO BE UPDATED # For python 3.6 use "python==3.6.5" # For python 3.7 use "python==3.7.4" # For python 3.8 use "python==3.8.5" # For python 3.9 use "python==3.9.5" -RUN echo "python==3.8.5" >> /opt/conda/conda-meta/pinned +# For python 3.10 use "python==3.10.10" +# For python 3.11 use "python==3.11.7" +RUN echo "python==3.10.10" >> /opt/conda/conda-meta/pinned ENV FLASK_PROXY_PORT 8080 RUN apt-get --allow-releaseinfo-change update \ # Upgrade installed packages to get latest security fixes if the base image does not contain them already. && apt-get upgrade -y --no-install-recommends \ - # add some packages required for the pip install - && apt-get install -y --no-install-recommends \ - gcc \ - libc-dev \ - libxslt-dev \ - libxml2-dev \ - libffi-dev \ - libssl-dev \ - zip \ - unzip \ - vim \ - make \ + # add the required packages + && apt-get install -y --no-install-recommends unzip \ # cleanup package lists, they are not used anymore in this image && rm -rf /var/lib/apt/lists/* \ && apt-cache search linux-headers-generic -# Add your Conda required packages here. Ensure "conda clean --all" at -# the end to remove temporary data. One "RUN" line is better than multiple -# ones in terms of image size. For example: -#RUN conda update -n base conda && conda install -c conda-forge opencv && conda install sortedcontainers gevent-websocket && conda clean --all +# Add your Conda required packages here. One "RUN" line is better +# than multiple ones in terms of image size. Install the libmamba +# solver for faster package resolution. Make sure to put "conda clean --all" +# at the end to remove temporary data. for example: +RUN conda update -n base -y conda \ + && conda install -n base -y conda-libmamba-solver \ + && conda config --set solver libmamba \ + && conda install -c conda-forge -c anaconda -c defaults -y opencv \ + && conda clean --all # install additional python modules COPY requirements.txt requirements.txt -RUN pip install --upgrade pip setuptools six gevent && pip install --no-cache-dir -r requirements.txt + +RUN pip install --upgrade pip setuptools six gevent \ + && pip install --no-cache-dir -r requirements.txt + +# Replace python intepreter with conda's +RUN ln -sf /opt/conda/bin/python /usr/local/bin/python ENV PORT 8080 ENV CONCURRENCY 1 diff --git a/runtime/code_engine/Dockerfile.githubci b/runtime/code_engine/Dockerfile.githubci deleted file mode 100644 index 8e69b74e0..000000000 --- a/runtime/code_engine/Dockerfile.githubci +++ /dev/null @@ -1,8 +0,0 @@ -FROM jsampe/lithops-codeengine-default-v310:latest - -# Copy Lithops proxy and lib to the container image. -ENV APP_HOME /lithops -WORKDIR $APP_HOME - -COPY lithops_codeengine.zip . -RUN rm -rf lithops && unzip -o lithops_codeengine.zip && rm lithops_codeengine.zip diff --git a/runtime/code_engine/Dockerfile.mamba b/runtime/code_engine/Dockerfile.mamba new file mode 100644 index 000000000..04c4b80b1 --- /dev/null +++ b/runtime/code_engine/Dockerfile.mamba @@ -0,0 +1,81 @@ +# Python 3.6 +#FROM python:3.6-slim-buster + +# Python 3.7 +#FROM python:3.7-slim-buster + +# Python 3.8 +#FROM python:3.8-slim-buster + +# Python 3.9 +#FROM python:3.9-slim-buster + +# Python 3.10 +FROM python:3.10-slim-buster + +# Python 3.11 +# FROM python:3.11-slim-buster + +RUN apt-get update \ + # add the required packages + && apt-get install -y wget unzip git \ + # cleanup package lists, they are not used anymore in this image + && rm -rf /var/lib/apt/lists/* \ + && apt-cache search linux-headers-generic + +# Install conda +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -o miniconda.sh && \ + sh Miniconda3-latest-Linux-x86_64.sh -b -p /opt/miniconda && \ + rm miniconda.sh Miniconda3-latest-Linux-x86_64.sh + +COPY lithops-env-py311.yml /tmp/lithops-conda.yml + +# Install mamba and create the new environment +RUN /opt/miniconda/bin/conda install -c conda-forge -y mamba && \ + /opt/miniconda/bin/conda install -c anaconda -y git && \ + /opt/miniconda/bin/conda update -n base -c defaults -y conda && \ + /opt/miniconda/bin/conda clean --all && \ + /opt/miniconda/bin/mamba env create --file /tmp/lithops-conda.yml --prefix /opt/conda-env && \ + /opt/miniconda/bin/mamba clean --all + +# Install Lithops dependencies +RUN /opt/conda-env/bin/pip install --upgrade --no-cache-dir --ignore-installed \ + gunicorn \ + flask \ + pika \ + boto3 \ + ibm-cloud-sdk-core \ + ibm-cos-sdk \ + ibm-code-engine-sdk \ + scikit-learn \ + diskcache \ + redis \ + gevent \ + requests \ + PyYAML \ + numpy \ + cloudpickle \ + ps-mem \ + tblib \ + psutil \ + kubernetes + +# Replace python intepreter with conda's +RUN ln -sf /opt/conda-env/bin/python /usr/local/bin/python + +ENV PATH="/opt/conda-env/bin/:${PATH}" + +ENV PORT 8080 +ENV CONCURRENCY 1 +ENV TIMEOUT 600 +ENV PYTHONUNBUFFERED TRUE + +# Copy Lithops proxy and lib to the container image. +ENV APP_HOME /lithops +WORKDIR $APP_HOME + +COPY lithops_codeengine.zip . +RUN unzip lithops_codeengine.zip && rm lithops_codeengine.zip + +CMD exec gunicorn --bind :$PORT --workers $CONCURRENCY --timeout $TIMEOUT lithopsentry:proxy + \ No newline at end of file diff --git a/runtime/code_engine/README.md b/runtime/code_engine/README.md index d3e462509..04639a4fa 100644 --- a/runtime/code_engine/README.md +++ b/runtime/code_engine/README.md @@ -52,27 +52,27 @@ code_enigne: $ docker login Update the Dockerfile that better fits to your requirements with your required system packages and Python modules. - If you need another Python version, for example Python 3.9, you must change the initial line of the Dockefile: + If you need another Python version, for example Python 3.12, you must change the initial line of the Dockefile: $ lithops runtime build docker_username/runtimename:tag -b code_engine Note that Docker hub image names look like *"docker_username/runtimename:tag"* and must be all lower case, for example: - $ lithops runtime build -b code_engine myaccount/lithops-ce-custom-v39:01 + $ lithops runtime build -b code_engine myaccount/lithops-ce-custom-v312:01 By default the Dockerfile should be located in the same folder from where you execute the **lithops runtime** command. If your Dockerfile is located in another folder, or the Dockerfile has another name, you can specify its location with the **-f** parameter, for example: - $ lithops runtime build -b code_engine -f code_engine/Dockerfile.conda myaccount/lithops-ce-custom-v39:01 + $ lithops runtime build -b code_engine -f code_engine/Dockerfile.conda myaccount/lithops-ce-custom-v312:01 Once you have built your runtime with all of your necessary packages, you can already use it with Lithops. To do so, you have to specify the full docker image name in the configuration or when you create the **FunctionExecutor** instance, or directly in the config file, for example: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='myaccount/lithops-ce-custom-v39:01') + fexec = lithops.FunctionExecutor(runtime='myaccount/lithops-ce-custom-v312:01') ``` - *NOTE: In this previous example shows how to build a Docker image based on Python 3.9, this means that now you also need Python 3.9 in the client machine.* + *NOTE: In this previous example shows how to build a Docker image based on Python 3.12, this means that now you also need Python 3.12 in the client machine.* 2. **Use an already built runtime from a public repository** @@ -81,29 +81,29 @@ code_enigne: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='lithopscloud/ce-conda-v39:01') + fexec = lithops.FunctionExecutor(runtime='lithopscloud/ce-conda-v312:01') ``` Alternatively, you can create a Lithops runtime based on already built Docker image by executing the following command, which will deploy all the necessary information to use the runtime with your Lithops. $ lithops runtime deploy -b code_engine -s ibm_cos docker_username/runtimename:tag - For example, you can use an already created runtime based on Python 3.9 and with the *matplotlib* and *nltk* libraries by running: + For example, you can use an already created runtime based on Python 3.12 and with the *matplotlib* and *nltk* libraries by running: - $ lithops runtime deploy -b code_engine -s ibm_cos lithopscloud/ce-matplotlib-v39:01 + $ lithops runtime deploy -b code_engine -s ibm_cos lithopscloud/ce-matplotlib-v312:01 Once finished, you can use the runtime in your Lithops code: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='lithopscloud/ce-matplotlib:v39:01') + fexec = lithops.FunctionExecutor(runtime='lithopscloud/ce-matplotlib-v312:01') ``` ## Runtime Management 1. **Update an existing runtime** - If you are a developer, and modified the PyWeen source code, you need to deploy the changes before executing Lithops. + If you are a developer, and modified the Lithops source code, you need to deploy the changes before executing Lithops. You can update default runtime by: @@ -113,9 +113,9 @@ code_enigne: $ lithops runtime update docker_username/runtimename:tag -b code_engine -s ibm_cos - For example, you can update an already created runtime based on the Docker image `lithopscloud/ce-matplotlib-v39:01` by: + For example, you can update an already created runtime based on the Docker image `lithopscloud/ce-matplotlib-v312:01` by: - $ lithops runtime update lithopscloud/ce-matplotlib-v39:01 -b code_engine -s ibm_cos + $ lithops runtime update lithopscloud/ce-matplotlib-v312:01 -b code_engine -s ibm_cos Alternatively, you can update all the deployed runtimes at a time by: @@ -133,9 +133,9 @@ code_enigne: $ lithops runtime delete docker_username/runtimename:tag -b code_engine -s ibm_cos - For example, you can delete runtime based on the Docker image `lithopscloud/ce-conda-v39:01` by: + For example, you can delete runtime based on the Docker image `lithopscloud/ce-conda-v312:01` by: - $ lithops runtime delete lithopscloud/ce-conda-v39:01 -b code_engine -s ibm_cos + $ lithops runtime delete lithopscloud/ce-conda-v312:01 -b code_engine -s ibm_cos You can delete all the runtimes at a time by: diff --git a/runtime/code_engine/lithops-env-py311.yml b/runtime/code_engine/lithops-env-py311.yml new file mode 100644 index 000000000..70608c315 --- /dev/null +++ b/runtime/code_engine/lithops-env-py311.yml @@ -0,0 +1,18 @@ +name: lithops-conda +channels: + - conda-forge + - anaconda + - defaults +dependencies: + - python=3.11 + - pip + # Put here your conda dependencies... + - intake-xarray + - fastparquet + - gdal + - zarr + - s3fs + - adios-db + - cartopy + - pip: + - git+https://github.com/OpenDrift/opendrift@master \ No newline at end of file diff --git a/runtime/code_engine/requirements.txt b/runtime/code_engine/requirements.txt index 56bc2ed34..a587ec205 100644 --- a/runtime/code_engine/requirements.txt +++ b/runtime/code_engine/requirements.txt @@ -1,52 +1,26 @@ # Requirements.txt contains a list of dependencies for the Python Application # -# Setup modules +# Mandatory Lithops packages gunicorn -gevent flask - -# Lithops +pika +boto3 +ibm-cloud-sdk-core +ibm-cos-sdk +ibm-code-engine-sdk +scikit-learn +diskcache +redis +gevent +requests +PyYAML +numpy cloudpickle ps-mem tblib -PyYAML +psutil kubernetes -# default available packages for python3 -beautifulsoup4 -httplib2 -kafka_python -lxml -python-dateutil -requests -scrapy -simplejson -#virtualenv -Twisted -PyJWT - -# packages for numerics -numpy -scikit-learn -scipy -pandas - -# packages for image processing -Pillow - -# IBM specific python modules -#ibm_db -#cloudant -#watson-developer-cloud -ibm-cos-sdk +# Optional packages (Add your packages below) ibm-vpc -ibm-code-engine-sdk -#ibmcloudsql - -# Compose Libs -psycopg2-binary -#pymongo -redis -pika -#elasticsearch -etcd3 +#pandas diff --git a/runtime/gcp_cloudrun/Dockerfile b/runtime/gcp_cloudrun/Dockerfile index ffdef471b..9c02809cc 100644 --- a/runtime/gcp_cloudrun/Dockerfile +++ b/runtime/gcp_cloudrun/Dockerfile @@ -37,7 +37,11 @@ RUN pip install --upgrade setuptools six pip \ google-cloud-storage \ google-api-python-client \ gcsfs \ - google-auth + google-auth \ + psutil + +ENV CONCURRENCY 1 +ENV TIMEOUT 600 # Copy Lithops proxy and lib to the container image. ENV APP_HOME /lithops @@ -46,4 +50,4 @@ WORKDIR $APP_HOME COPY lithops_cloudrun.zip . RUN unzip lithops_cloudrun.zip && rm lithops_cloudrun.zip -CMD exec gunicorn --bind :$PORT lithopsproxy:proxy +CMD exec gunicorn --bind :$PORT --workers $CONCURRENCY --timeout $TIMEOUT lithopsproxy:proxy diff --git a/runtime/gcp_cloudrun/Dockerfile.conda b/runtime/gcp_cloudrun/Dockerfile.conda index 3e67289df..e2fd1da4b 100644 --- a/runtime/gcp_cloudrun/Dockerfile.conda +++ b/runtime/gcp_cloudrun/Dockerfile.conda @@ -39,7 +39,11 @@ RUN pip install --upgrade setuptools six pip \ google-cloud-storage \ google-api-python-client \ gcsfs \ - google-auth + google-auth \ + psutil + +ENV CONCURRENCY 1 +ENV TIMEOUT 600 # Add your Conda required packages here. Ensure "conda clean --all" at # the end to remove temporary data. One "RUN" line is better than multiple @@ -53,4 +57,4 @@ WORKDIR $APP_HOME COPY lithops_cloudrun.zip . RUN unzip lithops_cloudrun.zip && rm lithops_cloudrun.zip -CMD exec gunicorn --bind :$PORT lithopsproxy:proxy +CMD exec gunicorn --bind :$PORT --workers $CONCURRENCY --timeout $TIMEOUT lithopsproxy:proxy diff --git a/runtime/gcp_cloudrun/README.md b/runtime/gcp_cloudrun/README.md index f0a080f50..1ff6c117e 100644 --- a/runtime/gcp_cloudrun/README.md +++ b/runtime/gcp_cloudrun/README.md @@ -55,11 +55,11 @@ gcp_cloudrun: To build your own runtime, first install the Docker CE version in your client machine. You can find the instructions [here](https://docs.docker.com/get-docker/). If you already have Docker installed omit this step. Update the Dockerfile that better fits to your requirements with your required system packages and Python modules. - If you need another Python version, for example Python 3.8, you must change the initial line of the Dockefile. + If you need another Python version, for example Python 3.12, you must change the initial line of the Dockefile. For example, we will add `PyTorch` to our Lithops runtime. The Dockerfile would look like this: ```dockerfile - FROM python:3.8-slim-buster + FROM python:3.12-slim-bookworm RUN apt-get update && apt-get install -y \ zip \ @@ -93,8 +93,8 @@ gcp_cloudrun: ENV APP_HOME /lithops WORKDIR $APP_HOME - COPY lithops_knative.zip . - RUN unzip lithops_knative.zip && rm lithops_knative.zip + COPY lithops_cloudrun.zip . + RUN unzip lithops_cloudrun.zip && rm lithops_cloudrun.zip CMD exec gunicorn --bind :$PORT lithopsproxy:proxy ``` @@ -119,14 +119,14 @@ gcp_cloudrun: fexec = lithops.FunctionExecutor(runtime='pytorchruntime') fexec.call_async(my_function, 'hello') - print(fexec.get_result()) # Prints + print(fexec.get_result()) # Prints ``` ## Runtime Management 1. **Update an existing runtime** - If you are a developer, and modified the PyWeen source code, you need to deploy the changes before executing Lithops. + If you are a developer, and modified the Lithops source code, you need to deploy the changes before executing Lithops. You can update default runtime by: diff --git a/runtime/gcp_functions/README.md b/runtime/gcp_functions/README.md index 2f5c39a41..e80016859 100644 --- a/runtime/gcp_functions/README.md +++ b/runtime/gcp_functions/README.md @@ -1,49 +1,10 @@ # Lithops runtime for Google Cloud Functions -The runtime is the place where your functions are executed. +Google Cloud Functions operate within a runtime environment distinct from other serverless platforms like Google Cloud Run, as they do not rely on containers from the user prespective. Consequently, specifying a container image as the function's runtime isn't feasible. However, you can enhance the default package set by providing a custom `requirements.txt` file, allowing for the inclusion of additional Python modules automatically installable via `pip`. -Unlike other Serverless backends like IBM Cloud Function or Google Cloud Run, Google Cloud Functions is not based on Docker, so it is not possible to provide a Docker image as the function's runtime. -However, it is possible to expand the default installed packages by providing a different `requirements.txt` file. In consequence, it is not possible to add a system library to the runtime, only Python modules that can be installed using `pip`. +Currently, Google Cloud Functions supports Python >= 3.7. You can find the list of pre-installed modules [here](https://cloud.google.com/functions/docs/writing/specifying-dependencies-python#pre-installed_packages). In addition, the Lithops default runtimes are built with the packages included in this [requirements.txt](requirements.txt) file: -Currently, Google Cloud Functions supports Python 3.7, 3.8 and 3.9, and it provides the following default runtimes with some packages already preinstalled: - -| Runtime name | Python version | Packages included | -| ----| ----| ---- | -| lithops-default-runtime-v37 | 3.7 | [list of packages](https://cloud.google.com/functions/docs/writing/specifying-dependencies-python#pre-installed_packages) | -| lithops-default-runtime-v38 | 3.8 | [list of packages](https://cloud.google.com/functions/docs/writing/specifying-dependencies-python#pre-installed_packages) | -| lithops-default-runtime-v38 | 3.9 | [list of packages](https://cloud.google.com/functions/docs/writing/specifying-dependencies-python#pre-installed_packages) | - -Lithops default runtimes are also ship with the following packages: -``` -numpy -scikit-learn -scipy -pandas -google-cloud -google-cloud-storage -google-cloud-pubsub -certifi -chardet -docutils -httplib2 -idna -jmespath -kafka-python -lxml -pika==0.13.0 -python-dateutil -redis -requests -simplejson -six -urllib3 -virtualenv -PyYAML -``` - -The default runtime is created the first time you execute a function. Lithops automatically detects the Python version of your environment and deploys the default runtime based on it. - -To run a function with the default runtime you don't need to specify anything in the code, since everything is managed internally by Lithops: +The default runtime is created automatically the first time you execute a function. Lithops automatically detects the Python version of your environment and deploys the default runtime based on it. In this sense, to run a function with the default runtime you don't need to specify anything in the code, since everything is managed internally by Lithops: ```python import lithops @@ -67,49 +28,19 @@ pw = lithops.FunctionExecutor(runtime_memory=512) **Build your own Lithops runtime for Google Cloud Functions** -If you need some Python modules which are not included in the default runtime, it is possible to build your own Lithops runtime with all of them. - -To build your own runtime, you have to collect all necessary modules in a `requirements.txt` file. +If you require additional Python modules not included in the default runtime, you can create your own custom Lithops runtime incorporating them. To create a custom runtime, compile all the necessary modules into a `requirements.txt` file. -For example, we want to add module `matplotlib` to our runtime, since it is not provided in the default runtime. +For instance, if you wish to integrate the `matplotlib` module into your runtime, which isn't part of the default setup, you need to append it to the existing [requirements.txt](requirements.txt) file. Note that this `requirements.txt` contains the mandatory pakcges required by lithops, so you don't have to remove any of them from the list, but just add your packages at the end. -First, we need to extend the default `requirements.txt` file provided with Lithops with all the modules we need. For our example, the `requirements.txt` will contain the following modules: -``` -numpy -scikit-learn -scipy -pandas -google-cloud -google-cloud-storage -google-cloud-pubsub -certifi -chardet -docutils -httplib2 -idna -jmespath -kafka-python -lxml -pika==0.13.0 -python-dateutil -redis -requests -simplejson -six -urllib3 -virtualenv -PyYAML -matplotlib -``` +After updating the file accordingly, you can proceed to build the custom runtime by specifying the modified `requirements.txt` file along with a chosen runtime name: -Then, we will build the runtime, specifying the modified `requirements.txt` file and a runtime name: ``` -$ lithops runtime build -f requirements.txt my_matplotlib_runtime -b gcp_functions +$ lithops runtime build -b gcp_functions -f requirements.txt my_matplotlib_runtime ``` This command will add an extra runtime called `my_matplotlib_runtime` to the available Google Cloud Function runtimes. -Finally, we can specify this new runtime when creating a Lithops Function Executor: +Finally, you can specify this new runtime when creating a Lithops Function Executor: ```python import lithops @@ -121,7 +52,7 @@ def test(): lith = lithops.FunctionExecutor(runtime='my_matplotlib_runtime') lith.call_async(test, data=()) res = lith.get_result() -print(res) # Prints +print(res) # Prints ``` -If we are running Lithops, for example, with Python 3.8, `my_matplotlib_runtime` will be a Python 3.8 runtime with the extra modules specified installed. +If we are running Lithops, for example, with Python 3.12, `my_matplotlib_runtime` will be a Python 3.12 runtime with the extra modules specified installed. diff --git a/runtime/gcp_functions/requirements.txt b/runtime/gcp_functions/requirements.txt index 6fa335f91..eb92ad4bb 100644 --- a/runtime/gcp_functions/requirements.txt +++ b/runtime/gcp_functions/requirements.txt @@ -1,27 +1,25 @@ -numpy -scikit-learn -scipy -pandas +# Requirements.txt contains a list of dependencies for the Python Application # + +# Mandatory Lithops packages google-cloud google-cloud-storage google-cloud-pubsub -certifi -chardet -docutils -httplib2 -idna -jmespath -kafka-python -lxml -pika -python-dateutil -redis -requests -simplejson +google-auth +google-api-python-client +numpy six -urllib3 -virtualenv -PyYAML +requests +redis +pika +scikit-learn +diskcache cloudpickle ps-mem -tblib \ No newline at end of file +tblib +PyYAML +urllib3 +psutil + +# Optional packages (Add your packages below) +#pandas +#matplotlib \ No newline at end of file diff --git a/runtime/ibm_cf/Dockerfile.githubci b/runtime/ibm_cf/Dockerfile.githubci deleted file mode 100644 index bb7bad926..000000000 --- a/runtime/ibm_cf/Dockerfile.githubci +++ /dev/null @@ -1 +0,0 @@ -FROM lithopscloud/ibmcf-python-v310 diff --git a/runtime/ibm_cf/Dockerfile.protomol b/runtime/ibm_cf/Dockerfile.protomol deleted file mode 100644 index 083b0703c..000000000 --- a/runtime/ibm_cf/Dockerfile.protomol +++ /dev/null @@ -1,12 +0,0 @@ -FROM lithopscloud/ibmcf-python-v38 - -RUN apt-get update && apt-get install -y \ - wget build-essential cmake pkg-config \ - && rm -rf /var/lib/apt/lists/* \ - && apt-cache search linux-headers-generic - -RUN pip install opencv-contrib-python-headless opencv-python-headless dlib \ - && wget https://sourceforge.net/projects/protomol/files/ProtoMol/Protomol%203.3/ProtoMol-3.3.0-Linux-64bit.tar.gz \ - && tar -zxvf ProtoMol-3.3.0-Linux-64bit.tar.gz \ - && cp /ProtoMol-3.3.0-Linux-64bit/ProtoMol /tmp/ProtoMol \ - && chmod +x /tmp/ProtoMol diff --git a/runtime/ibm_cf/Dockerfile.python310 b/runtime/ibm_cf/Dockerfile.python310 deleted file mode 100644 index c11c91af0..000000000 --- a/runtime/ibm_cf/Dockerfile.python310 +++ /dev/null @@ -1,30 +0,0 @@ -FROM python:3.10-slim-buster - -ENV FLASK_PROXY_PORT 8080 - -RUN apt-get update \ - # add some packages required for the pip install - && apt-get install -y \ - gcc \ - zlib1g-dev \ - libxslt-dev \ - libxml2-dev \ - zip \ - unzip \ - make \ - # cleanup package lists, they are not used anymore in this image - && rm -rf /var/lib/apt/lists/* \ - && apt-cache search linux-headers-generic - -COPY requirements.txt requirements.txt -RUN pip install --upgrade pip setuptools six && pip install --no-cache-dir -r requirements.txt - -# create action working directory -RUN mkdir -p /action \ - && mkdir -p /actionProxy \ - && mkdir -p /pythonAction - -ADD https://raw.githubusercontent.com/apache/openwhisk-runtime-docker/8b2e205c39d84ed5ede6b1b08cccf314a2b13105/core/actionProxy/actionproxy.py /actionProxy/actionproxy.py -ADD https://raw.githubusercontent.com/apache/openwhisk-runtime-python/3%401.0.3/core/pythonAction/pythonrunner.py /pythonAction/pythonrunner.py - -CMD ["/bin/bash", "-c", "cd /pythonAction && python -u pythonrunner.py"] diff --git a/runtime/ibm_cf/Dockerfile.python36 b/runtime/ibm_cf/Dockerfile.python36 deleted file mode 100644 index 827b8daa3..000000000 --- a/runtime/ibm_cf/Dockerfile.python36 +++ /dev/null @@ -1,34 +0,0 @@ -# Based on: https://github.com/ibm-functions/runtime-python/tree/master/python3.6 - -# Python 3.6 -FROM python:3.6-slim-buster - -ENV FLASK_PROXY_PORT 8080 - -RUN apt-get update \ - # add some packages required for the pip install - && apt-get install -y \ - gcc \ - zlib1g-dev \ - libxslt-dev \ - libxml2-dev \ - zip \ - unzip \ - make \ - # cleanup package lists, they are not used anymore in this image - && rm -rf /var/lib/apt/lists/* \ - && apt-cache search linux-headers-generic - -# install additional python modules -COPY requirements.txt requirements.txt -RUN pip install --upgrade pip setuptools six gevent && pip install --no-cache-dir -r requirements.txt - -# create action working directory -RUN mkdir -p /action \ - && mkdir -p /actionProxy \ - && mkdir -p /pythonAction - -ADD https://raw.githubusercontent.com/apache/openwhisk-runtime-docker/8b2e205c39d84ed5ede6b1b08cccf314a2b13105/core/actionProxy/actionproxy.py /actionProxy/actionproxy.py -ADD https://raw.githubusercontent.com/apache/openwhisk-runtime-python/3%401.0.3/core/pythonAction/pythonrunner.py /pythonAction/pythonrunner.py - -CMD ["/bin/bash", "-c", "cd /pythonAction && python -u pythonrunner.py"] diff --git a/runtime/ibm_cf/Dockerfile.python37 b/runtime/ibm_cf/Dockerfile.python37 deleted file mode 100644 index b4f0d9faa..000000000 --- a/runtime/ibm_cf/Dockerfile.python37 +++ /dev/null @@ -1,16 +0,0 @@ -# Based on: https://github.com/ibm-functions/runtime-python/tree/master/python3.7 - -FROM openwhisk/actionloop-python-v3.7:4e43668 - -RUN apt-get update \ - # Upgrade installed packages to get latest security fixes if the base image does not contain them already. - && apt-get upgrade -y --no-install-recommends \ - # cleanup package lists, they are not used anymore in this image - && rm -rf /var/lib/apt/lists/* \ - # We do not have mysql-server installed but mysql-common contains config files (/etc/mysql/my.cnf) for it. - # We need to add some dummy entries to /etc/mysql/my.cnf to sattisfy vulnerability checking of it. - && echo "\n[mysqld]\nssl-ca=/tmp/ca.pem\nssl-cert=/tmp/server-cert.pem\nssl-key=/tmp/server-key.pem\n" >> /etc/mysql/my.cnf - -# install additional python modules -COPY requirements.txt requirements.txt -RUN pip install --upgrade pip setuptools six && pip install --no-cache-dir -r requirements.txt diff --git a/runtime/ibm_cf/Dockerfile.python38 b/runtime/ibm_cf/Dockerfile.python38 deleted file mode 100644 index d625c339a..000000000 --- a/runtime/ibm_cf/Dockerfile.python38 +++ /dev/null @@ -1,30 +0,0 @@ -FROM python:3.8-slim-buster - -ENV FLASK_PROXY_PORT 8080 - -RUN apt-get update \ - # add some packages required for the pip install - && apt-get install -y \ - gcc \ - zlib1g-dev \ - libxslt-dev \ - libxml2-dev \ - zip \ - unzip \ - make \ - # cleanup package lists, they are not used anymore in this image - && rm -rf /var/lib/apt/lists/* \ - && apt-cache search linux-headers-generic - -COPY requirements.txt requirements.txt -RUN pip install --upgrade pip setuptools six && pip install --no-cache-dir -r requirements.txt - -# create action working directory -RUN mkdir -p /action \ - && mkdir -p /actionProxy \ - && mkdir -p /pythonAction - -ADD https://raw.githubusercontent.com/apache/openwhisk-runtime-docker/8b2e205c39d84ed5ede6b1b08cccf314a2b13105/core/actionProxy/actionproxy.py /actionProxy/actionproxy.py -ADD https://raw.githubusercontent.com/apache/openwhisk-runtime-python/3%401.0.3/core/pythonAction/pythonrunner.py /pythonAction/pythonrunner.py - -CMD ["/bin/bash", "-c", "cd /pythonAction && python -u pythonrunner.py"] diff --git a/runtime/ibm_cf/requirements.txt b/runtime/ibm_cf/requirements.txt deleted file mode 100644 index 44b567062..000000000 --- a/runtime/ibm_cf/requirements.txt +++ /dev/null @@ -1,48 +0,0 @@ -# Requirements.txt contains a list of dependencies for the Python Application # - -# Setup modules -gevent -flask - -# Lithops -cloudpickle -ps-mem -tblib - -# default available packages for python3 -beautifulsoup4 -httplib2 -#kafka_python -lxml -python-dateutil -requests -scrapy -simplejson -#virtualenv -Twisted -PyJWT - -# packages for numerics -numpy -#scikit-learn -scipy -pandas - -# packages for image processing -Pillow - -# IBM specific python modules -#ibm_db -#cloudant -#watson-developer-cloud -ibm-cos-sdk -#ibmcloudsql -ibm-vpc - -# Compose Libs -psycopg2-binary -#pymongo -redis -pika -#elasticsearch -etcd3 diff --git a/runtime/ibm_vpc/README.md b/runtime/ibm_vpc/README.md index 547699ae9..ec97b3be4 100644 --- a/runtime/ibm_vpc/README.md +++ b/runtime/ibm_vpc/README.md @@ -1,10 +1,8 @@ # Lithops runtime for IBM VPC -In IBM VPC, you can run functions by using a Virtual machine (VM). In the VM, functions run using parallel processes. In this case, it is not needed to install anything in the remote VMs since Lithops does this process automatically the first time you use them. However, use a custom VM it is a preferable approach, since using a pre-built custom image will greatly improve the overall execution time. To benefit from this approach, follow the following steps: +In IBM VPC, you can execute functions using a Virtual Machine (VM). These functions operate through parallel processes within the VM. When utilizing Lithops for the first time, there's no need to manually install anything on the remote VMs, as Lithops handles this process automatically. However, employing a custom VM is recommended, as utilizing a pre-built custom image significantly enhances overall execution time. To implement this approach effectively, follow these steps: -## Option 1 (beta): - -**Note**: This is a beta feature. Please open an issue if you encounter any error using this way of creating VM images in IBM VPC. +## Option 1: For building the default VM image that contains all dependencies required by Lithops, execute: @@ -12,8 +10,8 @@ For building the default VM image that contains all dependencies required by Lit lithops image build -b ibm_vpc ``` -This command will create an image called "lithops-worker-default" in the target region. -If the image already exists, and you want to updete it, use the '--overwrite' or '-o' flag: +This command will create an image called "lithops-ubuntu-22-04-3-minimal-amd64-1" in the target region. +If the image already exists, and you want to updete it, use the `--overwrite` or `-o` parameter: ``` lithops image build -b ibm_vpc --overwrite @@ -27,10 +25,34 @@ For creating a custom VM image, you can provide an `.sh` script with all the des lithops image build -b ibm_vpc -f myscript.sh custom-lithops-runtime ``` -In this case, if you use a custom name, you must provide the Image ID, printed at the end of the build command, in your lithops config. +If you want to upload local files to the custom VM Image, you can include them using the `--include` or `-i` parameter (src:dst), for example: + +``` +lithops image build -b ibm_vpc -f myscript.sh -i /home/user/test.bin:/home/ubuntu/test.bin custom-lithops-runtime +``` + +In the case of using using a custom name, you must provide the Image ID, printed at the end of the build command, in your lithops config, for eaxmple: + +```yaml +ibm_vpc: + ... + image_id: + ... +``` ## Option 2: +You can create a VM image manually. For example, you can create a VM in you AWS region, access the VM, install all the dependencies in the VM itself (apt-get, pip3 install, ...), stop the VM, create a VM Image, and then put the image_id in your lithops config, for example: + +```yaml +ibm_vpc: + ... + image_id: + ... +``` + +## Option 3 (Discontinued): + For building the VM image that contains all dependencies required by Lithops, execute the [build script](build_lithops_runtime.sh) located in this folder. The best is to use vanilla Ubuntu machine to run this script and this script will use a base image based on **ubuntu-20.04-server-cloudimg-amd64**. There is need to have sudo privileges to run this script. Once you accessed the machine, download the script @@ -89,14 +111,3 @@ Once local image is ready you need to upload it to COS. The best would be to use ``` 3. [Navigate to IBM VPC dashboard, custom images](https://cloud.ibm.com/vpc-ext/compute/images) and follow instructions to create new custom image based on the `lithops-ubuntu-20.04.qcow2` - -4. **Clean everything** - - You can clean everything related to Lithops, such as all deployed workers and cache information, and start from scratch by simply running the next command (Configuration is not deleted): - ``` - $ lithops clean -b ibm_vpc - ``` - In order to delete also master VM use `--all` flag - ``` - $ lithops clean -b ibm_vpc --all - ``` diff --git a/runtime/knative/Dockerfile b/runtime/knative/Dockerfile index 2c597a8d6..dc4d7592d 100644 --- a/runtime/knative/Dockerfile +++ b/runtime/knative/Dockerfile @@ -35,7 +35,8 @@ RUN pip install --upgrade setuptools six pip \ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil ENV CONCURRENCY 4 ENV TIMEOUT 600 diff --git a/runtime/knative/Dockerfile.conda b/runtime/knative/Dockerfile.conda index 3cf680675..17113e876 100644 --- a/runtime/knative/Dockerfile.conda +++ b/runtime/knative/Dockerfile.conda @@ -39,7 +39,8 @@ RUN pip install --upgrade setuptools six pip \ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil # Add your Conda required packages here. Ensure "conda clean --all" at # the end to remove temporary data. One "RUN" line is better than multiple diff --git a/runtime/knative/README.md b/runtime/knative/README.md index 3056456ec..8ca56ef12 100644 --- a/runtime/knative/README.md +++ b/runtime/knative/README.md @@ -57,27 +57,27 @@ knative: $ docker login Update the Dockerfile that better fits to your requirements with your required system packages and Python modules. - If you need another Python version, for example Python 3.8, you must change the initial line of the Dockefile: + If you need another Python version, for example Python 3.12, you must change the initial line of the Dockefile: $ lithops runtime build -b knative docker.io/username/runtimename:tag Note that Docker hub image names look like *"docker_username/runtimename:tag"* and must be all lower case, and must not include '.' or '_', for example: - $ lithops runtime build -b knative docker.io/username/lithops-kn-custom-v38:01 + $ lithops runtime build -b knative docker.io/username/lithops-kn-custom-v312:01 By default the Dockerfile should be located in the same folder from where you execute the **lithops runtime** command. If your Dockerfile is located in another folder, or the Dockerfile has another name, you can specify its location with the **-f** parameter, for example: - $ lithops runtime build -b knative -f knative/Dockerfile.conda docker.io/username/lithops-kn-custom-v38:01 + $ lithops runtime build -b knative -f knative/Dockerfile.conda docker.io/username/lithops-kn-custom-v312:01 Once you have built your runtime with all of your necessary packages, you can already use it with Lithops. To do so, you have to specify the full docker image name in the configuration or when you create the **FunctionExecutor** instance, or directly in the config file, for example: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='docker.io/username/lithops-kn-custom-v38:01') + fexec = lithops.FunctionExecutor(runtime='docker.io/username/lithops-kn-custom-v312:01') ``` - *NOTE: In this previous example shows how to build a Docker image based on Python 3.8, this means that now you also need Python 3.8 in the client machine.* + *NOTE: In this previous example shows how to build a Docker image based on Python 3.12, this means that now you also need Python 3.12 in the client machine.* 2. **Use an already built runtime from a public repository** @@ -86,29 +86,29 @@ knative: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='docker.io/username/lithops-kn-conda-v38:01') + fexec = lithops.FunctionExecutor(runtime='docker.io/username/lithops-kn-conda-v312:01') ``` Alternatively, you can create a Lithops runtime based on already built Docker image by executing the following command, which will deploy all the necessary information to use the runtime with your Lithops. $ lithops runtime deploy -b knative docker.io/username/runtimename:tag - For example, you can use an already built runtime based on Python 3.8 and with the *matplotlib* and *nltk* libraries by running: + For example, you can use an already built runtime based on Python 3.12 and with the *matplotlib* and *nltk* libraries by running: - $ lithops runtime deploy -b knative docker.io/username/lithops-kn-matplotlib-v38:01 + $ lithops runtime deploy -b knative docker.io/username/lithops-kn-matplotlib-v312:01 Once finished, you can use the runtime in your Lithops code: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='docker.io/username/lithops-kn-matplotlib:v38:01') + fexec = lithops.FunctionExecutor(runtime='docker.io/username/lithops-kn-matplotlib:v312:01') ``` ## Runtime Management 1. **Update an existing runtime** - If you are a developer, and modified the PyWeen source code, you need to deploy the changes before executing Lithops. + If you are a developer, and modified the Lithops source code, you need to deploy the changes before executing Lithops. You can update default runtime by: @@ -118,9 +118,9 @@ knative: $ lithops runtime update docker.io/username/runtimename:tag -b knative - For example, you can update an already created runtime based on the Docker image `jsampe/lithops-kn-matplotlib-v38:01` by: + For example, you can update an already created runtime based on the Docker image `jsampe/lithops-kn-matplotlib-v312:01` by: - $ lithops runtime update docker.io/username/lithops-kn-matplotlib-v38:01 -b knative + $ lithops runtime update docker.io/username/lithops-kn-matplotlib-v312:01 -b knative Alternatively, you can update all the deployed runtimes at a time by: @@ -138,9 +138,9 @@ knative: $ lithops runtime delete docker.io/username/runtimename:tag -b knative - For example, you can delete runtime based on the Docker image `docker.io/username/lithops-kn-conda-v36:01` by: + For example, you can delete runtime based on the Docker image `docker.io/username/lithops-kn-conda-v312:01` by: - $ lithops runtime delete docker.io/username/lithops-kn-conda-v36:01 -b knative + $ lithops runtime delete docker.io/username/lithops-kn-conda-v312:01 -b knative You can delete all the runtimes at a time by: diff --git a/runtime/kubernetes/Dockerfile b/runtime/kubernetes/Dockerfile index 729ecc138..f1d682d90 100644 --- a/runtime/kubernetes/Dockerfile +++ b/runtime/kubernetes/Dockerfile @@ -33,7 +33,8 @@ RUN pip install --upgrade setuptools six pip \ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil ENV PYTHONUNBUFFERED TRUE diff --git a/runtime/kubernetes/Dockerfile.conda b/runtime/kubernetes/Dockerfile.conda index 94f4265d3..37ddcc14c 100644 --- a/runtime/kubernetes/Dockerfile.conda +++ b/runtime/kubernetes/Dockerfile.conda @@ -37,7 +37,8 @@ RUN pip install --upgrade setuptools six pip \ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil # Add your Conda required packages here. Ensure "conda clean --all" at # the end to remove temporary data. One "RUN" line is better than multiple diff --git a/runtime/kubernetes/README.md b/runtime/kubernetes/README.md index 341092818..58cad0e4a 100644 --- a/runtime/kubernetes/README.md +++ b/runtime/kubernetes/README.md @@ -33,27 +33,27 @@ k8s: $ docker login Update the Dockerfile that better fits to your requirements with your required system packages and Python modules. - If you need another Python version, for example Python 3.9, you must change the initial line of the Dockefile: + If you need another Python version, for example Python 3.12, you must change the initial line of the Dockefile: $ lithops runtime build -b k8s docker_username/runtimename:tag Note that Docker hub image names look like *"docker_username/runtimename:tag"* and must be all lower case, for example: - $ lithops runtime build -b k8s myaccount/lithops-k8s-custom-v39:01 + $ lithops runtime build -b k8s myaccount/lithops-k8s-custom-v312:01 By default the Dockerfile should be located in the same folder from where you execute the **lithops runtime** command. If your Dockerfile is located in another folder, or the Dockerfile has another name, you can specify its location with the **-f** parameter, for example: - $ lithops runtime build -b k8s -f kubernetes/Dockerfile.conda myaccount/lithops-k8s-custom-v39:01 + $ lithops runtime build -b k8s -f kubernetes/Dockerfile.conda myaccount/lithops-k8s-custom-v312:01 Once you have built your runtime with all of your necessary packages, you can already use it with Lithops. To do so, you have to specify the full docker image name in the configuration or when you create the **FunctionExecutor** instance, or directly in the config file, for example: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='myaccount/lithops-k8s-custom-v39:01') + fexec = lithops.FunctionExecutor(runtime='myaccount/lithops-k8s-custom-v312:01') ``` - *NOTE: In this previous example shows how to build a Docker image based on Python 3.9, this means that now you also need Python 3.9 in the client machine.* + *NOTE: In this previous example shows how to build a Docker image based on Python 3.12, this means that now you also need Python 3.12 in the client machine.* 2. **Use an already built runtime from a public repository** @@ -62,7 +62,7 @@ k8s: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='docker.io/lithopscloud/lithops-k8s-conda-v39:01') + fexec = lithops.FunctionExecutor(runtime='docker.io/lithopscloud/lithops-k8s-conda-v312:01') ``` Alternatively, you can create a Lithops runtime based on already built Docker image by executing the following command, which will deploy all the necessary information to use the runtime with your Lithops. @@ -71,15 +71,15 @@ k8s: $ lithops runtime deploy -b k8s docker_username/runtimename:tag ``` - For example, you can use an already buit runtime based on Python 3.9 and with the *matplotlib* and *nltk* libraries by running: + For example, you can use an already buit runtime based on Python 3.12 and with the *matplotlib* and *nltk* libraries by running: ``` - $ lithops runtime deploy -b k8s docker.io/lithopscloud/lithops-k8s-matplotlib-v39:01 + $ lithops runtime deploy -b k8s docker.io/lithopscloud/lithops-k8s-matplotlib-v312:01 ``` ```python import lithops - fexec = lithops.FunctionExecutor(runtime='docker.io/lithopscloud/lithops-k8s-matplotlib:v39:01') + fexec = lithops.FunctionExecutor(runtime='docker.io/lithopscloud/lithops-k8s-matplotlib:v312:01') ``` 3. **Clean everything** diff --git a/runtime/ibm_cf/Dockerfile.python39 b/runtime/openwhisk/Dockerfile similarity index 84% rename from runtime/ibm_cf/Dockerfile.python39 rename to runtime/openwhisk/Dockerfile index 54e30d9ed..0ef00df2a 100644 --- a/runtime/ibm_cf/Dockerfile.python39 +++ b/runtime/openwhisk/Dockerfile @@ -1,17 +1,17 @@ -FROM python:3.9-slim-buster +#FROM python:3.6-slim-buster +#FROM python:3.7-slim-buster +#FROM python:3.8-slim-buster +#FROM python:3.9-slim-buster +#FROM python:3.10-slim-buster +FROM python:3.11-slim-buster ENV FLASK_PROXY_PORT 8080 RUN apt-get update \ # add some packages required for the pip install && apt-get install -y \ - gcc \ - zlib1g-dev \ - libxslt-dev \ - libxml2-dev \ zip \ unzip \ - make \ # cleanup package lists, they are not used anymore in this image && rm -rf /var/lib/apt/lists/* \ && apt-cache search linux-headers-generic diff --git a/runtime/ibm_cf/Dockerfile.conda b/runtime/openwhisk/Dockerfile.conda similarity index 88% rename from runtime/ibm_cf/Dockerfile.conda rename to runtime/openwhisk/Dockerfile.conda index 1beb334e0..2b35c4914 100644 --- a/runtime/ibm_cf/Dockerfile.conda +++ b/runtime/openwhisk/Dockerfile.conda @@ -5,17 +5,25 @@ #FROM continuumio/miniconda3:4.7.12 # Python 3.8 -FROM continuumio/miniconda3:4.9.2 +#FROM continuumio/miniconda3:4.9.2 # Python 3.9 #FROM continuumio/miniconda3:4.10.3 +# Python 3.10 +#FROM continuumio/miniconda3:23.3.1-0 + +# Python 3.11 +FROM continuumio/miniconda3:23.10.0-1 + # YOU MUST PIN THE PYTHON VERSION TO PREVENT IT TO BE UPDATED # For python 3.6 use "python==3.6.5" # For python 3.7 use "python==3.7.4" # For python 3.8 use "python==3.8.5" # For python 3.9 use "python==3.9.5" -RUN echo "python==3.8.5" >> /opt/conda/conda-meta/pinned +# For python 3.10 use "python==3.10.10" +# For python 3.11 use "python==3.11.7" +RUN echo "python==3.11.7" >> /opt/conda/conda-meta/pinned ENV FLASK_PROXY_PORT 8080 diff --git a/runtime/ibm_cf/Dockerfile.dlib b/runtime/openwhisk/Dockerfile.dlib similarity index 100% rename from runtime/ibm_cf/Dockerfile.dlib rename to runtime/openwhisk/Dockerfile.dlib diff --git a/runtime/ibm_cf/Docker.dso b/runtime/openwhisk/Dockerfile.dso similarity index 100% rename from runtime/ibm_cf/Docker.dso rename to runtime/openwhisk/Dockerfile.dso diff --git a/runtime/ibm_cf/Dockerfile.gdal b/runtime/openwhisk/Dockerfile.gdal similarity index 98% rename from runtime/ibm_cf/Dockerfile.gdal rename to runtime/openwhisk/Dockerfile.gdal index 1d7d9b449..145a3c748 100644 --- a/runtime/ibm_cf/Dockerfile.gdal +++ b/runtime/openwhisk/Dockerfile.gdal @@ -9,9 +9,6 @@ ARG PROJ_INSTALL_PREFIX=/usr/local FROM ubuntu:18.04 as builder -# Derived from osgeo/proj by Howard Butler -MAINTAINER Even Rouault - # Setup build env for PROJ RUN apt-get update -y \ && apt-get install -y --fix-missing --no-install-recommends \ diff --git a/runtime/ibm_cf/Dockerfile.slim b/runtime/openwhisk/Dockerfile.slim similarity index 81% rename from runtime/ibm_cf/Dockerfile.slim rename to runtime/openwhisk/Dockerfile.slim index 5aba00bbb..407a1cffa 100644 --- a/runtime/ibm_cf/Dockerfile.slim +++ b/runtime/openwhisk/Dockerfile.slim @@ -1,9 +1,12 @@ # Minimal Docker image for Lithops +#FROM python:3.6-slim-buster +#FROM python:3.7-slim-buster #FROM python:3.8-slim-buster #FROM python:3.9-slim-buster #FROM python:3.10-slim-buster FROM python:3.11-slim-buster +#FROM python:3.12-slim-bookworm ENV FLASK_PROXY_PORT 8080 @@ -11,7 +14,11 @@ RUN pip install --upgrade setuptools six \ && pip install --no-cache-dir \ flask \ pika \ + boto3 \ + ibm-cloud-sdk-core \ ibm-cos-sdk \ + scikit-learn \ + diskcache \ redis \ gevent \ requests \ @@ -19,7 +26,8 @@ RUN pip install --upgrade setuptools six \ numpy \ cloudpickle \ ps-mem \ - tblib + tblib \ + psutil # create action working directory RUN mkdir -p /action \ diff --git a/runtime/ibm_cf/README.md b/runtime/openwhisk/README.md similarity index 56% rename from runtime/ibm_cf/README.md rename to runtime/openwhisk/README.md index d46029a4e..0432f4056 100644 --- a/runtime/ibm_cf/README.md +++ b/runtime/openwhisk/README.md @@ -1,23 +1,14 @@ -# Lithops runtime for IBM Cloud Functions +# Lithops runtime for OpenWhisk -The runtime is the place where your functions are executed. In Lithops, runtimes are based on docker images, and it includes by default three different runtimes that allows you to run functions with Python 3.6, 3.7, 3.8, 3.9 and 3.10 environments. Lithops main runtime is responsible to execute Python functions within IBM Cloud Functions cluster. The strong requirement here is to match Python versions between the client and the runtime. The runtime may also contain additional packages which your code depends on. +The runtime is the place where your functions are executed. In Lithops, runtimes are based on docker images, and it includes by default three different runtimes that allows you to run functions with Python >= 3.6 environments. Lithops main runtime is responsible to execute Python functions within OpenWhisk. The strong requirement here is to match Python versions between the client and the runtime. The runtime may also contain additional packages which your code depends on. -Lithops for IBM Cloud is shipped with these default runtimes: - -| Runtime name | Python version | Packages included | -| ----| ----| ---- | -| lithopscloud/ibmcf-python-v36 | 3.6 | [list of packages](requirements.txt) | -| lithopscloud/ibmcf-python-v37 | 3.7 | [list of packages](requirements.txt) | -| lithopscloud/ibmcf-python-v38 | 3.8 | [list of packages](requirements.txt) | -| lithopscloud/ibmcf-python-v39 | 3.9 | [list of packages](requirements.txt) | -| lithopscloud/ibmcf-python-v310 | 3.10 | [list of packages](requirements.txt) | The default runtime is created the first time you execute a function. Lithops automatically detects the Python version of your environment and deploys the default runtime based on it. Alternatively, you can create the default runtime by running the following command: ```bash -$ lithops runtime deploy default -b ibm_cf -s ibm_cos +$ lithops runtime deploy default -b openwhisk -s minio ``` To run a function with the default runtime you don't need to specify anything in the code, since everything is managed internally by Lithops: @@ -28,16 +19,16 @@ import lithops def my_function(x): return x + 7 -pw = lithops.FunctionExecutor() -pw.call_async(my_function, 3) -result = pw.get_result() +fexec = lithops.FunctionExecutor() +fexec.call_async(my_function, 3) +result = fexec.get_result() ``` By default, Lithops uses 256MB as runtime memory size. However, you can change it in the `config` or when you obtain the executor, for example: ```python import lithops -pw = lithops.FunctionExecutor(runtime_memory=512) +fexec = lithops.FunctionExecutor(runtime_memory=512) ``` ## Custom runtime @@ -46,42 +37,37 @@ pw = lithops.FunctionExecutor(runtime_memory=512) If you need some Python modules (or other system libraries) which are not included in the default docker images (see table above), it is possible to build your own Lithops runtime with all of them. - This alternative usage is based on to build a local Docker image, deploy it to the docker hub (you need a [Docker Hub account](https://hub.docker.com)) and use it as a Lithops base runtime. - Project provides some base skeletons of Docker images for each supported python version, for example: - - * [Dockerfile](ibm_cf/Dockerfile.python36) - The image is based on `python:3.6-slim-buster`. - * [Dockerfile](ibm_cf/Dockerfile.python37) - The image is based on `python:3.7-slim-buster`. - * [Dockerfile](ibm_cf/Dockerfile.python38) - The image is based on `python:3.8-slim-buster`. - - To build your own runtime, first install the Docker CE version in your client machine. You can find the instructions [here](https://docs.docker.com/get-docker/). If you already have Docker installed omit this step. + This alternative usage is based on to build a local Docker image, deploy it to the docker hub (you need a [Docker Hub account](https://hub.docker.com)) (or another registry) and use it as a Lithops base runtime. + + In this folder you will finde some Dockerfile skeletons that you can use to build your runtimes. To build your own runtime, first install the Docker CE version in your client machine. You can find the instructions [here](https://docs.docker.com/get-docker/). If you already have Docker installed omit this step. Login to your Docker hub account by running in a terminal the next command. $ docker login - Navigate to [ibm_cf/](imb_cf/) and update the Dockerfile that better fits to your requirements with your required system packages and Python modules. - If you need another Python version, for example Python 3.8, you must use the [Dockerfile.python38](ibm_cf/Dockerfile.python38) that - points to a source image based on Python 3.8. Finally run the build script: + Navigate to [openwhisk/](openwhisk/) and update the Dockerfile that better fits to your requirements with your required system packages and Python modules. + If you need another Python version, for example Python v3.12, you must use this [Dockerfile](openwhisk/Dockerfile) that + points to a source image based on Python 3.12. Finally run the build script: - $ lithops runtime build -b ibm_cf docker_username/runtimename:tag + $ lithops runtime build -b openwhisk docker_username/runtimename:tag Note that Docker hub image names look like *"docker_username/runtimename:tag"* and must be all lower case, for example: - $ lithops runtime build -b ibm_cf jsampe/lithops-custom-runtime-3.8:0.1 + $ lithops runtime build -b openwhisk jsampe/lithops-custom-runtime-v312:0.1 By default the Dockerfile should be located in the same folder from where you execute the **lithops runtime** command. If your Dockerfile is located in another folder, or the Dockerfile has another name, you can specify its location with the **-f** parameter, for example: - $ lithops runtime build -b ibm_cf -f ibm_cf/Dockerfile.conda username/lithops-conda-runtime-3.8:0.1 + $ lithops runtime build -b openwhisk -f openwhisk/Dockerfile.conda username/lithops-conda-runtime-v312:0.1 Once you have built your runtime with all of your necessary packages, you can already use it with Lithops. To do so, you have to specify the full docker image name in the configuration or when you create the **ibm_cf_executor** instance, for example: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='jsampe/lithops-custom-runtime-3.8:0.1') + fexec = lithops.FunctionExecutor(runtime='jsampe/lithops-custom-runtime-v312:0.1') ``` - *NOTE: In this previous example we built a Docker image based on Python 3.8, this means that now we also need Python 3.8 in the client machine.* + *NOTE: In this previous example we built a Docker image based on Python v312, this means that now we also need Python v312 in the client machine.* 2. **Use an already built runtime from a public repository** @@ -90,22 +76,22 @@ pw = lithops.FunctionExecutor(runtime_memory=512) ```python import lithops - fexec = lithops.FunctionExecutor(runtime='jsampe/lithops-conda-3.8:0.1') + fexec = lithops.FunctionExecutor(runtime='jsampe/lithops-conda-v312:0.1') ``` Alternatively, you can create a Lithops runtime based on already built Docker image by executing the following command, which will deploy all the necessary information to use the runtime with your Lithops. - $ lithops runtime deploy docker_username/runtimename:tag -b ibm_cf -s ibm_cos + $ lithops runtime deploy docker_username/runtimename:tag -b openwhisk -s minio - For example, you can use an already built runtime based on Python 3.8 and with the *matplotlib* and *nltk* libraries by running: + For example, you can use an already built runtime based on Python v312 and with the *matplotlib* and *nltk* libraries by running: - $ lithops runtime deploy jsampe/lithops-matplotlib-3.8:0.1 -b ibm_cf -s ibm_cos + $ lithops runtime deploy jsampe/lithops-matplotlib-v312:0.1 -b openwhisk -s minio Once finished, you can use the runtime in your Lithops code: ```python import lithops - fexec = lithops.FunctionExecutor(runtime='jsampe/lithops-matplotlib:3.8:0.1') + fexec = lithops.FunctionExecutor(runtime='jsampe/lithops-matplotlib:v312:0.1') ``` ## Runtime Management @@ -116,19 +102,19 @@ pw = lithops.FunctionExecutor(runtime_memory=512) You can update default runtime by: - $ lithops runtime update default -b ibm_cf -s ibm_cos + $ lithops runtime update default -b openwhisk -s minio You can update any other runtime deployed in your namespace by specifying the docker image that the runtime depends on: - $ lithops runtime update docker_username/runtimename:tag -b ibm_cf -s ibm_cos + $ lithops runtime update docker_username/runtimename:tag -b openwhisk -s minio - For example, you can update an already created runtime based on the Docker image `jsampe/lithops-conda-3.8:0.1` by: + For example, you can update an already created runtime based on the Docker image `jsampe/lithops-conda-v312:0.1` by: - $ lithops runtime update jsampe/lithops-conda-3.8:0.1 -b ibm_cf -s ibm_cos + $ lithops runtime update jsampe/lithops-conda-v312:0.1 -b openwhisk -s minio Alternatively, you can update all the deployed runtimes at a time by: - $ lithops runtime update all -b ibm_cf -s ibm_cos + $ lithops runtime update all -b openwhisk -s minio 2. **Delete a runtime** @@ -136,22 +122,22 @@ pw = lithops.FunctionExecutor(runtime_memory=512) You can delete default runtime by: - $ lithops runtime delete default -b ibm_cf -s ibm_cos + $ lithops runtime delete default -b openwhisk -s minio You can delete any other runtime deployed in your namespace by specifying the docker image that the runtime depends on: - $ lithops runtime delete docker_username/runtimename:tag -b ibm_cf -s ibm_cos + $ lithops runtime delete docker_username/runtimename:tag -b openwhisk -s minio - For example, you can delete runtime based on the Docker image `jsampe/lithops-conda-3.8:0.1` by: + For example, you can delete runtime based on the Docker image `jsampe/lithops-conda-v312:0.1` by: - $ lithops runtime delete jsampe/lithops-conda-3.8:0.1 -b ibm_cf -s ibm_cos + $ lithops runtime delete jsampe/lithops-conda-v312:0.1 -b openwhisk -s minio You can delete all the runtimes at a time by: - $ lithops runtime delete all -b ibm_cf -s ibm_cos + $ lithops runtime delete all -b openwhisk -s minio 3. **Clean everything** You can clean everything related to Lithops, such as all deployed runtimes and cache information, and start from scratch by simply running the next command (Configuration is not deleted): - $ lithops clean -b ibm_cf -s ibm_cos + $ lithops clean -b openwhisk -s minio diff --git a/runtime/openwhisk/requirements.txt b/runtime/openwhisk/requirements.txt new file mode 100644 index 000000000..d3e25cb6a --- /dev/null +++ b/runtime/openwhisk/requirements.txt @@ -0,0 +1,22 @@ +# Requirements.txt contains a list of dependencies for the Python Application # + +# Mandatory Lithops packages +flask +pika +boto3 +ibm-cloud-sdk-core +ibm-cos-sdk +scikit-learn +diskcache +redis +gevent +requests +PyYAML +numpy +cloudpickle +ps-mem +tblib +psutil + +# Optional packages (Add your packages below) +#pandas \ No newline at end of file diff --git a/runtime/oracle_f/Dockerfile b/runtime/oracle_f/Dockerfile index 9a1cfe003..af667fec3 100644 --- a/runtime/oracle_f/Dockerfile +++ b/runtime/oracle_f/Dockerfile @@ -30,7 +30,8 @@ RUN pip install --upgrade --ignore-installed setuptools six pip \ cloudpickle \ ps-mem \ tblib \ - oci + oci \ + psutil ARG FUNCTION_DIR="/function" diff --git a/runtime/singularity/README.md b/runtime/singularity/README.md new file mode 100644 index 000000000..57c8a4dc7 --- /dev/null +++ b/runtime/singularity/README.md @@ -0,0 +1,67 @@ +# Lithops runtime for Singularity +This document describes how to use Singularity containers as runtimes for your Lithops functions. + +The runtime provides a pre-configured environment to execute your Lithops functions within Singularity containers. It includes the necessary and dependencies to execute your lithops code. + +**Note:** This backend requires a RabbitMQ server for communication. + +## Building a Singularity Runtime + +Use the `lithops runtime build` command to create a `.sif` image containing the necessary Lithops environment. + +1. **Building a basic runtime** + + $ lithops runtime build -b singularity singularity-runtime-name --fakeroot --force + + This command creates the a new `singularity-runtime-name.sif` file with the necessary libraries. + + +2. **Building a custom runtime from a definition file** + + For greater control and flexibility with custom packages, use a Singularity definition file (.def): + + $ lithops runtime build -b singularity my-custom-runtime -f my-runtime.def --fakeroot --force + + This command generates `my-custom-runtime.sif` based on `my-runtime.def`. + + +**Building flags:** +* `--fakeroot`: Often required for building Singularity images without root privileges. +* `--force`: Overwrites any existing images with the same name. + +You can find more information about the Singularity flags in the [Singularity documentation](https://docs.sylabs.io/guides/latest/user-guide/build_a_container.html). + + +## Deploying and Running the Runtime + +We need to perform a manual deployment because we do not have direct access to the cluster nodes. In this case, manually transferring the built `.sif` image to each cluster node is necessary. This ensures that the required runtime environment is available on each node for running Lithops functions. + +1. **Image Transfer:** Manually transfer the built `.sif` image to each cluster node that will run Lithops functions. + +2. **Starting the Singularity Instance:** On each node, start a Singularity instance with your runtime image: + + $ singularity instance start --fakeroot /tmp/singularity-runtime-name.sif lithops-worker + + This creates a Singularity instance called `lithops-worker`. + +3. **Running Functions:** Execute Lithops functions within this instance: + + $ singularity run --env AMQP_URL=amqp://:@:/ instance://lithops-worker + + Replace the placeholders (e.g., ``) with your RabbitMQ credentials to enable communication between your Lithops client and the function runtime. + +## Configuration +By default, the Singularity runtime uses the `/tmp` directory to store the `.sif` images. You can customize this path in your Lithops configuration file: + +```yaml +singularity: + sif_path: /your/custom/path + runtime: singularity-runtime-name +``` + +Also, to execute the Singularity backend, you need to set the RabbitMQ AMPQ URL in your configuration file: + +```yaml +rabbitmq: + amqp_url: amqp://:@:/ +``` \ No newline at end of file diff --git a/runtime/singularity/singularity-template.def b/runtime/singularity/singularity-template.def new file mode 100644 index 000000000..424995cf3 --- /dev/null +++ b/runtime/singularity/singularity-template.def @@ -0,0 +1,46 @@ +Bootstrap: docker + +# Python 3.7 +#FROM python:3.7-slim-buster + +# Python 3.8 +#FROM python:3.8-slim-buster + +# Python 3.9 +#FROM python:3.9-slim-buster + +# Python 3.10 +#FROM: python:3.10-slim-buster + +# Python 3.11 +FROM: python:3.11-slim-buster + +%post + apt-get update && apt-get install -y \ + zip \ + && rm -rf /var/lib/apt/lists/* + + pip install --upgrade setuptools six pip \ + && pip install --no-cache-dir \ + boto3 \ + pika \ + flask \ + gevent \ + redis \ + requests \ + PyYAML \ + numpy \ + cloudpickle \ + ps-mem \ + tblib \ + psutil + +%files + lithops_singularity.zip /lithops/lithops_singularity.zip + +%post + cd /lithops + unzip lithops_singularity.zip && rm lithops_singularity.zip + +%runscript + python3 /lithops/lithopsentry.py $AMQP_URL \ No newline at end of file diff --git a/setup.py b/setup.py index ab20c1f74..cff86327d 100644 --- a/setup.py +++ b/setup.py @@ -5,30 +5,27 @@ install_requires = [ 'Click', - 'pandas', + 'tabulate', + 'six', 'PyYAML', - 'python-dateutil', 'pika', 'tqdm', - 'lxml', 'tblib', - 'docker', 'requests', - 'seaborn', 'paramiko', - 'matplotlib', - 'kubernetes', - 'ibm-cos-sdk', - 'ibm-code-engine-sdk', - 'redis', - 'ibm-vpc', 'cloudpickle', - 'tblib', - 'ps-mem' + 'ps-mem', + 'psutil' ] extras_require = { + 'ibm': [ + 'ibm-cos-sdk', + 'ibm-code-engine-sdk', + 'ibm-vpc', + 'kubernetes', + ], 'aws': [ 'boto3' ], @@ -39,10 +36,6 @@ 'google-api-python-client', 'google-auth' ], - 'aliyun': [ - 'aliyun-fc2', - 'oss2' - ], 'azure': [ 'azure-mgmt-resource', 'azure-mgmt-compute', @@ -51,7 +44,27 @@ 'azure-storage-blob', 'azure-storage-queue' ], + 'aliyun': [ + 'aliyun-fc2', + 'oss2' + ], + 'ceph': [ + 'boto3' + ], + 'knative': [ + 'kubernetes', + ], + 'kubernetes': [ + 'kubernetes', + ], + 'minio': [ + 'boto3' + ], + 'redis': [ + 'redis' + ], 'multiprocessing': [ + 'redis', 'pynng' ], 'joblib': [ @@ -59,11 +72,21 @@ 'diskcache', 'numpy' ], + 'plotting': [ + 'pandas', + 'matplotlib', + 'seaborn', + 'numpy' + ], 'oracle': [ 'oci', + ], + 'tests': [ + 'pytest', ] } + extras_require["all"] = list(set(chain.from_iterable(extras_require.values()))) @@ -75,7 +98,7 @@ url='https://github.com/lithops-cloud/lithops', author='Gil Vernik, Josep Sampe', description='Lithops lets you transparently run your Python applications in the Cloud', - author_email='gilv@il.ibm.com, josep.sampe@gmail.com', + author_email='gilv@ibm.com, josep.sampe@gmail.com', packages=find_packages(), install_requires=install_requires, extras_require=extras_require, @@ -90,13 +113,11 @@ 'Operating System :: OS Independent', 'Natural Language :: English', 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + 'Programming Language :: Python :: 3.13', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Scientific/Engineering', 'Topic :: System :: Distributed Computing',