From a5dd4983c75ebc722ee81275aca4f9246a8d7e0d Mon Sep 17 00:00:00 2001 From: Jason Schmidt Date: Mon, 18 Apr 2022 15:15:23 -0600 Subject: [PATCH 1/5] feat: add minikube deployment test --- extras/jenkins/Minikube/Jenkinsfile | 268 ++++++++++++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 extras/jenkins/Minikube/Jenkinsfile diff --git a/extras/jenkins/Minikube/Jenkinsfile b/extras/jenkins/Minikube/Jenkinsfile new file mode 100644 index 00000000..8e2d7c2e --- /dev/null +++ b/extras/jenkins/Minikube/Jenkinsfile @@ -0,0 +1,268 @@ +pipeline { + agent { + /* + * Nodes that are configured for Microk8s are tagged as "mk8s". Unlike the deployment to cloud providers, this logic + * will install Microk8s on the Jenkins Agent. This means that the agent should have sufficient resources available + * to run Microk8s. A minimum of 16GB RAM, 2 vCPU, and 20GB of disk is recommended. Testing is done with 20GB of RAM, + * 4 vCPU, and 64GB of disk. + * + * This has been * tested on Ubuntu 20.04. Be sure to check that your Agent has the necessary components installed + * if you are using a different OS. + */ + node { + label 'localdeploy' + } + } + + /* + * The JWT for using NGINX Plus is passed in via a variable; if the JWT is not found the process will deploy the + * open source IC. + * + * The POSTRUN_CMD is used to execute an arbitrary command following the cleanup process; this is just a work-around + * for the time being and will be addressed in the future. + */ + + environment { + NGINX_JWT = credentials('NGINX_JWT') + POSTRUN_CMD = credentials('POSTRUN_CMD') + NO_COLOR = "TRUE" + PULUMI_ACCESS_TOKEN = credentials('PULUMI_ACCESS_TOKEN') + } + + stages { + + /* + * This logic allows any branch to be checked out and built, and is designed to be triggered by a github + * webhook. If desired, you can change the branch specification to force the process to only build from a specific + * branch. + * + * Note that we also init the submodule(s). + */ + + stage('Checkout Scm') { + steps { + checkout([$class: 'GitSCM', branches: [[name: '**']], extensions: [[$class: 'CheckoutOption'], + [$class: 'CloneOption', noTags: false, reference: '', shallow: false], + [$class: 'SubmoduleOption', disableSubmodules: false, parentCredentials: false, + recursiveSubmodules: true, reference: '', trackingSubmodules: false]], + userRemoteConfigs: [[url: 'https://github.com/nginxinc/kic-reference-architectures']]]) + } + } + + stage('Prepping OS') { + + /* + * This step handles ensuring the OS has the necessary tooling to build the project. This process has been + * built for Ubuntu 20.04. It assumes that you are running with an account that has access to passwordless sudo. + * + * We also make a few opinionated decisions, such as making sure we are always running on the latest set of + * packages for our Jenkins Agent. + * + * This should work on other Ubuntu related distributions, but most certainly will not work on RHEL or friends. + */ + + steps { + sh ''' + # Update catalogs + sudo apt update + # Make sure our deps are installed + sudo apt -y install figlet openjdk-11-jdk make docker.io python3-venv expect + # Upgrade everything; we always run with latest patches. + sudo apt -y upgrade + # Make sure our kubeconfig dir exists… + mkdir $HOME/.kube || true + ''' + } + } + + stage('Cleaning Up') { + steps { + + /* + * Run a find and check for any stacks that currently exist with our generated stack name; this should not + * happen in normal operation, but could potentially happen if things break so better safe than sorry. + * + * This function also tries to remove both K3S and Microk8s if they are found on the host; this is because we + * will be installing Microk8s and we want to both make sure we are removing any previous installations as well as + * ensuring this Jenkins Agent does not already have a Microk8s installation on it. + */ + + sh ''' + # Reset our K3S Environment + /usr/local/bin/k3s-killall.sh || true + /usr/local/bin/k3s-uninstall.sh || true + # Reset our Microk8s Environment; true if it’s not there + microk8s reset --destroy-storage || true + # True if it’s not there… + sudo snap remove microk8s || true + # Clean up the Pulumi stack if it exists for our run - which it shouldn\'t, but you never know. + find $WORKSPACE -mindepth 2 -maxdepth 7 -type f -name Pulumi.yaml -execdir $WORKSPACE/pulumi/python/venv/bin/pulumi stack rm marajenk${BUILD_NUMBER} --force --yes \\; + ''' + } + } + + stage('Minikube Setup') { + + /* + * This step installs Microk8s. This assumes you have the snap store installed and configured properly. Note that + * the snap store will always pull the latest version of the software so you may end up with a deployment that + * does not work as expected; if this happens please check back with the github repository and verify the known + * working configurations. + */ + + steps { + sh ''' + apt-get install curl wget gnupg2 -y + source /etc/os-release + sh -c "echo 'deb http://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_${VERSION_ID}/ /' > /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list" + wget -nv https://download.opensuse.org/repositories/devel:kubic:libcontainers:stable/xUbuntu_${VERSION_ID}/Release.key -O- | apt-key add - + apt-get update -qq -y + apt-get -qq --yes install podman + curl -Lo minikube https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 && chmod +x minikube + mkdir -p /usr/local/bin/ + install minikube /usr/local/bin/ + minikube start --vm-driver=podman + ''' + } + } + + stage('Configure Minikube') { + steps { + + /* + * This step configures Minikube for use with kubectl; we just make sure that the jenkins user has access to the + * configuration. + */ + + sh ''' + expect << _EOF_ + spawn minikube addons configure metallb + expect "Enter Load Balancer Start IP:" { send "192.168.100.105\\r" } + expect "Enter Load Balancer End IP:" { send "192.168.100.120\\r" } + expect eof +_EOF_ + ''' + } + } + + stage('Create VENV') { + steps { + + /* + * Create our virtual environment. + */ + + sh ''' + $WORKSPACE/bin/setup_venv.sh + ''' + } + } + + stage('Configure Pulumi') { + steps { + + /* + * This logic sets the necessary variables in the configuration files; this differs from the manual procedure + * where we prompt the user for a number of these required variables. This same approach can be used as part + * of the manual deployment if required. + * + * This will likely evolve further as the project does, and we may reach a point where these defaults are assumed + * for a given development type. + */ + + sh ''' + echo "PULUMI_STACK=marajenk${BUILD_NUMBER}" > $WORKSPACE/config/pulumi/environment + $WORKSPACE/pulumi/python/venv/bin/pulumi stack select --create marajenk${BUILD_NUMBER} -C pulumi/python/config + $WORKSPACE/pulumi/python/venv/bin/pulumi stack select --create marajenk${BUILD_NUMBER} -C pulumi/python/kubernetes/applications/sirius + $WORKSPACE/pulumi/python/venv/bin/pulumi config set certmgr:helm_timeout "600" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set kic-helm:fqdn "marajenks${BUILD_NUMBER}.zathras.io" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set kic-helm:helm_timeout "600" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set kubernetes:cluster_name "microk8s-cluster" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set kubernetes:infra_type "kubeconfig" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set kubernetes:kubeconfig "$HOME/.kube/config" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set logagent:helm_timeout "600" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set logstore:helm_timeout "600" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set prometheus:adminpass "password" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set prometheus:helm_timeout "600" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set prometheus:helm_timeout "600" -C pulumi/python/config -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set sirius:accounts_pwd --secret "Password" -C pulumi/python/kubernetes/applications/sirius -s marajenk${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set sirius:ledger_pwd --secret "Password" -C pulumi/python/kubernetes/applications/sirius -s marajenk${BUILD_NUMBER}''' + } + } + + stage('Pulumi Deployment') { + + /* + * This step echoes the JWT into the correct file for the startup to find it and then calls the script to build + * the MARA deployment in Microk8s + */ + + steps { + sh ''' + echo $NGINX_JWT > $WORKSPACE/extras/jwt.token + $WORKSPACE/bin/start_kube.sh + ''' + } + } + + stage('Reset Environment') { + + /* + * Clean up the environment; this includes running the destroy script to remove our pulumi resources and + * destroy the deployed Microk8s installation. + * + * After that completes, we remove the pulumi stack from the project with the find command; this is because + * we need to delete the stack in each project it's been instantiated in. + */ + + steps { + sh ''' + $WORKSPACE/bin/destroy.sh + # Reset our Microk8s Environment; true if it’s not there + microk8s reset --destroy-storage || true + # True if it’s not there… + sudo snap remove microk8s || true + find . -mindepth 2 -maxdepth 6 -type f -name Pulumi.yaml -execdir $WORKSPACE/pulumi/python/venv/bin/pulumi stack rm marajenk${BUILD_NUMBER} --force --yes \\; + # This is a hack to allow additional commands to be issued following cleanup. This is needed because the VMs + # that currently run as agents for K3S and Microk8s deployments need to be rebooted following some number of + # runs due to zombie processes and other issues. Long term we want to deploy these VM's via IaaC so the only + # exist for the lifetime of the project. We do it this way in order to provide some flexibility for the + # jenkins configuration. + ${POSTRUN_CMD- true} + ''' + + } + } + + } + post { + failure { + + /* + * On failure we still need to remove the partial build; however, we want to make sure we exit with a zero + * status so we can move on to the next step. Hence the "or true" logic below. + * + * We also clean up Microk8s. + */ + + sh ''' + # Destroy our partial build... + $WORKSPACE/bin/destroy.sh || true + # Reset our Microk8s Environment; true if it’s not there + microk8s reset --destroy-storage || true + # True if it’s not there… + sudo snap remove microk8s || true + # True if it's not there + minikube delete || true + # Clean up the Pulumi stack if it exists for our run - which it shouldn\'t, but you never know. + find $WORKSPACE -mindepth 2 -maxdepth 7 -type f -name Pulumi.yaml -execdir $WORKSPACE/pulumi/python/venv/bin/pulumi stack rm marajenk${BUILD_NUMBER} --force --yes \\; + # This is a hack to allow additional commands to be issued following cleanup. This is needed because the VMs + # that currently run as agents for K3S and Microk8s deployments need to be rebooted following some number of + # runs due to zombie processes and other issues. Long term we want to deploy these VM's via IaaC so the only + # exist for the lifetime of the project. We do it this way in order to provide some flexibility for the + # jenkins configuration. + ${POSTRUN_CMD- true} + ''' + } + } +} From 3719e5fb34864a0f006729c3a543f024cd03ea20 Mon Sep 17 00:00:00 2001 From: Jason Schmidt Date: Mon, 18 Apr 2022 15:15:56 -0600 Subject: [PATCH 2/5] fix: update AWS jenkinsfile for new deployment methodology --- extras/jenkins/AWS/Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extras/jenkins/AWS/Jenkinsfile b/extras/jenkins/AWS/Jenkinsfile index 48855e55..9ac0ee09 100644 --- a/extras/jenkins/AWS/Jenkinsfile +++ b/extras/jenkins/AWS/Jenkinsfile @@ -149,7 +149,7 @@ pipeline { $WORKSPACE/pulumi/python/venv/bin/pulumi config set kic-helm:fqdn "marajenks${BUILD_NUMBER}.zathras.io" -C pulumi/python/config -s marajenkaws${BUILD_NUMBER} $WORKSPACE/pulumi/python/venv/bin/pulumi config set kic-helm:helm_timeout "600" -C pulumi/python/config -s marajenkaws${BUILD_NUMBER} $WORKSPACE/pulumi/python/venv/bin/pulumi config set kubernetes:infra_type "AWS" -C pulumi/python/config -s marajenkaws${BUILD_NUMBER} - $WORKSPACE/pulumi/python/venv/bin/pulumi config set kubernetes:kubeconfig "/home/jerkins/.kube/config" -C pulumi/python/config -s marajenkaws${BUILD_NUMBER} + $WORKSPACE/pulumi/python/venv/bin/pulumi config set kubernetes:kubeconfig "$HOME/.kube/config" -C pulumi/python/config -s marajenkaws${BUILD_NUMBER} $WORKSPACE/pulumi/python/venv/bin/pulumi config set logagent:helm_timeout "600" -C pulumi/python/config -s marajenkaws${BUILD_NUMBER} $WORKSPACE/pulumi/python/venv/bin/pulumi config set logstore:helm_timeout "600" -C pulumi/python/config -s marajenkaws${BUILD_NUMBER} $WORKSPACE/pulumi/python/venv/bin/pulumi config set prometheus:adminpass "password" -C pulumi/python/config -s marajenkaws${BUILD_NUMBER} From 9b6c420c3ddaff7ab6d81f9949c4003b439440da Mon Sep 17 00:00:00 2001 From: Jason Schmidt Date: Mon, 18 Apr 2022 17:45:25 -0600 Subject: [PATCH 3/5] feat: updated docs to indicate minikube success --- docs/status-and-issues.md | 66 +++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/docs/status-and-issues.md b/docs/status-and-issues.md index eded05bb..1016c1d6 100644 --- a/docs/status-and-issues.md +++ b/docs/status-and-issues.md @@ -28,43 +28,47 @@ All of these configurations use Pulumi code within Python as the Infrastructure | Google GKE | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | | | MicroK8s | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Storage, DNS, and Metallb need to be Enabled (4) | | Harvester/RKE2 | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Storage, K8 LoadBalancer | -| K3S | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Storage, K8 LoadBalancer | +| K3S | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | | | Rancher Desktop | No | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Storage, K8 LoadBalancer | -| Minikube | No | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Storage, K8 LoadBalancer | +| Minikube | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Metallb Enabled and configured | ### Notes: -1. The NGINX IC build/deploy process is currently under active development and support for IC will be standardized across -all providers. Follow [#81](https://github.com/nginxinc/kic-reference-architectures/issues/81) and -[#86](https://github.com/nginxinc/kic-reference-architectures/issues/86) for details. Currently, for all non-AWS environments -you have the option to specify either NGINX or NGINX Plus as your IC. The latter does require an active subscription and a -JWT to be included at build time. Please see the documentation for more details. -2. The process via which the IP and FQDN are created and used is currently under active development, and will be streamlined -and standardized for all providers. Follow [#82](https://github.com/nginxinc/kic-reference-architectures/issues/82) for -details. -3. The initial deployment was entirely built to work with AWS. As part of our reorganization the ability to use a -kubeconfig file was added, along with the necessary configuration to support additional standup options. This is currently -in active development and will result in this process being streamlined for these additional environments. Please follow -[#80](https://github.com/nginxinc/kic-reference-architectures/issues/80) for details. -4. We are currently using filebeat as our logging agent. This deployment requires that the correct paths to the container -log directory are present in the deployment data. We have discovered that this differs based on the K8 provider. Please -see [#76](https://github.com/nginxinc/kic-reference-architectures/issues/76) for more detail. + +1. The NGINX IC build/deploy process is currently under active development and support for IC will be standardized + across all providers. Follow [#81](https://github.com/nginxinc/kic-reference-architectures/issues/81) and + [#86](https://github.com/nginxinc/kic-reference-architectures/issues/86) for details. Currently, for all non-AWS + environments you have the option to specify either NGINX or NGINX Plus as your IC. The latter does require an active + subscription and a JWT to be included at build time. Please see the documentation for more details. +2. The process via which the IP and FQDN are created and used is currently under active development, and will be + streamlined and standardized for all providers. + Follow [#82](https://github.com/nginxinc/kic-reference-architectures/issues/82) for details. +3. The initial deployment was entirely built to work with AWS. As part of our reorganization the ability to use a + kubeconfig file was added, along with the necessary configuration to support additional standup options. This is + currently in active development and will result in this process being streamlined for these additional environments. + Please follow + [#80](https://github.com/nginxinc/kic-reference-architectures/issues/80) for details. +4. We are currently using filebeat as our logging agent. This deployment requires that the correct paths to the + container log directory are present in the deployment data. We have discovered that this differs based on the K8 + provider. Please see [#76](https://github.com/nginxinc/kic-reference-architectures/issues/76) for more detail. ## Known Issues / Caveats + 1. Currently, the use of the Elastic tooling has shown to be problematic under heavy load, with containers falling over -and causing disruptions. Please see the tuning variables in the configuration file to adjust the number of replicas -deployed for the Elastic logstore to tune to your environment. -2. The default Helm timeout is 5 minutes, which is acceptable for most managed clouds but tends to be too short for -single-vm or workstation deployments. Please see the configuration file variables to adjust the helm timeout as required -for your environment. + and causing disruptions. Please see the tuning variables in the configuration file to adjust the number of replicas + deployed for the Elastic logstore to tune to your environment. +2. The default Helm timeout is 5 minutes, which is acceptable for most managed clouds but tends to be too short for + single-vm or workstation deployments. Please see the configuration file variables to adjust the helm timeout as + required for your environment. 3. When load testing the Bank of Sirius using Locust, you will likely see a high failure rate as you increase the max -users and spawn rate. This is "normal" and is an area we want to expose and explore for troubleshooting, determining -which metrics/traces are helpful, etc. + users and spawn rate. This is "normal" and is an area we want to expose and explore for troubleshooting, determining + which metrics/traces are helpful, etc. 4. The most common failure modes for non-cloud environments tend towards the following failures: - 1. Unable to provision persistent storage; correct by ensuring you have a persistent volume provider and can provision a volume. - 2. Unable to provision an External IP; correct by adding an IP provider such as kubevip or metallb. - 3. Resource starvation (not enough CPU, Memory); expand the size of the VM or detune the environment. - 4. Timeouts in helm; increase the helm timeout in the configuration file. -5. If you are using a cloud provider with timed credentials, such as AWS, one failure mode that can arise is when the -credentials expire. This will result in a number of strange and seemingly confusing errors. Double check to make sure that -the credentials are valid. + 1. Unable to provision persistent storage; correct by ensuring you have a persistent volume provider and can + provision a volume. + 2. Unable to provision an External IP; correct by adding an IP provider such as kubevip or metallb. + 3. Resource starvation (not enough CPU, Memory); expand the size of the VM or detune the environment. + 4. Timeouts in helm; increase the helm timeout in the configuration file. +5. If you are using a cloud provider with timed credentials, such as AWS, one failure mode that can arise is when the + credentials expire. This will result in a number of strange and seemingly confusing errors. Double check to make sure + that the credentials are valid. 6. Currently, the build/test process is highly manual. This will be addressed in the future. From 23a7fb81df6fab419f4145b0074779779145034e Mon Sep 17 00:00:00 2001 From: Jason Schmidt Date: Tue, 19 Apr 2022 08:15:47 -0600 Subject: [PATCH 4/5] fix: documentation changes suggested in review --- docs/status-and-issues.md | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/docs/status-and-issues.md b/docs/status-and-issues.md index 1016c1d6..c04d427e 100644 --- a/docs/status-and-issues.md +++ b/docs/status-and-issues.md @@ -18,19 +18,19 @@ includes the following: - FQDN/IP: How does the project handle the IP addressing and FQDN for the certificates? - Notes: Any additional information on the provider / project interaction. -All of these configurations use Pulumi code within Python as the Infrastructure as Code (IaC) manager. - -| K8 Provider | Tested | Infrastructure Support | IC Options | FQDN/IP | Notes | +All of these configurations use Pulumi code within Python as the Infrastructand configuredure as Code ( +| K8 Provider | Tested | Infrastructure Support | IC Options | FQDN/IP | Notes | |-----------------|--------|-----------------------------|---------------------------------|-----------------|--------------------------------------------------| -| AWS EKS | Yes | Full Infrastructure Standup | Build, Pull (uses ECR) | Provided | | -| Digtal Ocean | Yes | Full Infrastructure Standup |NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2)|| -| Azure AKS | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | | -| Google GKE | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | | -| MicroK8s | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Storage, DNS, and Metallb need to be Enabled (4) | -| Harvester/RKE2 | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Storage, K8 LoadBalancer | -| K3S | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | | -| Rancher Desktop | No | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Storage, K8 LoadBalancer | -| Minikube | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Metallb Enabled and configured | +| AWS EKS | Yes | Full Infrastructure Standup | Build, Pull (uses ECR) | Provided | | | Digtal Ocean | Yes | +Full Infrastructure Standup |NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2)|| | Azure AKS | Yes | Kubeconfig +Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | | | Google GKE | Yes | Kubeconfig Only (3) +| NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | | | MicroK8s | Yes | Kubeconfig Only (3) | NGINX / NGINX +Plus (w/ JWT) (1) | Manual FQDN (2) | Storage, DNS, and Metallb need to be Enabled (4) | | Harvester/RKE2 | Yes | +Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Storage, K8 LoadBalancer | | K3S +| Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | | | Rancher Desktop | No | +Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Storage, K8 LoadBalancer | | +Minikube | Yes | Kubeconfig Only (3) | NGINX / NGINX Plus (w/ JWT) (1) | Manual FQDN (2) | Needs Metallb Enabled +and configured | ### Notes: @@ -54,11 +54,15 @@ All of these configurations use Pulumi code within Python as the Infrastructure ## Known Issues / Caveats 1. Currently, the use of the Elastic tooling has shown to be problematic under heavy load, with containers falling over - and causing disruptions. Please see the tuning variables in the configuration file to adjust the number of replicas - deployed for the Elastic logstore to tune to your environment. + and causing disruptions. Please see the [example configuration file](../config/pulumi/Pulumi.stackname.yaml.example) + variables to adjust the number of replicas deployed for the Elastic logstore to tune to your environment. These will + need to be added/updated in the configuration for your stack, which is located in `./config/pulumi` and + is named `Pulumi.$STACK.yaml`. 2. The default Helm timeout is 5 minutes, which is acceptable for most managed clouds but tends to be too short for - single-vm or workstation deployments. Please see the configuration file variables to adjust the helm timeout as - required for your environment. + single-vm or workstation deployments. Please see + the [example configuration file](../config/pulumi/Pulumi.stackname.yaml.example) + variables to adjust the helm timeout as required for your environment. These will need to be added/updated in the + configuration for your stack, which is located in `./config/pulumi` and is named `Pulumi.$STACK.yaml`. 3. When load testing the Bank of Sirius using Locust, you will likely see a high failure rate as you increase the max users and spawn rate. This is "normal" and is an area we want to expose and explore for troubleshooting, determining which metrics/traces are helpful, etc. From b43d4d99cf036480b183bd4787dca009ce9cf384 Mon Sep 17 00:00:00 2001 From: Jason Schmidt Date: Tue, 19 Apr 2022 08:23:26 -0600 Subject: [PATCH 5/5] fix: added in some links for concepts as raised in review --- docs/status-and-issues.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/status-and-issues.md b/docs/status-and-issues.md index c04d427e..e997e740 100644 --- a/docs/status-and-issues.md +++ b/docs/status-and-issues.md @@ -63,13 +63,16 @@ and configured | the [example configuration file](../config/pulumi/Pulumi.stackname.yaml.example) variables to adjust the helm timeout as required for your environment. These will need to be added/updated in the configuration for your stack, which is located in `./config/pulumi` and is named `Pulumi.$STACK.yaml`. -3. When load testing the Bank of Sirius using Locust, you will likely see a high failure rate as you increase the max - users and spawn rate. This is "normal" and is an area we want to expose and explore for troubleshooting, determining - which metrics/traces are helpful, etc. +3. When load testing the Bank of Sirius using [Locust](https://locust.io/), you will likely see a high failure rate as + you increase the max users and spawn rate. This is "normal" and is an area we want to expose and explore for + troubleshooting, determining which metrics/traces are helpful, etc. 4. The most common failure modes for non-cloud environments tend towards the following failures: - 1. Unable to provision persistent storage; correct by ensuring you have a persistent volume provider and can - provision a volume. - 2. Unable to provision an External IP; correct by adding an IP provider such as kubevip or metallb. + 1. Unable to provision persistent storage; correct by ensuring you have a + [persistent volume provider](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) and can provision a + volume. + 2. Unable to provision an External IP; correct by adding an IP provider such + as [kubevip](https://kube-vip.chipzoller.dev/) + or [metallb](https://metallb.org/). 3. Resource starvation (not enough CPU, Memory); expand the size of the VM or detune the environment. 4. Timeouts in helm; increase the helm timeout in the configuration file. 5. If you are using a cloud provider with timed credentials, such as AWS, one failure mode that can arise is when the