-
-
Notifications
You must be signed in to change notification settings - Fork 196
/
Copy pathbootstrap-cluster.sh
executable file
·215 lines (165 loc) · 7.56 KB
/
bootstrap-cluster.sh
1
2
3
4
5
6
7
8
9
10
10000
div>
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env bash
set -Eeuo pipefail
source "$(dirname "${0}")/lib/common.sh"
export LOG_LEVEL="debug"
export ROOT_DIR="$(git rev-parse --show-toplevel)"
# Apply the Talos configuration to all the nodes
function apply_talos_config() {
log debug "Applying Talos configuration"
local talos_controlplane_file="${ROOT_DIR}/talos/controlplane.yaml.j2"
local talos_worker_file="${ROOT_DIR}/talos/worker.yaml.j2"
if [[ ! -f ${talos_controlplane_file} ]]; then
log error "No Talos machine files found for controlplane" "file=${talos_controlplane_file}"
fi
# Skip worker configuration if no worker file is found
if [[ ! -f ${talos_worker_file} ]]; then
log warn "No Talos machine files found for worker" "file=${talos_worker_file}"
talos_worker_file=""
fi
# Apply the Talos configuration to the controlplane and worker nodes
for file in ${talos_controlplane_file} ${talos_worker_file}; do
if ! nodes=$(talosctl config info --output json 2>/dev/null | jq --exit-status --raw-output '.nodes | join(" ")') || [[ -z "${nodes}" ]]; then
log error "No Talos nodes found"
fi
log debug "Talos nodes discovered" "nodes=${nodes}"
# Apply the Talos configuration
for node in ${nodes}; do
log debug "Applying Talos node configuration" "node=${node}"
if ! machine_config=$(bash "${ROOT_DIR}/scripts/render-machine-config.sh" "${file}" "${ROOT_DIR}/talos/nodes/${node}.yaml.j2") || [[ -z "${machine_config}" ]]; then
exit 1
fi
log info "Talos node configuration rendered successfully" "node=${node}"
if ! output=$(echo "${machine_config}" | talosctl --nodes "${node}" apply-config --insecure --file /dev/stdin 2>&1);
then
if [[ "${output}" == *"certificate required"* ]]; then
log warn "Talos node is already configured, skipping apply of config" "node=${node}"
continue
fi
log error "Failed to apply Talos node configuration" "node=${node}" "output=${output}"
fi
log info "Talos node configuration applied successfully" "node=${node}"
done
done
}
# Bootstrap Talos on a controller node
function bootstrap_talos() {
log debug "Bootstrapping Talos"
local bootstrapped=true
if ! controller=$(talosctl config info --output json | jq --exit-status --raw-output '.endpoints[]' | shuf -n 1) || [[ -z "${controller}" ]]; then
log error "No Talos controller found"
fi
log debug "Talos controller discovered" "controller=${controller}"
until output=$(talosctl --nodes "${controller}" bootstrap 2>&1); do
if [[ "${bootstrapped}" == true && "${output}" == *"AlreadyExists"* ]]; then
log info "Talos is bootstrapped" "controller=${controller}"
break
fi
# Set bootstrapped to false after the first attempt
bootstrapped=false
log info "Talos bootstrap failed, retrying in 10 seconds..." "controller=${controller}"
sleep 10
done
}
# Fetch the kubeconfig from a controller node
function fetch_kubeconfig() {
log debug "Fetching kubeconfig"
if ! controller=$(talosctl config info --output json | jq --exit-status --raw-output '.endpoints[]' | shuf -n 1) || [[ -z "${controller}" ]]; then
log error "No Talos controller found"
fi
if ! talosctl kubeconfig --nodes "${controller}" --force --force-context-name main "$(basename "${KUBECONFIG}")" &>/dev/null; then
log error "Failed to fetch kubeconfig"
fi
log info "Kubeconfig fetched successfully"
}
# Talos requires the nodes to be 'Ready=False' before applying resources
function wait_for_nodes() {
log debug "Waiting for nodes to be available"
# Skip waiting if all nodes are 'Ready=True'
if kubectl wait nodes --for=condition=Ready=True --all --timeout=10s &>/dev/null; then
log info "Nodes are available and ready, skipping wait for nodes"
return
fi
# Wait for all nodes to be 'Ready=False'
until kubectl wait nodes --for=condition=Ready=False --all --timeout=10s &>/dev/null; do
log info "Nodes are not available, waiting for nodes to be available. Retrying in 10 seconds..."
sleep 10
done
}
# Resources to be applied before the helmfile charts are installed
function apply_resources() {
log debug "Applying resources"
local -r resources_file="${ROOT_DIR}/bootstrap/resources.yaml.j2"
if ! output=$(render_template "${resources_file}") || [[ -z "${output}" ]]; then
exit 1
fi
if echo "${output}" | kubectl diff --filename - &>/dev/null; then
log info "Resources are up-to-date"
return
fi
if echo "${output}" | kubectl apply --server-side --filename - &>/dev/null; then
log info "Resources applied"
else
log error "Failed to apply resources"
fi
}
# Disks in use by rook-ceph must be wiped before Rook is installed
function wipe_rook_disks() {
log debug "Wiping Rook disks"
# Skip disk wipe if Rook is detected running in the cluster
# TODO: Is there a better way to detect Rook / OSDs?
if kubectl --namespace rook-ceph get kustomization rook-ceph &>/dev/null; then
log warn "Rook is detected running in the cluster, skipping disk wipe"
return
fi
if ! nodes=$(talosctl config info --output json 2>/dev/null | jq --exit-status --raw-output '.nodes | join(" ")') || [[ -z "${nodes}" ]]; then
log error "No Talos nodes found"
fi
log debug "Talos nodes discovered" "nodes=${nodes}"
# Wipe disks on each node that match the ROOK_DISK environment variable
for node in ${nodes}; do
if ! disks=$(talosctl --nodes "${node}" get disk --output json 2>/dev/null \
| jq --exit-status --raw-output --slurp '. | map(select(.spec.model == env.ROOK_DISK) | .metadata.id) | join(" ")') || [[ -z "${nodes}" ]];
then
log error "No disks found" "node=${node}" "model=${ROOK_DISK}"
fi
log debug "Talos node and disk discovered" "node=${node}" "disks=${disks}"
# Wipe each disk on the node
for disk in ${di
5D10
sks}; do
if talosctl --nodes "${node}" wipe disk "${disk}" &>/dev/null; then
log info "Disk wiped" "node=${node}" "disk=${disk}"
else
log error "Failed to wipe disk" "node=${node}" "disk=${disk}"
fi
done
done
}
# Apply Helm releases using helmfile
function apply_helm_releases() {
log debug "Applying Helm releases with helmfile"
local -r helmfile_file="${ROOT_DIR}/bootstrap/helmfile.yaml"
if [[ ! -f "${helmfile_file}" ]]; then
log error "File does not exist" "file=${helmfile_file}"
fi
if ! helmfile --file "${helmfile_file}" apply --hide-notes --skip-diff-on-install --suppress-diff --suppress-secrets; then
log error "Failed to apply Helm releases"
fi
log info "Helm releases applied successfully"
}
function main() {
check_env KUBECONFIG KUBERNETES_VERSION ROOK_DISK TALOS_VERSION
check_cli helmfile jq kubectl kustomize minijinja-cli op talosctl yq
if ! op whoami --format=json &>/dev/null; then
log error "Failed to authenticate with 1Password CLI"
fi
# Bootstrap the Talos node configuration
apply_talos_config
bootstrap_talos
fetch_kubeconfig
# Apply resources and Helm releases
wait_for_nodes
wipe_rook_disks
apply_resources
apply_helm_releases
log info "Congrats! The cluster is bootstrapped and Flux is syncing the Git repository"
}
main "$@"