Add timeouts for cluster metadata initialization and for init containers (#218)

- Add timeouts for waiting for zk and bk to become available.
- If the waiting gets stuck for some reason, the Pulsar deployment never
  becomes starts the broker services.
  - timeouts will help failures recover eventually
This commit is contained in:
Lari Hotari 2024-06-20 20:07:48 +03:00 committed by GitHub
parent 023f902a02
commit 70f36ffe43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 67 additions and 27 deletions

View File

@ -93,7 +93,7 @@ Define autorecovery init container : verify cluster id
{{- define "pulsar.autorecovery.init.verify_cluster_id" -}}
bin/apply-config-from-env.py conf/bookkeeper.conf;
{{- include "pulsar.autorecovery.zookeeper.tls.settings" . -}}
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
sleep 3;
done;
{{- end }}

View File

@ -124,7 +124,7 @@ Define bookie init container : verify cluster id
{{- if not (and .Values.volumes.persistence .Values.bookkeeper.volumes.persistence) }}
bin/apply-config-from-env.py conf/bookkeeper.conf;
{{- include "pulsar.bookkeeper.zookeeper.tls.settings" . -}}
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
sleep 3;
done;
bin/bookkeeper shell bookieformat -nonInteractive -force -deleteCookie || true
@ -133,7 +133,7 @@ bin/bookkeeper shell bookieformat -nonInteractive -force -deleteCookie || true
set -e;
bin/apply-config-from-env.py conf/bookkeeper.conf;
{{- include "pulsar.bookkeeper.zookeeper.tls.settings" . -}}
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
sleep 3;
done;
{{- end }}

View File

@ -106,13 +106,14 @@ spec:
terminationGracePeriodSeconds: {{ .Values.autorecovery.gracePeriod }}
serviceAccountName: "{{ template "pulsar.fullname" . }}-{{ .Values.autorecovery.component }}"
initContainers:
{{- if and .Values.autorecovery.waitBookkeeperTimeout (not (eq (.Values.autorecovery.waitBookkeeperTimeout | toString) "0")) }}
# This initContainer will wait for bookkeeper initnewcluster to complete
# before deploying the bookies
- name: pulsar-bookkeeper-verify-clusterid
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.autorecovery "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.autorecovery "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.autorecovery.waitBookkeeperTimeout }}", "sh", "-c"]
args:
- >
{{- include "pulsar.autorecovery.init.verify_cluster_id" . | nindent 10 }}
@ -121,6 +122,7 @@ spec:
name: "{{ template "pulsar.fullname" . }}-{{ .Values.autorecovery.component }}"
volumeMounts:
{{- include "pulsar.autorecovery.certs.volumeMounts" . | nindent 8 }}
{{- end }}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.autorecovery.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.autorecovery "root" .) }}"

View File

@ -45,16 +45,17 @@ spec:
{{ toYaml .Values.pulsar_metadata.tolerations | indent 8 }}
{{- end }}
initContainers:
{{- if and .Values.bookkeeper.metadata.waitZookeeperTimeout (not (eq (.Values.bookkeeper.metadata.waitZookeeperTimeout | toString) "0")) }}
- name: wait-zookeeper-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.bookie "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.bookie "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.bookkeeper.metadata.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
{{- if $zk:=.Values.pulsar_metadata.userProvidedZookeepers }}
export PULSAR_MEM="-Xmx128M";
until bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
echo "user provided zookeepers {{ $zk }} are unreachable... check in 3 seconds ..." && sleep 3;
done;
{{ else }}
@ -62,6 +63,7 @@ spec:
sleep 3;
done;
{{- end}}
{{- end}}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.bookkeeper.component }}-init"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.bookie "root" .) }}"
@ -70,17 +72,17 @@ spec:
resources:
{{ toYaml .Values.bookkeeper.metadata.resources | indent 10 }}
{{- end }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.bookkeeper.metadata.initTimeout | default 60 }}", "sh", "-c"]
args:
- >
bin/apply-config-from-env.py conf/bookkeeper.conf;
{{- include "pulsar.toolset.zookeeper.tls.settings" . | nindent 12 }}
export BOOKIE_MEM="-Xmx128M";
if bin/bookkeeper shell whatisinstanceid; then
if timeout 15 bin/bookkeeper shell whatisinstanceid; then
echo "bookkeeper cluster already initialized";
else
{{- if not (eq .Values.metadataPrefix "") }}
bin/bookkeeper org.apache.zookeeper.ZooKeeperMain -server {{ template "pulsar.fullname" . }}-{{ .Values.zookeeper.component }} create {{ .Values.metadataPrefix }} && echo 'created for pulsar cluster "{{ template "pulsar.cluster.name" . }}"' &&
bin/pulsar zookeeper-shell -server {{ template "pulsar.fullname" . }}-{{ .Values.zookeeper.component }} create {{ .Values.metadataPrefix }} && echo 'created for pulsar cluster "{{ template "pulsar.cluster.name" . }}"' &&
{{- end }}
bin/bookkeeper shell initnewcluster;
fi

View File

@ -106,6 +106,7 @@ spec:
securityContext:
{{ toYaml .Values.bookkeeper.securityContext | indent 8 }}
{{- end }}
{{- if and .Values.bookkeeper.waitMetadataTimeout (not (eq (.Values.bookkeeper.waitMetadataTimeout | toString) "0")) }}
initContainers:
# This initContainer will wait for bookkeeper initnewcluster to complete
# before deploying the bookies
@ -113,7 +114,7 @@ spec:
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.bookie "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.bookie "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.bookkeeper.waitMetadataTimeout }}", "sh", "-c"]
args:
# only reformat bookie if bookkeeper is running without persistence
- >
@ -127,6 +128,7 @@ spec:
{{- end}}
volumeMounts:
{{- include "pulsar.bookkeeper.certs.volumeMounts" . | nindent 8 }}
{{- end}}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.bookkeeper.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.bookie "root" .) }}"

View File

@ -121,22 +121,23 @@ spec:
{{- end }}
terminationGracePeriodSeconds: {{ .Values.broker.gracePeriod }}
initContainers:
{{- if and .Values.broker.waitZookeeperTimeout (not (eq (.Values.broker.waitZookeeperTimeout | toString) "0")) }}
# This init container will wait for zookeeper to be ready before
# deploying the bookies
- name: wait-zookeeper-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.broker "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.broker "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.broker.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
{{- include "pulsar.broker.zookeeper.tls.settings" . | nindent 12 }}
export BOOKIE_MEM="-Xmx128M";
{{- if .Values.pulsar_metadata.configurationStore }}
until bin/bookkeeper org.apache.zookeeper.ZooKeeperMain -server {{ template "pulsar.configurationStore.connect" . }} get {{ .Values.configurationStoreMetadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.configurationStore.connect" . }} get {{ .Values.configurationStoreMetadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
{{- end }}
{{- if not .Values.pulsar_metadata.configurationStore }}
until bin/bookkeeper org.apache.zookeeper.ZooKeeperMain -server {{ template "pulsar.zookeeper.connect" . }} get {{ .Values.metadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ .Values.metadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
{{- end }}
echo "pulsar cluster {{ template "pulsar.cluster.name" . }} isn't initialized yet ... check in 3 seconds ..." && sleep 3;
done;
@ -146,19 +147,21 @@ spec:
{{- end }}
volumeMounts:
{{- include "pulsar.broker.certs.volumeMounts" . | nindent 8 }}
{{- end }}
{{- if and .Values.broker.waitBookkeeperTimeout (not (eq (.Values.broker.waitBookkeeperTimeout | toString) "0")) }}
# This init container will wait for bookkeeper to be ready before
# deploying the broker
- name: wait-bookkeeper-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.broker "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.broker "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.broker.waitBookkeeperTimeout }}", "sh", "-c"]
args:
- >
{{- include "pulsar.broker.zookeeper.tls.settings" . | nindent 12 }}
bin/apply-config-from-env.py conf/bookkeeper.conf;
export BOOKIE_MEM="-Xmx128M";
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
echo "bookkeeper cluster is not initialized yet. backoff for 3 seconds ...";
sleep 3;
done;
@ -179,6 +182,7 @@ spec:
{{- end }}
volumeMounts:
{{- include "pulsar.broker.certs.volumeMounts" . | nindent 10 }}
{{- end }}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.broker.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.broker "root" .) }}"
@ -227,11 +231,11 @@ spec:
bin/gen-yml-from-env.py conf/functions_worker.yml;
echo "OK" > "${statusFilePath:-status}";
{{- include "pulsar.broker.zookeeper.tls.settings" . | nindent 10 }}
bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ template "pulsar.broker.znode" . }};
timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ template "pulsar.broker.znode" . }};
while [ $? -eq 0 ]; do
echo "broker {{ template "pulsar.broker.hostname" . }} znode still exists ... check in 10 seconds ...";
sleep 10;
bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ template "pulsar.broker.znode" . }};
timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.zookeeper.connect" . }} get {{ template "pulsar.broker.znode" . }};
done;
cat conf/pulsar_env.sh;
OPTS="${OPTS} -Dlog4j2.formatMsgNoLookups=true" exec bin/pulsar broker;

View File

@ -105,32 +105,35 @@ spec:
terminationGracePeriodSeconds: {{ .Values.proxy.gracePeriod }}
serviceAccountName: "{{ template "pulsar.fullname" . }}-{{ .Values.proxy.component }}"
initContainers:
{{- if and .Values.proxy.waitZookeeperTimeout (not (eq (.Values.proxy.waitZookeeperTimeout | toString) "0")) }}
# This init container will wait for zookeeper to be ready before
# deploying the bookies
- name: wait-zookeeper-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.proxy "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.proxy "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.proxy.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
export PULSAR_MEM="-Xmx128M";
{{- if $zk:=.Values.pulsar_metadata.userProvidedZookeepers }}
until bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
echo "user provided zookeepers {{ $zk }} are unreachable... check in 3 seconds ..." && sleep 3;
done;
{{ else }}
until bin/pulsar zookeeper-shell -server {{ template "pulsar.configurationStore.service" . }} get {{ .Values.metadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ template "pulsar.configurationStore.service" . }} get {{ .Values.metadataPrefix }}/admin/clusters/{{ template "pulsar.cluster.name" . }}; do
sleep 3;
done;
{{- end}}
{{- end}}
{{- if and .Values.proxy.waitBrokerTimeout (not (eq (.Values.proxy.waitBrokerTimeout | toString) "0")) }}
# This init container will wait for at least one broker to be ready before
# deploying the proxy
- name: wait-broker-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.proxy "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.images.proxy "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.proxy.waitBrokerTimeout }}", "sh", "-c"]
args:
- >-
set -e;
@ -140,6 +143,7 @@ spec:
sleep 10;
brokerServiceNumber="$(nslookup -timeout=10 {{ template "pulsar.fullname" . }}-{{ .Values.broker.component }} | grep Name | wc -l)";
done;
{{- end}}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.proxy.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.images.proxy "root" .) }}"

View File

@ -41,12 +41,13 @@ spec:
{{ toYaml .Values.pulsar_metadata.nodeSelector | indent 8 }}
{{- end }}
initContainers:
{{- if and .Values.pulsar_metadata.waitZookeeperTimeout (not (eq (.Values.pulsar_metadata.waitZookeeperTimeout | toString) "0")) }}
{{- if .Values.pulsar_metadata.configurationStore }}
- name: wait-cs-ready
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.pulsar_metadata.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
until nslookup {{ .Values.pulsar_metadata.configurationStore}}; do
@ -57,12 +58,12 @@ spec:
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.pulsar_metadata.waitZookeeperTimeout }}", "sh", "-c"]
args:
- >-
{{- if $zk:=.Values.pulsar_metadata.userProvidedZookeepers }}
export PULSAR_MEM="-Xmx128M";
until bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
until timeout 15 bin/pulsar zookeeper-shell -server {{ $zk }} ls {{ or .Values.metadataPrefix "/" }}; do
echo "user provided zookeepers {{ $zk }} are unreachable... check in 3 seconds ..." && sleep 3;
done;
{{ else }}
@ -70,13 +71,15 @@ spec:
sleep 3;
done;
{{- end}}
{{- end }}
{{- if and .Values.pulsar_metadata.waitBookkeeperTimeout (not (eq (.Values.pulsar_metadata.waitBookkeeperTimeout | toString) "0")) }}
# This initContainer will wait for bookkeeper initnewcluster to complete
# before initializing pulsar metadata
- name: pulsar-bookkeeper-verify-clusterid
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
imagePullPolicy: "{{ template "pulsar.imagePullPolicy" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
resources: {{ toYaml .Values.initContainer.resources | nindent 10 }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.pulsar_metadata.waitBookkeeperTimeout }}", "sh", "-c"]
args:
- >
bin/apply-config-from-env.py conf/bookkeeper.conf;
@ -84,7 +87,7 @@ spec:
echo Setting the memory to a lower value to avoid OOM as operations below are not memory intensive.;
export BOOKIE_MEM="-Xmx128M";
{{- include "pulsar.toolset.zookeeper.tls.settings" . | nindent 10 }}
until bin/bookkeeper shell whatisinstanceid; do
until timeout 15 bin/bookkeeper shell whatisinstanceid; do
sleep 3;
done;
envFrom:
@ -92,6 +95,7 @@ spec:
name: "{{ template "pulsar.fullname" . }}-{{ .Values.bookkeeper.component }}"
volumeMounts:
{{- include "pulsar.toolset.certs.volumeMounts" . | nindent 8 }}
{{- end }}
containers:
- name: "{{ template "pulsar.fullname" . }}-{{ .Values.pulsar_metadata.component }}"
image: "{{ template "pulsar.imageFullName" (dict "image" .Values.pulsar_metadata.image "root" .) }}"
@ -100,7 +104,7 @@ spec:
resources:
{{ toYaml .Values.pulsar_metadata.resources | indent 10 }}
{{- end }}
command: ["sh", "-c"]
command: ["timeout", "{{ .Values.pulsar_metadata.initTimeout | default 60 }}", "sh", "-c"]
args:
- |
{{- include "pulsar.toolset.zookeeper.tls.settings" . | nindent 12 }}

View File

@ -474,6 +474,10 @@ bookkeeper:
## BookKeeper Cluster Initialize
## templates/bookkeeper-cluster-initialize.yaml
metadata:
## Timeout for waiting for zookeeper to become available before running metadata initialization
waitZookeeperTimeout: 600
## Timeout for running metadata initialization
initTimeout: 60
## Set the resources used for running `bin/bookkeeper shell initnewcluster`
##
resources:
@ -529,6 +533,8 @@ bookkeeper:
annotations: {}
tolerations: []
gracePeriod: 30
## Timeout for waiting for bookkeeper cluster metadata to be initialized before starting a bookie
waitMetadataTimeout: 600
resources:
requests:
memory: 512Mi
@ -714,6 +720,8 @@ autorecovery:
annotations: {}
# tolerations: []
gracePeriod: 30
## Timeout for waiting for bookkeeper to become available before starting a broker
waitBookkeeperTimeout: 120
resources:
requests:
memory: 64Mi
@ -750,6 +758,12 @@ pulsar_metadata:
# configurationStore:
configurationStoreMetadataPrefix: ""
configurationStorePort: 2181
## Timeout for waiting for zookeeper to become available before running metadata initialization
waitZookeeperTimeout: 600
## Timeout for waiting for bookkeeper to be initialized before running metadata initialization
waitBookkeeperTimeout: 120
## Timeout for running metadata initialization
initTimeout: 60
# resources for bin/pulsar initialize-cluster-metadata
resources:
@ -834,6 +848,10 @@ broker:
annotations: {}
tolerations: []
gracePeriod: 30
## Timeout for waiting for zookeeper to become available before starting a broker
waitZookeeperTimeout: 600
## Timeout for waiting for bookkeeper to become available before starting a broker
waitBookkeeperTimeout: 120
resources:
requests:
memory: 512Mi
@ -1067,6 +1085,10 @@ proxy:
annotations: {}
tolerations: []
gracePeriod: 30
## Timeout for waiting for zookeeper to become available before starting a proxy
waitZookeeperTimeout: 600
## Timeout for waiting for brokers to become available before starting a proxy
waitBrokerTimeout: 120
resources:
requests:
memory: 128Mi