Step: storage-conf-wait-for-csi-migration

storage-conf-csi-migration-enable steps enables alpha feature gates for CSI migration of all in-tree volume plugins we have in OCP and waits for the chanegs to be reflected in kube-controller-manager and nodes. It expects that there is no volume present on the system that would be affected by the migration!

Container image used for this step: cli

cli resolves to an image built or imported by the ci-operator configuration (documentation).

Environment

Step exposes no environmental variables except the defaults.

Source Code

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/bin/bash
set -o nounset
set -o pipefail

ARTIFACT_DIR=${ARTIFACT_DIR:-/tmp}

function list_kcms() {
    # Tolerate errors from API server. Nodes and kube-controller-manager are restarted during this process.
    while true; do
        oc -n openshift-kube-controller-manager get pod -l "kube-controller-manager=true" -o custom-columns=NAME:.metadata.name --no-headers && break
    done
}

function kcm_migrated {
    local POD=$1

    # This will return 0 when the migration is enabled in the KCM pod.
    # Any API server failure results in nozero exit code, i.e. not migrated KCM.
    oc -n openshift-kube-controller-manager get pod $POD -o custom-columns=NAME:.spec.containers[0].args --no-headers  | fgrep -- "--feature-gates=CSIMigrationAWS=true" > /dev/null
}

function wait_for_kcms() {
    local COUNT=1

    while true; do
        local MIGRATED=true
        echo "$(date) waiting for all kube-controller-managers migrated, attempt $COUNT"

        for KCM in $( list_kcms ); do
            if kcm_migrated $KCM; then
                echo "$KCM migrated"
            else
                MIGRATED=false
                echo "$KCM not migrated"
            fi
        done

        # For debugging
        oc -n openshift-kube-controller-manager get pod -l "kube-controller-manager=true" -o yaml &> $ARTIFACT_DIR/kcm-$COUNT.yaml || :

        if $MIGRATED; then
            echo "All KCMs migrated"
            break
        fi
        COUNT=$[ $COUNT+1 ]
        sleep 5
    done
}

function list_nodes() {
    # Tolerate errors from API server. Nodes and kube-controller-manager are restarted during this process.
    while true; do
        oc get node -o custom-columns=NAME:.metadata.name --no-headers && break
    done
}

function node_migrated {
    local NODE=$1

    # This will return 0 when the migration is enabled in the node. Using AWS just as one representative,
    # all plugins should be migrated.
    # Any API server failure results in nozero exit code, i.e. not migrated node.
    oc get csinode $NODE -o yaml | grep -- "storage.alpha.kubernetes.io/migrated-plugins:.*kubernetes.io/aws-ebs" > /dev/null
}

function wait_for_nodes() {
    local COUNT=1

    while true; do
        local MIGRATED=true
        echo "$(date) waiting for all nodes migrated, attempt $COUNT"

        for NODE in $( list_nodes ); do
            if node_migrated $NODE; then
                echo "$NODE migrated"
            else
                MIGRATED=false
                echo "$NODE not migrated"
            fi
        done

        # For debugging
        oc get csinode -o yaml &> $ARTIFACT_DIR/csinode-$COUNT.yaml || :
        oc get node -o yaml &> $ARTIFACT_DIR/node-$COUNT.yaml || :

        if $MIGRATED; then
            echo "All nodes migrated"
            break
        fi
        COUNT=$[ $COUNT+1 ]
        sleep 5
    done
}

function nodes_stable() {
    # Check that the nodes are Ready
    echo "Checking Nodes Progressing=False"
    oc wait --for=condition=Ready=True node --all --timeout=0 || return 1
    # Check the nodes are schedulable
    echo "Checking Nodes are schedulable"
    if oc get node -o yaml | grep "unschedulable"; then
        return 1
    fi
}

function cluster_stable() {
    echo "Checking ClusterOperators Progressing=False"
    oc wait --all --for=condition=Progressing=False clusteroperators.config.openshift.io --timeout=0 || return 1
    echo "Checking ClusterOperators Available=True"
    oc wait --all --for=condition=Available=True clusteroperators.config.openshift.io --timeout=0 || return 1
    echo "Checking ClusterOperators Degraded=False"
    oc wait --all --for=condition=Degraded=False clusteroperators.config.openshift.io --timeout=0 || return 1
}

function wait_for_stable_cluster() {
    # A cluster is considered stable when:
    # - all nodes are Ready
    # - all nodes are schedulable
    # - CVO is Available=true, Progressing=false, Degraded=false
    # - all the checks above are stable for 1 minute
    local COUNT=1
    local STABLE_COUNT=1
    while true; do
        echo
        echo "$(date) Waiting for the cluster to stabilize, attempt $COUNT"

        if nodes_stable ; then
            echo "Nodes are stable"
        else
            STABLE_COUNT=0
            echo "Nodes are not stable"
        fi

        if cluster_stable; then
            echo "Cluster is stable"
        else
            STABLE_COUNT=0
            echo "Cluster is not stable"
        fi

        oc get node -o yaml &> $ARTIFACT_DIR/stability-node-$COUNT.yaml || :
        oc get clusteroperator -o yaml > $ARTIFACT_DIR/stability-clusteroperator-$COUNT.yaml || :

        # Wait until 6 checks pass in a row (at least 1 minute, probably much more)
        if [ "$STABLE_COUNT" -ge "6" ]; then
            echo "Cluster is stable"
            break
        fi
        COUNT=$[ $COUNT+1 ]
        echo "Current stability: $STABLE_COUNT"
        STABLE_COUNT=$[ $STABLE_COUNT+1 ]
        sleep 10
    done
}

wait_for_kcms
wait_for_nodes
wait_for_stable_cluster

Properties

Property Value Description
Resource requests (cpu) 10m Used in .resources.requests of the pod running this step.
Resource requests (memory) 100Mi Used in .resources.requests of the pod running this step.

GitHub Link:

https://github.com/openshift/release/blob/master/ci-operator/step-registry/storage/conf/wait-for-csi-migration/storage-conf-wait-for-csi-migration-ref.yaml

Owners:

Approvers:

Reviewers:

Source code for this page located on GitHub