diff options
author | Karsten Tausche <karsten@fairphone.com> | 2019-01-14 16:32:32 +0100 |
---|---|---|
committer | Chase Qi <chase.qi@linaro.org> | 2019-03-14 10:38:11 +0800 |
commit | dce10089766a0bf0a0224578013058cb76a3ea4e (patch) | |
tree | 3d07f1aa08b3b21d482b2f19f5f2034838f956b0 /automated/android | |
parent | 91c916c00747aa1fa52dbb5707cb5a5b24c0243e (diff) |
MultiNode Tradefed: Reset userdata before retry
If supplied, flash a userdata image to the devices before triggering
Tradefed retry. This helps reducing invalid failures that are caused by
tests bringing devices into a state where other tests cannot pass.
Issue: INFRA-137
Change-Id: I4a6510ceab6aec7ce530b9f8e244e9655dc22b81
Depends-On: Ie21cc417c78ec88da523f0d14ea85c1e9bcb912c
Signed-off-by: Karsten Tausche <karsten@fairphone.com>
Diffstat (limited to 'automated/android')
6 files changed, 144 insertions, 5 deletions
diff --git a/automated/android/multinode/tradefed/tradefed-multinode.sh b/automated/android/multinode/tradefed/tradefed-multinode.sh index 567b09a..39c24d6 100755 --- a/automated/android/multinode/tradefed/tradefed-multinode.sh +++ b/automated/android/multinode/tradefed/tradefed-multinode.sh @@ -32,11 +32,12 @@ $0 [-o timeout_secs] [ -m device_worker_mapping_file] [-c cts_url] [-t test_params] [-u test_retry_params] [-i max_num_runs] [-n runs_if_unchanged] [-p test_path] [-s state_check_frequency_secs] [-r <aggregated|atomic>] [-f failures_printed] [-a <ap_ssid>] [-k <ap_key>] [-j <java_options>] +[-b <userdata_image_file>] heredoc exit 1 } -while getopts ':o:m:c:t:u:i:n:p:s:r:f:a:k:j:' opt; do +while getopts ':o:m:c:t:u:i:n:p:s:r:f:a:k:j:b:' opt; do case "${opt}" in o) TIMEOUT_SECS="${OPTARG}" ;; m) DEVICE_WORKER_MAPPING_FILE="${OPTARG}" ;; @@ -52,6 +53,7 @@ while getopts ':o:m:c:t:u:i:n:p:s:r:f:a:k:j:' opt; do a) AP_SSID="${OPTARG}" ;; k) AP_KEY="${OPTARG}" ;; j) JAVA_OPTIONS="${OPTARG}" ;; + b) USERDATA_IMAGE_FILE="${OPTARG}" ;; *) usage ;; esac done @@ -135,6 +137,7 @@ runner_exited_cleanly="pass" ./tradefed-runner-multinode.py -t "${TEST_PARAMS}" -u "${TEST_RETRY_PARAMS}" -i "${MAX_NUM_RUNS}" \ -n "${RUNS_IF_UNCHANGED}" -p "${TEST_PATH}" -s "${STATE_CHECK_FREQUENCY_SECS}" \ -r "${RESULT_FORMAT}" -f "${FAILURES_PRINTED}" -m "${DEVICE_WORKER_MAPPING_FILE}" \ + --userdata_image_file "${USERDATA_IMAGE_FILE}" \ || runner_exited_cleanly="fail" # "fail" here means that an unexpected error/exception occurred in the runner. diff --git a/automated/android/multinode/tradefed/tradefed-multinode.yaml b/automated/android/multinode/tradefed/tradefed-multinode.yaml index 3e30a21..d6728df 100644 --- a/automated/android/multinode/tradefed/tradefed-multinode.yaml +++ b/automated/android/multinode/tradefed/tradefed-multinode.yaml @@ -51,6 +51,9 @@ params: # For devices locally connected via USB, <device> the serial number of the # device and <workerId> must be empty. DEVICE_WORKER_MAPPING_FILE: "/tmp/deviceWorkerMapping" + # Userdata image file that will be used to reset devices to a clean state + # before starting TradeFed reruns. + USERDATA_IMAGE_FILE: "" # Let the whole test run fail if the test runner failed to exit cleanly. RAISE_ON_FAILURE: "true" @@ -85,7 +88,7 @@ run: -s "${STATE_CHECK_FREQUENCY_SECS}" -r "${RESULTS_FORMAT}" \ -m "${DEVICE_WORKER_MAPPING_FILE}" -f "${FAILURES_PRINTED}" \ -a "${AP_SSID}" -k "${AP_KEY}" -j "${JAVA_OPTIONS}" \ - || exec_result=$? + -b "${USERDATA_IMAGE_FILE}" || exec_result=$? # Upload test log and result files to artifactorial. - cp -r ./${TEST_PATH}/results ./output/ || true - cp -r ./${TEST_PATH}/logs ./output/ || true diff --git a/automated/android/multinode/tradefed/tradefed-runner-multinode.py b/automated/android/multinode/tradefed/tradefed-runner-multinode.py index f862057..a28f5af 100755 --- a/automated/android/multinode/tradefed/tradefed-runner-multinode.py +++ b/automated/android/multinode/tradefed/tradefed-runner-multinode.py @@ -63,6 +63,10 @@ parser.add_argument('-f', dest='FAILURES_PRINTED', type=int, required=False, default=0, help="Specify the number of failed test cases to be\ printed, 0 means not print any failures.") +parser.add_argument('--userdata_image_file', dest='USERDATA_IMAGE_FILE', + required=False, help="Userdata image file that will be \ + used to reset devices to a clean state before starting \ + TradeFed reruns.") args = parser.parse_args() @@ -91,7 +95,14 @@ try: device_address = deviceToWorker[0] worker_job_id = (None if (len(deviceToWorker) == 1 or not deviceToWorker[1]) else deviceToWorker[1]) - devices.append(Device(device_address, TRADEFED_LOGCAT % device_address, worker_job_id)) + devices.append( + Device( + serial_or_address=device_address, + logcat_output_filename=TRADEFED_LOGCAT % device_address, + worker_job_id=worker_job_id, + userdata_image_file=args.USERDATA_IMAGE_FILE, + ) + ) except OSError as e: logger.error("Mapping file cannot be opened: %s" % args.DEVICE_WORKER_MAPPING_FILE) sys.exit(1) @@ -337,6 +348,19 @@ while child.isalive(): logger.info('NOT retrying TradeFed session as maximum number of retries is reached.') else: logger.info('Retrying with results of session %s' % tradefed_session_id) + logger.info('First resetting the devices to a clean state...') + + unavailable_devices = [] + for device in devices: + if not device.userdata_reset(): + unavailable_devices += [device.serial_or_address] + if unavailable_devices: + logger.warning( + 'Following devices were not reset successfully ' + 'or are not yet available again: %s' + % ', '.join(unavailable_devices) + ) + try: child.expect(prompt, timeout=60) child.sendline('%s --retry %s' % (args.TEST_RETRY_PARAMS, str(tradefed_session_id))) diff --git a/automated/android/multinode/tradefed/utils.py b/automated/android/multinode/tradefed/utils.py index d470885..6fab4d3 100644 --- a/automated/android/multinode/tradefed/utils.py +++ b/automated/android/multinode/tradefed/utils.py @@ -1,9 +1,11 @@ import logging +import os.path import re import shutil import subprocess import sys import time +from typing import Dict sys.path.insert(0, "../../../lib/") from py_util_lib import call_shell_lib # nopep8 @@ -16,7 +18,11 @@ class Device: EXEC_IN_LAVA = shutil.which("lava-send") is not None def __init__( - self, serial_or_address, logcat_output_filename, worker_job_id=None + self, + serial_or_address, + logcat_output_filename, + worker_job_id=None, + userdata_image_file=None, ): self.serial_or_address = serial_or_address self.is_tcpip_device = bool( @@ -29,6 +35,7 @@ class Device: ) self.worker_job_id = worker_job_id self.worker_handshake_iteration = 1 + self.userdata_image_file = userdata_image_file self._is_available = True def ensure_available(self, logger, timeout_secs=30): @@ -111,6 +118,10 @@ class Device: # function will return failure, but the device can still become accessible in the next # iteration of device availability checks. + # `fastboot devices` prints in some versions more debug information + # than `fastboot reboot`, e.g., missing udev rules. + subprocess.run(["fastboot", "devices"]) + # There is no point in waiting longer for `fastboot reboot`: fastbootRebootTimeoutSecs = 10 try: @@ -123,6 +134,8 @@ class Device: # failure. pass + subprocess.run(["fastboot", "devices"]) + bootTimeoutSecs = max( 10, int(reconnectTimeoutSecs) - fastbootRebootTimeoutSecs ) @@ -149,6 +162,11 @@ class Device: if not self.check_available(): return False + + # Ensure that the device screen is on during test runs. + if not self._call_shell_lib("disable_suspend"): + print("WARNING: Disabling device suspend may have failed.") + # reestablish logcat connection self.logcat.kill() self.logcat = subprocess.Popen( @@ -157,6 +175,58 @@ class Device: ) return True + def userdata_reset(self, commandTimeoutSecs=60, reconnectTimeoutSecs=900): + """Reset the device to a clean state. This is equivalent to resetting to + factory settings and applying CTS set-up steps.""" + if not self.userdata_image_file: + print("WARNING: Skipping userdata_reset; no image file provided.") + return True + if not os.path.isfile(self.userdata_image_file): + print( + "WARNING: Skipping userdata_reset; image file not found: %s" + % self.userdata_image_file + ) + + print("Resetting userdata partition on %s" % self.serial_or_address) + + # Reflash the userdata partition. + if self.is_tcpip_device: + self.worker_handshake("userdata_reset") + else: + try: + subprocess.run( + [ + "adb", + "-s", + self.serial_or_address, + "reboot", + "bootloader", + ], + timeout=commandTimeoutSecs, + ) + except subprocess.TimeoutExpired: + # Blocking `adb reboot` does not necessarily indicate a failure. + pass + try: + subprocess.run( + [ + "fastboot", + "-s", + self.serial_or_address, + "flash", + "userdata", + self.userdata_image_file, + ], + timeout=commandTimeoutSecs, + ) + except subprocess.TimeoutExpired as e: + print(e) + return False + + # Reconnect as usual. + if not self.try_reconnect(reconnectTimeoutSecs=reconnectTimeoutSecs): + return False + def release(self): self.logcat.kill() self.logcat_output_file.close() diff --git a/automated/android/multinode/wait-and-keep-local-device-accessible.sh b/automated/android/multinode/wait-and-keep-local-device-accessible.sh index 6776436..aee98f2 100755 --- a/automated/android/multinode/wait-and-keep-local-device-accessible.sh +++ b/automated/android/multinode/wait-and-keep-local-device-accessible.sh @@ -14,6 +14,7 @@ NETWORK_TIMEOUT_SECS=${NETWORK_TIMEOUT_SECS:-300} ADB_TCPIP_ATTEMPTS=${ADB_TCPIP_ATTEMPTS:-5} ADB_CONNECT_TEST_TIMEOUT_SECS=${ADB_CONNECT_TEST_TIMEOUT_SECS:-60} ANDROID_ENABLE_WIFI=${ANDROID_ENABLE_WIFI:-true} +USERDATA_IMAGE_FILE=${USERDATA_IMAGE_FILE:-""} # shellcheck source=automated/lib/sh-test-lib . "${MY_AUTOMATED_DIR}/lib/sh-test-lib" @@ -64,6 +65,34 @@ settings; UI automation failed." fi } +userdata_reset() { + if [ -z "${USERDATA_IMAGE_FILE}" ]; then + warn_msg "Skipping userdata_reset; no image file provided." + return + fi + if [ ! -f "${USERDATA_IMAGE_FILE}" ]; then + warn_msg "Skipping userdata_reset; image file not found: \ +${USERDATA_IMAGE_FILE}." + return + fi + # shellcheck disable=SC2039 + local previousResult="${RESULT}" + RESULT=false + if ! timeout "${ADB_CONNECT_TEST_TIMEOUT_SECS}" adb reboot bootloader; then + warn_msg "Reboot into bootloader failed." + return + fi + if ! fastboot flash userdata "${USERDATA_IMAGE_FILE}"; then + warn_msg "Flashing userdata image failed." + return + fi + if ! timeout 10 fastboot reboot; then + warn_msg "Device did not reboot from fastboot as expected." + return + fi + RESULT="${previousResult}" +} + lava-test-set start keepAlive @@ -89,6 +118,11 @@ while true; do adb devices || true reconnect_device ;; + userdata_reset) + info_msg "Userdata reset requested by master." + userdata_reset + reconnect_device + ;; *) lava-test-raise "Script error. Unexpected message from master to \ worker, command=${command}" diff --git a/automated/android/multinode/wait-and-keep-local-device-accessible.yaml b/automated/android/multinode/wait-and-keep-local-device-accessible.yaml index 35e3cad..720fedf 100644 --- a/automated/android/multinode/wait-and-keep-local-device-accessible.yaml +++ b/automated/android/multinode/wait-and-keep-local-device-accessible.yaml @@ -21,9 +21,14 @@ params: ADB_TCPIP_ATTEMPTS: "5" ADB_CONNECT_TEST_TIMEOUT_SECS: "60" ANDROID_ENABLE_WIFI: "true" + # Userdata image file that will be used to reset devices to a clean state + # before starting TradeFed reruns. + USERDATA_IMAGE_FILE: "" run: steps: - lava-install-packages --no-install-recommends python3-pip python3-setuptools python3-wheel - pip3 install -q uiautomator - - ./automated/android/multinode/wait-and-keep-local-device-accessible.sh + - | + USERDATA_IMAGE_FILE="${USERDATA_IMAGE_FILE}" \ + ./automated/android/multinode/wait-and-keep-local-device-accessible.sh |