ZTS: Add a failsafe callback to run after each test

Tests that get killed do not have an opportunity to clean up. There are many bad states this can leave the system in, but of particular gravity is when zinject has been used to induce bad behavior for one or more of the test disks. Create a failsafe mechanism in test-runner.py that runs a callback script after every test. The script is common to all tests so all tests benefit from the protection. Add an obligatory `zinject -c all` to clear all zinject state after every test case is run. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: John Kennedy <john.kennedy@delphix.com> Signed-off-by: Ryan Moeller <ryan@iXsystems.com> Closes #10096
2020-03-10 14:00:56 -04:00 · 2020-03-10 14:00:56 -04:00 · ddd9ef3a4f
commit ddd9ef3a4f
parent 1dc32a67e9
7 changed files with 145 additions and 48 deletions
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@ -17,6 +17,8 @@ user = root
 timeout = 600
 post_user = root
 post = cleanup
+failsafe_user = root
+failsafe = callbacks/zfs_failsafe
 outputdir = /var/tmp/test_results
 tags = ['functional']

--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@ -17,6 +17,8 @@ user = root
 timeout = 600
 post_user = root
 post = cleanup
+failsafe_user = root
+failsafe = callbacks/zfs_failsafe
 outputdir = /var/tmp/test_results
 tags = ['functional']

--- a/tests/runfiles/sunos.run
+++ b/tests/runfiles/sunos.run
@ -17,6 +17,8 @@ user = root
 timeout = 600
 post_user = root
 post = cleanup
+failsafe_user = root
+failsafe = callbacks/zfs_failsafe
 outputdir = /var/tmp/test_results
 tags = ['functional']

--- a/tests/test-runner/bin/test-runner.py
+++ b/tests/test-runner/bin/test-runner.py
@ -173,10 +173,13 @@ def __init__(self, pathname, identifier=None, outputdir=None,
            self.timeout = 60

    def __str__(self):
-        return "Pathname: %s\nIdentifier: %s\nOutputdir: %s\nTimeout: %d\n" \
-               "User: %s\n" % \
-               (self.pathname, self.identifier, self.outputdir, self.timeout,
-                self.user)
+        return '''\
+Pathname: %s
+Identifier: %s
+Outputdir: %s
+Timeout: %d
+User: %s
+''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user)

    def kill_cmd(self, proc, keyboard_interrupt=False):
        """
@ -305,7 +308,7 @@ def skip(self):
        self.result.runtime = '%02d:%02d' % (m, s)
        self.result.result = 'SKIP'

-    def log(self, options):
+    def log(self, options, suppress_console=False):
        """
        This function is responsible for writing all output. This includes
        the console output, the logfile of all results (with timestamped
@ -328,12 +331,14 @@ def log(self, options):

        # The result line is always written to the log file. If -q was
        # specified only failures are written to the console, otherwise
-        # the result line is written to the console.
+        # the result line is written to the console. The console output
+        # may be suppressed by calling log() with suppress_console=True.
        write_log(bytearray(result_line, encoding='utf-8'), LOG_FILE)
-        if not options.quiet:
-            write_log(result_line, LOG_OUT)
-        elif options.quiet and self.result.result != 'PASS':
-            write_log(result_line, LOG_OUT)
+        if not suppress_console:
+            if not options.quiet:
+                write_log(result_line, LOG_OUT)
+            elif options.quiet and self.result.result != 'PASS':
+                write_log(result_line, LOG_OUT)

        lines = sorted(self.result.stdout + self.result.stderr,
                       key=lambda x: x[0])
@ -361,36 +366,49 @@ def log(self, options):

 class Test(Cmd):
    props = ['outputdir', 'timeout', 'user', 'pre', 'pre_user', 'post',
-             'post_user', 'tags']
+             'post_user', 'failsafe', 'failsafe_user', 'tags']

    def __init__(self, pathname,
-                 pre=None, pre_user=None, post=None, post_user=None, tags=None,
-                 **kwargs):
+                 pre=None, pre_user=None, post=None, post_user=None,
+                 failsafe=None, failsafe_user=None, tags=None, **kwargs):
        super(Test, self).__init__(pathname, **kwargs)
        self.pre = pre or ''
        self.pre_user = pre_user or ''
        self.post = post or ''
        self.post_user = post_user or ''
+        self.failsafe = failsafe or ''
+        self.failsafe_user = failsafe_user or ''
        self.tags = tags or []

    def __str__(self):
-        post_user = pre_user = ''
+        post_user = pre_user = failsafe_user = ''
        if len(self.pre_user):
            pre_user = ' (as %s)' % (self.pre_user)
        if len(self.post_user):
            post_user = ' (as %s)' % (self.post_user)
-        return "Pathname: %s\nIdentifier: %s\nOutputdir: %s\nTimeout: %d\n" \
-               "Pre: %s%s\nPost: %s%s\nUser: %s\nTags: %s\n" % \
-               (self.pathname, self.identifier, self.outputdir, self.timeout,
-                self.pre, pre_user, self.post, post_user, self.user, self.tags)
+        if len(self.failsafe_user):
+            failsafe_user = ' (as %s)' % (self.failsafe_user)
+        return '''\
+Pathname: %s
+Identifier: %s
+Outputdir: %s
+Timeout: %d
+User: %s
+Pre: %s%s
+Post: %s%s
+Failsafe: %s%s
+Tags: %s
+''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user,
+            self.pre, pre_user, self.post, post_user, self.failsafe,
+            failsafe_user, self.tags)

    def verify(self):
        """
-        Check the pre/post scripts, user and Test. Omit the Test from this
-        run if there are any problems.
+        Check the pre/post/failsafe scripts, user and Test. Omit the Test from
+        this run if there are any problems.
        """
-        files = [self.pre, self.pathname, self.post]
-        users = [self.pre_user, self.user, self.post_user]
+        files = [self.pre, self.pathname, self.post, self.failsafe]
+        users = [self.pre_user, self.user, self.post_user, self.failsafe_user]

        for f in [f for f in files if len(f)]:
            if not verify_file(f):
@ -408,8 +426,9 @@ def verify(self):

    def run(self, options):
        """
-        Create Cmd instances for the pre/post scripts. If the pre script
-        doesn't pass, skip this Test. Run the post script regardless.
+        Create Cmd instances for the pre/post/failsafe scripts. If the pre
+        script doesn't pass, skip this Test. Run the post script regardless.
+        If the Test is killed, also run the failsafe script.
        """
        odir = os.path.join(self.outputdir, os.path.basename(self.pre))
        pretest = Cmd(self.pre, identifier=self.identifier, outputdir=odir,
@ -417,6 +436,10 @@ def run(self, options):
        test = Cmd(self.pathname, identifier=self.identifier,
                   outputdir=self.outputdir, timeout=self.timeout,
                   user=self.user)
+        odir = os.path.join(self.outputdir, os.path.basename(self.failsafe))
+        failsafe = Cmd(self.failsafe, identifier=self.identifier,
+                       outputdir=odir, timeout=self.timeout,
+                       user=self.failsafe_user)
        odir = os.path.join(self.outputdir, os.path.basename(self.post))
        posttest = Cmd(self.post, identifier=self.identifier, outputdir=odir,
                       timeout=self.timeout, user=self.post_user)
@ -429,6 +452,9 @@ def run(self, options):

        if cont:
            test.run(options.dryrun)
+            if test.result.result == 'KILLED' and len(failsafe.pathname):
+                failsafe.run(options.dryrun)
+                failsafe.log(options, suppress_console=True)
        else:
            test.skip()

@ -447,35 +473,48 @@ def __init__(self, pathname, tests=None, **kwargs):
        self.tests = tests or []

    def __str__(self):
-        post_user = pre_user = ''
+        post_user = pre_user = failsafe_user = ''
        if len(self.pre_user):
            pre_user = ' (as %s)' % (self.pre_user)
        if len(self.post_user):
            post_user = ' (as %s)' % (self.post_user)
-        return "Pathname: %s\nIdentifier: %s\nOutputdir: %s\nTests: %s\n" \
-               "Timeout: %s\nPre: %s%s\nPost: %s%s\nUser: %s\nTags: %s\n" % \
-               (self.pathname, self.identifier, self.outputdir, self.tests,
-                self.timeout, self.pre, pre_user, self.post, post_user,
-                self.user, self.tags)
+        if len(self.failsafe_user):
+            failsafe_user = ' (as %s)' % (self.failsafe_user)
+        return '''\
+Pathname: %s
+Identifier: %s
+Outputdir: %s
+Tests: %s
+Timeout: %s
+User: %s
+Pre: %s%s
+Post: %s%s
+Failsafe: %s%s
+Tags: %s
+''' % (self.pathname, self.identifier, self.outputdir, self.tests,
+            self.timeout, self.user, self.pre, pre_user, self.post, post_user,
+            self.failsafe, failsafe_user, self.tags)

    def verify(self):
        """
-        Check the pre/post scripts, user and tests in this TestGroup. Omit
-        the TestGroup entirely, or simply delete the relevant tests in the
+        Check the pre/post/failsafe scripts, user and tests in this TestGroup.
+        Omit the TestGroup entirely, or simply delete the relevant tests in the
        group, if that's all that's required.
        """
-        # If the pre or post scripts are relative pathnames, convert to
+        # If the pre/post/failsafe scripts are relative pathnames, convert to
        # absolute, so they stand a chance of passing verification.
        if len(self.pre) and not os.path.isabs(self.pre):
            self.pre = os.path.join(self.pathname, self.pre)
        if len(self.post) and not os.path.isabs(self.post):
            self.post = os.path.join(self.pathname, self.post)
+        if len(self.failsafe) and not os.path.isabs(self.failsafe):
+            self.post = os.path.join(self.pathname, self.post)

-        auxfiles = [self.pre, self.post]
-        users = [self.pre_user, self.user, self.post_user]
+        auxfiles = [self.pre, self.post, self.failsafe]
+        users = [self.pre_user, self.user, self.post_user, self.failsafe_user]

        for f in [f for f in auxfiles if len(f)]:
-            if self.pathname != os.path.dirname(f):
+            if f != self.failsafe and self.pathname != os.path.dirname(f):
                write_log("Warning: TestGroup '%s' not added to this run. "
                          "Auxiliary script '%s' exists in a different "
                          "directory.\n" % (self.pathname, f), LOG_ERR)
@ -505,9 +544,9 @@ def verify(self):

    def run(self, options):
        """
-        Create Cmd instances for the pre/post scripts. If the pre script
-        doesn't pass, skip all the tests in this TestGroup. Run the post
-        script regardless.
+        Create Cmd instances for the pre/post/failsafe scripts. If the pre
+        script doesn't pass, skip all the tests in this TestGroup. Run the
+        post script regardless. Run the failsafe script when a test is killed.
        """
        # tags assigned to this test group also include the test names
        if options.tags and not set(self.tags).intersection(set(options.tags)):
@ -527,12 +566,18 @@ def run(self, options):
            pretest.log(options)

        for fname in self.tests:
-            test = Cmd(os.path.join(self.pathname, fname),
-                       outputdir=os.path.join(self.outputdir, fname),
+            odir = os.path.join(self.outputdir, fname)
+            test = Cmd(os.path.join(self.pathname, fname), outputdir=odir,
                       timeout=self.timeout, user=self.user,
                       identifier=self.identifier)
+            odir = os.path.join(odir, os.path.basename(self.failsafe))
+            failsafe = Cmd(self.failsafe, outputdir=odir, timeout=self.timeout,
+                           user=self.failsafe_user, identifier=self.identifier)
            if cont:
                test.run(options.dryrun)
+                if test.result.result == 'KILLED' and len(failsafe.pathname):
+                    failsafe.run(options.dryrun)
+                    failsafe.log(options, suppress_console=True)
            else:
                test.skip()

@ -562,6 +607,8 @@ def __init__(self, options):
            ('pre_user', ''),
            ('post', ''),
            ('post_user', ''),
+            ('failsafe', ''),
+            ('failsafe_user', ''),
            ('tags', [])
        ]

@ -599,8 +646,8 @@ def addtestgroup(self, dirname, filenames, options):
            for prop in Test.props:
                setattr(testgroup, prop, getattr(options, prop))

-            # Prevent pre/post scripts from running as regular tests
-            for f in [testgroup.pre, testgroup.post]:
+            # Prevent pre/post/failsafe scripts from running as regular tests
+            for f in [testgroup.pre, testgroup.post, testgroup.failsafe]:
                if f in filenames:
                    del filenames[filenames.index(f)]

@ -630,6 +677,8 @@ def read(self, options):
                setattr(self, opt, config.get('DEFAULT', opt))
        self.outputdir = os.path.join(self.outputdir, self.timestamp)

+        testdir = options.testdir
+
        for section in config.sections():
            if 'tests' in config.options(section):
                parts = section.split(':', 1)
@ -637,8 +686,8 @@ def read(self, options):
                identifier = parts[1] if len(parts) == 2 else None
                if os.path.isdir(sectiondir):
                    pathname = sectiondir
-                elif os.path.isdir(os.path.join(options.testdir, sectiondir)):
-                    pathname = os.path.join(options.testdir, sectiondir)
+                elif os.path.isdir(os.path.join(testdir, sectiondir)):
+                    pathname = os.path.join(testdir, sectiondir)
                else:
                    pathname = sectiondir

@ -647,9 +696,13 @@ def read(self, options):
                for prop in TestGroup.props:
                    for sect in ['DEFAULT', section]:
                        if config.has_option(sect, prop):
-                            if prop == "tags":
+                            if prop == 'tags':
                                setattr(testgroup, prop,
                                        eval(config.get(sect, prop)))
+                            elif prop == 'failsafe':
+                                failsafe = config.get(sect, prop)
+                                setattr(testgroup, prop,
+                                        os.path.join(testdir, failsafe))
                            else:
                                setattr(testgroup, prop,
                                        config.get(sect, prop))
@ -664,7 +717,12 @@ def read(self, options):
                for prop in Test.props:
                    for sect in ['DEFAULT', section]:
                        if config.has_option(sect, prop):
-                            setattr(test, prop, config.get(sect, prop))
+                            if prop == 'failsafe':
+                                failsafe = config.get(sect, prop)
+                                setattr(test, prop,
+                                        os.path.join(testdir, failsafe))
+                            else:
+                                setattr(test, prop, config.get(sect, prop))

                if test.verify():
                    self.tests[section] = test
@ -706,7 +764,8 @@ def complete_outputdirs(self):
        outputdir, and are guaranteed uniqueness because a group can only
        contain files in one directory. Pre and post tests will create a
        directory rooted at the outputdir of the Test or TestGroup in
-        question for their output.
+        question for their output. Failsafe scripts will create a directory
+        rooted at the outputdir of each Test for their output.
        """
        done = False
        components = 0
@ -932,6 +991,13 @@ def parse_args():
                      type='string', help='Specify a post script.')
    parser.add_option('-q', action='store_true', default=False, dest='quiet',
                      help='Silence on the console during a test run.')
+    parser.add_option('-s', action='callback', callback=options_cb,
+                      default='', dest='failsafe', metavar='script',
+                      type='string', help='Specify a failsafe script.')
+    parser.add_option('-S', action='callback', callback=options_cb,
+                      default='', dest='failsafe_user',
+                      metavar='failsafe_user', type='string',
+                      help='Specify a user to execute the failsafe script.')
    parser.add_option('-t', action='callback', callback=options_cb, default=60,
                      dest='timeout', metavar='seconds', type='int',
                      help='Timeout (in seconds) for an individual test.')
--- a/tests/test-runner/man/test-runner.1
+++ b/tests/test-runner/man/test-runner.1
@ -251,6 +251,22 @@ Run \fIscript\fR after any test or test group.
 Print only the results summary to the standard output.
 .RE

+.ne 2
+.na
+\fB-s\fR \fIscript\fR
+.ad
+.RS 6n
+Run \fIscript\fR as a failsafe after any test is killed.
+.RE
+
+.ne 2
+.na
+\fB-S\fR \fIusername\fR
+.ad
+.RS 6n
+Execute the failsafe script as \fIusername\fR.
+.RE
+
 .ne 2
 .na
 \fB-t\fR \fIn\fR
--- a/tests/zfs-tests/callbacks/Makefile.am
+++ b/tests/zfs-tests/callbacks/Makefile.am
@ -1,5 +1,6 @@
 pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/callbacks
 dist_pkgdata_SCRIPTS = \
+	zfs_failsafe.ksh \
 	zfs_dbgmsg.ksh \
 	zfs_dmesg.ksh \
 	zfs_mmp.ksh
--- a/tests/zfs-tests/callbacks/zfs_failsafe.ksh
+++ b/tests/zfs-tests/callbacks/zfs_failsafe.ksh
@ -0,0 +1,8 @@
+#!/bin/ksh
+
+# Commands to perform failsafe-critical cleanup after a test is killed.
+#
+# This should only be used to ensure the system is restored to a functional
+# state in the event of tests being killed (preventing normal cleanup).
+
+zinject -c all