diff --git a/sys/contrib/openzfs/.github/CONTRIBUTING.md b/sys/contrib/openzfs/.github/CONTRIBUTING.md index 9bc2e7ef0684..f28a747e82c4 100644 --- a/sys/contrib/openzfs/.github/CONTRIBUTING.md +++ b/sys/contrib/openzfs/.github/CONTRIBUTING.md @@ -126,8 +126,8 @@ feature needed? What problem does it solve? #### General -* All pull requests must be based on the current master branch and apply -without conflicts. +* All pull requests, except backports and releases, must be based on the current master branch +and should apply without conflicts. * Please attempt to limit pull requests to a single commit which resolves one specific issue. * Make sure your commit messages are in the correct format. See the @@ -230,70 +230,6 @@ attempting to solve. Signed-off-by: Contributor ``` -#### OpenZFS Patch Ports -If you are porting OpenZFS patches, the commit message must meet -the following guidelines: -* The first line must be the summary line from the most important OpenZFS commit being ported. -It must begin with `OpenZFS dddd, dddd - ` where `dddd` are OpenZFS issue numbers. -* Provides a `Authored by:` line to attribute each patch for each original author. -* Provides the `Reviewed by:` and `Approved by:` lines from each original -OpenZFS commit. -* Provides a `Ported-by:` line with the developer's name followed by -their email for each OpenZFS commit. -* Provides a `OpenZFS-issue:` line with link for each original illumos -issue. -* Provides a `OpenZFS-commit:` line with link for each original OpenZFS commit. -* If necessary, provide some porting notes to describe any deviations from -the original OpenZFS commits. - -An example OpenZFS patch port commit message for a single patch is provided -below. -``` -OpenZFS 1234 - Summary from the original OpenZFS commit - -Authored by: Original Author -Reviewed by: Reviewer One -Reviewed by: Reviewer Two -Approved by: Approver One -Ported-by: ZFS Contributor - -Provide some porting notes here if necessary. - -OpenZFS-issue: https://www.illumos.org/issues/1234 -OpenZFS-commit: https://github.com/openzfs/openzfs/commit/abcd1234 -``` - -If necessary, multiple OpenZFS patches can be combined in a single port. -This is useful when you are porting a new patch and its subsequent bug -fixes. An example commit message is provided below. -``` -OpenZFS 1234, 5678 - Summary of most important OpenZFS commit - -1234 Summary from original OpenZFS commit for 1234 - -Authored by: Original Author -Reviewed by: Reviewer Two -Approved by: Approver One -Ported-by: ZFS Contributor - -Provide some porting notes here for 1234 if necessary. - -OpenZFS-issue: https://www.illumos.org/issues/1234 -OpenZFS-commit: https://github.com/openzfs/openzfs/commit/abcd1234 - -5678 Summary from original OpenZFS commit for 5678 - -Authored by: Original Author2 -Reviewed by: Reviewer One -Approved by: Approver Two -Ported-by: ZFS Contributor - -Provide some porting notes here for 5678 if necessary. - -OpenZFS-issue: https://www.illumos.org/issues/5678 -OpenZFS-commit: https://github.com/openzfs/openzfs/commit/efgh5678 -``` - #### Coverity Defect Fixes If you are submitting a fix to a [Coverity defect](https://scan.coverity.com/projects/zfsonlinux-zfs), diff --git a/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/bug_report.md b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000000..1dbb5f6edb55 --- /dev/null +++ b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,53 @@ +--- +name: Bug report +about: Create a report to help us improve OpenZFS +title: '' +labels: 'Type: Defect, Status: Triage Needed' +assignees: '' + +--- + + + + + +### System information + +Type | Version/Name + --- | --- +Distribution Name | +Distribution Version | +Linux Kernel | +Architecture | +ZFS Version | +SPL Version | + + +### Describe the problem you're observing + +### Describe how to reproduce the problem + +### Include any warning/errors/backtraces from the system logs + + diff --git a/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/config.yml b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000000..dd8f0557a30c --- /dev/null +++ b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,14 @@ +blank_issues_enabled: false +contact_links: + - name: OpenZFS Questions + url: https://github.com/openzfs/zfs/discussions/new + about: Ask the community for help + - name: OpenZFS Community Support Mailing list (Linux) + url: https://zfsonlinux.topicbox.com/groups/zfs-discuss + about: Get community support for OpenZFS on Linux + - name: FreeBSD Community Support Mailing list + url: https://lists.freebsd.org/mailman/listinfo/freebsd-fs + about: Get community support for OpenZFS on FreeBSD + - name: OpenZFS on IRC + url: https://webchat.freenode.net/#openzfs + about: Use IRC to get community support for OpenZFS diff --git a/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000000..9b50a4a3d96e --- /dev/null +++ b/sys/contrib/openzfs/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,33 @@ +--- +name: Feature request +about: Suggest a feature for OpenZFS +title: '' +labels: 'Type: Feature' +assignees: '' + +--- + + + +### Describe the feature would like to see added to OpenZFS + + + +### How will this feature improve OpenZFS? + + + +### Additional context + + diff --git a/sys/contrib/openzfs/.github/codecov.yml b/sys/contrib/openzfs/.github/codecov.yml new file mode 100644 index 000000000000..6d4932680e5c --- /dev/null +++ b/sys/contrib/openzfs/.github/codecov.yml @@ -0,0 +1,25 @@ +codecov: + notify: + require_ci_to_pass: false # always post + after_n_builds: 2 # user and kernel + +coverage: + precision: 0 # 0 decimals of precision + round: nearest # Round to nearest precision point + range: "50...90" # red -> yellow -> green + + status: + project: + default: + threshold: 1% # allow 1% coverage variance + + patch: + default: + threshold: 1% # allow 1% coverage variance + +comment: + layout: "reach, diff, flags, footer" + behavior: once # update if exists; post new; skip if deleted + require_changes: yes # only post when coverage changes + +# ignore: Please place any ignores in config/ax_code_coverage.m4 instead diff --git a/sys/contrib/openzfs/.github/no-response.yml b/sys/contrib/openzfs/.github/no-response.yml new file mode 100644 index 000000000000..ef2656ec96ef --- /dev/null +++ b/sys/contrib/openzfs/.github/no-response.yml @@ -0,0 +1,13 @@ +# Configuration for probot-no-response - https://github.com/probot/no-response + +# Number of days of inactivity before an Issue is closed for lack of response +daysUntilClose: 31 +# Label requiring a response +responseRequiredLabel: "Status: Feedback requested" +# Comment to post when closing an Issue for lack of response. Set to `false` to disable +closeComment: > + This issue has been automatically closed because there has been no response + to our request for more information from the original author. With only the + information that is currently in the issue, we don't have enough information + to take action. Please reach out if you have or find the answers we need so + that we can investigate further. diff --git a/sys/contrib/openzfs/.github/stale.yml b/sys/contrib/openzfs/.github/stale.yml new file mode 100644 index 000000000000..895cc8e803b2 --- /dev/null +++ b/sys/contrib/openzfs/.github/stale.yml @@ -0,0 +1,26 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 365 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 90 +# Limit to only `issues` or `pulls` +only: issues +# Issues with these labels will never be considered stale +exemptLabels: + - "Type: Feature" + - "Bot: Not Stale" + - "Status: Work in Progress" +# Set to true to ignore issues in a project (defaults to false) +exemptProjects: true +# Set to true to ignore issues in a milestone (defaults to false) +exemptMilestones: true +# Set to true to ignore issues with an assignee (defaults to false) +exemptAssignees: true +# Label to use when marking an issue as stale +staleLabel: "Status: Stale" +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as "stale" because it has not had + any activity for a while. It will be closed in 90 days if no further activity occurs. + Thank you for your contributions. +# Limit the number of actions per hour, from 1-30. Default is 30 +limitPerRun: 6 diff --git a/sys/contrib/openzfs/.github/workflows/checkstyle.yaml b/sys/contrib/openzfs/.github/workflows/checkstyle.yaml new file mode 100644 index 000000000000..1707f5bb21db --- /dev/null +++ b/sys/contrib/openzfs/.github/workflows/checkstyle.yaml @@ -0,0 +1,36 @@ +name: checkstyle + +on: + push: + pull_request: + +jobs: + checkstyle: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install --yes -qq build-essential autoconf libtool gawk alien fakeroot linux-headers-$(uname -r) + sudo apt-get install --yes -qq zlib1g-dev uuid-dev libattr1-dev libblkid-dev libselinux-dev libudev-dev libssl-dev python-dev python-setuptools python-cffi python3 python3-dev python3-setuptools python3-cffi + # packages for tests + sudo apt-get install --yes -qq parted lsscsi ksh attr acl nfs-kernel-server fio + sudo apt-get install --yes -qq mandoc cppcheck pax-utils devscripts abigail-tools + sudo -E pip --quiet install flake8 + - name: Prepare + run: | + sh ./autogen.sh + ./configure + make -j$(nproc) + - name: Checkstyle + run: | + make checkstyle + - name: Lint + run: | + make lint + - name: CheckABI + run: | + make checkabi diff --git a/sys/contrib/openzfs/.github/workflows/zfs-tests.yml b/sys/contrib/openzfs/.github/workflows/zfs-tests.yml new file mode 100644 index 000000000000..b075a78c7729 --- /dev/null +++ b/sys/contrib/openzfs/.github/workflows/zfs-tests.yml @@ -0,0 +1,58 @@ +name: zfs-tests-sanity + +on: + push: + pull_request: + +jobs: + tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install --yes -qq build-essential autoconf libtool gdb lcov \ + git alien fakeroot wget curl bc fio acl \ + sysstat mdadm lsscsi parted gdebi attr dbench watchdog ksh \ + nfs-kernel-server samba rng-tools xz-utils \ + zlib1g-dev uuid-dev libblkid-dev libselinux-dev \ + xfslibs-dev libattr1-dev libacl1-dev libudev-dev libdevmapper-dev \ + libssl-dev libffi-dev libaio-dev libelf-dev libmount-dev \ + libpam0g-dev pamtester python-dev python-setuptools python-cffi \ + python3 python3-dev python3-setuptools python3-cffi + - name: Autogen.sh + run: | + sh autogen.sh + - name: Configure + run: | + ./configure --enable-debug --enable-debuginfo + - name: Make + run: | + make --no-print-directory -s pkg-utils pkg-kmod + - name: Install + run: | + sudo dpkg -i *.deb + # Update order of directories to search for modules, otherwise + # Ubuntu will load kernel-shipped ones. + sudo sed -i.bak 's/updates/extra updates/' /etc/depmod.d/ubuntu.conf + sudo depmod + sudo modprobe zfs + - name: Tests + run: | + /usr/share/zfs/zfs-tests.sh -v -s 3G -r sanity + - name: Prepare artifacts + if: failure() + run: | + RESULTS_PATH=$(readlink -f /var/tmp/test_results/current) + sudo dmesg > $RESULTS_PATH/dmesg + sudo cp /var/log/syslog $RESULTS_PATH/ + sudo chmod +r $RESULTS_PATH/* + - uses: actions/upload-artifact@v2 + if: failure() + with: + name: Test logs + path: /var/tmp/test_results/20*/ + if-no-files-found: ignore diff --git a/sys/contrib/openzfs/.github/workflows/zloop.yml b/sys/contrib/openzfs/.github/workflows/zloop.yml new file mode 100644 index 000000000000..30785b14507a --- /dev/null +++ b/sys/contrib/openzfs/.github/workflows/zloop.yml @@ -0,0 +1,67 @@ +name: zloop + +on: + push: + pull_request: + +jobs: + tests: + runs-on: ubuntu-latest + env: + TEST_DIR: /var/tmp/zloop + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install --yes -qq build-essential autoconf libtool gdb \ + git alien fakeroot \ + zlib1g-dev uuid-dev libblkid-dev libselinux-dev \ + xfslibs-dev libattr1-dev libacl1-dev libudev-dev libdevmapper-dev \ + libssl-dev libffi-dev libaio-dev libelf-dev libmount-dev \ + libpam0g-dev \ + python-dev python-setuptools python-cffi \ + python3 python3-dev python3-setuptools python3-cffi + - name: Autogen.sh + run: | + sh autogen.sh + - name: Configure + run: | + ./configure --enable-debug --enable-debuginfo + - name: Make + run: | + make --no-print-directory -s pkg-utils pkg-kmod + - name: Install + run: | + sudo dpkg -i *.deb + # Update order of directories to search for modules, otherwise + # Ubuntu will load kernel-shipped ones. + sudo sed -i.bak 's/updates/extra updates/' /etc/depmod.d/ubuntu.conf + sudo depmod + sudo modprobe zfs + - name: Tests + run: | + sudo mkdir -p $TEST_DIR + # run for 20 minutes to have a total runner time of 30 minutes + sudo /usr/share/zfs/zloop.sh -t 1200 -l -m1 + - name: Prepare artifacts + if: failure() + run: | + sudo chmod +r -R $TEST_DIR/ + - uses: actions/upload-artifact@v2 + if: failure() + with: + name: Logs + path: | + /var/tmp/zloop/*/ + !/var/tmp/zloop/*/vdev/ + if-no-files-found: ignore + - uses: actions/upload-artifact@v2 + if: failure() + with: + name: Pool files + path: | + /var/tmp/zloop/*/vdev/ + if-no-files-found: ignore diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META index 87ffae5f4c09..886da443357d 100644 --- a/sys/contrib/openzfs/META +++ b/sys/contrib/openzfs/META @@ -2,9 +2,9 @@ Meta: 1 Name: zfs Branch: 1.0 Version: 2.0.0 -Release: rc3 +Release: rc1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 5.9 +Linux-Maximum: 5.10 Linux-Minimum: 3.10 diff --git a/sys/contrib/openzfs/Makefile.am b/sys/contrib/openzfs/Makefile.am index b409d2196f86..436b78d76282 100644 --- a/sys/contrib/openzfs/Makefile.am +++ b/sys/contrib/openzfs/Makefile.am @@ -136,6 +136,13 @@ shellcheck: echo "skipping shellcheck because shellcheck is not installed"; \ fi +PHONY += checkabi storeabi +checkabi: lib + $(MAKE) -C lib checkabi + +storeabi: lib + $(MAKE) -C lib storeabi + PHONY += checkbashisms checkbashisms: @if type checkbashisms > /dev/null 2>&1; then \ @@ -152,9 +159,10 @@ checkbashisms: -o -name 'smart' -prune \ -o -name 'paxcheck.sh' -prune \ -o -name 'make_gitrev.sh' -prune \ + -o -name '90zfs' -prune \ -o -type f ! -name 'config*' \ ! -name 'libtool' \ - -exec bash -c 'awk "NR==1 && /\#\!.*bin\/sh.*/ {print FILENAME;}" "{}"' \;); \ + -exec sh -c 'awk "NR==1 && /\#\!.*bin\/sh.*/ {print FILENAME;}" "{}"' \;); \ else \ echo "skipping checkbashisms because checkbashisms is not installed"; \ fi diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am index 88d32b1c538c..d99d1dc382cc 100644 --- a/sys/contrib/openzfs/cmd/Makefile.am +++ b/sys/contrib/openzfs/cmd/Makefile.am @@ -1,5 +1,6 @@ SUBDIRS = zfs zpool zdb zhack zinject zstream zstreamdump ztest SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path +SUBDIRS += zpool_influxdb if USING_PYTHON SUBDIRS += arcstat arc_summary dbufstat diff --git a/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 b/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 index a925d32788ea..75b5697526f7 100755 --- a/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 +++ b/sys/contrib/openzfs/cmd/arc_summary/arc_summary2 @@ -59,14 +59,20 @@ if sys.platform.startswith('freebsd'): # Requires py27-sysctl on FreeBSD import sysctl + def is_value(ctl): + return ctl.type != sysctl.CTLTYPE_NODE + def load_kstats(namespace): """Collect information on a specific subsystem of the ARC""" base = 'kstat.zfs.misc.%s.' % namespace - return [(kstat.name, D(kstat.value)) for kstat in sysctl.filter(base)] + fmt = lambda kstat: (kstat.name, D(kstat.value)) + kstats = sysctl.filter(base) + return [fmt(kstat) for kstat in kstats if is_value(kstat)] def load_tunables(): - return dict((ctl.name, ctl.value) for ctl in sysctl.filter('vfs.zfs')) + ctls = sysctl.filter('vfs.zfs') + return dict((ctl.name, ctl.value) for ctl in ctls if is_value(ctl)) elif sys.platform.startswith('linux'): @@ -219,12 +225,30 @@ def get_arc_summary(Kstat): deleted = Kstat["kstat.zfs.misc.arcstats.deleted"] mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"] evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"] + evict_l2_cached = Kstat["kstat.zfs.misc.arcstats.evict_l2_cached"] + evict_l2_eligible = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible"] + evict_l2_eligible_mfu = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible_mfu"] + evict_l2_eligible_mru = Kstat["kstat.zfs.misc.arcstats.evict_l2_eligible_mru"] + evict_l2_ineligible = Kstat["kstat.zfs.misc.arcstats.evict_l2_ineligible"] + evict_l2_skip = Kstat["kstat.zfs.misc.arcstats.evict_l2_skip"] # ARC Misc. output["arc_misc"] = {} output["arc_misc"]["deleted"] = fHits(deleted) - output["arc_misc"]['mutex_miss'] = fHits(mutex_miss) - output["arc_misc"]['evict_skips'] = fHits(evict_skip) + output["arc_misc"]["mutex_miss"] = fHits(mutex_miss) + output["arc_misc"]["evict_skips"] = fHits(evict_skip) + output["arc_misc"]["evict_l2_skip"] = fHits(evict_l2_skip) + output["arc_misc"]["evict_l2_cached"] = fBytes(evict_l2_cached) + output["arc_misc"]["evict_l2_eligible"] = fBytes(evict_l2_eligible) + output["arc_misc"]["evict_l2_eligible_mfu"] = { + 'per': fPerc(evict_l2_eligible_mfu, evict_l2_eligible), + 'num': fBytes(evict_l2_eligible_mfu), + } + output["arc_misc"]["evict_l2_eligible_mru"] = { + 'per': fPerc(evict_l2_eligible_mru, evict_l2_eligible), + 'num': fBytes(evict_l2_eligible_mru), + } + output["arc_misc"]["evict_l2_ineligible"] = fBytes(evict_l2_ineligible) # ARC Sizing arc_size = Kstat["kstat.zfs.misc.arcstats.size"] @@ -340,8 +364,26 @@ def _arc_summary(Kstat): sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted']) sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" % arc['arc_misc']['mutex_miss']) - sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" % + sys.stdout.write("\tEviction Skips:\t\t\t\t%s\n" % arc['arc_misc']['evict_skips']) + sys.stdout.write("\tEviction Skips Due to L2 Writes:\t%s\n" % + arc['arc_misc']['evict_l2_skip']) + sys.stdout.write("\tL2 Cached Evictions:\t\t\t%s\n" % + arc['arc_misc']['evict_l2_cached']) + sys.stdout.write("\tL2 Eligible Evictions:\t\t\t%s\n" % + arc['arc_misc']['evict_l2_eligible']) + sys.stdout.write("\tL2 Eligible MFU Evictions:\t%s\t%s\n" % ( + arc['arc_misc']['evict_l2_eligible_mfu']['per'], + arc['arc_misc']['evict_l2_eligible_mfu']['num'], + ) + ) + sys.stdout.write("\tL2 Eligible MRU Evictions:\t%s\t%s\n" % ( + arc['arc_misc']['evict_l2_eligible_mru']['per'], + arc['arc_misc']['evict_l2_eligible_mru']['num'], + ) + ) + sys.stdout.write("\tL2 Ineligible Evictions:\t\t%s\n" % + arc['arc_misc']['evict_l2_ineligible']) sys.stdout.write("\n") # ARC Sizing @@ -677,6 +719,11 @@ def get_l2arc_summary(Kstat): l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"] l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"] l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"] + l2_mfu_asize = Kstat["kstat.zfs.misc.arcstats.l2_mfu_asize"] + l2_mru_asize = Kstat["kstat.zfs.misc.arcstats.l2_mru_asize"] + l2_prefetch_asize = Kstat["kstat.zfs.misc.arcstats.l2_prefetch_asize"] + l2_bufc_data_asize = Kstat["kstat.zfs.misc.arcstats.l2_bufc_data_asize"] + l2_bufc_metadata_asize = Kstat["kstat.zfs.misc.arcstats.l2_bufc_metadata_asize"] l2_access_total = (l2_hits + l2_misses) output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error) @@ -699,7 +746,7 @@ def get_l2arc_summary(Kstat): output["io_errors"] = fHits(l2_io_error) output["l2_arc_size"] = {} - output["l2_arc_size"]["adative"] = fBytes(l2_size) + output["l2_arc_size"]["adaptive"] = fBytes(l2_size) output["l2_arc_size"]["actual"] = { 'per': fPerc(l2_asize, l2_size), 'num': fBytes(l2_asize) @@ -708,6 +755,26 @@ def get_l2arc_summary(Kstat): 'per': fPerc(l2_hdr_size, l2_size), 'num': fBytes(l2_hdr_size), } + output["l2_arc_size"]["mfu_asize"] = { + 'per': fPerc(l2_mfu_asize, l2_asize), + 'num': fBytes(l2_mfu_asize), + } + output["l2_arc_size"]["mru_asize"] = { + 'per': fPerc(l2_mru_asize, l2_asize), + 'num': fBytes(l2_mru_asize), + } + output["l2_arc_size"]["prefetch_asize"] = { + 'per': fPerc(l2_prefetch_asize, l2_asize), + 'num': fBytes(l2_prefetch_asize), + } + output["l2_arc_size"]["bufc_data_asize"] = { + 'per': fPerc(l2_bufc_data_asize, l2_asize), + 'num': fBytes(l2_bufc_data_asize), + } + output["l2_arc_size"]["bufc_metadata_asize"] = { + 'per': fPerc(l2_bufc_metadata_asize, l2_asize), + 'num': fBytes(l2_bufc_metadata_asize), + } output["l2_arc_evicts"] = {} output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry) @@ -772,7 +839,7 @@ def _l2arc_summary(Kstat): sys.stdout.write("\n") sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" % - arc["l2_arc_size"]["adative"]) + arc["l2_arc_size"]["adaptive"]) sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % ( arc["l2_arc_size"]["actual"]["per"], arc["l2_arc_size"]["actual"]["num"], @@ -783,11 +850,36 @@ def _l2arc_summary(Kstat): arc["l2_arc_size"]["head_size"]["num"], ) ) + sys.stdout.write("\tMFU Alloc. Size:\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["mfu_asize"]["per"], + arc["l2_arc_size"]["mfu_asize"]["num"], + ) + ) + sys.stdout.write("\tMRU Alloc. Size:\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["mru_asize"]["per"], + arc["l2_arc_size"]["mru_asize"]["num"], + ) + ) + sys.stdout.write("\tPrefetch Alloc. Size:\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["prefetch_asize"]["per"], + arc["l2_arc_size"]["prefetch_asize"]["num"], + ) + ) + sys.stdout.write("\tData (buf content) Alloc. Size:\t%s\t%s\n" % ( + arc["l2_arc_size"]["bufc_data_asize"]["per"], + arc["l2_arc_size"]["bufc_data_asize"]["num"], + ) + ) + sys.stdout.write("\tMetadata (buf content) Size:\t%s\t%s\n" % ( + arc["l2_arc_size"]["bufc_metadata_asize"]["per"], + arc["l2_arc_size"]["bufc_metadata_asize"]["num"], + ) + ) sys.stdout.write("\n") if arc["l2_arc_evicts"]['lock_retries'] != '0' or \ arc["l2_arc_evicts"]["reading"] != '0': - sys.stdout.write("L2 ARC Evicts:\n") + sys.stdout.write("L2 ARC Evictions:\n") sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" % arc["l2_arc_evicts"]['lock_retries']) sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" % diff --git a/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 b/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 index 83cbf0f1728d..96f7990e1726 100755 --- a/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 +++ b/sys/contrib/openzfs/cmd/arc_summary/arc_summary3 @@ -58,7 +58,6 @@ SECTION_PATHS = {'arc': 'arcstats', 'dmu': 'dmu_tx', 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats 'vdev': 'vdev_cache_stats', - 'xuio': 'xuio_stats', 'zfetch': 'zfetchstats', 'zil': 'zil'} @@ -86,16 +85,24 @@ if sys.platform.startswith('freebsd'): VDEV_CACHE_SIZE = 'vdev.cache_size' + def is_value(ctl): + return ctl.type != sysctl.CTLTYPE_NODE + + def namefmt(ctl, base='vfs.zfs.'): + # base is removed from the name + cut = len(base) + return ctl.name[cut:] + def load_kstats(section): base = 'kstat.zfs.misc.{section}.'.format(section=section) - # base is removed from the name - fmt = lambda kstat: '{name} : {value}'.format(name=kstat.name[len(base):], + fmt = lambda kstat: '{name} : {value}'.format(name=namefmt(kstat, base), value=kstat.value) - return [fmt(kstat) for kstat in sysctl.filter(base)] + kstats = sysctl.filter(base) + return [fmt(kstat) for kstat in kstats if is_value(kstat)] def get_params(base): - cut = 8 # = len('vfs.zfs.') - return {ctl.name[cut:]: str(ctl.value) for ctl in sysctl.filter(base)} + ctls = sysctl.filter(base) + return {namefmt(ctl): str(ctl.value) for ctl in ctls if is_value(ctl)} def get_tunable_params(): return get_params('vfs.zfs') @@ -112,25 +119,8 @@ if sys.platform.startswith('freebsd'): return '{} version {}'.format(name, version) def get_descriptions(_request): - # py-sysctl doesn't give descriptions, so we have to shell out. - command = ['sysctl', '-d', 'vfs.zfs'] - - # The recommended way to do this is with subprocess.run(). However, - # some installed versions of Python are < 3.5, so we offer them - # the option of doing it the old way (for now) - if 'run' in dir(subprocess): - info = subprocess.run(command, stdout=subprocess.PIPE, - universal_newlines=True) - lines = info.stdout.split('\n') - else: - info = subprocess.check_output(command, universal_newlines=True) - lines = info.split('\n') - - def fmt(line): - name, desc = line.split(':', 1) - return (name.strip(), desc.strip()) - - return dict([fmt(line) for line in lines if len(line) > 0]) + ctls = sysctl.filter('vfs.zfs') + return {namefmt(ctl): ctl.description for ctl in ctls if is_value(ctl)} elif sys.platform.startswith('linux'): @@ -397,8 +387,12 @@ def format_raw_line(name, value): if ARGS.alt: result = '{0}{1}={2}'.format(INDENT, name, value) else: - spc = LINE_LENGTH-(len(INDENT)+len(value)) - result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc) + # Right-align the value within the line length if it fits, + # otherwise just separate it from the name by a single space. + fit = LINE_LENGTH - len(INDENT) - len(name) + overflow = len(value) + 1 + w = max(fit, overflow) + result = '{0}{1}{2:>{w}}'.format(INDENT, name, value, w=w) return result @@ -598,6 +592,20 @@ def section_arc(kstats_dict): prt_i1('Deleted:', f_hits(arc_stats['deleted'])) prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss'])) prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip'])) + prt_i1('Eviction skips due to L2 writes:', + f_hits(arc_stats['evict_l2_skip'])) + prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached'])) + prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible'])) + prt_i2('L2 eligible MFU evictions:', + f_perc(arc_stats['evict_l2_eligible_mfu'], + arc_stats['evict_l2_eligible']), + f_bytes(arc_stats['evict_l2_eligible_mfu'])) + prt_i2('L2 eligible MRU evictions:', + f_perc(arc_stats['evict_l2_eligible_mru'], + arc_stats['evict_l2_eligible']), + f_bytes(arc_stats['evict_l2_eligible_mru'])) + prt_i1('L2 ineligible evictions:', + f_bytes(arc_stats['evict_l2_ineligible'])) print() @@ -736,6 +744,21 @@ def section_l2arc(kstats_dict): prt_i2('Header size:', f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), f_bytes(arc_stats['l2_hdr_size'])) + prt_i2('MFU allocated size:', + f_perc(arc_stats['l2_mfu_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_mfu_asize'])) + prt_i2('MRU allocated size:', + f_perc(arc_stats['l2_mru_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_mru_asize'])) + prt_i2('Prefetch allocated size:', + f_perc(arc_stats['l2_prefetch_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_prefetch_asize'])) + prt_i2('Data (buffer content) allocated size:', + f_perc(arc_stats['l2_bufc_data_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_bufc_data_asize'])) + prt_i2('Metadata (buffer content) allocated size:', + f_perc(arc_stats['l2_bufc_metadata_asize'], arc_stats['l2_asize']), + f_bytes(arc_stats['l2_bufc_metadata_asize'])) print() prt_1('L2ARC breakdown:', f_hits(l2_access_total)) diff --git a/sys/contrib/openzfs/cmd/arcstat/arcstat.in b/sys/contrib/openzfs/cmd/arcstat/arcstat.in index c83a1c74599e..9e7c52a6c7a3 100755 --- a/sys/contrib/openzfs/cmd/arcstat/arcstat.in +++ b/sys/contrib/openzfs/cmd/arcstat/arcstat.in @@ -88,6 +88,12 @@ cols = { "mfug": [4, 1000, "MFU ghost list hits per second"], "mrug": [4, 1000, "MRU ghost list hits per second"], "eskip": [5, 1000, "evict_skip per second"], + "el2skip": [7, 1000, "evict skip, due to l2 writes, per second"], + "el2cach": [7, 1024, "Size of L2 cached evictions per second"], + "el2el": [5, 1024, "Size of L2 eligible evictions per second"], + "el2mfu": [6, 1024, "Size of L2 eligible MFU evictions per second"], + "el2mru": [6, 1024, "Size of L2 eligible MRU evictions per second"], + "el2inel": [7, 1024, "Size of L2 ineligible evictions per second"], "mtxmis": [6, 1000, "mutex_miss per second"], "dread": [5, 1000, "Demand accesses per second"], "pread": [5, 1000, "Prefetch accesses per second"], @@ -96,6 +102,16 @@ cols = { "l2read": [6, 1000, "Total L2ARC accesses per second"], "l2hit%": [6, 100, "L2ARC access hit percentage"], "l2miss%": [7, 100, "L2ARC access miss percentage"], + "l2pref": [6, 1024, "L2ARC prefetch allocated size"], + "l2mfu": [5, 1024, "L2ARC MFU allocated size"], + "l2mru": [5, 1024, "L2ARC MRU allocated size"], + "l2data": [6, 1024, "L2ARC data allocated size"], + "l2meta": [6, 1024, "L2ARC metadata allocated size"], + "l2pref%": [7, 100, "L2ARC prefetch percentage"], + "l2mfu%": [6, 100, "L2ARC MFU percentage"], + "l2mru%": [6, 100, "L2ARC MRU percentage"], + "l2data%": [7, 100, "L2ARC data percentage"], + "l2meta%": [7, 100, "L2ARC metadata percentage"], "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], "l2size": [6, 1024, "Size of the L2ARC"], "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"], @@ -118,22 +134,24 @@ opfile = None sep = " " # Default separator is 2 spaces version = "0.4" l2exist = False -cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " +cmd = ("Usage: arcstat [-havxp] [-f fields] [-o file] [-s string] [interval " "[count]]\n") cur = {} d = {} out = None kstat = None +pretty_print = True if sys.platform.startswith('freebsd'): - # Requires py27-sysctl on FreeBSD + # Requires py-sysctl on FreeBSD import sysctl def kstat_update(): global kstat - k = sysctl.filter('kstat.zfs.misc.arcstats') + k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats') + if ctl.type != sysctl.CTLTYPE_NODE] if not k: sys.exit(1) @@ -181,6 +199,7 @@ def detailed_usage(): def usage(): sys.stderr.write("%s\n" % cmd) sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -a : Print all possible stats\n") sys.stderr.write("\t -v : List all possible field headers and definitions" "\n") sys.stderr.write("\t -x : Print extended stats\n") @@ -188,6 +207,7 @@ def usage(): sys.stderr.write("\t -o : Redirect output to the specified file\n") sys.stderr.write("\t -s : Override default field separator with custom " "character or string\n") + sys.stderr.write("\t -p : Disable auto-scaling of numerical fields\n") sys.stderr.write("\nExamples:\n") sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") @@ -246,10 +266,14 @@ def print_values(): global hdr global sep global v + global pretty_print - sys.stdout.write(sep.join( - prettynum(cols[col][0], cols[col][1], v[col]) for col in hdr)) + if pretty_print: + fmt = lambda col: prettynum(cols[col][0], cols[col][1], v[col]) + else: + fmt = lambda col: v[col] + sys.stdout.write(sep.join(fmt(col) for col in hdr)) sys.stdout.write("\n") sys.stdout.flush() @@ -257,9 +281,14 @@ def print_values(): def print_header(): global hdr global sep + global pretty_print - sys.stdout.write(sep.join("%*s" % (cols[col][0], col) for col in hdr)) + if pretty_print: + fmt = lambda col: "%*s" % (cols[col][0], col) + else: + fmt = lambda col: col + sys.stdout.write(sep.join(fmt(col) for col in hdr)) sys.stdout.write("\n") @@ -296,8 +325,10 @@ def init(): global sep global out global l2exist + global pretty_print desired_cols = None + aflag = False xflag = False hflag = False vflag = False @@ -306,14 +337,16 @@ def init(): try: opts, args = getopt.getopt( sys.argv[1:], - "xo:hvs:f:", + "axo:hvs:f:p", [ + "all", "extended", "outfile", "help", "verbose", "separator", - "columns" + "columns", + "parsable" ] ) except getopt.error as msg: @@ -322,6 +355,8 @@ def init(): opts = None for opt, arg in opts: + if opt in ('-a', '--all'): + aflag = True if opt in ('-x', '--extended'): xflag = True if opt in ('-o', '--outfile'): @@ -337,6 +372,8 @@ def init(): if opt in ('-f', '--columns'): desired_cols = arg i += 1 + if opt in ('-p', '--parsable'): + pretty_print = False i += 1 argv = sys.argv[i:] @@ -381,6 +418,12 @@ def init(): incompat) usage() + if aflag: + if l2exist: + hdr = cols.keys() + else: + hdr = [col for col in cols.keys() if not col.startswith("l2")] + if opfile: try: out = open(opfile, "w") @@ -436,6 +479,12 @@ def calculate(): v["mrug"] = d["mru_ghost_hits"] / sint v["mfug"] = d["mfu_ghost_hits"] / sint v["eskip"] = d["evict_skip"] / sint + v["el2skip"] = d["evict_l2_skip"] / sint + v["el2cach"] = d["evict_l2_cached"] / sint + v["el2el"] = d["evict_l2_eligible"] / sint + v["el2mfu"] = d["evict_l2_eligible_mfu"] / sint + v["el2mru"] = d["evict_l2_eligible_mru"] / sint + v["el2inel"] = d["evict_l2_ineligible"] / sint v["mtxmis"] = d["mutex_miss"] / sint if l2exist: @@ -449,6 +498,17 @@ def calculate(): v["l2size"] = cur["l2_size"] v["l2bytes"] = d["l2_read_bytes"] / sint + v["l2pref"] = cur["l2_prefetch_asize"] + v["l2mfu"] = cur["l2_mfu_asize"] + v["l2mru"] = cur["l2_mru_asize"] + v["l2data"] = cur["l2_bufc_data_asize"] + v["l2meta"] = cur["l2_bufc_metadata_asize"] + v["l2pref%"] = 100 * v["l2pref"] / v["l2asize"] + v["l2mfu%"] = 100 * v["l2mfu"] / v["l2asize"] + v["l2mru%"] = 100 * v["l2mru"] / v["l2asize"] + v["l2data%"] = 100 * v["l2data"] / v["l2asize"] + v["l2meta%"] = 100 * v["l2meta"] / v["l2asize"] + v["grow"] = 0 if cur["arc_no_grow"] else 1 v["need"] = cur["arc_need_free"] v["free"] = cur["memory_free_bytes"] diff --git a/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in b/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in index 1d4eb39d7242..82250353f5eb 100755 --- a/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in +++ b/sys/contrib/openzfs/cmd/dbufstat/dbufstat.in @@ -131,7 +131,7 @@ elif sys.platform.startswith("linux"): def print_incompat_helper(incompat): cnt = 0 for key in sorted(incompat): - if cnt is 0: + if cnt == 0: sys.stderr.write("\t") elif cnt > 8: sys.stderr.write(",\n\t") @@ -662,7 +662,7 @@ def main(): if not ifile: ifile = default_ifile() - if ifile is not "-": + if ifile != "-": try: tmp = open(ifile, "r") sys.stdin = tmp diff --git a/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c b/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c index ed9f167ccac8..ca39d228479e 100644 --- a/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c +++ b/sys/contrib/openzfs/cmd/mount_zfs/mount_zfs.c @@ -43,67 +43,30 @@ libzfs_handle_t *g_zfs; /* - * Return the pool/dataset to mount given the name passed to mount. This - * is expected to be of the form pool/dataset, however may also refer to - * a block device if that device contains a valid zfs label. + * Opportunistically convert a target string into a pool name. If the + * string does not represent a block device with a valid zfs label + * then it is passed through without modification. */ -static char * -parse_dataset(char *dataset) +static void +parse_dataset(const char *target, char **dataset) { - char cwd[PATH_MAX]; - struct stat64 statbuf; - int error; - int len; + /* Assume pool/dataset is more likely */ + strlcpy(*dataset, target, PATH_MAX); - /* - * We expect a pool/dataset to be provided, however if we're - * given a device which is a member of a zpool we attempt to - * extract the pool name stored in the label. Given the pool - * name we can mount the root dataset. - */ - error = stat64(dataset, &statbuf); - if (error == 0) { - nvlist_t *config; - char *name; - int fd; + int fd = open(target, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return; - fd = open(dataset, O_RDONLY); - if (fd < 0) - goto out; - - error = zpool_read_label(fd, &config, NULL); - (void) close(fd); - if (error) - goto out; - - error = nvlist_lookup_string(config, - ZPOOL_CONFIG_POOL_NAME, &name); - if (error) { - nvlist_free(config); - } else { - dataset = strdup(name); - nvlist_free(config); - return (dataset); - } + nvlist_t *cfg = NULL; + if (zpool_read_label(fd, &cfg, NULL) == 0) { + char *nm = NULL; + if (!nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &nm)) + strlcpy(*dataset, nm, PATH_MAX); + nvlist_free(cfg); } -out: - /* - * If a file or directory in your current working directory is - * named 'dataset' then mount(8) will prepend your current working - * directory to the dataset. There is no way to prevent this - * behavior so we simply check for it and strip the prepended - * patch when it is added. - */ - if (getcwd(cwd, PATH_MAX) == NULL) - return (dataset); - len = strlen(cwd); - - /* Do not add one when cwd already ends in a trailing '/' */ - if (strncmp(cwd, dataset, len) == 0) - return (dataset + len + (cwd[len-1] != '/')); - - return (dataset); + if (close(fd)) + perror("close"); } /* @@ -147,8 +110,8 @@ mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) if (!fp) { (void) fprintf(stderr, gettext( "filesystem '%s' was mounted, but /etc/mtab " - "could not be opened due to error %d\n"), - dataset, errno); + "could not be opened due to error: %s\n"), + dataset, strerror(errno)); return (MOUNT_FILEIO); } @@ -156,8 +119,8 @@ mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) if (error) { (void) fprintf(stderr, gettext( "filesystem '%s' was mounted, but /etc/mtab " - "could not be updated due to error %d\n"), - dataset, errno); + "could not be updated due to error: %s\n"), + dataset, strerror(errno)); return (MOUNT_FILEIO); } @@ -176,7 +139,7 @@ main(int argc, char **argv) char badopt[MNT_LINE_MAX] = { '\0' }; char mtabopt[MNT_LINE_MAX] = { '\0' }; char mntpoint[PATH_MAX]; - char *dataset; + char dataset[PATH_MAX], *pdataset = dataset; unsigned long mntflags = 0, zfsflags = 0, remount = 0; int sloppy = 0, fake = 0, verbose = 0, nomtab = 0, zfsutil = 0; int error, c; @@ -232,13 +195,13 @@ main(int argc, char **argv) return (MOUNT_USAGE); } - dataset = parse_dataset(argv[0]); + parse_dataset(argv[0], &pdataset); /* canonicalize the mount point */ if (realpath(argv[1], mntpoint) == NULL) { (void) fprintf(stderr, gettext("filesystem '%s' cannot be " - "mounted at '%s' due to canonicalization error %d.\n"), - dataset, argv[1], errno); + "mounted at '%s' due to canonicalization error: %s\n"), + dataset, argv[1], strerror(errno)); return (MOUNT_SYSERR); } diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c index 8a2cec4ca685..a3446c52c416 100644 --- a/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_bench.c @@ -83,8 +83,17 @@ run_gen_bench_impl(const char *impl) /* create suitable raidz_map */ ncols = rto_opts.rto_dcols + fn + 1; zio_bench.io_size = 1ULL << ds; - rm_bench = vdev_raidz_map_alloc(&zio_bench, - BENCH_ASHIFT, ncols, fn+1); + + if (rto_opts.rto_expand) { + rm_bench = vdev_raidz_map_alloc_expanded( + zio_bench.io_abd, + zio_bench.io_size, zio_bench.io_offset, + rto_opts.rto_ashift, ncols+1, ncols, + fn+1, rto_opts.rto_expand_offset); + } else { + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, fn+1); + } /* estimate iteration count */ iter_cnt = GEN_BENCH_MEMORY; @@ -163,8 +172,16 @@ run_rec_bench_impl(const char *impl) (1ULL << BENCH_ASHIFT)) continue; - rm_bench = vdev_raidz_map_alloc(&zio_bench, - BENCH_ASHIFT, ncols, PARITY_PQR); + if (rto_opts.rto_expand) { + rm_bench = vdev_raidz_map_alloc_expanded( + zio_bench.io_abd, + zio_bench.io_size, zio_bench.io_offset, + BENCH_ASHIFT, ncols+1, ncols, + PARITY_PQR, rto_opts.rto_expand_offset); + } else { + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, PARITY_PQR); + } /* estimate iteration count */ iter_cnt = (REC_BENCH_MEMORY); diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c index 66f36b0d56ca..4e2639f3676d 100644 --- a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.c @@ -77,16 +77,20 @@ static void print_opts(raidz_test_opts_t *opts, boolean_t force) (void) fprintf(stdout, DBLSEP "Running with options:\n" " (-a) zio ashift : %zu\n" " (-o) zio offset : 1 << %zu\n" + " (-e) expanded map : %s\n" + " (-r) reflow offset : %llx\n" " (-d) number of raidz data columns : %zu\n" " (-s) size of DATA : 1 << %zu\n" " (-S) sweep parameters : %s \n" " (-v) verbose : %s \n\n", - opts->rto_ashift, /* -a */ - ilog2(opts->rto_offset), /* -o */ - opts->rto_dcols, /* -d */ - ilog2(opts->rto_dsize), /* -s */ - opts->rto_sweep ? "yes" : "no", /* -S */ - verbose); /* -v */ + opts->rto_ashift, /* -a */ + ilog2(opts->rto_offset), /* -o */ + opts->rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)opts->rto_expand_offset, /* -r */ + opts->rto_dcols, /* -d */ + ilog2(opts->rto_dsize), /* -s */ + opts->rto_sweep ? "yes" : "no", /* -S */ + verbose); /* -v */ } } @@ -104,6 +108,8 @@ static void usage(boolean_t requested) "\t[-S parameter sweep (default: %s)]\n" "\t[-t timeout for parameter sweep test]\n" "\t[-B benchmark all raidz implementations]\n" + "\t[-e use expanded raidz map (default: %s)]\n" + "\t[-r expanded raidz map reflow offset (default: %llx)]\n" "\t[-v increase verbosity (default: %zu)]\n" "\t[-h (print help)]\n" "\t[-T test the test, see if failure would be detected]\n" @@ -114,6 +120,8 @@ static void usage(boolean_t requested) o->rto_dcols, /* -d */ ilog2(o->rto_dsize), /* -s */ rto_opts.rto_sweep ? "yes" : "no", /* -S */ + rto_opts.rto_expand ? "yes" : "no", /* -e */ + (u_longlong_t)o->rto_expand_offset, /* -r */ o->rto_v); /* -d */ exit(requested ? 0 : 1); @@ -128,7 +136,7 @@ static void process_options(int argc, char **argv) bcopy(&rto_opts_defaults, o, sizeof (*o)); - while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) { + while ((opt = getopt(argc, argv, "TDBSvha:er:o:d:s:t:")) != -1) { value = 0; switch (opt) { @@ -136,6 +144,12 @@ static void process_options(int argc, char **argv) value = strtoull(optarg, NULL, 0); o->rto_ashift = MIN(13, MAX(9, value)); break; + case 'e': + o->rto_expand = 1; + break; + case 'r': + o->rto_expand_offset = strtoull(optarg, NULL, 0); + break; case 'o': value = strtoull(optarg, NULL, 0); o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; @@ -179,25 +193,34 @@ static void process_options(int argc, char **argv) } } -#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) -#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) +#define DATA_COL(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_abd) +#define DATA_COL_SIZE(rr, i) ((rr)->rr_col[rr->rr_firstdatacol + (i)].rc_size) -#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) -#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) +#define CODE_COL(rr, i) ((rr)->rr_col[(i)].rc_abd) +#define CODE_COL_SIZE(rr, i) ((rr)->rr_col[(i)].rc_size) static int cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) { - int i, ret = 0; + int r, i, ret = 0; VERIFY(parity >= 1 && parity <= 3); - for (i = 0; i < parity; i++) { - if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) - != 0) { - ret++; - LOG_OPT(D_DEBUG, opts, - "\nParity block [%d] different!\n", i); + for (r = 0; r < rm->rm_nrows; r++) { + raidz_row_t * const rr = rm->rm_row[r]; + raidz_row_t * const rrg = opts->rm_golden->rm_row[r]; + for (i = 0; i < parity; i++) { + if (CODE_COL_SIZE(rrg, i) == 0) { + VERIFY0(CODE_COL_SIZE(rr, i)); + continue; + } + + if (abd_cmp(CODE_COL(rr, i), + CODE_COL(rrg, i)) != 0) { + ret++; + LOG_OPT(D_DEBUG, opts, + "\nParity block [%d] different!\n", i); + } } } return (ret); @@ -206,16 +229,26 @@ cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) static int cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) { - int i, ret = 0; - int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); + int r, i, dcols, ret = 0; - for (i = 0; i < dcols; i++) { - if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) - != 0) { - ret++; + for (r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + raidz_row_t *rrg = opts->rm_golden->rm_row[r]; + dcols = opts->rm_golden->rm_row[0]->rr_cols - + raidz_parity(opts->rm_golden); + for (i = 0; i < dcols; i++) { + if (DATA_COL_SIZE(rrg, i) == 0) { + VERIFY0(DATA_COL_SIZE(rr, i)); + continue; + } - LOG_OPT(D_DEBUG, opts, - "\nData block [%d] different!\n", i); + if (abd_cmp(DATA_COL(rrg, i), + DATA_COL(rr, i)) != 0) { + ret++; + + LOG_OPT(D_DEBUG, opts, + "\nData block [%d] different!\n", i); + } } } return (ret); @@ -236,12 +269,13 @@ init_rand(void *data, size_t size, void *private) static void corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) { - int i; - raidz_col_t *col; - - for (i = 0; i < cnt; i++) { - col = &rm->rm_col[tgts[i]]; - abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + for (int i = 0; i < cnt; i++) { + raidz_col_t *col = &rr->rr_col[tgts[i]]; + abd_iterate_func(col->rc_abd, 0, col->rc_size, + init_rand, NULL); + } } } @@ -288,10 +322,22 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) VERIFY0(vdev_raidz_impl_set("original")); - opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, - opts->rto_ashift, total_ncols, parity); - rm_test = vdev_raidz_map_alloc(zio_test, - opts->rto_ashift, total_ncols, parity); + if (opts->rto_expand) { + opts->rm_golden = + vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd, + opts->zio_golden->io_size, opts->zio_golden->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd, + zio_test->io_size, zio_test->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + } else { + opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, + opts->rto_ashift, total_ncols, parity); + rm_test = vdev_raidz_map_alloc(zio_test, + opts->rto_ashift, total_ncols, parity); + } VERIFY(opts->zio_golden); VERIFY(opts->rm_golden); @@ -312,6 +358,188 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) return (err); } +/* + * If reflow is not in progress, reflow_offset should be UINT64_MAX. + * For each row, if the row is entirely before reflow_offset, it will + * come from the new location. Otherwise this row will come from the + * old location. Therefore, rows that straddle the reflow_offset will + * come from the old location. + * + * NOTE: Until raidz expansion is implemented this function is only + * needed by raidz_test.c to the multi-row raid_map_t functionality. + */ +raidz_map_t * +vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset, + uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, + uint64_t nparity, uint64_t reflow_offset) +{ + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = size >> ashift; + uint64_t q, r, bc, devidx, asize = 0, tot; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + * AKA "full rows" + */ + q = s / (logical_cols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (logical_cols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* How many rows contain data (not skip) */ + uint64_t rows = howmany(tot, logical_cols); + int cols = MIN(tot, logical_cols); + + raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), + KM_SLEEP); + rm->rm_nrows = rows; + + for (uint64_t row = 0; row < rows; row++) { + raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t, + rr_col[cols]), KM_SLEEP); + rm->rm_row[row] = rr; + + /* The starting RAIDZ (parent) vdev sector of the row. */ + uint64_t b = (offset >> ashift) + row * logical_cols; + + /* + * If we are in the middle of a reflow, and any part of this + * row has not been copied, then use the old location of + * this row. + */ + int row_phys_cols = physical_cols; + if (b + (logical_cols - nparity) > reflow_offset >> ashift) + row_phys_cols--; + + /* starting child of this row */ + uint64_t child_id = b % row_phys_cols; + /* The starting byte offset on each child vdev. */ + uint64_t child_offset = (b / row_phys_cols) << ashift; + + /* + * We set cols to the entire width of the block, even + * if this row is shorter. This is needed because parity + * generation (for Q and R) needs to know the entire width, + * because it treats the short row as though it was + * full-width (and the "phantom" sectors were zero-filled). + * + * Another approach to this would be to set cols shorter + * (to just the number of columns that we might do i/o to) + * and have another mechanism to tell the parity generation + * about the "entire width". Reconstruction (at least + * vdev_raidz_reconstruct_general()) would also need to + * know about the "entire width". + */ + rr->rr_cols = cols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; + + for (int c = 0; c < rr->rr_cols; c++, child_id++) { + if (child_id >= row_phys_cols) { + child_id -= row_phys_cols; + child_offset += 1ULL << ashift; + } + rr->rr_col[c].rc_devidx = child_id; + rr->rr_col[c].rc_offset = child_offset; + rr->rr_col[c].rc_gdata = NULL; + rr->rr_col[c].rc_orig_data = NULL; + rr->rr_col[c].rc_error = 0; + rr->rr_col[c].rc_tried = 0; + rr->rr_col[c].rc_skipped = 0; + rr->rr_col[c].rc_need_orig_restore = B_FALSE; + + uint64_t dc = c - rr->rr_firstdatacol; + if (c < rr->rr_firstdatacol) { + rr->rr_col[c].rc_size = 1ULL << ashift; + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, + B_TRUE); + } else if (row == rows - 1 && bc != 0 && c >= bc) { + /* + * Past the end, this for parity generation. + */ + rr->rr_col[c].rc_size = 0; + rr->rr_col[c].rc_abd = NULL; + } else { + /* + * "data column" (col excluding parity) + * Add an ASCII art diagram here + */ + uint64_t off; + + if (c < bc || r == 0) { + off = dc * rows + row; + } else { + off = r * rows + + (dc - r) * (rows - 1) + row; + } + rr->rr_col[c].rc_size = 1ULL << ashift; + rr->rr_col[c].rc_abd = + abd_get_offset(abd, off << ashift); + } + + asize += rr->rr_col[c].rc_size; + } + /* + * If all data stored spans all columns, there's a danger that + * parity will always be on the same device and, since parity + * isn't read during normal operation, that that device's I/O + * bandwidth won't be used effectively. We therefore switch + * the parity every 1MB. + * + * ...at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices + * evenly, we won't see any benefit. Further, occasional writes + * that aren't a multiple of the LCM of the number of children + * and the minimum stripe width are sufficient to avoid pessimal + * behavior. Unfortunately, this decision created an implicit + * on-disk format requirement that we need to support for all + * eternity, but only for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for + * padding we must make sure to note this swap. We will never + * intend to skip the first column since at least one data and + * one parity column must appear in each row. + */ + if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && + (offset & (1ULL << 20))) { + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); + devidx = rr->rr_col[0].rc_devidx; + uint64_t o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; + } + + } + ASSERT3U(asize, ==, tot << ashift); + + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + static raidz_map_t * init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) { @@ -330,8 +558,15 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) (*zio)->io_abd = raidz_alloc(alloc_dsize); init_zio_abd(*zio); - rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, - total_ncols, parity); + if (opts->rto_expand) { + rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd, + (*zio)->io_size, (*zio)->io_offset, + opts->rto_ashift, total_ncols+1, total_ncols, + parity, opts->rto_expand_offset); + } else { + rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, + total_ncols, parity); + } VERIFY(rm); /* Make sure code columns are destroyed */ @@ -420,7 +655,7 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) if (fn < RAIDZ_REC_PQ) { /* can reconstruct 1 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; /* Check if should stop */ @@ -445,10 +680,11 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) } else if (fn < RAIDZ_REC_PQR) { /* can reconstruct 2 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { - if (x1 >= rm->rm_cols - raidz_parity(rm)) + if (x1 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; /* Check if should stop */ @@ -475,14 +711,15 @@ run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) } else { /* can reconstruct 3 failed data disk */ for (x0 = 0; x0 < opts->rto_dcols; x0++) { - if (x0 >= rm->rm_cols - raidz_parity(rm)) + if (x0 >= rm->rm_row[0]->rr_cols - raidz_parity(rm)) continue; for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { - if (x1 >= rm->rm_cols - raidz_parity(rm)) + if (x1 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { - if (x2 >= - rm->rm_cols - raidz_parity(rm)) + if (x2 >= rm->rm_row[0]->rr_cols - + raidz_parity(rm)) continue; /* Check if should stop */ @@ -700,6 +937,8 @@ run_sweep(void) opts->rto_dcols = dcols_v[d]; opts->rto_offset = (1 << ashift_v[a]) * rand(); opts->rto_dsize = size_v[s]; + opts->rto_expand = rto_opts.rto_expand; + opts->rto_expand_offset = rto_opts.rto_expand_offset; opts->rto_v = 0; /* be quiet */ VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, @@ -732,6 +971,7 @@ run_sweep(void) return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); } + int main(int argc, char **argv) { diff --git a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h index 09c825ae43c7..0f7f4cee3eb6 100644 --- a/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h +++ b/sys/contrib/openzfs/cmd/raidz_test/raidz_test.h @@ -44,13 +44,15 @@ static const char *raidz_impl_names[] = { typedef struct raidz_test_opts { size_t rto_ashift; - size_t rto_offset; + uint64_t rto_offset; size_t rto_dcols; size_t rto_dsize; size_t rto_v; size_t rto_sweep; size_t rto_sweep_timeout; size_t rto_benchmark; + size_t rto_expand; + uint64_t rto_expand_offset; size_t rto_sanity; size_t rto_gdb; @@ -69,6 +71,8 @@ static const raidz_test_opts_t rto_opts_defaults = { .rto_v = 0, .rto_sweep = 0, .rto_benchmark = 0, + .rto_expand = 0, + .rto_expand_offset = -1ULL, .rto_sanity = 0, .rto_gdb = 0, .rto_should_stop = B_FALSE @@ -113,4 +117,7 @@ void init_zio_abd(zio_t *zio); void run_raidz_benchmark(void); +struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t, + uint64_t, uint64_t, uint64_t, uint64_t, uint64_t); + #endif /* RAIDZ_TEST_H */ diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c index 376b24db1eec..e45bff26944a 100644 --- a/sys/contrib/openzfs/cmd/zdb/zdb.c +++ b/sys/contrib/openzfs/cmd/zdb/zdb.c @@ -1642,7 +1642,11 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + if (vd->vdev_ops == &vdev_draid_ops) + ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift); + else + ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { @@ -4202,6 +4206,8 @@ dump_l2arc_log_entries(uint64_t log_entries, (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); (void) printf("|\t\t\t\taddress: %llu\n", (u_longlong_t)le[j].le_daddr); + (void) printf("|\t\t\t\tARC state: %llu\n", + (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop)); (void) printf("|\n"); } (void) printf("\n"); @@ -5201,8 +5207,6 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - abd_free(zio->io_abd); - mutex_enter(&spa->spa_scrub_lock); spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); cv_broadcast(&spa->spa_scrub_io_cv); @@ -5229,6 +5233,8 @@ zdb_blkptr_done(zio_t *zio) blkbuf); } mutex_exit(&spa->spa_scrub_lock); + + abd_free(zio->io_abd); } static int @@ -6316,7 +6322,7 @@ dump_block_stats(spa_t *spa) (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); - if (spa_special_class(spa)->mc_rotor != NULL) { + if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_special_class(spa)); uint64_t space = metaslab_class_get_space( @@ -6327,7 +6333,7 @@ dump_block_stats(spa_t *spa) 100.0 * alloc / space); } - if (spa_dedup_class(spa)->mc_rotor != NULL) { + if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) { uint64_t alloc = metaslab_class_get_alloc( spa_dedup_class(spa)); uint64_t space = metaslab_class_get_space( @@ -6756,6 +6762,7 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) { int error = 0; char *poolname, *bogus_name = NULL; + boolean_t freecfg = B_FALSE; /* If the target is not a pool, the extract the pool name */ char *path_start = strchr(target, '/'); @@ -6774,6 +6781,7 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) "spa_get_stats() failed with error %d\n", poolname, error); } + freecfg = B_TRUE; } if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) @@ -6783,6 +6791,8 @@ import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) error = spa_import(bogus_name, cfg, NULL, ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | ZFS_IMPORT_SKIP_MMP); + if (freecfg) + nvlist_free(cfg); if (error != 0) { fatal("Tried to import pool \"%s\" but spa_import() failed " "with error %d\n", bogus_name, error); @@ -7011,7 +7021,6 @@ verify_checkpoint_blocks(spa_t *spa) spa_t *checkpoint_spa; char *checkpoint_pool; - nvlist_t *config = NULL; int error = 0; /* @@ -7019,7 +7028,7 @@ verify_checkpoint_blocks(spa_t *spa) * name) so we can do verification on it against the current state * of the pool. */ - checkpoint_pool = import_checkpointed_state(spa->spa_name, config, + checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, NULL); ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); @@ -8429,6 +8438,11 @@ main(int argc, char **argv) } } + if (searchdirs != NULL) { + umem_free(searchdirs, nsearch * sizeof (char *)); + searchdirs = NULL; + } + /* * import_checkpointed_state makes the assumption that the * target pool that we pass it is already part of the spa @@ -8447,6 +8461,11 @@ main(int argc, char **argv) target = checkpoint_target; } + if (cfg != NULL) { + nvlist_free(cfg); + cfg = NULL; + } + if (target_pool != target) free(target_pool); diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c index 6c40470e83d7..0e1bcf92765b 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_agents.c @@ -181,6 +181,8 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) * from the vdev_disk layer after a hot unplug. Fortunately we do * get an EC_DEV_REMOVE from our disk monitor and it is a suitable * proxy so we remap it here for the benefit of the diagnosis engine. + * Starting in OpenZFS 2.0, we do get FM_RESOURCE_REMOVED from the spa + * layer. Processing multiple FM_RESOURCE_REMOVED events is not harmful. */ if ((strcmp(class, EC_DEV_REMOVE) == 0) && (strcmp(subclass, ESC_DISK) == 0) && diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c index 8190beb0c9e7..4a58e1f1dbd3 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_mod.c @@ -435,7 +435,15 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) return; } - ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); + /* + * Prefer sequential resilvering when supported (mirrors and dRAID), + * otherwise fallback to a traditional healing resilver. + */ + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_TRUE); + if (ret != 0) { + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, + B_TRUE, B_FALSE); + } zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", fullpath, path, (ret == 0) ? "no errors" : diff --git a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c index ba8a6de3a66f..89bb84e489b6 100644 --- a/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c +++ b/sys/contrib/openzfs/cmd/zed/agents/zfs_retire.c @@ -219,12 +219,18 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) * replace it. */ for (s = 0; s < nspares; s++) { - char *spare_name; + boolean_t rebuild = B_FALSE; + char *spare_name, *type; if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, &spare_name) != 0) continue; + /* prefer sequential resilvering for distributed spares */ + if ((nvlist_lookup_string(spares[s], ZPOOL_CONFIG_TYPE, + &type) == 0) && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + rebuild = B_TRUE; + /* if set, add the "ashift" pool property to the spare nvlist */ if (source != ZPROP_SRC_DEFAULT) (void) nvlist_add_uint64(spares[s], @@ -237,7 +243,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) dev_name, basename(spare_name)); if (zpool_vdev_attach(zhp, dev_name, spare_name, - replacement, B_TRUE, B_FALSE) == 0) { + replacement, B_TRUE, rebuild) == 0) { free(dev_name); nvlist_free(replacement); return (B_TRUE); @@ -499,6 +505,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, * Attempt to substitute a hot spare. */ (void) replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); } diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh b/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh index cb9286500136..270b1bc67e5c 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh +++ b/sys/contrib/openzfs/cmd/zed/zed.d/all-syslog.sh @@ -1,14 +1,50 @@ #!/bin/sh +# +# Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. +# Copyright (c) 2020 by Delphix. All rights reserved. +# + # # Log the zevent via syslog. +# [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" zed_exit_if_ignoring_this_event -zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \ - "${ZEVENT_POOL_GUID:+"pool_guid=${ZEVENT_POOL_GUID}"}" \ - "${ZEVENT_VDEV_PATH:+"vdev_path=${ZEVENT_VDEV_PATH}"}" \ - "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}" +# build a string of name=value pairs for this event +msg="eid=${ZEVENT_EID} class=${ZEVENT_SUBCLASS}" + +if [ "${ZED_SYSLOG_DISPLAY_GUIDS}" = "1" ]; then + [ -n "${ZEVENT_POOL_GUID}" ] && msg="${msg} pool_guid=${ZEVENT_POOL_GUID}" + [ -n "${ZEVENT_VDEV_GUID}" ] && msg="${msg} vdev_guid=${ZEVENT_VDEV_GUID}" +else + [ -n "${ZEVENT_POOL}" ] && msg="${msg} pool='${ZEVENT_POOL}'" + [ -n "${ZEVENT_VDEV_PATH}" ] && msg="${msg} vdev=$(basename "${ZEVENT_VDEV_PATH}")" +fi + +# log pool state if state is anything other than 'ACTIVE' +[ -n "${ZEVENT_POOL_STATE_STR}" ] && [ "$ZEVENT_POOL_STATE" -ne 0 ] && \ + msg="${msg} pool_state=${ZEVENT_POOL_STATE_STR}" + +# Log the following payload nvpairs if they are present +[ -n "${ZEVENT_VDEV_STATE_STR}" ] && msg="${msg} vdev_state=${ZEVENT_VDEV_STATE_STR}" +[ -n "${ZEVENT_CKSUM_ALGORITHM}" ] && msg="${msg} algorithm=${ZEVENT_CKSUM_ALGORITHM}" +[ -n "${ZEVENT_ZIO_SIZE}" ] && msg="${msg} size=${ZEVENT_ZIO_SIZE}" +[ -n "${ZEVENT_ZIO_OFFSET}" ] && msg="${msg} offset=${ZEVENT_ZIO_OFFSET}" +[ -n "${ZEVENT_ZIO_PRIORITY}" ] && msg="${msg} priority=${ZEVENT_ZIO_PRIORITY}" +[ -n "${ZEVENT_ZIO_ERR}" ] && msg="${msg} err=${ZEVENT_ZIO_ERR}" +[ -n "${ZEVENT_ZIO_FLAGS}" ] && msg="${msg} flags=$(printf '0x%x' "${ZEVENT_ZIO_FLAGS}")" + +# log delays that are >= 10 milisec +[ -n "${ZEVENT_ZIO_DELAY}" ] && [ "$ZEVENT_ZIO_DELAY" -gt 10000000 ] && \ + msg="${msg} delay=$((ZEVENT_ZIO_DELAY / 1000000))ms" + +# list the bookmark data together +[ -n "${ZEVENT_ZIO_OBJSET}" ] && \ + msg="${msg} bookmark=${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}:${ZEVENT_ZIO_LEVEL}:${ZEVENT_ZIO_BLKID}" + +zed_log_msg "${msg}" + exit 0 diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in index 053b4414a768..bf5a121f6a79 100755 --- a/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in +++ b/sys/contrib/openzfs/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in @@ -13,7 +13,7 @@ FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}" [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" . "${ZED_ZEDLET_DIR}/zed-functions.sh" -zed_exit_if_ignoring_this_event +[ "$ZEVENT_SUBCLASS" != "history_event" ] && exit 0 zed_check_cmd "${ZFS}" sort diff grep # If we are acting on a snapshot, we have nothing to do diff --git a/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc index 1b220d28db20..df560f921e60 100644 --- a/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc +++ b/sys/contrib/openzfs/cmd/zed/zed.d/zed.rc @@ -118,5 +118,10 @@ ZED_USE_ENCLOSURE_LEDS=1 # Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the # matching subclasses are excluded from logging. #ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*" -#ZED_SYSLOG_SUBCLASS_EXCLUDE="statechange|config_*|history_event" +ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" + +## +# Use GUIDs instead of names when logging pool and vdevs +# Disabled by default, 1 to enable and 0 to disable. +#ZED_SYSLOG_DISPLAY_GUIDS=1 diff --git a/sys/contrib/openzfs/cmd/zfs/zfs_main.c b/sys/contrib/openzfs/cmd/zfs/zfs_main.c index 42c180890fec..ab2b006ae460 100644 --- a/sys/contrib/openzfs/cmd/zfs/zfs_main.c +++ b/sys/contrib/openzfs/cmd/zfs/zfs_main.c @@ -270,7 +270,7 @@ get_usage(zfs_help_t idx) return (gettext("\tclone [-p] [-o property=value] ... " " \n")); case HELP_CREATE: - return (gettext("\tcreate [-Pnpv] [-o property=value] ... " + return (gettext("\tcreate [-Pnpuv] [-o property=value] ... " "\n" "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " "-V \n")); @@ -892,6 +892,107 @@ zfs_do_clone(int argc, char **argv) return (-1); } +/* + * Return a default volblocksize for the pool which always uses more than + * half of the data sectors. This primarily applies to dRAID which always + * writes full stripe widths. + */ +static uint64_t +default_volblocksize(zpool_handle_t *zhp, nvlist_t *props) +{ + uint64_t volblocksize, asize = SPA_MINBLOCKSIZE; + nvlist_t *tree, **vdevs; + uint_t nvdevs; + + nvlist_t *config = zpool_get_config(zhp, NULL); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || + nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, + &vdevs, &nvdevs) != 0) { + return (ZVOL_DEFAULT_BLOCKSIZE); + } + + for (int i = 0; i < nvdevs; i++) { + nvlist_t *nv = vdevs[i]; + uint64_t ashift, ndata, nparity; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &ashift) != 0) + continue; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, + &ndata) == 0) { + /* dRAID minimum allocation width */ + asize = MAX(asize, ndata * (1ULL << ashift)); + } else if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &nparity) == 0) { + /* raidz minimum allocation width */ + if (nparity == 1) + asize = MAX(asize, 2 * (1ULL << ashift)); + else + asize = MAX(asize, 4 * (1ULL << ashift)); + } else { + /* mirror or (non-redundant) leaf vdev */ + asize = MAX(asize, 1ULL << ashift); + } + } + + /* + * Calculate the target volblocksize such that more than half + * of the asize is used. The following table is for 4k sectors. + * + * n asize blksz used | n asize blksz used + * -------------------------+--------------------------------- + * 1 4,096 8,192 100% | 9 36,864 32,768 88% + * 2 8,192 8,192 100% | 10 40,960 32,768 80% + * 3 12,288 8,192 66% | 11 45,056 32,768 72% + * 4 16,384 16,384 100% | 12 49,152 32,768 66% + * 5 20,480 16,384 80% | 13 53,248 32,768 61% + * 6 24,576 16,384 66% | 14 57,344 32,768 57% + * 7 28,672 16,384 57% | 15 61,440 32,768 53% + * 8 32,768 32,768 100% | 16 65,536 65,636 100% + * + * This is primarily a concern for dRAID which always allocates + * a full stripe width. For dRAID the default stripe width is + * n=8 in which case the volblocksize is set to 32k. Ignoring + * compression there are no unused sectors. This same reasoning + * applies to raidz[2,3] so target 4 sectors to minimize waste. + */ + uint64_t tgt_volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + while (tgt_volblocksize * 2 <= asize) + tgt_volblocksize *= 2; + + const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); + if (nvlist_lookup_uint64(props, prop, &volblocksize) == 0) { + + /* Issue a warning when a non-optimal size is requested. */ + if (volblocksize < ZVOL_DEFAULT_BLOCKSIZE) { + (void) fprintf(stderr, gettext("Warning: " + "volblocksize (%llu) is less than the default " + "minimum block size (%llu).\nTo reduce wasted " + "space a volblocksize of %llu is recommended.\n"), + (u_longlong_t)volblocksize, + (u_longlong_t)ZVOL_DEFAULT_BLOCKSIZE, + (u_longlong_t)tgt_volblocksize); + } else if (volblocksize < tgt_volblocksize) { + (void) fprintf(stderr, gettext("Warning: " + "volblocksize (%llu) is much less than the " + "minimum allocation\nunit (%llu), which wastes " + "at least %llu%% of space. To reduce wasted " + "space,\nuse a larger volblocksize (%llu is " + "recommended), fewer dRAID data disks\n" + "per group, or smaller sector size (ashift).\n"), + (u_longlong_t)volblocksize, (u_longlong_t)asize, + (u_longlong_t)((100 * (asize - volblocksize)) / + asize), (u_longlong_t)tgt_volblocksize); + } + } else { + volblocksize = tgt_volblocksize; + fnvlist_add_uint64(props, prop, volblocksize); + } + + return (volblocksize); +} + /* * zfs create [-Pnpv] [-o prop=value] ... fs * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size @@ -911,6 +1012,8 @@ zfs_do_clone(int argc, char **argv) * check of arguments and properties, but does not check for permissions, * available space, etc. * + * The '-u' flag prevents the newly created file system from being mounted. + * * The '-v' flag is for verbose output. * * The '-P' flag is used for parseable output. It implies '-v'. @@ -927,17 +1030,19 @@ zfs_do_create(int argc, char **argv) boolean_t bflag = B_FALSE; boolean_t parents = B_FALSE; boolean_t dryrun = B_FALSE; + boolean_t nomount = B_FALSE; boolean_t verbose = B_FALSE; boolean_t parseable = B_FALSE; int ret = 1; nvlist_t *props; uint64_t intval; + char *strval; if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) nomem(); /* check options */ - while ((c = getopt(argc, argv, ":PV:b:nso:pv")) != -1) { + while ((c = getopt(argc, argv, ":PV:b:nso:puv")) != -1) { switch (c) { case 'V': type = ZFS_TYPE_VOLUME; @@ -984,6 +1089,9 @@ zfs_do_create(int argc, char **argv) case 's': noreserve = B_TRUE; break; + case 'u': + nomount = B_TRUE; + break; case 'v': verbose = B_TRUE; break; @@ -1003,6 +1111,11 @@ zfs_do_create(int argc, char **argv) "used when creating a volume\n")); goto badusage; } + if (nomount && type != ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("'-u' can only be " + "used when creating a filesystem\n")); + goto badusage; + } argc -= optind; argv += optind; @@ -1018,7 +1131,7 @@ zfs_do_create(int argc, char **argv) goto badusage; } - if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) { + if (dryrun || type == ZFS_TYPE_VOLUME) { char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; char *p; @@ -1040,18 +1153,24 @@ zfs_do_create(int argc, char **argv) } } - /* - * if volsize is not a multiple of volblocksize, round it up to the - * nearest multiple of the volblocksize - */ if (type == ZFS_TYPE_VOLUME) { - uint64_t volblocksize; + const char *prop = zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE); + uint64_t volblocksize = default_volblocksize(zpool_handle, + real_props); - if (nvlist_lookup_uint64(props, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - &volblocksize) != 0) - volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + if (volblocksize != ZVOL_DEFAULT_BLOCKSIZE && + nvlist_lookup_string(props, prop, &strval) != 0) { + if (asprintf(&strval, "%llu", + (u_longlong_t)volblocksize) == -1) + nomem(); + nvlist_add_string(props, prop, strval); + free(strval); + } + /* + * If volsize is not a multiple of volblocksize, round it + * up to the nearest multiple of the volblocksize. + */ if (volsize % volblocksize) { volsize = P2ROUNDUP_TYPED(volsize, volblocksize, uint64_t); @@ -1064,11 +1183,9 @@ zfs_do_create(int argc, char **argv) } } - if (type == ZFS_TYPE_VOLUME && !noreserve) { uint64_t spa_version; zfs_prop_t resv_prop; - char *strval; spa_version = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_VERSION, NULL); @@ -1159,6 +1276,11 @@ zfs_do_create(int argc, char **argv) log_history = B_FALSE; } + if (nomount) { + ret = 0; + goto error; + } + ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); error: nvlist_free(props); @@ -6596,9 +6718,9 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use share(1M) to " - "share this filesystem, or set " - "sharenfs property on\n")); + (void) fprintf(stderr, gettext("use exports(5) or " + "smb.conf(5) to share this filesystem, or set " + "the sharenfs or sharesmb property\n")); return (1); } @@ -6613,7 +6735,7 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, (void) fprintf(stderr, gettext("cannot %s '%s': " "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); - (void) fprintf(stderr, gettext("use %s(1M) to " + (void) fprintf(stderr, gettext("use %s(8) to " "%s this filesystem\n"), cmdname, cmdname); return (1); } @@ -7416,8 +7538,8 @@ unshare_unmount(int op, int argc, char **argv) "unshare '%s': legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " - "unshare(1M) to unshare this " - "filesystem\n")); + "exports(5) or smb.conf(5) to unshare " + "this filesystem\n")); ret = 1; } else if (!zfs_is_shared(zhp)) { (void) fprintf(stderr, gettext("cannot " @@ -7435,7 +7557,7 @@ unshare_unmount(int op, int argc, char **argv) "unmount '%s': legacy " "mountpoint\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use " - "umount(1M) to unmount this " + "umount(8) to unmount this " "filesystem\n")); ret = 1; } else if (!zfs_is_mounted(zhp, NULL)) { @@ -8370,7 +8492,7 @@ zfs_do_wait(int argc, char **argv) { boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; int error, i; - char c; + int c; /* By default, wait for all types of activity. */ for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) diff --git a/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c b/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c index 6cfaa6f41fa5..80dd5bf2dc2e 100644 --- a/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c +++ b/sys/contrib/openzfs/cmd/zfs_ids_to_path/zfs_ids_to_path.c @@ -44,7 +44,7 @@ int main(int argc, char **argv) { boolean_t verbose = B_FALSE; - char c; + int c; while ((c = getopt(argc, argv, "v")) != -1) { switch (c) { case 'v': diff --git a/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid.c b/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid.c index 562262928c77..50fcf05e420c 100644 --- a/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid.c +++ b/sys/contrib/openzfs/cmd/zgenhostid/zgenhostid.c @@ -47,10 +47,10 @@ usage(void) " -h\t\t print this usage and exit\n" " -o \t write hostid to this file\n\n" "If hostid file is not present, store a hostid in it.\n" - "The optional value must be an 8-digit hex number between" - "1 and 2^32-1.\n" - "If no value is provided, a random one will" - "be generated.\n" + "The optional value should be an 8-digit hex number between" + " 1 and 2^32-1.\n" + "If the value is 0 or no value is provided, a random one" + " will be generated.\n" "The value must be unique among your systems.\n"); exit(EXIT_FAILURE); /* NOTREACHED */ @@ -108,7 +108,7 @@ main(int argc, char **argv) exit(EXIT_FAILURE); } - if (input_i < 0x1 || input_i > UINT32_MAX) { + if (input_i > UINT32_MAX) { fprintf(stderr, "%s\n", strerror(ERANGE)); usage(); } diff --git a/sys/contrib/openzfs/cmd/zhack/zhack.c b/sys/contrib/openzfs/cmd/zhack/zhack.c index 4d958fe4365a..08263120c7c4 100644 --- a/sys/contrib/openzfs/cmd/zhack/zhack.c +++ b/sys/contrib/openzfs/cmd/zhack/zhack.c @@ -150,6 +150,7 @@ zhack_import(char *target, boolean_t readonly) zfeature_checks_disable = B_TRUE; error = spa_import(target, config, props, (readonly ? ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL)); + fnvlist_free(config); zfeature_checks_disable = B_FALSE; if (error == EEXIST) error = 0; diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c index 5f3153bca2c2..d70d266699cf 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_iter.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_iter.c @@ -56,6 +56,7 @@ typedef struct zpool_node { struct zpool_list { boolean_t zl_findall; + boolean_t zl_literal; uu_avl_t *zl_avl; uu_avl_pool_t *zl_pool; zprop_list_t **zl_proplist; @@ -88,7 +89,9 @@ add_pool(zpool_handle_t *zhp, void *data) uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { if (zlp->zl_proplist && - zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) { + zpool_expand_proplist(zhp, zlp->zl_proplist, + zlp->zl_literal) + != 0) { zpool_close(zhp); free(node); return (-1); @@ -110,7 +113,8 @@ add_pool(zpool_handle_t *zhp, void *data) * line. */ zpool_list_t * -pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) +pool_list_get(int argc, char **argv, zprop_list_t **proplist, + boolean_t literal, int *err) { zpool_list_t *zlp; @@ -128,6 +132,8 @@ pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) zlp->zl_proplist = proplist; + zlp->zl_literal = literal; + if (argc == 0) { (void) zpool_iter(g_zfs, add_pool, zlp); zlp->zl_findall = B_TRUE; @@ -242,12 +248,12 @@ pool_list_count(zpool_list_t *zlp) */ int for_each_pool(int argc, char **argv, boolean_t unavail, - zprop_list_t **proplist, zpool_iter_f func, void *data) + zprop_list_t **proplist, boolean_t literal, zpool_iter_f func, void *data) { zpool_list_t *list; int ret = 0; - if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL) + if ((list = pool_list_get(argc, argv, proplist, literal, &ret)) == NULL) return (1); if (pool_list_iter(list, unavail, func, data) != 0) @@ -711,7 +717,7 @@ all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, vcdl->g_zfs = g_zfs; /* Gather our list of all vdevs in all pools */ - for_each_pool(argc, argv, B_TRUE, NULL, + for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, all_pools_for_each_vdev_gather_cb, vcdl); /* Run command on all vdevs in all pools */ diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_main.c b/sys/contrib/openzfs/cmd/zpool/zpool_main.c index 83a9b5a5ac07..e00fdb7ae1b0 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_main.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_main.c @@ -669,9 +669,16 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, } for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; + uint64_t is_log = B_FALSE, is_hole = B_FALSE; char *class = ""; + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole == B_TRUE) { + continue; + } + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); if (is_log) @@ -692,6 +699,54 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, } } +/* + * Print the list of l2cache devices for dry runs. + */ +static void +print_cache_list(nvlist_t *nv, int indent) +{ + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0 && children > 0) { + (void) printf("\t%*s%s\n", indent, "", "cache"); + } else { + return; + } + for (c = 0; c < children; c++) { + char *vname; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); + (void) printf("\t%*s%s\n", indent + 2, "", vname); + free(vname); + } +} + +/* + * Print the list of spares for dry runs. + */ +static void +print_spare_list(nvlist_t *nv, int indent) +{ + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0 && children > 0) { + (void) printf("\t%*s%s\n", indent, "", "spares"); + } else { + return; + } + for (c = 0; c < children; c++) { + char *vname; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], 0); + (void) printf("\t%*s%s\n", indent + 2, "", vname); + free(vname); + } +} + static boolean_t prop_list_contains_feature(nvlist_t *proplist) { @@ -921,16 +976,16 @@ zpool_do_add(int argc, char **argv) if (dryrun) { nvlist_t *poolnvroot; - nvlist_t **l2child; - uint_t l2children, c; + nvlist_t **l2child, **sparechild; + uint_t l2children, sparechildren, c; char *vname; - boolean_t hadcache = B_FALSE; + boolean_t hadcache = B_FALSE, hadspare = B_FALSE; verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &poolnvroot) == 0); (void) printf(gettext("would update '%s' to the following " - "configuration:\n"), zpool_get_name(zhp)); + "configuration:\n\n"), zpool_get_name(zhp)); /* print original main pool and new tree */ print_vdev_tree(zhp, poolname, poolnvroot, 0, "", @@ -991,6 +1046,29 @@ zpool_do_add(int argc, char **argv) free(vname); } } + /* And finaly the spares */ + if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_SPARES, + &sparechild, &sparechildren) == 0 && sparechildren > 0) { + hadspare = B_TRUE; + (void) printf(gettext("\tspares\n")); + for (c = 0; c < sparechildren; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + sparechild[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &sparechild, &sparechildren) == 0 && sparechildren > 0) { + if (!hadspare) + (void) printf(gettext("\tspares\n")); + for (c = 0; c < sparechildren; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + sparechild[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } ret = 0; } else { @@ -1548,6 +1626,8 @@ zpool_do_create(int argc, char **argv) VDEV_ALLOC_BIAS_SPECIAL, 0); print_vdev_tree(NULL, "logs", nvroot, 0, VDEV_ALLOC_BIAS_LOG, 0); + print_cache_list(nvroot, 0); + print_spare_list(nvroot, 0); ret = 0; } else { @@ -1762,7 +1842,7 @@ zpool_do_export(int argc, char **argv) } return (for_each_pool(argc, argv, B_TRUE, NULL, - zpool_export_one, &cb)); + B_FALSE, zpool_export_one, &cb)); } /* check arguments */ @@ -1771,7 +1851,8 @@ zpool_do_export(int argc, char **argv) usage(B_FALSE); } - ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb); + ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_export_one, + &cb); return (ret); } @@ -2294,7 +2375,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } } - /* Display vdev initialization and trim status for leaves */ + /* Display vdev initialization and trim status for leaves. */ if (children == 0) { print_status_initialize(vs, cb->cb_print_vdev_init); print_status_trim(vs, cb->cb_print_vdev_trim); @@ -3613,7 +3694,8 @@ zpool_do_sync(int argc, char **argv) argv += optind; /* if argc == 0 we will execute zpool_sync_one on all pools */ - ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force); + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, zpool_sync_one, + &force); return (ret); } @@ -4958,7 +5040,7 @@ are_vdevs_in_pool(int argc, char **argv, char *pool_name, /* Is this name a vdev in our pools? */ ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, - is_vdev, cb); + B_FALSE, is_vdev, cb); if (!ret) { /* No match */ break; @@ -4986,7 +5068,8 @@ is_pool_cb(zpool_handle_t *zhp, void *data) static int is_pool(char *name) { - return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); + return (for_each_pool(0, NULL, B_TRUE, NULL, B_FALSE, is_pool_cb, + name)); } /* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ @@ -5438,7 +5521,7 @@ zpool_do_iostat(int argc, char **argv) * Construct the list of all interesting pools. */ ret = 0; - if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) + if ((list = pool_list_get(argc, argv, NULL, parsable, &ret)) == NULL) return (1); if (pool_list_count(list) == 0 && argc != 0) { @@ -6112,7 +6195,7 @@ zpool_do_list(int argc, char **argv) for (;;) { if ((list = pool_list_get(argc, argv, &cb.cb_proplist, - &ret)) == NULL) + cb.cb_literal, &ret)) == NULL) return (1); if (pool_list_count(list) == 0) @@ -6512,6 +6595,10 @@ zpool_do_split(int argc, char **argv) "following layout:\n\n"), newpool); print_vdev_tree(NULL, newpool, config, 0, "", flags.name_flags); + print_vdev_tree(NULL, "dedup", config, 0, + VDEV_ALLOC_BIAS_DEDUP, 0); + print_vdev_tree(NULL, "special", config, 0, + VDEV_ALLOC_BIAS_SPECIAL, 0); } } @@ -6864,7 +6951,7 @@ zpool_do_reopen(int argc, char **argv) argv += optind; /* if argc == 0 we will execute zpool_reopen_one on all pools */ - ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one, + ret = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, zpool_reopen_one, &scrub_restart); return (ret); @@ -6994,12 +7081,13 @@ zpool_do_scrub(int argc, char **argv) usage(B_FALSE); } - error = for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb); + error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + scrub_callback, &cb); if (wait && !error) { zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; - error = for_each_pool(argc, argv, B_TRUE, NULL, wait_callback, - &act); + error = for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + wait_callback, &act); } return (error); @@ -7037,7 +7125,8 @@ zpool_do_resilver(int argc, char **argv) usage(B_FALSE); } - return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); + return (for_each_pool(argc, argv, B_TRUE, NULL, B_FALSE, + scrub_callback, &cb)); } /* @@ -7590,7 +7679,7 @@ print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) vdev_name = zpool_vdev_name(g_zfs, zhp, child[prs->prs_removing_vdev], B_TRUE); - (void) printf(gettext("remove: ")); + printf_color(ANSI_BOLD, gettext("remove: ")); start = prs->prs_start_time; end = prs->prs_end_time; @@ -8431,7 +8520,7 @@ zpool_do_status(int argc, char **argv) cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, NULL, NULL, 0, 0); - ret = for_each_pool(argc, argv, B_TRUE, NULL, + ret = for_each_pool(argc, argv, B_TRUE, NULL, cb.cb_literal, status_callback, &cb); if (cb.vcdl != NULL) @@ -8950,7 +9039,7 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext("\n")); } } else { - ret = for_each_pool(argc, argv, B_FALSE, NULL, + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, upgrade_one, &cb); } @@ -9036,6 +9125,12 @@ print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) dump_nvlist(fnvlist_lookup_nvlist(rec, ZPOOL_HIST_OUTPUT_NVL), 8); } + if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_SIZE)) { + (void) printf(" output nvlist omitted; " + "original size: %lldKB\n", + (longlong_t)fnvlist_lookup_int64(rec, + ZPOOL_HIST_OUTPUT_SIZE) / 1024); + } if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { (void) printf(" errno: %lld\n", (longlong_t)fnvlist_lookup_int64(rec, @@ -9133,7 +9228,7 @@ zpool_do_history(int argc, char **argv) argc -= optind; argv += optind; - ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, + ret = for_each_pool(argc, argv, B_FALSE, NULL, B_FALSE, get_history_one, &cbdata); if (argc == 0 && cbdata.first == B_TRUE) { @@ -9696,7 +9791,7 @@ zpool_do_get(int argc, char **argv) cb.cb_proplist = &fake_name; } - ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, + ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, cb.cb_literal, get_callback, &cb); if (cb.cb_proplist == &fake_name) @@ -9766,7 +9861,7 @@ zpool_do_set(int argc, char **argv) *(cb.cb_value) = '\0'; cb.cb_value++; - error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, + error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, B_FALSE, set_callback, &cb); return (error); @@ -9849,7 +9944,8 @@ vdev_any_spare_replacing(nvlist_t *nv) (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || - strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) { + strcmp(vdev_type, VDEV_TYPE_SPARE) == 0 || + strcmp(vdev_type, VDEV_TYPE_DRAID_SPARE) == 0) { return (B_TRUE); } @@ -10051,7 +10147,7 @@ int zpool_do_wait(int argc, char **argv) { boolean_t verbose = B_FALSE; - char c; + int c; char *value; int i; unsigned long count; diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_util.h b/sys/contrib/openzfs/cmd/zpool/zpool_util.h index 265aa58953a0..abaa22d78c20 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_util.h +++ b/sys/contrib/openzfs/cmd/zpool/zpool_util.h @@ -64,7 +64,7 @@ nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, * Pool list functions */ int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, - zpool_iter_f, void *); + boolean_t, zpool_iter_f, void *); /* Vdev list functions */ typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); @@ -72,7 +72,7 @@ int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); typedef struct zpool_list zpool_list_t; -zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); +zpool_list_t *pool_list_get(int, char **, zprop_list_t **, boolean_t, int *); void pool_list_update(zpool_list_t *); int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); void pool_list_free(zpool_list_t *); diff --git a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c index 9aa09b18c4ae..c86081a8153a 100644 --- a/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c +++ b/sys/contrib/openzfs/cmd/zpool/zpool_vdev.c @@ -86,9 +86,6 @@ boolean_t error_seen; boolean_t is_force; - - - /*PRINTFLIKE1*/ void vdev_error(const char *fmt, ...) @@ -222,6 +219,9 @@ is_spare(nvlist_t *config, const char *path) uint_t i, nspares; boolean_t inuse; + if (zpool_is_draid_spare(path)) + return (B_TRUE); + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) return (B_FALSE); @@ -267,9 +267,10 @@ is_spare(nvlist_t *config, const char *path) * /dev/xxx Complete disk path * /xxx Full path to file * xxx Shorthand for /xxx + * draid* Virtual dRAID spare */ static nvlist_t * -make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) +make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) { char path[MAXPATHLEN]; struct stat64 statbuf; @@ -309,6 +310,17 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) /* After whole disk check restore original passed path */ strlcpy(path, arg, sizeof (path)); + } else if (zpool_is_draid_spare(arg)) { + if (!is_primary) { + (void) fprintf(stderr, + gettext("cannot open '%s': dRAID spares can only " + "be used to replace primary vdevs\n"), arg); + return (NULL); + } + + wholedisk = B_TRUE; + strlcpy(path, arg, sizeof (path)); + type = VDEV_TYPE_DRAID_SPARE; } else { err = is_shorthand_path(arg, path, sizeof (path), &statbuf, &wholedisk); @@ -337,17 +349,19 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) } } - /* - * Determine whether this is a device or a file. - */ - if (wholedisk || S_ISBLK(statbuf.st_mode)) { - type = VDEV_TYPE_DISK; - } else if (S_ISREG(statbuf.st_mode)) { - type = VDEV_TYPE_FILE; - } else { - (void) fprintf(stderr, gettext("cannot use '%s': must be a " - "block device or regular file\n"), path); - return (NULL); + if (type == NULL) { + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + fprintf(stderr, gettext("cannot use '%s': must " + "be a block device or regular file\n"), path); + return (NULL); + } } /* @@ -358,10 +372,7 @@ make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); - verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) - verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, - VDEV_ALLOC_BIAS_LOG) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); @@ -432,11 +443,16 @@ typedef struct replication_level { #define ZPOOL_FUZZ (16 * 1024 * 1024) +/* + * N.B. For the purposes of comparing replication levels dRAID can be + * considered functionally equivilant to raidz. + */ static boolean_t is_raidz_mirror(replication_level_t *a, replication_level_t *b, replication_level_t **raidz, replication_level_t **mirror) { - if (strcmp(a->zprl_type, "raidz") == 0 && + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && strcmp(b->zprl_type, "mirror") == 0) { *raidz = a; *mirror = b; @@ -445,6 +461,22 @@ is_raidz_mirror(replication_level_t *a, replication_level_t *b, return (B_FALSE); } +/* + * Comparison for determining if dRAID and raidz where passed in either order. + */ +static boolean_t +is_raidz_draid(replication_level_t *a, replication_level_t *b) +{ + if ((strcmp(a->zprl_type, "raidz") == 0 || + strcmp(a->zprl_type, "draid") == 0) && + (strcmp(b->zprl_type, "raidz") == 0 || + strcmp(b->zprl_type, "draid") == 0)) { + return (B_TRUE); + } + + return (B_FALSE); +} + /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then @@ -511,7 +543,8 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) rep.zprl_type = type; rep.zprl_children = 0; - if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_DRAID) == 0) { verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &rep.zprl_parity) == 0); @@ -677,6 +710,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) else return (NULL); } + } else if (is_raidz_draid(&lastrep, &rep)) { + /* + * Accepted raidz and draid when they can + * handle the same number of disk failures. + */ + if (lastrep.zprl_parity != rep.zprl_parity) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: %s and %s vdevs " + "with different " + "redundancy, %llu vs. " + "%llu are present\n"), + lastrep.zprl_type, + rep.zprl_type, + lastrep.zprl_parity, + rep.zprl_parity); + else + return (NULL); + } } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) { if (ret != NULL) @@ -1103,31 +1159,87 @@ is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, return (anyinuse); } +/* + * Returns the parity level extracted from a raidz or draid type. + * If the parity cannot be determined zero is returned. + */ +static int +get_parity(const char *type) +{ + long parity = 0; + const char *p; + + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0) { + p = type + strlen(VDEV_TYPE_RAIDZ); + + if (*p == '\0') { + /* when unspecified default to single parity */ + return (1); + } else if (*p == '0') { + /* no zero prefixes allowed */ + return (0); + } else { + /* 0-3, no suffixes allowed */ + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || *end != '\0' || + parity < 1 || parity > VDEV_RAIDZ_MAXPARITY) { + return (0); + } + } + } else if (strncmp(type, VDEV_TYPE_DRAID, + strlen(VDEV_TYPE_DRAID)) == 0) { + p = type + strlen(VDEV_TYPE_DRAID); + + if (*p == '\0' || *p == ':') { + /* when unspecified default to single parity */ + return (1); + } else if (*p == '0') { + /* no zero prefixes allowed */ + return (0); + } else { + /* 0-3, allowed suffixes: '\0' or ':' */ + char *end; + errno = 0; + parity = strtol(p, &end, 10); + if (errno != 0 || + parity < 1 || parity > VDEV_DRAID_MAXPARITY || + (*end != '\0' && *end != ':')) { + return (0); + } + } + } + + return ((int)parity); +} + +/* + * Assign the minimum and maximum number of devices allowed for + * the specified type. On error NULL is returned, otherwise the + * type prefix is returned (raidz, mirror, etc). + */ static const char * is_grouping(const char *type, int *mindev, int *maxdev) { - if (strncmp(type, "raidz", 5) == 0) { - const char *p = type + 5; - char *end; - long nparity; - - if (*p == '\0') { - nparity = 1; - } else if (*p == '0') { - return (NULL); /* no zero prefixes allowed */ - } else { - errno = 0; - nparity = strtol(p, &end, 10); - if (errno != 0 || nparity < 1 || nparity >= 255 || - *end != '\0') - return (NULL); - } + int nparity; + if (strncmp(type, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0) { + nparity = get_parity(type); + if (nparity == 0) + return (NULL); if (mindev != NULL) *mindev = nparity + 1; if (maxdev != NULL) *maxdev = 255; - return (VDEV_TYPE_RAIDZ); + + if (strncmp(type, VDEV_TYPE_RAIDZ, + strlen(VDEV_TYPE_RAIDZ)) == 0) { + return (VDEV_TYPE_RAIDZ); + } else { + return (VDEV_TYPE_DRAID); + } } if (maxdev != NULL) @@ -1167,6 +1279,163 @@ is_grouping(const char *type, int *mindev, int *maxdev) return (NULL); } +/* + * Extract the configuration parameters encoded in the dRAID type and + * use them to generate a dRAID configuration. The expected format is: + * + * draid[][:][:][:] + * + * The intent is to be able to generate a good configuration when no + * additional information is provided. The only mandatory component + * of the 'type' is the 'draid' prefix. If a value is not provided + * then reasonable defaults are used. The optional components may + * appear in any order but the d/s/c suffix is required. + * + * Valid inputs: + * - data: number of data devices per group (1-255) + * - parity: number of parity blocks per group (1-3) + * - spares: number of distributed spare (0-100) + * - children: total number of devices (1-255) + * + * Examples: + * - zpool create tank draid + * - zpool create tank draid2:8d:51c:2s + */ +static int +draid_config_by_type(nvlist_t *nv, const char *type, uint64_t children) +{ + uint64_t nparity = 1; + uint64_t nspares = 0; + uint64_t ndata = UINT64_MAX; + uint64_t ngroups = 1; + long value; + + if (strncmp(type, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) != 0) + return (EINVAL); + + nparity = (uint64_t)get_parity(type); + if (nparity == 0) + return (EINVAL); + + char *p = (char *)type; + while ((p = strchr(p, ':')) != NULL) { + char *end; + + p = p + 1; + errno = 0; + + if (!isdigit(p[0])) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:] not '%s'\n"), + type); + return (EINVAL); + } + + /* Expected non-zero value with c/d/s suffix */ + value = strtol(p, &end, 10); + char suffix = tolower(*end); + if (errno != 0 || + (suffix != 'c' && suffix != 'd' && suffix != 's')) { + (void) fprintf(stderr, gettext("invalid dRAID " + "syntax; expected [:] not '%s'\n"), + type); + return (EINVAL); + } + + if (suffix == 'c') { + if ((uint64_t)value != children) { + fprintf(stderr, + gettext("invalid number of dRAID children; " + "%llu required but %llu provided\n"), + (u_longlong_t)value, + (u_longlong_t)children); + return (EINVAL); + } + } else if (suffix == 'd') { + ndata = (uint64_t)value; + } else if (suffix == 's') { + nspares = (uint64_t)value; + } else { + verify(0); /* Unreachable */ + } + } + + /* + * When a specific number of data disks is not provided limit a + * redundancy group to 8 data disks. This value was selected to + * provide a reasonable tradeoff between capacity and performance. + */ + if (ndata == UINT64_MAX) { + if (children > nspares + nparity) { + ndata = MIN(children - nspares - nparity, 8); + } else { + fprintf(stderr, gettext("request number of " + "distributed spares %llu and parity level %llu\n" + "leaves no disks available for data\n"), + (u_longlong_t)nspares, (u_longlong_t)nparity); + return (EINVAL); + } + } + + /* Verify the maximum allowed group size is never exceeded. */ + if (ndata == 0 || (ndata + nparity > children - nspares)) { + fprintf(stderr, gettext("requested number of dRAID data " + "disks per group %llu is too high,\nat most %llu disks " + "are available for data\n"), (u_longlong_t)ndata, + (u_longlong_t)(children - nspares - nparity)); + return (EINVAL); + } + + if (nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + fprintf(stderr, + gettext("invalid dRAID parity level %llu; must be " + "between 1 and %d\n"), (u_longlong_t)nparity, + VDEV_DRAID_MAXPARITY); + return (EINVAL); + } + + /* + * Verify the requested number of spares can be satisfied. + * An arbitrary limit of 100 distributed spares is applied. + */ + if (nspares > 100 || nspares > (children - (ndata + nparity))) { + fprintf(stderr, + gettext("invalid number of dRAID spares %llu; additional " + "disks would be required\n"), (u_longlong_t)nspares); + return (EINVAL); + } + + /* Verify the requested number children is sufficient. */ + if (children < (ndata + nparity + nspares)) { + fprintf(stderr, gettext("%llu disks were provided, but at " + "least %llu disks are required for this config\n"), + (u_longlong_t)children, + (u_longlong_t)(ndata + nparity + nspares)); + } + + if (children > VDEV_DRAID_MAX_CHILDREN) { + fprintf(stderr, gettext("%llu disks were provided, but " + "dRAID only supports up to %u disks"), + (u_longlong_t)children, VDEV_DRAID_MAX_CHILDREN); + } + + /* + * Calculate the minimum number of groups required to fill a slice. + * This is the LCM of the stripe width (ndata + nparity) and the + * number of data drives (children - nspares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + + return (0); +} + /* * Construct a syntactically valid vdev specification, * and ensure that all devices and files exist and can be opened. @@ -1178,8 +1447,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; - const char *type; - uint64_t is_log, is_special, is_dedup; + const char *type, *fulltype; + boolean_t is_log, is_special, is_dedup, is_spare; boolean_t seen_logs; top = NULL; @@ -1189,18 +1458,20 @@ construct_spec(nvlist_t *props, int argc, char **argv) nspares = 0; nlogs = 0; nl2cache = 0; - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = is_dedup = is_spare = B_FALSE; seen_logs = B_FALSE; nvroot = NULL; while (argc > 0) { + fulltype = argv[0]; nv = NULL; /* - * If it's a mirror or raidz, the subsequent arguments are - * its leaves -- until we encounter the next mirror or raidz. + * If it's a mirror, raidz, or draid the subsequent arguments + * are its leaves -- until we encounter the next mirror, + * raidz or draid. */ - if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { + if ((type = is_grouping(fulltype, &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; @@ -1212,6 +1483,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } + is_spare = B_TRUE; is_log = is_special = is_dedup = B_FALSE; } @@ -1225,8 +1497,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) } seen_logs = B_TRUE; is_log = B_TRUE; - is_special = B_FALSE; - is_dedup = B_FALSE; + is_special = is_dedup = is_spare = B_FALSE; argc--; argv++; /* @@ -1238,8 +1509,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { is_special = B_TRUE; - is_log = B_FALSE; - is_dedup = B_FALSE; + is_log = is_dedup = is_spare = B_FALSE; argc--; argv++; continue; @@ -1247,8 +1517,7 @@ construct_spec(nvlist_t *props, int argc, char **argv) if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { is_dedup = B_TRUE; - is_log = B_FALSE; - is_special = B_FALSE; + is_log = is_special = is_spare = B_FALSE; argc--; argv++; continue; @@ -1262,7 +1531,8 @@ construct_spec(nvlist_t *props, int argc, char **argv) "specified only once\n")); goto spec_out; } - is_log = is_special = is_dedup = B_FALSE; + is_log = is_special = B_FALSE; + is_dedup = is_spare = B_FALSE; } if (is_log || is_special || is_dedup) { @@ -1280,13 +1550,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) for (c = 1; c < argc; c++) { if (is_grouping(argv[c], NULL, NULL) != NULL) break; + children++; child = realloc(child, children * sizeof (nvlist_t *)); if (child == NULL) zpool_no_memory(); if ((nv = make_leaf_vdev(props, argv[c], - B_FALSE)) == NULL) { + !(is_log || is_special || is_dedup || + is_spare))) == NULL) { for (c = 0; c < children - 1; c++) nvlist_free(child[c]); free(child); @@ -1335,10 +1607,11 @@ construct_spec(nvlist_t *props, int argc, char **argv) type) == 0); verify(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, is_log) == 0); - if (is_log) + if (is_log) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, VDEV_ALLOC_BIAS_LOG) == 0); + } if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, @@ -1354,6 +1627,15 @@ construct_spec(nvlist_t *props, int argc, char **argv) ZPOOL_CONFIG_NPARITY, mindev - 1) == 0); } + if (strcmp(type, VDEV_TYPE_DRAID) == 0) { + if (draid_config_by_type(nv, + fulltype, children) != 0) { + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + } verify(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, children) == 0); @@ -1367,12 +1649,19 @@ construct_spec(nvlist_t *props, int argc, char **argv) * We have a device. Pass off to make_leaf_vdev() to * construct the appropriate nvlist describing the vdev. */ - if ((nv = make_leaf_vdev(props, argv[0], - is_log)) == NULL) + if ((nv = make_leaf_vdev(props, argv[0], !(is_log || + is_special || is_dedup || is_spare))) == NULL) goto spec_out; - if (is_log) + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); nlogs++; + } + if (is_special) { verify(nvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/.gitignore b/sys/contrib/openzfs/cmd/zpool_influxdb/.gitignore new file mode 100644 index 000000000000..bd765d188278 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/.gitignore @@ -0,0 +1 @@ +/zpool_influxdb diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/Makefile.am b/sys/contrib/openzfs/cmd/zpool_influxdb/Makefile.am new file mode 100644 index 000000000000..28e94d616e61 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/Makefile.am @@ -0,0 +1,11 @@ +include $(top_srcdir)/config/Rules.am + +zfsexec_PROGRAMS = zpool_influxdb + +zpool_influxdb_SOURCES = \ + zpool_influxdb.c + +zpool_influxdb_LDADD = \ + $(top_builddir)/lib/libspl/libspl.la \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs/libzfs.la diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/README.md b/sys/contrib/openzfs/cmd/zpool_influxdb/README.md new file mode 100644 index 000000000000..864d67498325 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/README.md @@ -0,0 +1,294 @@ +# Influxdb Metrics for ZFS Pools +The _zpool_influxdb_ program produces +[influxdb](https://github.com/influxdata/influxdb) line protocol +compatible metrics from zpools. In the UNIX tradition, _zpool_influxdb_ +does one thing: read statistics from a pool and print them to +stdout. In many ways, this is a metrics-friendly output of +statistics normally observed via the `zpool` command. + +## Usage +When run without arguments, _zpool_influxdb_ runs once, reading data +from all imported pools, and prints to stdout. +```shell +zpool_influxdb [options] [poolname] +``` +If no poolname is specified, then all pools are sampled. + +| option | short option | description | +|---|---|---| +| --execd | -e | For use with telegraf's `execd` plugin. When [enter] is pressed, the pools are sampled. To exit, use [ctrl+D] | +| --no-histogram | -n | Do not print histogram information | +| --signed-int | -i | Use signed integer data type (default=unsigned) | +| --sum-histogram-buckets | -s | Sum histogram bucket values | +| --tags key=value[,key=value...] | -t | Add tags to data points. No tag sanity checking is performed. | +| --help | -h | Print a short usage message | + +#### Histogram Bucket Values +The histogram data collected by ZFS is stored as independent bucket values. +This works well out-of-the-box with an influxdb data source and grafana's +heatmap visualization. The influxdb query for a grafana heatmap +visualization looks like: +``` +field(disk_read) last() non_negative_derivative(1s) +``` + +Another method for storing histogram data sums the values for lower-value +buckets. For example, a latency bucket tagged "le=10" includes the values +in the bucket "le=1". +This method is often used for prometheus histograms. +The `zpool_influxdb --sum-histogram-buckets` option presents the data from ZFS +as summed values. + +## Measurements +The following measurements are collected: + +| measurement | description | zpool equivalent | +|---|---|---| +| zpool_stats | general size and data | zpool list | +| zpool_scan_stats | scrub, rebuild, and resilver statistics (omitted if no scan has been requested) | zpool status | +| zpool_vdev_stats | per-vdev statistics | zpool iostat -q | +| zpool_io_size | per-vdev I/O size histogram | zpool iostat -r | +| zpool_latency | per-vdev I/O latency histogram | zpool iostat -w | +| zpool_vdev_queue | per-vdev instantaneous queue depth | zpool iostat -q | + +### zpool_stats Description +zpool_stats contains top-level summary statistics for the pool. +Performance counters measure the I/Os to the pool's devices. + +#### zpool_stats Tags + +| label | description | +|---|---| +| name | pool name | +| path | for leaf vdevs, the pathname | +| state | pool state, as shown by _zpool status_ | +| vdev | vdev name (root = entire pool) | + +#### zpool_stats Fields + +| field | units | description | +|---|---|---| +| alloc | bytes | allocated space | +| free | bytes | unallocated space | +| size | bytes | total pool size | +| read_bytes | bytes | bytes read since pool import | +| read_errors | count | number of read errors | +| read_ops | count | number of read operations | +| write_bytes | bytes | bytes written since pool import | +| write_errors | count | number of write errors | +| write_ops | count | number of write operations | + +### zpool_scan_stats Description +Once a pool has been scrubbed, resilvered, or rebuilt, the zpool_scan_stats +contain information about the status and performance of the operation. +Otherwise, the zpool_scan_stats do not exist in the kernel, and therefore +cannot be reported by this collector. + +#### zpool_scan_stats Tags + +| label | description | +|---|---| +| name | pool name | +| function | name of the scan function running or recently completed | +| state | scan state, as shown by _zpool status_ | + +#### zpool_scan_stats Fields + +| field | units | description | +|---|---|---| +| errors | count | number of errors encountered by scan | +| examined | bytes | total data examined during scan | +| to_examine | bytes | prediction of total bytes to be scanned | +| pass_examined | bytes | data examined during current scan pass | +| issued | bytes | size of I/Os issued to disks | +| pass_issued | bytes | size of I/Os issued to disks for current pass | +| processed | bytes | data reconstructed during scan | +| to_process | bytes | total bytes to be repaired | +| rate | bytes/sec | examination rate | +| start_ts | epoch timestamp | start timestamp for scan | +| pause_ts | epoch timestamp | timestamp for a scan pause request | +| end_ts | epoch timestamp | completion timestamp for scan | +| paused_t | seconds | elapsed time while paused | +| remaining_t | seconds | estimate of time remaining for scan | + +### zpool_vdev_stats Description +The ZFS I/O (ZIO) scheduler uses five queues to schedule I/Os to each vdev. +These queues are further divided into active and pending states. +An I/O is pending prior to being issued to the vdev. An active +I/O has been issued to the vdev. The scheduler and its tunable +parameters are described at the +[ZFS documentation for ZIO Scheduler] +(https://openzfs.github.io/openzfs-docs/Performance%20and%20Tuning/ZIO%20Scheduler.html) +The ZIO scheduler reports the queue depths as gauges where the value +represents an instantaneous snapshot of the queue depth at +the sample time. Therefore, it is not unusual to see all zeroes +for an idle pool. + +#### zpool_vdev_stats Tags +| label | description | +|---|---| +| name | pool name | +| vdev | vdev name (root = entire pool) | + +#### zpool_vdev_stats Fields +| field | units | description | +|---|---|---| +| sync_r_active_queue | entries | synchronous read active queue depth | +| sync_w_active_queue | entries | synchronous write active queue depth | +| async_r_active_queue | entries | asynchronous read active queue depth | +| async_w_active_queue | entries | asynchronous write active queue depth | +| async_scrub_active_queue | entries | asynchronous scrub active queue depth | +| sync_r_pend_queue | entries | synchronous read pending queue depth | +| sync_w_pend_queue | entries | synchronous write pending queue depth | +| async_r_pend_queue | entries | asynchronous read pending queue depth | +| async_w_pend_queue | entries | asynchronous write pending queue depth | +| async_scrub_pend_queue | entries | asynchronous scrub pending queue depth | + +### zpool_latency Histogram +ZFS tracks the latency of each I/O in the ZIO pipeline. This latency can +be useful for observing latency-related issues that are not easily observed +using the averaged latency statistics. + +The histogram fields show cumulative values from lowest to highest. +The largest bucket is tagged "le=+Inf", representing the total count +of I/Os by type and vdev. + +#### zpool_latency Histogram Tags +| label | description | +|---|---| +| le | bucket for histogram, latency is less than or equal to bucket value in seconds | +| name | pool name | +| path | for leaf vdevs, the device path name, otherwise omitted | +| vdev | vdev name (root = entire pool) | + +#### zpool_latency Histogram Fields +| field | units | description | +|---|---|---| +| total_read | operations | read operations of all types | +| total_write | operations | write operations of all types | +| disk_read | operations | disk read operations | +| disk_write | operations | disk write operations | +| sync_read | operations | ZIO sync reads | +| sync_write | operations | ZIO sync writes | +| async_read | operations | ZIO async reads| +| async_write | operations | ZIO async writes | +| scrub | operations | ZIO scrub/scan reads | +| trim | operations | ZIO trim (aka unmap) writes | + +### zpool_io_size Histogram +ZFS tracks I/O throughout the ZIO pipeline. The size of each I/O is used +to create a histogram of the size by I/O type and vdev. For example, a +4KiB write to mirrored pool will show a 4KiB write to the top-level vdev +(root) and a 4KiB write to each of the mirror leaf vdevs. + +The ZIO pipeline can aggregate I/O operations. For example, a contiguous +series of writes can be aggregated into a single, larger I/O to the leaf +vdev. The independent I/O operations reflect the logical operations and +the aggregated I/O operations reflect the physical operations. + +The histogram fields show cumulative values from lowest to highest. +The largest bucket is tagged "le=+Inf", representing the total count +of I/Os by type and vdev. + +Note: trim I/Os can be larger than 16MiB, but the larger sizes are +accounted in the 16MiB bucket. + +#### zpool_io_size Histogram Tags +| label | description | +|---|---| +| le | bucket for histogram, I/O size is less than or equal to bucket value in bytes | +| name | pool name | +| path | for leaf vdevs, the device path name, otherwise omitted | +| vdev | vdev name (root = entire pool) | + +#### zpool_io_size Histogram Fields +| field | units | description | +|---|---|---| +| sync_read_ind | blocks | independent sync reads | +| sync_write_ind | blocks | independent sync writes | +| async_read_ind | blocks | independent async reads | +| async_write_ind | blocks | independent async writes | +| scrub_read_ind | blocks | independent scrub/scan reads | +| trim_write_ind | blocks | independent trim (aka unmap) writes | +| sync_read_agg | blocks | aggregated sync reads | +| sync_write_agg | blocks | aggregated sync writes | +| async_read_agg | blocks | aggregated async reads | +| async_write_agg | blocks | aggregated async writes | +| scrub_read_agg | blocks | aggregated scrub/scan reads | +| trim_write_agg | blocks | aggregated trim (aka unmap) writes | + +#### About unsigned integers +Telegraf v1.6.2 and later support unsigned 64-bit integers which more +closely matches the uint64_t values used by ZFS. By default, zpool_influxdb +uses ZFS' uint64_t values and influxdb line protocol unsigned integer type. +If you are using old telegraf or influxdb where unsigned integers are not +available, use the `--signed-int` option. + +## Using _zpool_influxdb_ + +The simplest method is to use the execd input agent in telegraf. For older +versions of telegraf which lack execd, the exec input agent can be used. +For convenience, one of the sample config files below can be placed in the +telegraf config-directory (often /etc/telegraf/telegraf.d). Telegraf can +be restarted to read the config-directory files. + +### Example telegraf execd configuration +```toml +# # Read metrics from zpool_influxdb +[[inputs.execd]] +# ## default installation location for zpool_influxdb command + command = ["/usr/libexec/zfs/zpool_influxdb", "--execd"] + + ## Define how the process is signaled on each collection interval. + ## Valid values are: + ## "none" : Do not signal anything. (Recommended for service inputs) + ## The process must output metrics by itself. + ## "STDIN" : Send a newline on STDIN. (Recommended for gather inputs) + ## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended) + ## "SIGUSR1" : Send a USR1 signal. Not available on Windows. + ## "SIGUSR2" : Send a USR2 signal. Not available on Windows. + signal = "STDIN" + + ## Delay before the process is restarted after an unexpected termination + restart_delay = "10s" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" +``` + +### Example telegraf exec configuration +```toml +# # Read metrics from zpool_influxdb +[[inputs.exec]] +# ## default installation location for zpool_influxdb command + commands = ["/usr/libexec/zfs/zpool_influxdb"] + data_format = "influx" +``` + +## Caveat Emptor +* Like the _zpool_ command, _zpool_influxdb_ takes a reader + lock on spa_config for each imported pool. If this lock blocks, + then the command will also block indefinitely and might be + unkillable. This is not a normal condition, but can occur if + there are bugs in the kernel modules. + For this reason, care should be taken: + * avoid spawning many of these commands hoping that one might + finish + * avoid frequent updates or short sample time + intervals, because the locks can interfere with the performance + of other instances of _zpool_ or _zpool_influxdb_ + +## Other collectors +There are a few other collectors for zpool statistics roaming around +the Internet. Many attempt to screen-scrape `zpool` output in various +ways. The screen-scrape method works poorly for `zpool` output because +of its human-friendly nature. Also, they suffer from the same caveats +as this implementation. This implementation is optimized for directly +collecting the metrics and is much more efficient than the screen-scrapers. + +## Feedback Encouraged +Pull requests and issues are greatly appreciated at +https://github.com/openzfs/zfs diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/dashboards/README.md b/sys/contrib/openzfs/cmd/zpool_influxdb/dashboards/README.md new file mode 100644 index 000000000000..2fdbe49834ff --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/dashboards/README.md @@ -0,0 +1,3 @@ +### Dashboards for zpool_influxdb +This directory contains a collection of dashboards related to ZFS with data +collected from the zpool_influxdb collector. diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/dashboards/grafana/ZFS-pool-latency-heatmaps-influxdb.json b/sys/contrib/openzfs/cmd/zpool_influxdb/dashboards/grafana/ZFS-pool-latency-heatmaps-influxdb.json new file mode 100644 index 000000000000..a99f92783bc4 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/dashboards/grafana/ZFS-pool-latency-heatmaps-influxdb.json @@ -0,0 +1,1667 @@ +{ + "__inputs": [ + { + "name": "DS_MACBOOK-INFLUX", + "label": "macbook-influx", + "description": "", + "type": "datasource", + "pluginId": "influxdb", + "pluginName": "InfluxDB" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "6.7.3" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "influxdb", + "name": "InfluxDB", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "jdbranham-diagram-panel", + "name": "Diagram", + "version": "1.4.5" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + } + ], + "annotations": { + "list": [ + { + "$$hashKey": "object:1627", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Top-level ZFS pool latency by ZIO type", + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "iteration": 1590445168391, + "links": [], + "panels": [ + { + "collapsed": false, + "datasource": "${DS_MACBOOK-INFLUX}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "panels": [], + "title": "Total Reads and Writes", + "type": "row" + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the total reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 2, + "legend": { + "show": true + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "total_read" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Reads", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the total writes of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 3, + "legend": { + "show": true + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "total_write" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Writes", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "collapsed": false, + "datasource": "${DS_MACBOOK-INFLUX}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 8, + "panels": [], + "title": "ZIO Scheduler Queues for Read Operations", + "type": "row" + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the synchronous reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 0, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 6, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "sync_read" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Sync Read Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the asynchronous reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 5, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 9, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "async_read" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Async Read Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the scrub or scan reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 10, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 10, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "scrub" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Scrub/Scan Read Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the actual disk reads of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 15, + "y": 11 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 11, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "disk_read" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Read Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "collapsed": false, + "datasource": "${DS_MACBOOK-INFLUX}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 13, + "panels": [], + "title": "ZIO Scheduler Queues for Write Operations", + "type": "row" + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the synchronous writes of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 0, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 14, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "sync_write" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Sync Write Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the asynchronous writes of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 5, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 15, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "async_write" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Async Write Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the trim or unmap operations of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 10, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 16, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "trim" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Trim Write Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_MACBOOK-INFLUX}", + "description": "Latency histogram for the disk write operations of a ZFS pool", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 15, + "y": 20 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 17, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "alias": "$tag_le", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "le" + ], + "type": "tag" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "measurement": "zpool_latency", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "disk_write" + ], + "type": "field" + }, + { + "params": [], + "type": "last" + }, + { + "params": [ + "1s" + ], + "type": "non_negative_derivative" + } + ] + ], + "tags": [ + { + "key": "host", + "operator": "=~", + "value": "/^$hostname$/" + }, + { + "condition": "AND", + "key": "name", + "operator": "=~", + "value": "/^$poolname$/" + } + ] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Write Queue", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "collapsed": false, + "datasource": "${DS_MACBOOK-INFLUX}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 19, + "panels": [], + "title": "About", + "type": "row" + }, + { + "content": "I/O requests that are satisfied by accessing pool devices are managed by the ZIO scheduler.\nThe total latency is measured from the start of the I/O to completion by the disk.\nLatency through each queue is shown prior to its submission to the disk queue.\n\nThis view is useful for observing the effects of tuning the ZIO scheduler min and max values\n(see zfs-module-parameters(5) and [ZFS on Linux Module Parameters](https://openzfs.github.io/openzfs-docs/Performance%20and%20tuning/ZFS%20on%20Linux%20Module%20Parameters.html)):\n+ *zfs_vdev_max_active* controls the ZIO scheduler's disk queue depth (do not confuse with the block device's nr_requests)\n+ *zfs_vdev_sync_read_min_active* and *zfs_vdev_sync_read_max_active* control the synchronous queue for reads: most reads are sync\n+ *zfs_vdev_sync_write_min_active* and *zfs_vdev_sync_write_max_active* control the synchronous queue for writes: \nusually metadata or user data depending on the \"sync\" property setting or I/Os that are requested to be flushed\n+ *zfs_vdev_async_read_min_active* and *zfs_vdev_async_read_max_active* control the asynchronous queue for reads: usually prefetches\n+ *zfs_vdev_async_write_min_active* and *zfs_vdev_async_write_max_active* control the asynchronous queue for writes: \nusually the bulk of all writes at transaction group (txg) commit\n+ *zfs_vdev_scrub_min_active* and *zfs_vdev_scrub_max_active* controls the scan reads: usually scrub or resilver\n\n", + "datasource": "${DS_MACBOOK-INFLUX}", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 16, + "x": 0, + "y": 29 + }, + "id": 21, + "mode": "markdown", + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "About ZFS Pool All Queues Read/Write Latency Histograms", + "type": "text" + }, + { + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "composites": [], + "content": "graph LR\nIO((I/O request)) --> SR(sync read queue)\nIO --> SW(sync write queue)\nIO --> AR(async read queue)\nIO --> AW(async write queue)\nIO --> SCRUB(scrub queue)\nIO --> TRIM(trim queue)\nSR --> DISKQ(disk queue)\nSW --> DISKQ\nAR --> DISKQ\nAW --> DISKQ\nSCRUB --> DISKQ\nTRIM --> DISKQ\nDISKQ --> DISK((disk))\n", + "datasource": "${DS_MACBOOK-INFLUX}", + "decimals": 2, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "none", + "graphId": "diagram_23", + "gridPos": { + "h": 15, + "w": 7, + "x": 16, + "y": 29 + }, + "id": 23, + "init": { + "arrowMarkerAbsolute": true, + "cloneCssStyles": true, + "flowchart": { + "htmlLabels": true, + "useMaxWidth": true + }, + "gantt": { + "barGap": 4, + "barHeight": 20, + "fontFamily": "\"Open-Sans\", \"sans-serif\"", + "fontSize": 11, + "gridLineStartPadding": 35, + "leftPadding": 75, + "numberSectionStyles": 3, + "titleTopMargin": 25, + "topPadding": 50 + }, + "logLevel": 3, + "securityLevel": "loose", + "sequence": { + "actorMargin": 50, + "bottomMarginAdj": 1, + "boxMargin": 10, + "boxTextMargin": 5, + "diagramMarginX": 50, + "diagramMarginY": 10, + "height": 65, + "messageMargin": 35, + "mirrorActors": true, + "noteMargin": 10, + "useMaxWidth": true, + "width": 150 + }, + "startOnLoad": false, + "theme": "dark" + }, + "legend": { + "avg": true, + "current": true, + "gradient": { + "enabled": true, + "show": true + }, + "max": true, + "min": true, + "show": false, + "total": true + }, + "mappingType": 1, + "mappingTypes": [ + { + "$$hashKey": "object:155", + "name": "value to text", + "value": 1 + }, + { + "$$hashKey": "object:156", + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxWidth": false, + "mermaidServiceUrl": "", + "metricCharacterReplacements": [], + "moddedSeriesVal": 0, + "mode": "content", + "nullPointMode": "connected", + "seriesOverrides": [], + "style": "", + "styleValues": {}, + "targets": [ + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": true, + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + } + ], + "themes": [ + "default", + "dark", + "forest", + "neutral" + ], + "thresholds": "0,10", + "timeFrom": null, + "timeShift": null, + "title": "Panel Title", + "type": "jdbranham-diagram-panel", + "valueMaps": [ + { + "$$hashKey": "object:151", + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg", + "valueOptions": [ + "avg", + "min", + "max", + "total", + "current" + ] + } + ], + "refresh": false, + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ZFS", + "Latency", + "Histogram" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_MACBOOK-INFLUX}", + "definition": "show tag values from \"zpool_latency\" with key = \"host\"", + "hide": 0, + "includeAll": false, + "index": -1, + "label": null, + "multi": false, + "name": "hostname", + "options": [], + "query": "show tag values from \"zpool_latency\" with key = \"host\"", + "refresh": 1, + "regex": "/([-a-zA-Z-0-9]+)/", + "skipUrlSync": false, + "sort": 5, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_MACBOOK-INFLUX}", + "definition": "show tag values from \"zpool_latency\" with key = \"name\" where \"host\" =~ /^$hostname/", + "hide": 0, + "includeAll": false, + "index": -1, + "label": null, + "multi": false, + "name": "poolname", + "options": [], + "query": "show tag values from \"zpool_latency\" with key = \"name\" where \"host\" =~ /^$hostname/", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 5, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "2020-05-25T21:34:30.137Z", + "to": "2020-05-25T21:39:54.445Z" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "ZFS Pool Latency Heatmaps Influxdb", + "uid": "TbB4-DkGz", + "variables": { + "list": [] + }, + "version": 2 +} \ No newline at end of file diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/README.md b/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/README.md new file mode 100644 index 000000000000..74f411a15d34 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/README.md @@ -0,0 +1,7 @@ +This directory contains sample telegraf configurations for +adding `zpool_influxdb` as an input plugin. Depending on your +telegraf configuration, the installation can be as simple as +copying one of these to the `/etc/telegraf/telegraf.d` directory +and restarting `systemctl restart telegraf` + +See the telegraf docs for more information on input plugins. diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/exec_zpool_influxdb.conf b/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/exec_zpool_influxdb.conf new file mode 100644 index 000000000000..a2efa61892ff --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/exec_zpool_influxdb.conf @@ -0,0 +1,15 @@ +# # Read metrics from zpool_influxdb +[[inputs.exec]] +# ## default installation location for zpool_influxdb command + commands = ["/usr/local/libexec/zfs/zpool_influxdb"] +# ## Timeout for each command to complete. +# timeout = "5s" +# +# ## measurement name suffix (for separating different commands) +# name_suffix = "_mycollector" +# +# ## Data format to consume. +# ## Each data format has its own unique set of configuration options, read +# ## more about them here: +# ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/execd_zpool_influxdb.conf b/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/execd_zpool_influxdb.conf new file mode 100644 index 000000000000..90737b8cb798 --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/telegraf.d/execd_zpool_influxdb.conf @@ -0,0 +1,23 @@ +# # Read metrics from zpool_influxdb +[[inputs.execd]] +# ## default installation location for zpool_influxdb command + command = ["/usr/local/libexec/zfs/zpool_influxdb", "--execd"] + + ## Define how the process is signaled on each collection interval. + ## Valid values are: + ## "none" : Do not signal anything. (Recommended for service inputs) + ## The process must output metrics by itself. + ## "STDIN" : Send a newline on STDIN. (Recommended for gather inputs) + ## "SIGHUP" : Send a HUP signal. Not available on Windows. (not recommended) + ## "SIGUSR1" : Send a USR1 signal. Not available on Windows. + ## "SIGUSR2" : Send a USR2 signal. Not available on Windows. + signal = "STDIN" + + ## Delay before the process is restarted after an unexpected termination + restart_delay = "10s" + + ## Data format to consume. + ## Each data format has its own unique set of configuration options, read + ## more about them here: + ## https://github.com/influxdata/telegraf/blob/master/docs/DATA_FORMATS_INPUT.md + data_format = "influx" diff --git a/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c b/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c new file mode 100644 index 000000000000..71ffcb25381a --- /dev/null +++ b/sys/contrib/openzfs/cmd/zpool_influxdb/zpool_influxdb.c @@ -0,0 +1,843 @@ +/* + * Gather top-level ZFS pool and resilver/scan statistics and print using + * influxdb line protocol + * usage: [options] [pool_name] + * where options are: + * --execd, -e run in telegraf execd input plugin mode, [CR] on + * stdin causes a sample to be printed and wait for + * the next [CR] + * --no-histograms, -n don't print histogram data (reduces cardinality + * if you don't care about histograms) + * --sum-histogram-buckets, -s sum histogram bucket values + * + * To integrate into telegraf use one of: + * 1. the `inputs.execd` plugin with the `--execd` option + * 2. the `inputs.exec` plugin to simply run with no options + * + * NOTE: libzfs is an unstable interface. YMMV. + * + * The design goals of this software include: + * + be as lightweight as possible + * + reduce the number of external dependencies as far as possible, hence + * there is no dependency on a client library for managing the metric + * collection -- info is printed, KISS + * + broken pools or kernel bugs can cause this process to hang in an + * unkillable state. For this reason, it is best to keep the damage limited + * to a small process like zpool_influxdb rather than a larger collector. + * + * Copyright 2018-2020 Richard Elling + * + * This software is dual-licensed MIT and CDDL. + * + * The MIT License (MIT) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + * CDDL HEADER END + */ +#include +#include +#include +#include +#include +#include + +#define POOL_MEASUREMENT "zpool_stats" +#define SCAN_MEASUREMENT "zpool_scan_stats" +#define VDEV_MEASUREMENT "zpool_vdev_stats" +#define POOL_LATENCY_MEASUREMENT "zpool_latency" +#define POOL_QUEUE_MEASUREMENT "zpool_vdev_queue" +#define MIN_LAT_INDEX 10 /* minimum latency index 10 = 1024ns */ +#define POOL_IO_SIZE_MEASUREMENT "zpool_io_size" +#define MIN_SIZE_INDEX 9 /* minimum size index 9 = 512 bytes */ + +/* global options */ +int execd_mode = 0; +int no_histograms = 0; +int sum_histogram_buckets = 0; +char metric_data_type = 'u'; +uint64_t metric_value_mask = UINT64_MAX; +uint64_t timestamp = 0; +int complained_about_sync = 0; +char *tags = ""; + +typedef int (*stat_printer_f)(nvlist_t *, const char *, const char *); + +/* + * influxdb line protocol rules for escaping are important because the + * zpool name can include characters that need to be escaped + * + * caller is responsible for freeing result + */ +static char * +escape_string(char *s) +{ + char *c, *d; + char *t = (char *)malloc(ZFS_MAX_DATASET_NAME_LEN * 2); + if (t == NULL) { + fprintf(stderr, "error: cannot allocate memory\n"); + exit(1); + } + + for (c = s, d = t; *c != '\0'; c++, d++) { + switch (*c) { + case ' ': + case ',': + case '=': + case '\\': + *d++ = '\\'; + default: + *d = *c; + } + } + *d = '\0'; + return (t); +} + +/* + * print key=value where value is a uint64_t + */ +static void +print_kv(char *key, uint64_t value) +{ + printf("%s=%llu%c", key, + (u_longlong_t)value & metric_value_mask, metric_data_type); +} + +/* + * print_scan_status() prints the details as often seen in the "zpool status" + * output. However, unlike the zpool command, which is intended for humans, + * this output is suitable for long-term tracking in influxdb. + * TODO: update to include issued scan data + */ +static int +print_scan_status(nvlist_t *nvroot, const char *pool_name) +{ + uint_t c; + int64_t elapsed; + uint64_t examined, pass_exam, paused_time, paused_ts, rate; + uint64_t remaining_time; + pool_scan_stat_t *ps = NULL; + double pct_done; + char *state[DSS_NUM_STATES] = { + "none", "scanning", "finished", "canceled"}; + char *func; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c); + + /* + * ignore if there are no stats + */ + if (ps == NULL) + return (0); + + /* + * return error if state is bogus + */ + if (ps->pss_state >= DSS_NUM_STATES || + ps->pss_func >= POOL_SCAN_FUNCS) { + if (complained_about_sync % 1000 == 0) { + fprintf(stderr, "error: cannot decode scan stats: " + "ZFS is out of sync with compiled zpool_influxdb"); + complained_about_sync++; + } + return (1); + } + + switch (ps->pss_func) { + case POOL_SCAN_NONE: + func = "none_requested"; + break; + case POOL_SCAN_SCRUB: + func = "scrub"; + break; + case POOL_SCAN_RESILVER: + func = "resilver"; + break; +#ifdef POOL_SCAN_REBUILD + case POOL_SCAN_REBUILD: + func = "rebuild"; + break; +#endif + default: + func = "scan"; + } + + /* overall progress */ + examined = ps->pss_examined ? ps->pss_examined : 1; + pct_done = 0.0; + if (ps->pss_to_examine > 0) + pct_done = 100.0 * examined / ps->pss_to_examine; + +#ifdef EZFS_SCRUB_PAUSED + paused_ts = ps->pss_pass_scrub_pause; + paused_time = ps->pss_pass_scrub_spent_paused; +#else + paused_ts = 0; + paused_time = 0; +#endif + + /* calculations for this pass */ + if (ps->pss_state == DSS_SCANNING) { + elapsed = (int64_t)time(NULL) - (int64_t)ps->pss_pass_start - + (int64_t)paused_time; + elapsed = (elapsed > 0) ? elapsed : 1; + pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; + rate = pass_exam / elapsed; + rate = (rate > 0) ? rate : 1; + remaining_time = ps->pss_to_examine - examined / rate; + } else { + elapsed = + (int64_t)ps->pss_end_time - (int64_t)ps->pss_pass_start - + (int64_t)paused_time; + elapsed = (elapsed > 0) ? elapsed : 1; + pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; + rate = pass_exam / elapsed; + remaining_time = 0; + } + rate = rate ? rate : 1; + + /* influxdb line protocol format: "tags metrics timestamp" */ + printf("%s%s,function=%s,name=%s,state=%s ", + SCAN_MEASUREMENT, tags, func, pool_name, state[ps->pss_state]); + print_kv("end_ts", ps->pss_end_time); + print_kv(",errors", ps->pss_errors); + print_kv(",examined", examined); + print_kv(",issued", ps->pss_issued); + print_kv(",pass_examined", pass_exam); + print_kv(",pass_issued", ps->pss_pass_issued); + print_kv(",paused_ts", paused_ts); + print_kv(",paused_t", paused_time); + printf(",pct_done=%.2f", pct_done); + print_kv(",processed", ps->pss_processed); + print_kv(",rate", rate); + print_kv(",remaining_t", remaining_time); + print_kv(",start_ts", ps->pss_start_time); + print_kv(",to_examine", ps->pss_to_examine); + print_kv(",to_process", ps->pss_to_process); + printf(" %llu\n", (u_longlong_t)timestamp); + return (0); +} + +/* + * get a vdev name that corresponds to the top-level vdev names + * printed by `zpool status` + */ +static char * +get_vdev_name(nvlist_t *nvroot, const char *parent_name) +{ + static char vdev_name[256]; + char *vdev_type = NULL; + uint64_t vdev_id = 0; + + if (nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, + &vdev_type) != 0) { + vdev_type = "unknown"; + } + if (nvlist_lookup_uint64( + nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0) { + vdev_id = UINT64_MAX; + } + if (parent_name == NULL) { + (void) snprintf(vdev_name, sizeof (vdev_name), "%s", + vdev_type); + } else { + (void) snprintf(vdev_name, sizeof (vdev_name), + "%s/%s-%llu", + parent_name, vdev_type, (u_longlong_t)vdev_id); + } + return (vdev_name); +} + +/* + * get a string suitable for an influxdb tag that describes this vdev + * + * By default only the vdev hierarchical name is shown, separated by '/' + * If the vdev has an associated path, which is typical of leaf vdevs, + * then the path is added. + * It would be nice to have the devid instead of the path, but under + * Linux we cannot be sure a devid will exist and we'd rather have + * something than nothing, so we'll use path instead. + */ +static char * +get_vdev_desc(nvlist_t *nvroot, const char *parent_name) +{ + static char vdev_desc[2 * MAXPATHLEN]; + char *vdev_type = NULL; + uint64_t vdev_id = 0; + char vdev_value[MAXPATHLEN]; + char *vdev_path = NULL; + char *s, *t; + + if (nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, &vdev_type) != 0) { + vdev_type = "unknown"; + } + if (nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0) { + vdev_id = UINT64_MAX; + } + if (nvlist_lookup_string( + nvroot, ZPOOL_CONFIG_PATH, &vdev_path) != 0) { + vdev_path = NULL; + } + + if (parent_name == NULL) { + s = escape_string(vdev_type); + (void) snprintf(vdev_value, sizeof (vdev_value), "vdev=%s", s); + free(s); + } else { + s = escape_string((char *)parent_name); + t = escape_string(vdev_type); + (void) snprintf(vdev_value, sizeof (vdev_value), + "vdev=%s/%s-%llu", s, t, (u_longlong_t)vdev_id); + free(s); + free(t); + } + if (vdev_path == NULL) { + (void) snprintf(vdev_desc, sizeof (vdev_desc), "%s", + vdev_value); + } else { + s = escape_string(vdev_path); + (void) snprintf(vdev_desc, sizeof (vdev_desc), "path=%s,%s", + s, vdev_value); + free(s); + } + return (vdev_desc); +} + +/* + * vdev summary stats are a combination of the data shown by + * `zpool status` and `zpool list -v` + */ +static int +print_summary_stats(nvlist_t *nvroot, const char *pool_name, + const char *parent_name) +{ + uint_t c; + vdev_stat_t *vs; + char *vdev_desc = NULL; + vdev_desc = get_vdev_desc(nvroot, parent_name); + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) != 0) { + return (1); + } + printf("%s%s,name=%s,state=%s,%s ", POOL_MEASUREMENT, tags, + pool_name, zpool_state_to_name((vdev_state_t)vs->vs_state, + (vdev_aux_t)vs->vs_aux), vdev_desc); + print_kv("alloc", vs->vs_alloc); + print_kv(",free", vs->vs_space - vs->vs_alloc); + print_kv(",size", vs->vs_space); + print_kv(",read_bytes", vs->vs_bytes[ZIO_TYPE_READ]); + print_kv(",read_errors", vs->vs_read_errors); + print_kv(",read_ops", vs->vs_ops[ZIO_TYPE_READ]); + print_kv(",write_bytes", vs->vs_bytes[ZIO_TYPE_WRITE]); + print_kv(",write_errors", vs->vs_write_errors); + print_kv(",write_ops", vs->vs_ops[ZIO_TYPE_WRITE]); + print_kv(",checksum_errors", vs->vs_checksum_errors); + print_kv(",fragmentation", vs->vs_fragmentation); + printf(" %llu\n", (u_longlong_t)timestamp); + return (0); +} + +/* + * vdev latency stats are histograms stored as nvlist arrays of uint64. + * Latency stats include the ZIO scheduler classes plus lower-level + * vdev latencies. + * + * In many cases, the top-level "root" view obscures the underlying + * top-level vdev operations. For example, if a pool has a log, special, + * or cache device, then each can behave very differently. It is useful + * to see how each is responding. + */ +static int +print_vdev_latency_stats(nvlist_t *nvroot, const char *pool_name, + const char *parent_name) +{ + uint_t c, end = 0; + nvlist_t *nv_ex; + char *vdev_desc = NULL; + + /* short_names become part of the metric name and are influxdb-ready */ + struct lat_lookup { + char *name; + char *short_name; + uint64_t sum; + uint64_t *array; + }; + struct lat_lookup lat_type[] = { + {ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, "total_read", 0}, + {ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, "total_write", 0}, + {ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, "disk_read", 0}, + {ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, "disk_write", 0}, + {ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, "sync_read", 0}, + {ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, "sync_write", 0}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, "async_read", 0}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, "async_write", 0}, + {ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, "scrub", 0}, +#ifdef ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO + {ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, "trim", 0}, +#endif + {NULL, NULL} + }; + + if (nvlist_lookup_nvlist(nvroot, + ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) { + return (6); + } + + vdev_desc = get_vdev_desc(nvroot, parent_name); + + for (int i = 0; lat_type[i].name; i++) { + if (nvlist_lookup_uint64_array(nv_ex, + lat_type[i].name, &lat_type[i].array, &c) != 0) { + fprintf(stderr, "error: can't get %s\n", + lat_type[i].name); + return (3); + } + /* end count count, all of the arrays are the same size */ + end = c - 1; + } + + for (int bucket = 0; bucket <= end; bucket++) { + if (bucket < MIN_LAT_INDEX) { + /* don't print, but collect the sum */ + for (int i = 0; lat_type[i].name; i++) { + lat_type[i].sum += lat_type[i].array[bucket]; + } + continue; + } + if (bucket < end) { + printf("%s%s,le=%0.6f,name=%s,%s ", + POOL_LATENCY_MEASUREMENT, tags, + (float)(1ULL << bucket) * 1e-9, + pool_name, vdev_desc); + } else { + printf("%s%s,le=+Inf,name=%s,%s ", + POOL_LATENCY_MEASUREMENT, tags, pool_name, + vdev_desc); + } + for (int i = 0; lat_type[i].name; i++) { + if (bucket <= MIN_LAT_INDEX || sum_histogram_buckets) { + lat_type[i].sum += lat_type[i].array[bucket]; + } else { + lat_type[i].sum = lat_type[i].array[bucket]; + } + print_kv(lat_type[i].short_name, lat_type[i].sum); + if (lat_type[i + 1].name != NULL) { + printf(","); + } + } + printf(" %llu\n", (u_longlong_t)timestamp); + } + return (0); +} + +/* + * vdev request size stats are histograms stored as nvlist arrays of uint64. + * Request size stats include the ZIO scheduler classes plus lower-level + * vdev sizes. Both independent (ind) and aggregated (agg) sizes are reported. + * + * In many cases, the top-level "root" view obscures the underlying + * top-level vdev operations. For example, if a pool has a log, special, + * or cache device, then each can behave very differently. It is useful + * to see how each is responding. + */ +static int +print_vdev_size_stats(nvlist_t *nvroot, const char *pool_name, + const char *parent_name) +{ + uint_t c, end = 0; + nvlist_t *nv_ex; + char *vdev_desc = NULL; + + /* short_names become the field name */ + struct size_lookup { + char *name; + char *short_name; + uint64_t sum; + uint64_t *array; + }; + struct size_lookup size_type[] = { + {ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, "sync_read_ind"}, + {ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, "sync_write_ind"}, + {ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, "async_read_ind"}, + {ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, "async_write_ind"}, + {ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, "scrub_read_ind"}, + {ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, "sync_read_agg"}, + {ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, "sync_write_agg"}, + {ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, "async_read_agg"}, + {ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, "async_write_agg"}, + {ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, "scrub_read_agg"}, +#ifdef ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO + {ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, "trim_write_ind"}, + {ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, "trim_write_agg"}, +#endif + {NULL, NULL} + }; + + if (nvlist_lookup_nvlist(nvroot, + ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) { + return (6); + } + + vdev_desc = get_vdev_desc(nvroot, parent_name); + + for (int i = 0; size_type[i].name; i++) { + if (nvlist_lookup_uint64_array(nv_ex, size_type[i].name, + &size_type[i].array, &c) != 0) { + fprintf(stderr, "error: can't get %s\n", + size_type[i].name); + return (3); + } + /* end count count, all of the arrays are the same size */ + end = c - 1; + } + + for (int bucket = 0; bucket <= end; bucket++) { + if (bucket < MIN_SIZE_INDEX) { + /* don't print, but collect the sum */ + for (int i = 0; size_type[i].name; i++) { + size_type[i].sum += size_type[i].array[bucket]; + } + continue; + } + + if (bucket < end) { + printf("%s%s,le=%llu,name=%s,%s ", + POOL_IO_SIZE_MEASUREMENT, tags, 1ULL << bucket, + pool_name, vdev_desc); + } else { + printf("%s%s,le=+Inf,name=%s,%s ", + POOL_IO_SIZE_MEASUREMENT, tags, pool_name, + vdev_desc); + } + for (int i = 0; size_type[i].name; i++) { + if (bucket <= MIN_SIZE_INDEX || sum_histogram_buckets) { + size_type[i].sum += size_type[i].array[bucket]; + } else { + size_type[i].sum = size_type[i].array[bucket]; + } + print_kv(size_type[i].short_name, size_type[i].sum); + if (size_type[i + 1].name != NULL) { + printf(","); + } + } + printf(" %llu\n", (u_longlong_t)timestamp); + } + return (0); +} + +/* + * ZIO scheduler queue stats are stored as gauges. This is unfortunate + * because the values can change very rapidly and any point-in-time + * value will quickly be obsoleted. It is also not easy to downsample. + * Thus only the top-level queue stats might be beneficial... maybe. + */ +static int +print_queue_stats(nvlist_t *nvroot, const char *pool_name, + const char *parent_name) +{ + nvlist_t *nv_ex; + uint64_t value; + + /* short_names are used for the field name */ + struct queue_lookup { + char *name; + char *short_name; + }; + struct queue_lookup queue_type[] = { + {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, "sync_r_active"}, + {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, "sync_w_active"}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, "async_r_active"}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, "async_w_active"}, + {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, "async_scrub_active"}, + {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, "sync_r_pend"}, + {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, "sync_w_pend"}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, "async_r_pend"}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, "async_w_pend"}, + {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, "async_scrub_pend"}, + {NULL, NULL} + }; + + if (nvlist_lookup_nvlist(nvroot, + ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) { + return (6); + } + + printf("%s%s,name=%s,%s ", POOL_QUEUE_MEASUREMENT, tags, pool_name, + get_vdev_desc(nvroot, parent_name)); + for (int i = 0; queue_type[i].name; i++) { + if (nvlist_lookup_uint64(nv_ex, + queue_type[i].name, &value) != 0) { + fprintf(stderr, "error: can't get %s\n", + queue_type[i].name); + return (3); + } + print_kv(queue_type[i].short_name, value); + if (queue_type[i + 1].name != NULL) { + printf(","); + } + } + printf(" %llu\n", (u_longlong_t)timestamp); + return (0); +} + +/* + * top-level vdev stats are at the pool level + */ +static int +print_top_level_vdev_stats(nvlist_t *nvroot, const char *pool_name) +{ + nvlist_t *nv_ex; + uint64_t value; + + /* short_names become part of the metric name */ + struct queue_lookup { + char *name; + char *short_name; + }; + struct queue_lookup queue_type[] = { + {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, "sync_r_active_queue"}, + {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, "sync_w_active_queue"}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, "async_r_active_queue"}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, "async_w_active_queue"}, + {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, "async_scrub_active_queue"}, + {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, "sync_r_pend_queue"}, + {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, "sync_w_pend_queue"}, + {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, "async_r_pend_queue"}, + {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, "async_w_pend_queue"}, + {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, "async_scrub_pend_queue"}, + {NULL, NULL} + }; + + if (nvlist_lookup_nvlist(nvroot, + ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) { + return (6); + } + + printf("%s%s,name=%s,vdev=root ", VDEV_MEASUREMENT, tags, + pool_name); + for (int i = 0; queue_type[i].name; i++) { + if (nvlist_lookup_uint64(nv_ex, + queue_type[i].name, &value) != 0) { + fprintf(stderr, "error: can't get %s\n", + queue_type[i].name); + return (3); + } + if (i > 0) + printf(","); + print_kv(queue_type[i].short_name, value); + } + + printf(" %llu\n", (u_longlong_t)timestamp); + return (0); +} + +/* + * recursive stats printer + */ +static int +print_recursive_stats(stat_printer_f func, nvlist_t *nvroot, + const char *pool_name, const char *parent_name, int descend) +{ + uint_t c, children; + nvlist_t **child; + char vdev_name[256]; + int err; + + err = func(nvroot, pool_name, parent_name); + if (err) + return (err); + + if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + (void) strncpy(vdev_name, get_vdev_name(nvroot, parent_name), + sizeof (vdev_name)); + vdev_name[sizeof (vdev_name) - 1] = '\0'; + + for (c = 0; c < children; c++) { + print_recursive_stats(func, child[c], pool_name, + vdev_name, descend); + } + } + return (0); +} + +/* + * call-back to print the stats from the pool config + * + * Note: if the pool is broken, this can hang indefinitely and perhaps in an + * unkillable state. + */ +static int +print_stats(zpool_handle_t *zhp, void *data) +{ + uint_t c; + int err; + boolean_t missing; + nvlist_t *config, *nvroot; + vdev_stat_t *vs; + struct timespec tv; + char *pool_name; + + /* if not this pool return quickly */ + if (data && + strncmp(data, zhp->zpool_name, ZFS_MAX_DATASET_NAME_LEN) != 0) { + zpool_close(zhp); + return (0); + } + + if (zpool_refresh_stats(zhp, &missing) != 0) { + zpool_close(zhp); + return (1); + } + + config = zpool_get_config(zhp, NULL); + if (clock_gettime(CLOCK_REALTIME, &tv) != 0) + timestamp = (uint64_t)time(NULL) * 1000000000; + else + timestamp = + ((uint64_t)tv.tv_sec * 1000000000) + (uint64_t)tv.tv_nsec; + + if (nvlist_lookup_nvlist( + config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) { + zpool_close(zhp); + return (2); + } + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) != 0) { + zpool_close(zhp); + return (3); + } + + pool_name = escape_string(zhp->zpool_name); + err = print_recursive_stats(print_summary_stats, nvroot, + pool_name, NULL, 1); + /* if any of these return an error, skip the rest */ + if (err == 0) + err = print_top_level_vdev_stats(nvroot, pool_name); + + if (no_histograms == 0) { + if (err == 0) + err = print_recursive_stats(print_vdev_latency_stats, nvroot, + pool_name, NULL, 1); + if (err == 0) + err = print_recursive_stats(print_vdev_size_stats, nvroot, + pool_name, NULL, 1); + if (err == 0) + err = print_recursive_stats(print_queue_stats, nvroot, + pool_name, NULL, 0); + } + if (err == 0) + err = print_scan_status(nvroot, pool_name); + + free(pool_name); + zpool_close(zhp); + return (err); +} + +static void +usage(char *name) +{ + fprintf(stderr, "usage: %s [--execd][--no-histograms]" + "[--sum-histogram-buckets] [--signed-int] [poolname]\n", name); + exit(EXIT_FAILURE); +} + +int +main(int argc, char *argv[]) +{ + int opt; + int ret = 8; + char *line = NULL; + size_t len, tagslen = 0; + struct option long_options[] = { + {"execd", no_argument, NULL, 'e'}, + {"help", no_argument, NULL, 'h'}, + {"no-histograms", no_argument, NULL, 'n'}, + {"signed-int", no_argument, NULL, 'i'}, + {"sum-histogram-buckets", no_argument, NULL, 's'}, + {"tags", required_argument, NULL, 't'}, + {0, 0, 0, 0} + }; + while ((opt = getopt_long( + argc, argv, "ehinst:", long_options, NULL)) != -1) { + switch (opt) { + case 'e': + execd_mode = 1; + break; + case 'i': + metric_data_type = 'i'; + metric_value_mask = INT64_MAX; + break; + case 'n': + no_histograms = 1; + break; + case 's': + sum_histogram_buckets = 1; + break; + case 't': + tagslen = strlen(optarg) + 2; + tags = calloc(tagslen, 1); + if (tags == NULL) { + fprintf(stderr, + "error: cannot allocate memory " + "for tags\n"); + exit(1); + } + (void) snprintf(tags, tagslen, ",%s", optarg); + break; + default: + usage(argv[0]); + } + } + + libzfs_handle_t *g_zfs; + if ((g_zfs = libzfs_init()) == NULL) { + fprintf(stderr, + "error: cannot initialize libzfs. " + "Is the zfs module loaded or zrepl running?\n"); + exit(EXIT_FAILURE); + } + if (execd_mode == 0) { + ret = zpool_iter(g_zfs, print_stats, argv[optind]); + return (ret); + } + while (getline(&line, &len, stdin) != -1) { + ret = zpool_iter(g_zfs, print_stats, argv[optind]); + fflush(stdout); + } + return (ret); +} diff --git a/sys/contrib/openzfs/cmd/zstream/zstream_redup.c b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c index 379025ce59e5..41f1068e3dfc 100644 --- a/sys/contrib/openzfs/cmd/zstream/zstream_redup.c +++ b/sys/contrib/openzfs/cmd/zstream/zstream_redup.c @@ -421,7 +421,7 @@ int zstream_do_redup(int argc, char *argv[]) { boolean_t verbose = B_FALSE; - char c; + int c; while ((c = getopt(argc, argv, "v")) != -1) { switch (c) { diff --git a/sys/contrib/openzfs/cmd/ztest/ztest.c b/sys/contrib/openzfs/cmd/ztest/ztest.c index 31205a5bf8cf..f66772fa7285 100644 --- a/sys/contrib/openzfs/cmd/ztest/ztest.c +++ b/sys/contrib/openzfs/cmd/ztest/ztest.c @@ -104,6 +104,7 @@ #include #include #include +#include #include #include #include @@ -167,8 +168,11 @@ typedef struct ztest_shared_opts { size_t zo_vdev_size; int zo_ashift; int zo_mirrors; - int zo_raidz; - int zo_raidz_parity; + int zo_raid_children; + int zo_raid_parity; + char zo_raid_type[8]; + int zo_draid_data; + int zo_draid_spares; int zo_datasets; int zo_threads; uint64_t zo_passtime; @@ -191,9 +195,12 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_vdevs = 5, .zo_ashift = SPA_MINBLOCKSHIFT, .zo_mirrors = 2, - .zo_raidz = 4, - .zo_raidz_parity = 1, + .zo_raid_children = 4, + .zo_raid_parity = 1, + .zo_raid_type = VDEV_TYPE_RAIDZ, .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ + .zo_draid_data = 4, /* data drives */ + .zo_draid_spares = 1, /* distributed spares */ .zo_datasets = 7, .zo_threads = 23, .zo_passtime = 60, /* 60 seconds */ @@ -232,7 +239,7 @@ static ztest_shared_ds_t *ztest_shared_ds; #define BT_MAGIC 0x123456789abcdefULL #define MAXFAULTS(zs) \ - (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) + (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raid_parity + 1) - 1) enum ztest_io_type { ZTEST_IO_WRITE_TAG, @@ -689,8 +696,11 @@ usage(boolean_t requested) "\t[-s size_of_each_vdev (default: %s)]\n" "\t[-a alignment_shift (default: %d)] use 0 for random\n" "\t[-m mirror_copies (default: %d)]\n" - "\t[-r raidz_disks (default: %d)]\n" - "\t[-R raidz_parity (default: %d)]\n" + "\t[-r raidz_disks / draid_disks (default: %d)]\n" + "\t[-R raid_parity (default: %d)]\n" + "\t[-K raid_kind (default: random)] raidz|draid|random\n" + "\t[-D draid_data (default: %d)] in config\n" + "\t[-S draid_spares (default: %d)]\n" "\t[-d datasets (default: %d)]\n" "\t[-t threads (default: %d)]\n" "\t[-g gang_block_threshold (default: %s)]\n" @@ -716,8 +726,10 @@ usage(boolean_t requested) nice_vdev_size, /* -s */ zo->zo_ashift, /* -a */ zo->zo_mirrors, /* -m */ - zo->zo_raidz, /* -r */ - zo->zo_raidz_parity, /* -R */ + zo->zo_raid_children, /* -r */ + zo->zo_raid_parity, /* -R */ + zo->zo_draid_data, /* -D */ + zo->zo_draid_spares, /* -S */ zo->zo_datasets, /* -d */ zo->zo_threads, /* -t */ nice_force_ganging, /* -g */ @@ -731,6 +743,21 @@ usage(boolean_t requested) exit(requested ? 0 : 1); } +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + ASSERT3S(ztest_fd_rand, >=, 0); + + if (range == 0) + return (0); + + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) + fatal(1, "short read from /dev/urandom"); + + return (r % range); +} static void ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) @@ -780,11 +807,12 @@ process_options(int argc, char **argv) int opt; uint64_t value; char altdir[MAXNAMELEN] = { 0 }; + char raid_kind[8] = { "random" }; bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); while ((opt = getopt(argc, argv, - "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { + "v:s:a:m:r:R:K:D:S:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { value = 0; switch (opt) { case 'v': @@ -793,6 +821,8 @@ process_options(int argc, char **argv) case 'm': case 'r': case 'R': + case 'D': + case 'S': case 'd': case 't': case 'g': @@ -817,10 +847,19 @@ process_options(int argc, char **argv) zo->zo_mirrors = value; break; case 'r': - zo->zo_raidz = MAX(1, value); + zo->zo_raid_children = MAX(1, value); break; case 'R': - zo->zo_raidz_parity = MIN(MAX(value, 1), 3); + zo->zo_raid_parity = MIN(MAX(value, 1), 3); + break; + case 'K': + (void) strlcpy(raid_kind, optarg, sizeof (raid_kind)); + break; + case 'D': + zo->zo_draid_data = MAX(1, value); + break; + case 'S': + zo->zo_draid_spares = MAX(1, value); break; case 'd': zo->zo_datasets = MAX(1, value); @@ -895,7 +934,54 @@ process_options(int argc, char **argv) } } - zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); + /* When raid choice is 'random' add a draid pool 50% of the time */ + if (strcmp(raid_kind, "random") == 0) { + (void) strlcpy(raid_kind, (ztest_random(2) == 0) ? + "draid" : "raidz", sizeof (raid_kind)); + + if (ztest_opts.zo_verbose >= 3) + (void) printf("choosing RAID type '%s'\n", raid_kind); + } + + if (strcmp(raid_kind, "draid") == 0) { + uint64_t min_devsize; + + /* With fewer disk use 256M, otherwise 128M is OK */ + min_devsize = (ztest_opts.zo_raid_children < 16) ? + (256ULL << 20) : (128ULL << 20); + + /* No top-level mirrors with dRAID for now */ + zo->zo_mirrors = 0; + + /* Use more appropriate defaults for dRAID */ + if (zo->zo_vdevs == ztest_opts_defaults.zo_vdevs) + zo->zo_vdevs = 1; + if (zo->zo_raid_children == + ztest_opts_defaults.zo_raid_children) + zo->zo_raid_children = 16; + if (zo->zo_ashift < 12) + zo->zo_ashift = 12; + if (zo->zo_vdev_size < min_devsize) + zo->zo_vdev_size = min_devsize; + + if (zo->zo_draid_data + zo->zo_raid_parity > + zo->zo_raid_children - zo->zo_draid_spares) { + (void) fprintf(stderr, "error: too few draid " + "children (%d) for stripe width (%d)\n", + zo->zo_raid_children, + zo->zo_draid_data + zo->zo_raid_parity); + usage(B_FALSE); + } + + (void) strlcpy(zo->zo_raid_type, VDEV_TYPE_DRAID, + sizeof (zo->zo_raid_type)); + + } else /* using raidz */ { + ASSERT0(strcmp(raid_kind, "raidz")); + + zo->zo_raid_parity = MIN(zo->zo_raid_parity, + zo->zo_raid_children - 1); + } zo->zo_vdevtime = (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : @@ -966,22 +1052,6 @@ ztest_kill(ztest_shared_t *zs) (void) kill(getpid(), SIGKILL); } -static uint64_t -ztest_random(uint64_t range) -{ - uint64_t r; - - ASSERT3S(ztest_fd_rand, >=, 0); - - if (range == 0) - return (0); - - if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) - fatal(1, "short read from /dev/urandom"); - - return (r % range); -} - /* ARGSUSED */ static void ztest_record_enospc(const char *s) @@ -997,12 +1067,27 @@ ztest_get_ashift(void) return (ztest_opts.zo_ashift); } +static boolean_t +ztest_is_draid_spare(const char *name) +{ + uint64_t spare_id = 0, parity = 0, vdev_id = 0; + + if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", + (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, + (u_longlong_t *)&spare_id) == 3) { + return (B_TRUE); + } + + return (B_FALSE); +} + static nvlist_t * make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) { char *pathbuf; uint64_t vdev; nvlist_t *file; + boolean_t draid_spare = B_FALSE; pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); @@ -1024,9 +1109,11 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) ztest_dev_template, ztest_opts.zo_dir, pool == NULL ? ztest_opts.zo_pool : pool, vdev); } + } else { + draid_spare = ztest_is_draid_spare(path); } - if (size != 0) { + if (size != 0 && !draid_spare) { int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); if (fd == -1) fatal(1, "can't open %s", path); @@ -1035,20 +1122,21 @@ make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) (void) close(fd); } - VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); - VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); - VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); + VERIFY0(nvlist_alloc(&file, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, + draid_spare ? VDEV_TYPE_DRAID_SPARE : VDEV_TYPE_FILE)); + VERIFY0(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path)); + VERIFY0(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift)); umem_free(pathbuf, MAXPATHLEN); return (file); } static nvlist_t * -make_vdev_raidz(char *path, char *aux, char *pool, size_t size, +make_vdev_raid(char *path, char *aux, char *pool, size_t size, uint64_t ashift, int r) { - nvlist_t *raidz, **child; + nvlist_t *raid, **child; int c; if (r < 2) @@ -1058,20 +1146,41 @@ make_vdev_raidz(char *path, char *aux, char *pool, size_t size, for (c = 0; c < r; c++) child[c] = make_vdev_file(path, aux, pool, size, ashift); - VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); - VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, - VDEV_TYPE_RAIDZ) == 0); - VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, - ztest_opts.zo_raidz_parity) == 0); - VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, - child, r) == 0); + VERIFY0(nvlist_alloc(&raid, NV_UNIQUE_NAME, 0)); + VERIFY0(nvlist_add_string(raid, ZPOOL_CONFIG_TYPE, + ztest_opts.zo_raid_type)); + VERIFY0(nvlist_add_uint64(raid, ZPOOL_CONFIG_NPARITY, + ztest_opts.zo_raid_parity)); + VERIFY0(nvlist_add_nvlist_array(raid, ZPOOL_CONFIG_CHILDREN, + child, r)); + + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) { + uint64_t ndata = ztest_opts.zo_draid_data; + uint64_t nparity = ztest_opts.zo_raid_parity; + uint64_t nspares = ztest_opts.zo_draid_spares; + uint64_t children = ztest_opts.zo_raid_children; + uint64_t ngroups = 1; + + /* + * Calculate the minimum number of groups required to fill a + * slice. This is the LCM of the stripe width (data + parity) + * and the number of data drives (children - spares). + */ + while (ngroups * (ndata + nparity) % (children - nspares) != 0) + ngroups++; + + /* Store the basic dRAID configuration. */ + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NDATA, ndata); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NSPARES, nspares); + fnvlist_add_uint64(raid, ZPOOL_CONFIG_DRAID_NGROUPS, ngroups); + } for (c = 0; c < r; c++) nvlist_free(child[c]); umem_free(child, r * sizeof (nvlist_t *)); - return (raidz); + return (raid); } static nvlist_t * @@ -1082,12 +1191,12 @@ make_vdev_mirror(char *path, char *aux, char *pool, size_t size, int c; if (m < 1) - return (make_vdev_raidz(path, aux, pool, size, ashift, r)); + return (make_vdev_raid(path, aux, pool, size, ashift, r)); child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); for (c = 0; c < m; c++) - child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); + child[c] = make_vdev_raid(path, aux, pool, size, ashift, r); VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, @@ -1332,7 +1441,11 @@ ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, crypto_args, &dcp)); err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); - dsl_crypto_params_free(dcp, B_FALSE); + /* + * Note: if there was an error loading, the wkey was not + * consumed, and needs to be freed. + */ + dsl_crypto_params_free(dcp, (err != 0)); fnvlist_free(crypto_args); if (err == EINVAL) { @@ -2809,6 +2922,10 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) if (ztest_opts.zo_mmp_test) return; + /* dRAID added after feature flags, skip upgrade test. */ + if (strcmp(ztest_opts.zo_raid_type, VDEV_TYPE_DRAID) == 0) + return; + mutex_enter(&ztest_vdev_lock); name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); @@ -2818,13 +2935,13 @@ ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) (void) spa_destroy(name); nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + NULL, ztest_opts.zo_raid_children, ztest_opts.zo_mirrors, 1); /* * If we're configuring a RAIDZ device then make sure that the * initial version is capable of supporting that feature. */ - switch (ztest_opts.zo_raidz_parity) { + switch (ztest_opts.zo_raid_parity) { case 0: case 1: initial_version = SPA_VERSION_INITIAL; @@ -2970,7 +3087,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) return; mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -2985,7 +3103,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) /* * find the first real slog in log allocation class */ - mg = spa_log_class(spa)->mc_rotor; + mg = spa_log_class(spa)->mc_allocator[0].mca_rotor; while (!mg->mg_vd->vdev_islog) mg = mg->mg_next; @@ -3024,7 +3142,8 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) */ nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? - "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, + 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3078,14 +3197,15 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) return; } - leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * + ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; spa_config_exit(spa, SCL_VDEV, FTAG); nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); error = spa_vdev_add(spa, nvroot); nvlist_free(nvroot); @@ -3134,7 +3254,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) char *aux; char *path; uint64_t guid = 0; - int error; + int error, ignore_err = 0; if (ztest_opts.zo_mmp_test) return; @@ -3157,7 +3277,13 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) /* * Pick a random device to remove. */ - guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + vdev_t *svd = sav->sav_vdevs[ztest_random(sav->sav_count)]; + + /* dRAID spares cannot be removed; try anyways to see ENOTSUP */ + if (strstr(svd->vdev_path, VDEV_TYPE_DRAID) != NULL) + ignore_err = ENOTSUP; + + guid = svd->vdev_guid; } else { /* * Find an unused device we can add. @@ -3214,7 +3340,9 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) case ZFS_ERR_DISCARDING_CHECKPOINT: break; default: - fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); + if (error != ignore_err) + fatal(0, "spa_vdev_remove(%llu) = %d", guid, + error); } } @@ -3243,7 +3371,7 @@ ztest_split_pool(ztest_ds_t *zd, uint64_t id) mutex_enter(&ztest_vdev_lock); /* ensure we have a usable config; mirrors of raidz aren't supported */ - if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + if (zs->zs_mirrors < 3 || ztest_opts.zo_raid_children > 1) { mutex_exit(&ztest_vdev_lock); return; } @@ -3343,6 +3471,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) int replacing; int oldvd_has_siblings = B_FALSE; int newvd_is_spare = B_FALSE; + int newvd_is_dspare = B_FALSE; int oldvd_is_log; int error, expected_error; @@ -3353,7 +3482,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); mutex_enter(&ztest_vdev_lock); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); @@ -3365,8 +3494,7 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) */ if (ztest_device_removal_active) { spa_config_exit(spa, SCL_ALL, FTAG); - mutex_exit(&ztest_vdev_lock); - return; + goto out; } /* @@ -3393,14 +3521,17 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (zs->zs_mirrors >= 1) { ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); ASSERT(oldvd->vdev_children >= zs->zs_mirrors); - oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raid_children]; } /* pick a child out of the raidz group */ - if (ztest_opts.zo_raidz > 1) { - ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); - ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); - oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; + if (ztest_opts.zo_raid_children > 1) { + if (strcmp(oldvd->vdev_ops->vdev_op_type, "raidz") == 0) + ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); + else + ASSERT(oldvd->vdev_ops == &vdev_draid_ops); + ASSERT(oldvd->vdev_children == ztest_opts.zo_raid_children); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raid_children]; } /* @@ -3447,6 +3578,10 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) if (sav->sav_count != 0 && ztest_random(3) == 0) { newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; newvd_is_spare = B_TRUE; + + if (newvd->vdev_ops == &vdev_draid_spare_ops) + newvd_is_dspare = B_TRUE; + (void) strcpy(newpath, newvd->vdev_path); } else { (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, @@ -3480,6 +3615,9 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) * If newvd is already part of the pool, it should fail with EBUSY. * * If newvd is too small, it should fail with EOVERFLOW. + * + * If newvd is a distributed spare and it's being attached to a + * dRAID which is not its parent it should fail with EINVAL. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops && (!replacing || @@ -3492,10 +3630,12 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) expected_error = replacing ? 0 : EBUSY; else if (vdev_lookup_by_path(rvd, newpath) != NULL) expected_error = EBUSY; - else if (newsize < oldsize) + else if (!newvd_is_dspare && newsize < oldsize) expected_error = EOVERFLOW; else if (ashift > oldvd->vdev_top->vdev_ashift) expected_error = EDOM; + else if (newvd_is_dspare && pvd != vdev_draid_spare_get_parent(newvd)) + expected_error = ENOTSUP; else expected_error = 0; @@ -4880,13 +5020,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, packobj, packoff, + VERIFY0(dmu_read(os, packobj, packoff, packsize, packcheck, DMU_READ_PREFETCH)); - VERIFY(0 == dmu_read(os, bigobj, bigoff, + VERIFY0(dmu_read(os, bigobj, bigoff, bigsize, bigcheck, DMU_READ_PREFETCH)); - ASSERT(bcmp(packbuf, packcheck, packsize) == 0); - ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + ASSERT0(bcmp(packbuf, packcheck, packsize)); + ASSERT0(bcmp(bigbuf, bigcheck, bigsize)); umem_free(packcheck, packsize); umem_free(bigcheck, bigsize); @@ -5761,7 +5901,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) } maxfaults = MAXFAULTS(zs); - leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raid_children; mirror_save = zs->zs_mirrors; mutex_exit(&ztest_vdev_lock); @@ -6011,7 +6151,7 @@ ztest_fault_inject(ztest_ds_t *zd, uint64_t id) /* * By design ztest will never inject uncorrectable damage in to the pool. * Issue a scrub, wait for it to complete, and verify there is never any - * any persistent damage. + * persistent damage. * * Only after a full scrub has been completed is it safe to start injecting * data corruption. See the comment in zfs_fault_inject(). @@ -7016,6 +7156,7 @@ ztest_import_impl(ztest_shared_t *zs) VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, &libzpool_config_ops)); VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); + fnvlist_free(cfg); } /* @@ -7347,7 +7488,7 @@ ztest_init(ztest_shared_t *zs) zs->zs_splits = 0; zs->zs_mirrors = ztest_opts.zo_mirrors; nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, - NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); props = make_random_props(); /* @@ -7683,10 +7824,12 @@ main(int argc, char **argv) if (ztest_opts.zo_verbose >= 1) { (void) printf("%llu vdevs, %d datasets, %d threads," - " %llu seconds...\n", + "%d %s disks, %llu seconds...\n\n", (u_longlong_t)ztest_opts.zo_vdevs, ztest_opts.zo_datasets, ztest_opts.zo_threads, + ztest_opts.zo_raid_children, + ztest_opts.zo_raid_type, (u_longlong_t)ztest_opts.zo_time); } diff --git a/sys/contrib/openzfs/config/Abigail.am b/sys/contrib/openzfs/config/Abigail.am new file mode 100644 index 000000000000..599f611942b0 --- /dev/null +++ b/sys/contrib/openzfs/config/Abigail.am @@ -0,0 +1,29 @@ +# +# When performing an ABI check the following options are applied: +# +# --no-unreferenced-symbols: Exclude symbols which are not referenced by +# any debug information. Without this _init() and _fini() are incorrectly +# reported on CentOS7 for libuutil.so. +# +# --headers-dir1: Limit ABI checks to public OpenZFS headers, otherwise +# changes in public system headers are also reported. +# +# --suppressions: Honor a suppressions file for each library to provide +# a mechanism for suppressing harmless warnings. +# + +PHONY += checkabi storeabi + +checkabi: + for lib in $(lib_LTLIBRARIES) ; do \ + abidiff --no-unreferenced-symbols \ + --headers-dir1 ../../include \ + --suppressions $${lib%.la}.suppr \ + $${lib%.la}.abi .libs/$${lib%.la}.so ; \ + done + +storeabi: + cd .libs ; \ + for lib in $(lib_LTLIBRARIES) ; do \ + abidw $${lib%.la}.so > ../$${lib%.la}.abi ; \ + done diff --git a/sys/contrib/openzfs/config/always-python.m4 b/sys/contrib/openzfs/config/always-python.m4 index c01e631a8f4f..76b06fcd8488 100644 --- a/sys/contrib/openzfs/config/always-python.m4 +++ b/sys/contrib/openzfs/config/always-python.m4 @@ -7,7 +7,7 @@ dnl # set the PYTHON environment variable accordingly. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYTHON], [ AC_ARG_WITH([python], - AC_HELP_STRING([--with-python[=VERSION]], + AS_HELP_STRING([--with-python[=VERSION]], [default system python version @<:@default=check@:>@]), [with_python=$withval], [with_python=check]) diff --git a/sys/contrib/openzfs/config/always-pyzfs.m4 b/sys/contrib/openzfs/config/always-pyzfs.m4 index f620a8f9a18b..76e07b593df2 100644 --- a/sys/contrib/openzfs/config/always-pyzfs.m4 +++ b/sys/contrib/openzfs/config/always-pyzfs.m4 @@ -22,7 +22,7 @@ dnl # Determines if pyzfs can be built, requires Python 2.7 or later. dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ AC_ARG_ENABLE([pyzfs], - AC_HELP_STRING([--enable-pyzfs], + AS_HELP_STRING([--enable-pyzfs], [install libzfs_core python bindings @<:@default=check@:>@]), [enable_pyzfs=$enableval], [enable_pyzfs=check]) diff --git a/sys/contrib/openzfs/config/always-sed.m4 b/sys/contrib/openzfs/config/always-sed.m4 index 19633e118aed..3d7ae285ba1b 100644 --- a/sys/contrib/openzfs/config/always-sed.m4 +++ b/sys/contrib/openzfs/config/always-sed.m4 @@ -4,7 +4,7 @@ dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_SED], [ AC_REQUIRE([AC_PROG_SED])dnl AC_CACHE_CHECK([for sed --in-place], [ac_cv_inplace], [ - tmpfile=$(mktemp conftest.XXX) + tmpfile=$(mktemp conftest.XXXXXX) echo foo >$tmpfile AS_IF([$SED --in-place 's#foo#bar#' $tmpfile 2>/dev/null], [ac_cv_inplace="--in-place"], diff --git a/sys/contrib/openzfs/config/deb.am b/sys/contrib/openzfs/config/deb.am index 79063e407fe3..639a46efddbf 100644 --- a/sys/contrib/openzfs/config/deb.am +++ b/sys/contrib/openzfs/config/deb.am @@ -41,11 +41,11 @@ deb-utils: deb-local rpm-utils-initramfs arch=`$(RPM) -qp $${name}-$${version}.src.rpm --qf %{arch} | tail -1`; \ debarch=`$(DPKG) --print-architecture`; \ pkg1=$${name}-$${version}.$${arch}.rpm; \ - pkg2=libnvpair1-$${version}.$${arch}.rpm; \ - pkg3=libuutil1-$${version}.$${arch}.rpm; \ - pkg4=libzfs2-$${version}.$${arch}.rpm; \ - pkg5=libzpool2-$${version}.$${arch}.rpm; \ - pkg6=libzfs2-devel-$${version}.$${arch}.rpm; \ + pkg2=libnvpair3-$${version}.$${arch}.rpm; \ + pkg3=libuutil3-$${version}.$${arch}.rpm; \ + pkg4=libzfs4-$${version}.$${arch}.rpm; \ + pkg5=libzpool4-$${version}.$${arch}.rpm; \ + pkg6=libzfs4-devel-$${version}.$${arch}.rpm; \ pkg7=$${name}-test-$${version}.$${arch}.rpm; \ pkg8=$${name}-dracut-$${version}.noarch.rpm; \ pkg9=$${name}-initramfs-$${version}.$${arch}.rpm; \ @@ -53,10 +53,10 @@ deb-utils: deb-local rpm-utils-initramfs ## Arguments need to be passed to dh_shlibdeps. Alien provides no mechanism ## to do this, so we install a shim onto the path which calls the real ## dh_shlibdeps with the required arguments. - path_prepend=`mktemp -d /tmp/intercept.XXX`; \ + path_prepend=`mktemp -d /tmp/intercept.XXXXXX`; \ echo "#$(SHELL)" > $${path_prepend}/dh_shlibdeps; \ echo "`which dh_shlibdeps` -- \ - -xlibuutil1linux -xlibnvpair1linux -xlibzfs2linux -xlibzpool2linux" \ + -xlibuutil3linux -xlibnvpair3linux -xlibzfs4linux -xlibzpool4linux" \ >> $${path_prepend}/dh_shlibdeps; \ ## These -x arguments are passed to dpkg-shlibdeps, which exclude the ## Debianized packages from the auto-generated dependencies of the new debs, diff --git a/sys/contrib/openzfs/config/kernel-acl.m4 b/sys/contrib/openzfs/config/kernel-acl.m4 index 0f1c24656730..e02ce665323f 100644 --- a/sys/contrib/openzfs/config/kernel-acl.m4 +++ b/sys/contrib/openzfs/config/kernel-acl.m4 @@ -11,7 +11,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_RELEASE], [ ], [ struct posix_acl *tmp = posix_acl_alloc(1, 0); posix_acl_release(tmp); - ], [], [$ZFS_META_LICENSE]) + ], [], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_RELEASE], [ @@ -50,7 +50,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_CACHED_ACL_USABLE], [ struct posix_acl *acl = posix_acl_alloc(1, 0); set_cached_acl(ip, ACL_TYPE_ACCESS, acl); forget_cached_acl(ip, ACL_TYPE_ACCESS); - ], [], [$ZFS_META_LICENSE]) + ], [], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE], [ diff --git a/sys/contrib/openzfs/config/kernel-bio.m4 b/sys/contrib/openzfs/config/kernel-bio.m4 index 534282780d3e..0c533531dceb 100644 --- a/sys/contrib/openzfs/config/kernel-bio.m4 +++ b/sys/contrib/openzfs/config/kernel-bio.m4 @@ -188,7 +188,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_SET_DEV], [ struct block_device *bdev = NULL; struct bio *bio = NULL; bio_set_dev(bio, bdev); - ], [], [$ZFS_META_LICENSE]) + ], [], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_BIO_SET_DEV], [ @@ -347,7 +347,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKG_TRYGET], [ struct blkcg_gq blkg __attribute__ ((unused)) = {}; bool rc __attribute__ ((unused)); rc = blkg_tryget(&blkg); - ], [], [$ZFS_META_LICENSE]) + ], [], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKG_TRYGET], [ diff --git a/sys/contrib/openzfs/config/kernel-blk-queue.m4 b/sys/contrib/openzfs/config/kernel-blk-queue.m4 index 382ebefd34a3..ff2da92e9ee1 100644 --- a/sys/contrib/openzfs/config/kernel-blk-queue.m4 +++ b/sys/contrib/openzfs/config/kernel-blk-queue.m4 @@ -179,7 +179,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH], [ ], [ struct request_queue *q = NULL; (void) blk_queue_flush(q, REQ_FLUSH); - ], [$NO_UNUSED_BUT_SET_VARIABLE], [$ZFS_META_LICENSE]) + ], [$NO_UNUSED_BUT_SET_VARIABLE], [ZFS_META_LICENSE]) ZFS_LINUX_TEST_SRC([blk_queue_write_cache], [ #include @@ -187,7 +187,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH], [ ], [ struct request_queue *q = NULL; blk_queue_write_cache(q, true, true); - ], [$NO_UNUSED_BUT_SET_VARIABLE], [$ZFS_META_LICENSE]) + ], [$NO_UNUSED_BUT_SET_VARIABLE], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLUSH], [ diff --git a/sys/contrib/openzfs/config/kernel-blkdev.m4 b/sys/contrib/openzfs/config/kernel-blkdev.m4 index 2644555f5524..4b80d4dd29a5 100644 --- a/sys/contrib/openzfs/config/kernel-blkdev.m4 +++ b/sys/contrib/openzfs/config/kernel-blkdev.m4 @@ -77,6 +77,59 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [ ]) ]) +dnl # +dnl # check_disk_change() was removed in 5.10 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE], [ + ZFS_LINUX_TEST_SRC([check_disk_change], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + bool error; + + error = check_disk_change(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [ + AC_MSG_CHECKING([whether check_disk_change() exists]) + ZFS_LINUX_TEST_RESULT([check_disk_change], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CHECK_DISK_CHANGE, 1, + [check_disk_change() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 5.10 API, check_disk_change() is removed, in favor of +dnl # bdev_check_media_change(), which doesn't force revalidation +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ + ZFS_LINUX_TEST_SRC([bdev_check_media_change], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + int error; + + error = bdev_check_media_change(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ + AC_MSG_CHECKING([whether bdev_disk_changed() exists]) + ZFS_LINUX_TEST_RESULT([bdev_check_media_change], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_CHECK_MEDIA_CHANGE, 1, + [bdev_check_media_change() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # 2.6.22 API change dnl # Single argument invalidate_bdev() @@ -101,42 +154,69 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV], [ ]) dnl # -dnl # 2.6.27, lookup_bdev() was exported. -dnl # 4.4.0-6.21 - lookup_bdev() takes 2 arguments. +dnl # 5.11 API, lookup_bdev() takes dev_t argument. +dnl # 2.6.27 API, lookup_bdev() was first exported. +dnl # 4.4.0-6.21 API, lookup_bdev() on Ubuntu takes mode argument. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV], [ + ZFS_LINUX_TEST_SRC([lookup_bdev_devt], [ + #include + ], [ + int error __attribute__ ((unused)); + const char path[] = "/example/path"; + dev_t dev; + + error = lookup_bdev(path, &dev); + ]) + ZFS_LINUX_TEST_SRC([lookup_bdev_1arg], [ #include #include ], [ - lookup_bdev(NULL); + struct block_device *bdev __attribute__ ((unused)); + const char path[] = "/example/path"; + + bdev = lookup_bdev(path); ]) - ZFS_LINUX_TEST_SRC([lookup_bdev_2args], [ + ZFS_LINUX_TEST_SRC([lookup_bdev_mode], [ #include ], [ - lookup_bdev(NULL, FMODE_READ); + struct block_device *bdev __attribute__ ((unused)); + const char path[] = "/example/path"; + + bdev = lookup_bdev(path, FMODE_READ); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV], [ - AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg]) - ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_1arg], + AC_MSG_CHECKING([whether lookup_bdev() wants dev_t arg]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_devt], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1, - [lookup_bdev() wants 1 arg]) + AC_DEFINE(HAVE_DEVT_LOOKUP_BDEV, 1, + [lookup_bdev() wants dev_t arg]) ], [ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether lookup_bdev() wants 2 args]) - ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_2args], + AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_1arg], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_2ARGS_LOOKUP_BDEV, 1, - [lookup_bdev() wants 2 args]) + AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1, + [lookup_bdev() wants 1 arg]) ], [ - ZFS_LINUX_TEST_ERROR([lookup_bdev()]) + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether lookup_bdev() wants mode arg]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_mode], + [lookup_bdev], [fs/block_dev.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MODE_LOOKUP_BDEV, 1, + [lookup_bdev() wants mode arg]) + ], [ + ZFS_LINUX_TEST_ERROR([lookup_bdev()]) + ]) ]) ]) ]) @@ -191,6 +271,29 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ ]) ]) +dnl # +dnl # 5.11 API change +dnl # Added bdev_whole() helper. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE], [ + ZFS_LINUX_TEST_SRC([bdev_whole], [ + #include + ],[ + struct block_device *bdev = NULL; + bdev = bdev_whole(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [ + AC_MSG_CHECKING([whether bdev_whole() is available]) + ZFS_LINUX_TEST_RESULT([bdev_whole], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_WHOLE, 1, [bdev_whole() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT @@ -199,6 +302,9 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE + ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ @@ -209,4 +315,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE + ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE + ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE + ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE ]) diff --git a/sys/contrib/openzfs/config/kernel-config-defined.m4 b/sys/contrib/openzfs/config/kernel-config-defined.m4 index fe778e649454..9b9468269ca3 100644 --- a/sys/contrib/openzfs/config/kernel-config-defined.m4 +++ b/sys/contrib/openzfs/config/kernel-config-defined.m4 @@ -86,7 +86,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_DEBUG_LOCK_ALLOC], [ mutex_init(&lock); mutex_lock(&lock); mutex_unlock(&lock); - ], [], [$ZFS_META_LICENSE]) + ], [], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC], [ diff --git a/sys/contrib/openzfs/config/kernel-fpu.m4 b/sys/contrib/openzfs/config/kernel-fpu.m4 index 3c7933413d18..4d6fe052289c 100644 --- a/sys/contrib/openzfs/config/kernel-fpu.m4 +++ b/sys/contrib/openzfs/config/kernel-fpu.m4 @@ -42,7 +42,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ ], [ kernel_fpu_begin(); kernel_fpu_end(); - ], [], [$ZFS_META_LICENSE]) + ], [], [ZFS_META_LICENSE]) ZFS_LINUX_TEST_SRC([__kernel_fpu], [ #include @@ -55,7 +55,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ ], [ __kernel_fpu_begin(); __kernel_fpu_end(); - ], [], [$ZFS_META_LICENSE]) + ], [], [ZFS_META_LICENSE]) ZFS_LINUX_TEST_SRC([fpu_internal], [ #if defined(__x86_64) || defined(__x86_64__) || \ diff --git a/sys/contrib/openzfs/config/kernel-generic_io_acct.m4 b/sys/contrib/openzfs/config/kernel-generic_io_acct.m4 index 423b3e5a3521..e4ab503d5e1c 100644 --- a/sys/contrib/openzfs/config/kernel-generic_io_acct.m4 +++ b/sys/contrib/openzfs/config/kernel-generic_io_acct.m4 @@ -2,6 +2,16 @@ dnl # dnl # Check for generic io accounting interface. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT], [ + ZFS_LINUX_TEST_SRC([bio_io_acct], [ + #include + ], [ + struct bio *bio = NULL; + unsigned long start_time; + + start_time = bio_start_io_acct(bio); + bio_end_io_acct(bio, start_time); + ]) + ZFS_LINUX_TEST_SRC([generic_acct_3args], [ #include @@ -29,36 +39,49 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT], [ AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT], [ dnl # - dnl # 3.19 API addition + dnl # 5.7 API, dnl # - dnl # torvalds/linux@394ffa50 allows us to increment iostat - dnl # counters without generic_make_request(). + dnl # Added bio_start_io_acct() and bio_end_io_acct() helpers. dnl # - AC_MSG_CHECKING([whether generic IO accounting wants 3 args]) - ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_3args], - [generic_start_io_acct], [block/bio.c], [ + AC_MSG_CHECKING([whether generic bio_*_io_acct() are available]) + ZFS_LINUX_TEST_RESULT([bio_io_acct], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_IO_ACCT_3ARG, 1, - [generic_start_io_acct()/generic_end_io_acct() available]) + AC_DEFINE(HAVE_BIO_IO_ACCT, 1, [bio_*_io_acct() available]) ], [ AC_MSG_RESULT(no) dnl # - dnl # Linux 4.14 API, + dnl # 4.14 API, dnl # dnl # generic_start_io_acct/generic_end_io_acct now require dnl # request_queue to be provided. No functional changes, dnl # but preparation for inflight accounting. dnl # - AC_MSG_CHECKING([whether generic IO accounting wants 4 args]) + AC_MSG_CHECKING([whether generic_*_io_acct wants 4 args]) ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_4args], [generic_start_io_acct], [block/bio.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_GENERIC_IO_ACCT_4ARG, 1, - [generic_start_io_acct()/generic_end_io_acct() ] - [4 arg available]) + [generic_*_io_acct() 4 arg available]) ], [ AC_MSG_RESULT(no) + + dnl # + dnl # 3.19 API addition + dnl # + dnl # torvalds/linux@394ffa50 allows us to increment + dnl # iostat counters without generic_make_request(). + dnl # + AC_MSG_CHECKING( + [whether generic_*_io_acct wants 3 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_3args], + [generic_start_io_acct], [block/bio.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_IO_ACCT_3ARG, 1, + [generic_*_io_acct() 3 arg available]) + ], [ + AC_MSG_RESULT(no) + ]) ]) ]) ]) diff --git a/sys/contrib/openzfs/config/kernel-get-disk-and-module.m4 b/sys/contrib/openzfs/config/kernel-get-disk-and-module.m4 index 51cf7743cf0b..e69de29bb2d1 100644 --- a/sys/contrib/openzfs/config/kernel-get-disk-and-module.m4 +++ b/sys/contrib/openzfs/config/kernel-get-disk-and-module.m4 @@ -1,24 +0,0 @@ -dnl # -dnl # 4.16 API change -dnl # Verify if get_disk_and_module() symbol is available. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_DISK_AND_MODULE], [ - ZFS_LINUX_TEST_SRC([get_disk_and_module], [ - #include - ], [ - struct gendisk *disk = NULL; - (void) get_disk_and_module(disk); - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_GET_DISK_AND_MODULE], [ - AC_MSG_CHECKING([whether get_disk_and_module() is available]) - ZFS_LINUX_TEST_RESULT_SYMBOL([get_disk_and_module], - [get_disk_and_module], [block/genhd.c], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GET_DISK_AND_MODULE, - 1, [get_disk_and_module() is available]) - ], [ - AC_MSG_RESULT(no) - ]) -]) diff --git a/sys/contrib/openzfs/config/kernel-hotplug.m4 b/sys/contrib/openzfs/config/kernel-hotplug.m4 new file mode 100644 index 000000000000..e796a6d2e8e8 --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-hotplug.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # 4.6 API change +dnl # Added CPU hotplug APIs +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CPU_HOTPLUG], [ + ZFS_LINUX_TEST_SRC([cpu_hotplug], [ + #include + ],[ + enum cpuhp_state state = CPUHP_ONLINE; + int (*fp)(unsigned int, struct hlist_node *) = NULL; + cpuhp_state_add_instance_nocalls(0, (struct hlist_node *)NULL); + cpuhp_state_remove_instance_nocalls(0, (struct hlist_node *)NULL); + cpuhp_setup_state_multi(state, "", fp, fp); + cpuhp_remove_multi_state(0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CPU_HOTPLUG], [ + AC_MSG_CHECKING([whether CPU hotplug APIs exist]) + ZFS_LINUX_TEST_RESULT([cpu_hotplug], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CPU_HOTPLUG, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel-make-request-fn.m4 b/sys/contrib/openzfs/config/kernel-make-request-fn.m4 index 1576fece1368..290ef6b8da7d 100644 --- a/sys/contrib/openzfs/config/kernel-make-request-fn.m4 +++ b/sys/contrib/openzfs/config/kernel-make-request-fn.m4 @@ -27,6 +27,15 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ q = blk_alloc_queue(make_request, NUMA_NO_NODE); ]) + ZFS_LINUX_TEST_SRC([blk_alloc_queue_request_fn_rh], [ + #include + blk_qc_t make_request(struct request_queue *q, + struct bio *bio) { return (BLK_QC_T_NONE); } + ],[ + struct request_queue *q __attribute__ ((unused)); + q = blk_alloc_queue_rh(make_request, NUMA_NO_NODE); + ]) + ZFS_LINUX_TEST_SRC([block_device_operations_submit_bio], [ #include ],[ @@ -47,7 +56,9 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ AC_DEFINE(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS, 1, [submit_bio is member of struct block_device_operations]) - ],[ + ],[ + AC_MSG_RESULT(no) + dnl # Checked as part of the blk_alloc_queue_request_fn test dnl # dnl # Linux 5.7 API Change @@ -55,6 +66,9 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ dnl # AC_MSG_CHECKING([whether blk_alloc_queue() expects request function]) ZFS_LINUX_TEST_RESULT([blk_alloc_queue_request_fn], [ + AC_MSG_RESULT(yes) + + dnl # This is currently always the case. AC_MSG_CHECKING([whether make_request_fn() returns blk_qc_t]) AC_MSG_RESULT(yes) @@ -66,34 +80,59 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ [Noting that make_request_fn() returns blk_qc_t]) ],[ dnl # - dnl # Linux 3.2 API Change - dnl # make_request_fn returns void. + dnl # CentOS Stream 4.18.0-257 API Change + dnl # The Linux 5.7 blk_alloc_queue() change was back- + dnl # ported and the symbol renamed blk_alloc_queue_rh(). + dnl # As of this kernel version they're not providing + dnl # any compatibility code in the kernel for this. dnl # - AC_MSG_CHECKING([whether make_request_fn() returns void]) - ZFS_LINUX_TEST_RESULT([make_request_fn_void], [ + ZFS_LINUX_TEST_RESULT([blk_alloc_queue_request_fn_rh], [ AC_MSG_RESULT(yes) - AC_DEFINE(MAKE_REQUEST_FN_RET, void, + + dnl # This is currently always the case. + AC_MSG_CHECKING([whether make_request_fn_rh() returns blk_qc_t]) + AC_MSG_RESULT(yes) + + AC_DEFINE(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH, 1, + [blk_alloc_queue_rh() expects request function]) + AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, [make_request_fn() return type]) - AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_VOID, 1, - [Noting that make_request_fn() returns void]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, + [Noting that make_request_fn() returns blk_qc_t]) ],[ AC_MSG_RESULT(no) dnl # - dnl # Linux 4.4 API Change - dnl # make_request_fn returns blk_qc_t. + dnl # Linux 3.2 API Change + dnl # make_request_fn returns void. dnl # AC_MSG_CHECKING( - [whether make_request_fn() returns blk_qc_t]) - ZFS_LINUX_TEST_RESULT([make_request_fn_blk_qc_t], [ + [whether make_request_fn() returns void]) + ZFS_LINUX_TEST_RESULT([make_request_fn_void], [ AC_MSG_RESULT(yes) - AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, + AC_DEFINE(MAKE_REQUEST_FN_RET, void, [make_request_fn() return type]) - AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, - [Noting that make_request_fn() ] - [returns blk_qc_t]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_VOID, 1, + [Noting that make_request_fn() returns void]) ],[ - ZFS_LINUX_TEST_ERROR([make_request_fn]) + AC_MSG_RESULT(no) + + dnl # + dnl # Linux 4.4 API Change + dnl # make_request_fn returns blk_qc_t. + dnl # + AC_MSG_CHECKING( + [whether make_request_fn() returns blk_qc_t]) + ZFS_LINUX_TEST_RESULT([make_request_fn_blk_qc_t], [ + AC_MSG_RESULT(yes) + AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, + [make_request_fn() return type]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, + [Noting that make_request_fn() ] + [returns blk_qc_t]) + ],[ + ZFS_LINUX_TEST_ERROR([make_request_fn]) + ]) ]) ]) ]) diff --git a/sys/contrib/openzfs/config/kernel-objtool.m4 b/sys/contrib/openzfs/config/kernel-objtool.m4 index c560c41954c6..f9f9d657d805 100644 --- a/sys/contrib/openzfs/config/kernel-objtool.m4 +++ b/sys/contrib/openzfs/config/kernel-objtool.m4 @@ -1,3 +1,24 @@ +dnl # +dnl # Detect objtool functionality. +dnl # + +dnl # +dnl # Kernel 5.10: linux/frame.h was renamed linux/objtool.h +dnl # +AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL_HEADER], [ + AC_MSG_CHECKING([whether objtool header is available]) + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + ],[ + AC_DEFINE(HAVE_KERNEL_OBJTOOL_HEADER, 1, + [kernel has linux/objtool.h]) + AC_MSG_RESULT(linux/objtool.h) + ],[ + AC_MSG_RESULT(linux/frame.h) + ]) +]) + dnl # dnl # Check for objtool support. dnl # @@ -16,7 +37,11 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_OBJTOOL], [ dnl # 4.6 API added STACK_FRAME_NON_STANDARD macro ZFS_LINUX_TEST_SRC([stack_frame_non_standard], [ + #ifdef HAVE_KERNEL_OBJTOOL_HEADER + #include + #else #include + #endif ],[ #if !defined(STACK_FRAME_NON_STANDARD) #error "STACK_FRAME_NON_STANDARD is not defined." diff --git a/sys/contrib/openzfs/config/kernel-percpu.m4 b/sys/contrib/openzfs/config/kernel-percpu.m4 index e9654a69ee0a..700d97a25853 100644 --- a/sys/contrib/openzfs/config/kernel-percpu.m4 +++ b/sys/contrib/openzfs/config/kernel-percpu.m4 @@ -25,10 +25,36 @@ AC_DEFUN([ZFS_AC_KERNEL_PERCPU_COUNTER_INIT], [ ]) ]) +dnl # +dnl # 5.10 API change, +dnl # The "count" was moved into ref->data, from ref +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU_REF_COUNT_IN_DATA], [ + ZFS_LINUX_TEST_SRC([percpu_ref_count_in_data], [ + #include + ],[ + struct percpu_ref_data d; + + atomic_long_set(&d.count, 1L); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PERCPU_REF_COUNT_IN_DATA], [ + AC_MSG_CHECKING([whether is inside percpu_ref.data]) + ZFS_LINUX_TEST_RESULT([percpu_ref_count_in_data], [ + AC_MSG_RESULT(yes) + AC_DEFINE(ZFS_PERCPU_REF_COUNT_IN_DATA, 1, + [count is located in percpu_ref.data]) + ],[ + AC_MSG_RESULT(no) + ]) +]) AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU], [ ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_INIT + ZFS_AC_KERNEL_SRC_PERCPU_REF_COUNT_IN_DATA ]) AC_DEFUN([ZFS_AC_KERNEL_PERCPU], [ ZFS_AC_KERNEL_PERCPU_COUNTER_INIT + ZFS_AC_KERNEL_PERCPU_REF_COUNT_IN_DATA ]) diff --git a/sys/contrib/openzfs/config/kernel-revalidate-disk-size.m4 b/sys/contrib/openzfs/config/kernel-revalidate-disk-size.m4 new file mode 100644 index 000000000000..a7d0cb3cdab4 --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-revalidate-disk-size.m4 @@ -0,0 +1,46 @@ +dnl # +dnl # 5.11 API change +dnl # revalidate_disk_size() has been removed entirely. +dnl # +dnl # 5.10 API change +dnl # revalidate_disk() was replaced by revalidate_disk_size() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REVALIDATE_DISK], [ + + ZFS_LINUX_TEST_SRC([revalidate_disk_size], [ + #include + ], [ + struct gendisk *disk = NULL; + (void) revalidate_disk_size(disk, false); + ]) + + ZFS_LINUX_TEST_SRC([revalidate_disk], [ + #include + ], [ + struct gendisk *disk = NULL; + (void) revalidate_disk(disk); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_REVALIDATE_DISK], [ + + AC_MSG_CHECKING([whether revalidate_disk_size() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([revalidate_disk_size], + [revalidate_disk_size], [block/genhd.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REVALIDATE_DISK_SIZE, 1, + [revalidate_disk_size() is available]) + ], [ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether revalidate_disk() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([revalidate_disk], + [revalidate_disk], [block/genhd.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REVALIDATE_DISK, 1, + [revalidate_disk() is available]) + ], [ + AC_MSG_RESULT(no) + ]) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel-rwsem.m4 b/sys/contrib/openzfs/config/kernel-rwsem.m4 index 824f4a3ffd41..d3a64a8efa19 100644 --- a/sys/contrib/openzfs/config/kernel-rwsem.m4 +++ b/sys/contrib/openzfs/config/kernel-rwsem.m4 @@ -1,29 +1,3 @@ -dnl # -dnl # 3.1 API Change -dnl # -dnl # The rw_semaphore.wait_lock member was changed from spinlock_t to -dnl # raw_spinlock_t at commit ddb6c9b58a19edcfac93ac670b066c836ff729f1. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM_SPINLOCK_IS_RAW], [ - ZFS_LINUX_TEST_SRC([rwsem_spinlock_is_raw], [ - #include - ],[ - struct rw_semaphore dummy_semaphore __attribute__ ((unused)); - raw_spinlock_t dummy_lock __attribute__ ((unused)) = - __RAW_SPIN_LOCK_INITIALIZER(dummy_lock); - dummy_semaphore.wait_lock = dummy_lock; - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_RWSEM_SPINLOCK_IS_RAW], [ - AC_MSG_CHECKING([whether struct rw_semaphore member wait_lock is raw]) - ZFS_LINUX_TEST_RESULT([rwsem_spinlock_is_raw], [ - AC_MSG_RESULT(yes) - ],[ - ZFS_LINUX_TEST_ERROR([rwsem_spinlock_is_raw]) - ]) -]) - dnl # dnl # 3.16 API Change dnl # @@ -76,13 +50,11 @@ AC_DEFUN([ZFS_AC_KERNEL_RWSEM_ATOMIC_LONG_COUNT], [ ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM], [ - ZFS_AC_KERNEL_SRC_RWSEM_SPINLOCK_IS_RAW ZFS_AC_KERNEL_SRC_RWSEM_ACTIVITY ZFS_AC_KERNEL_SRC_RWSEM_ATOMIC_LONG_COUNT ]) AC_DEFUN([ZFS_AC_KERNEL_RWSEM], [ - ZFS_AC_KERNEL_RWSEM_SPINLOCK_IS_RAW ZFS_AC_KERNEL_RWSEM_ACTIVITY ZFS_AC_KERNEL_RWSEM_ATOMIC_LONG_COUNT ]) diff --git a/sys/contrib/openzfs/config/kernel-vfs-iov_iter.m4 b/sys/contrib/openzfs/config/kernel-vfs-iov_iter.m4 new file mode 100644 index 000000000000..69db11b6882b --- /dev/null +++ b/sys/contrib/openzfs/config/kernel-vfs-iov_iter.m4 @@ -0,0 +1,206 @@ +dnl # +dnl # Check for available iov_iter functionality. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ + ZFS_LINUX_TEST_SRC([iov_iter_types], [ + #include + #include + ],[ + int type __attribute__ ((unused)) = + ITER_IOVEC | ITER_KVEC | ITER_BVEC | ITER_PIPE; + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_init], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + struct iovec iov; + unsigned long nr_segs = 1; + size_t count = 1024; + + iov_iter_init(&iter, WRITE, &iov, nr_segs, count); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_init_legacy], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + struct iovec iov; + unsigned long nr_segs = 1; + size_t count = 1024; + size_t written = 0; + + iov_iter_init(&iter, &iov, nr_segs, count, written); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_advance], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + size_t advance = 512; + + iov_iter_advance(&iter, advance); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_revert], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + size_t revert = 512; + + iov_iter_revert(&iter, revert); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_fault_in_readable], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + size_t size = 512; + int error __attribute__ ((unused)); + + error = iov_iter_fault_in_readable(&iter, size); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_count], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + size_t bytes __attribute__ ((unused)); + + bytes = iov_iter_count(&iter); + ]) + + ZFS_LINUX_TEST_SRC([copy_to_iter], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + char buf[512] = { 0 }; + size_t size = 512; + size_t bytes __attribute__ ((unused)); + + bytes = copy_to_iter((const void *)&buf, size, &iter); + ]) + + ZFS_LINUX_TEST_SRC([copy_from_iter], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + char buf[512] = { 0 }; + size_t size = 512; + size_t bytes __attribute__ ((unused)); + + bytes = copy_from_iter((void *)&buf, size, &iter); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ + enable_vfs_iov_iter="yes" + + AC_MSG_CHECKING([whether iov_iter types are available]) + ZFS_LINUX_TEST_RESULT([iov_iter_types], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_TYPES, 1, + [iov_iter types are available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + dnl # + dnl # 'iov_iter_init' available in Linux 3.16 and newer. + dnl # 'iov_iter_init_legacy' available in Linux 3.15 and older. + dnl # + AC_MSG_CHECKING([whether iov_iter_init() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_init], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_INIT, 1, + [iov_iter_init() is available]) + ],[ + ZFS_LINUX_TEST_RESULT([iov_iter_init_legacy], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_INIT_LEGACY, 1, + [iov_iter_init() is available]) + ],[ + ZFS_LINUX_TEST_ERROR([iov_iter_init()]) + ]) + ]) + + AC_MSG_CHECKING([whether iov_iter_advance() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_advance], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_ADVANCE, 1, + [iov_iter_advance() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether iov_iter_revert() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_revert], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_REVERT, 1, + [iov_iter_revert() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether iov_iter_fault_in_readable() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_fault_in_readable], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_FAULT_IN_READABLE, 1, + [iov_iter_fault_in_readable() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether iov_iter_count() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_count], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_COUNT, 1, + [iov_iter_count() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether copy_to_iter() is available]) + ZFS_LINUX_TEST_RESULT([copy_to_iter], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_COPY_TO_ITER, 1, + [copy_to_iter() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + AC_MSG_CHECKING([whether copy_from_iter() is available]) + ZFS_LINUX_TEST_RESULT([copy_from_iter], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_COPY_FROM_ITER, 1, + [copy_from_iter() is available]) + ],[ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + + dnl # + dnl # As of the 4.9 kernel support is provided for iovecs, kvecs, + dnl # bvecs and pipes in the iov_iter structure. As long as the + dnl # other support interfaces are all available the iov_iter can + dnl # be correctly used in the uio structure. + dnl # + AS_IF([test "x$enable_vfs_iov_iter" = "xyes"], [ + AC_DEFINE(HAVE_VFS_IOV_ITER, 1, + [All required iov_iter interfaces are available]) + ]) +]) diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4 index ec52f014a7a3..f31be845f5d9 100644 --- a/sys/contrib/openzfs/config/kernel.m4 +++ b/sys/contrib/openzfs/config/kernel.m4 @@ -13,6 +13,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ dnl # Sequential ZFS_LINUX_TRY_COMPILE tests ZFS_AC_KERNEL_FPU_HEADER + ZFS_AC_KERNEL_OBJTOOL_HEADER ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T ZFS_AC_KERNEL_MISC_MINOR ZFS_AC_KERNEL_DECLARE_EVENT_CLASS @@ -60,7 +61,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_BIO ZFS_AC_KERNEL_SRC_BLKDEV ZFS_AC_KERNEL_SRC_BLK_QUEUE - ZFS_AC_KERNEL_SRC_GET_DISK_AND_MODULE + ZFS_AC_KERNEL_SRC_REVALIDATE_DISK ZFS_AC_KERNEL_SRC_GET_DISK_RO ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY @@ -104,6 +105,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS + ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN @@ -122,6 +124,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES ZFS_AC_KERNEL_SRC_KSTRTOUL ZFS_AC_KERNEL_SRC_PERCPU + ZFS_AC_KERNEL_SRC_CPU_HOTPLUG AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) @@ -156,7 +159,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_BIO ZFS_AC_KERNEL_BLKDEV ZFS_AC_KERNEL_BLK_QUEUE - ZFS_AC_KERNEL_GET_DISK_AND_MODULE + ZFS_AC_KERNEL_REVALIDATE_DISK ZFS_AC_KERNEL_GET_DISK_RO ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY @@ -200,6 +203,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS + ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN @@ -218,6 +222,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_TOTALHIGH_PAGES ZFS_AC_KERNEL_KSTRTOUL ZFS_AC_KERNEL_PERCPU + ZFS_AC_KERNEL_CPU_HOTPLUG ]) dnl # @@ -317,19 +322,15 @@ AC_DEFUN([ZFS_AC_KERNEL], [ utsrelease2=$kernelbuild/include/linux/utsrelease.h utsrelease3=$kernelbuild/include/generated/utsrelease.h AS_IF([test -r $utsrelease1 && fgrep -q UTS_RELEASE $utsrelease1], [ - utsrelease=linux/version.h + utsrelease=$utsrelease1 ], [test -r $utsrelease2 && fgrep -q UTS_RELEASE $utsrelease2], [ - utsrelease=linux/utsrelease.h + utsrelease=$utsrelease2 ], [test -r $utsrelease3 && fgrep -q UTS_RELEASE $utsrelease3], [ - utsrelease=generated/utsrelease.h + utsrelease=$utsrelease3 ]) - AS_IF([test "$utsrelease"], [ - kernsrcver=`(echo "#include <$utsrelease>"; - echo "kernsrcver=UTS_RELEASE") | - ${CPP} -I $kernelbuild/include - | - grep "^kernsrcver=" | cut -d \" -f 2` - + AS_IF([test -n "$utsrelease"], [ + kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease) AS_IF([test -z "$kernsrcver"], [ AC_MSG_RESULT([Not found]) AC_MSG_ERROR([ @@ -536,7 +537,9 @@ dnl # dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY]) dnl # m4_define([ZFS_LINUX_TEST_PROGRAM], [ +#include $1 + int main (void) { @@ -544,6 +547,11 @@ $2 ; return 0; } + +MODULE_DESCRIPTION("conftest"); +MODULE_AUTHOR(ZFS_META_AUTHOR); +MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +MODULE_LICENSE($3); ]) dnl # @@ -683,19 +691,21 @@ dnl # $3 - source dnl # $4 - extra cflags dnl # $5 - check license-compatibility dnl # +dnl # Check if the test source is buildable at all and then if it is +dnl # license compatible. +dnl # dnl # N.B because all of the test cases are compiled in parallel they dnl # must never depend on the results of previous tests. Each test dnl # needs to be entirely independent. dnl # AC_DEFUN([ZFS_LINUX_TEST_SRC], [ - ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]])], [$1]) + ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]], + [["Dual BSD/GPL"]])], [$1]) ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4]) AS_IF([ test -n "$5" ], [ - ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[ - #include - MODULE_LICENSE("$5"); - $2]], [[$3]])], [$1_license]) + ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM( + [[$2]], [[$3]], [[$5]])], [$1_license]) ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4]) ]) ]) @@ -785,11 +795,13 @@ dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( - [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]])], + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], + [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4]) ], [ ZFS_LINUX_COMPILE_IFELSE( - [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]])], + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], + [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4]) ]) ]) @@ -855,7 +867,7 @@ dnl # provided via the fifth parameter dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [ ZFS_LINUX_COMPILE_IFELSE( - [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]])], + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4], [$5]) ]) diff --git a/sys/contrib/openzfs/config/mount-helper.m4 b/sys/contrib/openzfs/config/mount-helper.m4 index 0a6c7670840b..e559b9ab2734 100644 --- a/sys/contrib/openzfs/config/mount-helper.m4 +++ b/sys/contrib/openzfs/config/mount-helper.m4 @@ -1,6 +1,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_MOUNT_HELPER], [ AC_ARG_WITH(mounthelperdir, - AC_HELP_STRING([--with-mounthelperdir=DIR], + AS_HELP_STRING([--with-mounthelperdir=DIR], [install mount.zfs in dir [[/sbin]]]), mounthelperdir=$withval,mounthelperdir=/sbin) diff --git a/sys/contrib/openzfs/config/user-dracut.m4 b/sys/contrib/openzfs/config/user-dracut.m4 index 95f800bda47a..b9705297f744 100644 --- a/sys/contrib/openzfs/config/user-dracut.m4 +++ b/sys/contrib/openzfs/config/user-dracut.m4 @@ -1,7 +1,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_DRACUT], [ AC_MSG_CHECKING(for dracut directory) AC_ARG_WITH([dracutdir], - AC_HELP_STRING([--with-dracutdir=DIR], + AS_HELP_STRING([--with-dracutdir=DIR], [install dracut helpers @<:@default=check@:>@]), [dracutdir=$withval], [dracutdir=check]) diff --git a/sys/contrib/openzfs/config/user-libexec.m4 b/sys/contrib/openzfs/config/user-libexec.m4 index 31bcea3fcfd3..5379c25b4a0c 100644 --- a/sys/contrib/openzfs/config/user-libexec.m4 +++ b/sys/contrib/openzfs/config/user-libexec.m4 @@ -1,6 +1,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_ZFSEXEC], [ AC_ARG_WITH(zfsexecdir, - AC_HELP_STRING([--with-zfsexecdir=DIR], + AS_HELP_STRING([--with-zfsexecdir=DIR], [install scripts [[@<:@libexecdir@:>@/zfs]]]), [zfsexecdir=$withval], [zfsexecdir="${libexecdir}/zfs"]) diff --git a/sys/contrib/openzfs/config/user-makedev.m4 b/sys/contrib/openzfs/config/user-makedev.m4 index 4383681a8f4c..8986107aef80 100644 --- a/sys/contrib/openzfs/config/user-makedev.m4 +++ b/sys/contrib/openzfs/config/user-makedev.m4 @@ -3,13 +3,12 @@ dnl # glibc 2.25 dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS], [ AC_MSG_CHECKING([makedev() is declared in sys/sysmacros.h]) - AC_TRY_COMPILE( - [ + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #include - ],[ + ]], [[ int k; k = makedev(0,0); - ],[ + ]])],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_MAKEDEV_IN_SYSMACROS, 1, [makedev() is declared in sys/sysmacros.h]) @@ -23,13 +22,12 @@ dnl # glibc X < Y < 2.25 dnl # AC_DEFUN([ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV], [ AC_MSG_CHECKING([makedev() is declared in sys/mkdev.h]) - AC_TRY_COMPILE( - [ + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[ #include - ],[ + ]], [[ int k; k = makedev(0,0); - ],[ + ]])],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_MAKEDEV_IN_MKDEV, 1, [makedev() is declared in sys/mkdev.h]) diff --git a/sys/contrib/openzfs/config/user-systemd.m4 b/sys/contrib/openzfs/config/user-systemd.m4 index 3e6a4a281f3c..63f02ad2a82b 100644 --- a/sys/contrib/openzfs/config/user-systemd.m4 +++ b/sys/contrib/openzfs/config/user-systemd.m4 @@ -1,27 +1,27 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_SYSTEMD], [ AC_ARG_ENABLE(systemd, - AC_HELP_STRING([--enable-systemd], + AS_HELP_STRING([--enable-systemd], [install systemd unit/preset files [[default: yes]]]), [enable_systemd=$enableval], [enable_systemd=check]) AC_ARG_WITH(systemdunitdir, - AC_HELP_STRING([--with-systemdunitdir=DIR], + AS_HELP_STRING([--with-systemdunitdir=DIR], [install systemd unit files in dir [[/usr/lib/systemd/system]]]), systemdunitdir=$withval,systemdunitdir=/usr/lib/systemd/system) AC_ARG_WITH(systemdpresetdir, - AC_HELP_STRING([--with-systemdpresetdir=DIR], + AS_HELP_STRING([--with-systemdpresetdir=DIR], [install systemd preset files in dir [[/usr/lib/systemd/system-preset]]]), systemdpresetdir=$withval,systemdpresetdir=/usr/lib/systemd/system-preset) AC_ARG_WITH(systemdmodulesloaddir, - AC_HELP_STRING([--with-systemdmodulesloaddir=DIR], + AS_HELP_STRING([--with-systemdmodulesloaddir=DIR], [install systemd module load files into dir [[/usr/lib/modules-load.d]]]), systemdmodulesloaddir=$withval,systemdmodulesloaddir=/usr/lib/modules-load.d) AC_ARG_WITH(systemdgeneratordir, - AC_HELP_STRING([--with-systemdgeneratordir=DIR], + AS_HELP_STRING([--with-systemdgeneratordir=DIR], [install systemd generators in dir [[/usr/lib/systemd/system-generators]]]), systemdgeneratordir=$withval,systemdgeneratordir=/usr/lib/systemd/system-generators) diff --git a/sys/contrib/openzfs/config/user-sysvinit.m4 b/sys/contrib/openzfs/config/user-sysvinit.m4 index 65dcc3819231..b6b63f1cfa36 100644 --- a/sys/contrib/openzfs/config/user-sysvinit.m4 +++ b/sys/contrib/openzfs/config/user-sysvinit.m4 @@ -1,6 +1,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_SYSVINIT], [ AC_ARG_ENABLE(sysvinit, - AC_HELP_STRING([--enable-sysvinit], + AS_HELP_STRING([--enable-sysvinit], [install SysV init scripts [default: yes]]), [],enable_sysvinit=yes) diff --git a/sys/contrib/openzfs/config/user-udev.m4 b/sys/contrib/openzfs/config/user-udev.m4 index 65dc79fb4847..e6120fc8fef6 100644 --- a/sys/contrib/openzfs/config/user-udev.m4 +++ b/sys/contrib/openzfs/config/user-udev.m4 @@ -1,7 +1,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_UDEV], [ AC_MSG_CHECKING(for udev directories) AC_ARG_WITH(udevdir, - AC_HELP_STRING([--with-udevdir=DIR], + AS_HELP_STRING([--with-udevdir=DIR], [install udev helpers @<:@default=check@:>@]), [udevdir=$withval], [udevdir=check]) @@ -18,7 +18,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_UDEV], [ ]) AC_ARG_WITH(udevruledir, - AC_HELP_STRING([--with-udevruledir=DIR], + AS_HELP_STRING([--with-udevruledir=DIR], [install udev rules [[UDEVDIR/rules.d]]]), [udevruledir=$withval], [udevruledir="${udevdir}/rules.d"]) diff --git a/sys/contrib/openzfs/config/zfs-build.m4 b/sys/contrib/openzfs/config/zfs-build.m4 index 7754eda3f6a2..f0eb47035d1e 100644 --- a/sys/contrib/openzfs/config/zfs-build.m4 +++ b/sys/contrib/openzfs/config/zfs-build.m4 @@ -180,7 +180,7 @@ AC_DEFUN([ZFS_AC_CONFIG], [ [Config file 'kernel|user|all|srpm']), [ZFS_CONFIG="$withval"]) AC_ARG_ENABLE([linux-builtin], - [AC_HELP_STRING([--enable-linux-builtin], + [AS_HELP_STRING([--enable-linux-builtin], [Configure for builtin in-tree kernel modules @<:@default=no@:>@])], [], [enable_linux_builtin=no]) diff --git a/sys/contrib/openzfs/configure.ac b/sys/contrib/openzfs/configure.ac index 9323aa7a0c28..4520a290a9a5 100644 --- a/sys/contrib/openzfs/configure.ac +++ b/sys/contrib/openzfs/configure.ac @@ -36,7 +36,7 @@ AC_LANG(C) ZFS_AC_META AC_CONFIG_AUX_DIR([config]) AC_CONFIG_MACRO_DIR([config]) -AC_CANONICAL_SYSTEM +AC_CANONICAL_TARGET AM_MAINTAINER_MODE m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AM_INIT_AUTOMAKE([subdir-objects]) @@ -45,9 +45,9 @@ AC_CONFIG_HEADERS([zfs_config.h], [ awk -f ${ac_srcdir}/config/config.awk zfs_config.h.tmp >zfs_config.h && rm zfs_config.h.tmp) || exit 1]) +LT_INIT AC_PROG_INSTALL AC_PROG_CC -AC_PROG_LIBTOOL PKG_PROG_PKG_CONFIG AM_PROG_AS AM_PROG_CC_C_O @@ -86,6 +86,7 @@ AC_CONFIG_FILES([ cmd/ztest/Makefile cmd/zvol_id/Makefile cmd/zvol_wait/Makefile + cmd/zpool_influxdb/Makefile contrib/Makefile contrib/bash_completion.d/Makefile contrib/bpftrace/Makefile @@ -208,6 +209,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/cmd/btree_test/Makefile tests/zfs-tests/cmd/chg_usr_exec/Makefile tests/zfs-tests/cmd/devname2devid/Makefile + tests/zfs-tests/cmd/draid/Makefile tests/zfs-tests/cmd/dir_rd_update/Makefile tests/zfs-tests/cmd/file_check/Makefile tests/zfs-tests/cmd/file_trunc/Makefile @@ -342,6 +344,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/inheritance/Makefile tests/zfs-tests/tests/functional/inuse/Makefile tests/zfs-tests/tests/functional/io/Makefile + tests/zfs-tests/tests/functional/l2arc/Makefile tests/zfs-tests/tests/functional/large_files/Makefile tests/zfs-tests/tests/functional/largest_pool/Makefile tests/zfs-tests/tests/functional/libzfs/Makefile @@ -358,7 +361,6 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/nopwrite/Makefile tests/zfs-tests/tests/functional/online_offline/Makefile tests/zfs-tests/tests/functional/pam/Makefile - tests/zfs-tests/tests/functional/persist_l2arc/Makefile tests/zfs-tests/tests/functional/pool_checkpoint/Makefile tests/zfs-tests/tests/functional/pool_names/Makefile tests/zfs-tests/tests/functional/poolversion/Makefile @@ -394,6 +396,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/vdev_zaps/Makefile tests/zfs-tests/tests/functional/write_dirs/Makefile tests/zfs-tests/tests/functional/xattr/Makefile + tests/zfs-tests/tests/functional/zpool_influxdb/Makefile tests/zfs-tests/tests/functional/zvol/Makefile tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/export-zfs.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/export-zfs.sh.in index 09e4a3cc0e5e..892650383475 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/export-zfs.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/export-zfs.sh.in @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh . /lib/dracut-zfs-lib.sh diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in index db5670cd5253..42afda60278c 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/module-setup.sh.in @@ -85,7 +85,13 @@ install() { fi # Synchronize initramfs and system hostid - zgenhostid -o "${initdir}/etc/hostid" "$(hostid)" + if [ -f @sysconfdir@/hostid ]; then + inst @sysconfdir@/hostid + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @sysconfdir@/hostid + elif HOSTID="$(hostid 2>/dev/null)" && [ "${HOSTID}" != "00000000" ]; then + zgenhostid -o "${initdir}@sysconfdir@/hostid" "${HOSTID}" + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @sysconfdir@/hostid + fi if dracut_module_included "systemd"; then mkdir -p "${initdir}/$systemdsystemunitdir/zfs-import.target.wants" diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/mount-zfs.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/mount-zfs.sh.in index f5b3d9056c17..4a892e9382cf 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/mount-zfs.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/mount-zfs.sh.in @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh . /lib/dracut-zfs-lib.sh @@ -58,7 +58,7 @@ ZFS_POOL="${ZFS_DATASET%%/*}" if import_pool "${ZFS_POOL}" ; then # Load keys if we can or if we need to - if [ $(zpool list -H -o feature@encryption $(echo "${ZFS_POOL}" | awk -F\/ '{print $1}')) = 'active' ]; then + if [ "$(zpool list -H -o feature@encryption "$(echo "${ZFS_POOL}" | awk -F/ '{print $1}')")" = 'active' ]; then # if the root dataset has encryption enabled ENCRYPTIONROOT="$(zfs get -H -o value encryptionroot "${ZFS_DATASET}")" if ! [ "${ENCRYPTIONROOT}" = "-" ]; then diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/parse-zfs.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/parse-zfs.sh.in index 2ff76d8fa080..768de9dd2512 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/parse-zfs.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/parse-zfs.sh.in @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh . /lib/dracut-lib.sh diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-generator.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-generator.sh.in index 120b9ecf957e..59cdadcbeae5 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-generator.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-generator.sh.in @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh echo "zfs-generator: starting" >> /dev/kmsg @@ -11,7 +11,7 @@ GENERATOR_DIR="$1" [ -f /lib/dracut-lib.sh ] && dracutlib=/lib/dracut-lib.sh [ -f /usr/lib/dracut/modules.d/99base/dracut-lib.sh ] && dracutlib=/usr/lib/dracut/modules.d/99base/dracut-lib.sh -type getarg >/dev/null 2>&1 || { +command -v getarg >/dev/null 2>&1 || { echo "zfs-generator: loading Dracut library from $dracutlib" >> /dev/kmsg . "$dracutlib" } @@ -22,16 +22,17 @@ type getarg >/dev/null 2>&1 || { # If root is not ZFS= or zfs: or rootfstype is not zfs # then we are not supposed to handle it. -[ "${root##zfs:}" = "${root}" -a "${root##ZFS=}" = "${root}" -a "$rootfstype" != "zfs" ] && exit 0 +[ "${root##zfs:}" = "${root}" ] && + [ "${root##ZFS=}" = "${root}" ] && + [ "$rootfstype" != "zfs" ] && + exit 0 rootfstype=zfs -if echo "${rootflags}" | grep -Eq '^zfsutil$|^zfsutil,|,zfsutil$|,zfsutil,' ; then - true -elif test -n "${rootflags}" ; then - rootflags="zfsutil,${rootflags}" -else - rootflags=zfsutil -fi +case ",${rootflags}," in + *,zfsutil,*) ;; + ,,) rootflags=zfsutil ;; + *) rootflags="zfsutil,${rootflags}" ;; +esac echo "zfs-generator: writing extension for sysroot.mount to $GENERATOR_DIR"/sysroot.mount.d/zfs-enhancement.conf >> /dev/kmsg @@ -58,4 +59,4 @@ echo "zfs-generator: writing extension for sysroot.mount to $GENERATOR_DIR"/sysr [ -d "$GENERATOR_DIR"/initrd-root-fs.target.requires ] || mkdir -p "$GENERATOR_DIR"/initrd-root-fs.target.requires ln -s ../sysroot.mount "$GENERATOR_DIR"/initrd-root-fs.target.requires/sysroot.mount -echo "zfs-generator: finished" >> /dev/kmsg \ No newline at end of file +echo "zfs-generator: finished" >> /dev/kmsg diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-lib.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-lib.sh.in index f470bfcc54ae..c39cc5cfff1f 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-lib.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-lib.sh.in @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh command -v getarg >/dev/null || . /lib/dracut-lib.sh command -v getargbool >/dev/null || { @@ -144,7 +144,7 @@ ask_for_password() { { flock -s 9; # Prompt for password with plymouth, if installed and running. - if type plymouth >/dev/null 2>&1 && plymouth --ping 2>/dev/null; then + if plymouth --ping 2>/dev/null; then plymouth ask-for-password \ --prompt "$ply_prompt" --number-of-tries="$ply_tries" \ --command="$ply_cmd" diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-load-key.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-load-key.sh.in index ff586ef654b8..e29501418919 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-load-key.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-load-key.sh.in @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # only run this on systemd systems, we handle the decrypt in mount-zfs.sh in the mount hook otherwise [ -e /bin/systemctl ] || return 0 @@ -17,10 +17,8 @@ [ "${root##zfs:}" = "${root}" ] && [ "${root##ZFS=}" = "${root}" ] && [ "$rootfstype" != "zfs" ] && exit 0 # There is a race between the zpool import and the pre-mount hooks, so we wait for a pool to be imported -while true; do - zpool list -H | grep -q -v '^$' && break - [ "$(systemctl is-failed zfs-import-cache.service)" = 'failed' ] && exit 1 - [ "$(systemctl is-failed zfs-import-scan.service)" = 'failed' ] && exit 1 +while [ "$(zpool list -H)" = "" ]; do + systemctl is-failed --quiet zfs-import-cache.service zfs-import-scan.service && exit 1 sleep 0.1s done @@ -34,11 +32,11 @@ else fi # if pool encryption is active and the zfs command understands '-o encryption' -if [ "$(zpool list -H -o feature@encryption $(echo "${BOOTFS}" | awk -F\/ '{print $1}'))" = 'active' ]; then +if [ "$(zpool list -H -o feature@encryption "$(echo "${BOOTFS}" | awk -F/ '{print $1}')")" = 'active' ]; then # if the root dataset has encryption enabled - ENCRYPTIONROOT=$(zfs get -H -o value encryptionroot "${BOOTFS}") + ENCRYPTIONROOT="$(zfs get -H -o value encryptionroot "${BOOTFS}")" # where the key is stored (in a file or loaded via prompt) - KEYLOCATION=$(zfs get -H -o value keylocation "${ENCRYPTIONROOT}") + KEYLOCATION="$(zfs get -H -o value keylocation "${ENCRYPTIONROOT}")" if ! [ "${ENCRYPTIONROOT}" = "-" ]; then KEYSTATUS="$(zfs get -H -o value keystatus "${ENCRYPTIONROOT}")" # continue only if the key needs to be loaded diff --git a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-needshutdown.sh.in b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-needshutdown.sh.in index ddd3edae0014..dd6de30c2704 100755 --- a/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-needshutdown.sh.in +++ b/sys/contrib/openzfs/contrib/dracut/90zfs/zfs-needshutdown.sh.in @@ -1,6 +1,6 @@ -#!/bin/bash +#!/bin/sh -type getarg >/dev/null 2>&1 || . /lib/dracut-lib.sh +command -v getarg >/dev/null 2>&1 || . /lib/dracut-lib.sh if zpool list 2>&1 | grep -q 'no pools available' ; then info "ZFS: No active pools, no need to export anything." diff --git a/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in b/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in index c8ae86363981..d451726545db 100644 --- a/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in +++ b/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in @@ -15,4 +15,4 @@ esac . /usr/share/initramfs-tools/hook-functions -copy_exec /usr/share/initramfs-tools/zfsunlock /usr/bin +copy_exec /usr/share/initramfs-tools/zfsunlock /usr/bin/zfsunlock diff --git a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c index 0a96f19a3cd0..4cafc37b9b47 100644 --- a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c +++ b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c @@ -386,6 +386,8 @@ unmount_unload(pam_handle_t *pamh, const char *ds_name) typedef struct { char *homes_prefix; char *runstatedir; + char *homedir; + char *dsname; uid_t uid; const char *username; int unmount_and_unload; @@ -423,6 +425,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->uid = entry->pw_uid; config->username = name; config->unmount_and_unload = 1; + config->dsname = NULL; + config->homedir = NULL; for (int c = 0; c < argc; c++) { if (strncmp(argv[c], "homes=", 6) == 0) { free(config->homes_prefix); @@ -432,6 +436,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->runstatedir = strdup(argv[c] + 12); } else if (strcmp(argv[c], "nounmount") == 0) { config->unmount_and_unload = 0; + } else if (strcmp(argv[c], "prop_mountpoint") == 0) { + config->homedir = strdup(entry->pw_dir); } } return (0); @@ -441,11 +447,59 @@ static void zfs_key_config_free(zfs_key_config_t *config) { free(config->homes_prefix); + free(config->runstatedir); + free(config->homedir); + free(config->dsname); +} + +static int +find_dsname_by_prop_value(zfs_handle_t *zhp, void *data) +{ + zfs_type_t type = zfs_get_type(zhp); + zfs_key_config_t *target = data; + char mountpoint[ZFS_MAXPROPLEN]; + + /* Skip any datasets whose type does not match */ + if ((type & ZFS_TYPE_FILESYSTEM) == 0) { + zfs_close(zhp); + return (0); + } + + /* Skip any datasets whose mountpoint does not match */ + (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE); + if (strcmp(target->homedir, mountpoint) != 0) { + zfs_close(zhp); + return (0); + } + + target->dsname = strdup(zfs_get_name(zhp)); + zfs_close(zhp); + return (1); } static char * zfs_key_config_get_dataset(zfs_key_config_t *config) { + if (config->homedir != NULL && + config->homes_prefix != NULL) { + zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix, + ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) { + pam_syslog(NULL, LOG_ERR, "dataset %s not found", + config->homes_prefix); + zfs_close(zhp); + return (NULL); + } + + (void) zfs_iter_filesystems(zhp, find_dsname_by_prop_value, + config); + zfs_close(zhp); + char *dsname = config->dsname; + config->dsname = NULL; + return (dsname); + } + size_t len = ZFS_MAX_DATASET_NAME_LEN; size_t total_len = strlen(config->homes_prefix) + 1 + strlen(config->username); diff --git a/sys/contrib/openzfs/etc/systemd/system/zfs-share.service.in b/sys/contrib/openzfs/etc/systemd/system/zfs-share.service.in index b720085874e5..745077513c30 100644 --- a/sys/contrib/openzfs/etc/systemd/system/zfs-share.service.in +++ b/sys/contrib/openzfs/etc/systemd/system/zfs-share.service.in @@ -8,6 +8,7 @@ Wants=zfs-mount.service After=zfs-mount.service PartOf=nfs-server.service nfs-kernel-server.service PartOf=smb.service +ConditionPathIsDirectory=/sys/module/zfs [Service] Type=oneshot diff --git a/sys/contrib/openzfs/etc/systemd/system/zfs-volume-wait.service.in b/sys/contrib/openzfs/etc/systemd/system/zfs-volume-wait.service.in index 75bd9fcdd56c..4c77724d8bbb 100644 --- a/sys/contrib/openzfs/etc/systemd/system/zfs-volume-wait.service.in +++ b/sys/contrib/openzfs/etc/systemd/system/zfs-volume-wait.service.in @@ -3,6 +3,7 @@ Description=Wait for ZFS Volume (zvol) links in /dev DefaultDependencies=no After=systemd-udev-settle.service After=zfs-import.target +ConditionPathIsDirectory=/sys/module/zfs [Service] Type=oneshot diff --git a/sys/contrib/openzfs/etc/systemd/system/zfs-zed.service.in b/sys/contrib/openzfs/etc/systemd/system/zfs-zed.service.in index f4313625ee5e..008075138f02 100644 --- a/sys/contrib/openzfs/etc/systemd/system/zfs-zed.service.in +++ b/sys/contrib/openzfs/etc/systemd/system/zfs-zed.service.in @@ -1,6 +1,7 @@ [Unit] Description=ZFS Event Daemon (zed) Documentation=man:zed(8) +ConditionPathIsDirectory=/sys/module/zfs [Service] ExecStart=@sbindir@/zed -F diff --git a/sys/contrib/openzfs/include/libzfs.h b/sys/contrib/openzfs/include/libzfs.h index e0b2676a441f..66cedd0ee0fe 100644 --- a/sys/contrib/openzfs/include/libzfs.h +++ b/sys/contrib/openzfs/include/libzfs.h @@ -88,8 +88,8 @@ typedef enum zfs_error { EZFS_ZONED, /* used improperly in local zone */ EZFS_MOUNTFAILED, /* failed to mount dataset */ EZFS_UMOUNTFAILED, /* failed to unmount dataset */ - EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ - EZFS_SHARENFSFAILED, /* share(1M) failed */ + EZFS_UNSHARENFSFAILED, /* failed to unshare over nfs */ + EZFS_SHARENFSFAILED, /* failed to share over nfs */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_FAULT, /* bad address */ @@ -455,6 +455,7 @@ extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, nvlist_t *); extern int zpool_checkpoint(zpool_handle_t *); extern int zpool_discard_checkpoint(zpool_handle_t *); +extern boolean_t zpool_is_draid_spare(const char *); /* * Basic handle manipulations. These functions do not create or destroy the @@ -556,7 +557,7 @@ extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); /* * zpool property management */ -extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); +extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **, boolean_t); extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, size_t); extern const char *zpool_prop_default_string(zpool_prop_t); diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h index e39bb07b2f4c..3481507d2c33 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/misc.h @@ -30,6 +30,7 @@ #define _OPENSOLARIS_SYS_MISC_H_ #include +#include #define MAXUID UID_MAX @@ -40,8 +41,8 @@ #define _FIOGDIO (INT_MIN+1) #define _FIOSDIO (INT_MIN+2) -#define _FIO_SEEK_DATA FIOSEEKDATA -#define _FIO_SEEK_HOLE FIOSEEKHOLE +#define F_SEEK_DATA FIOSEEKDATA +#define F_SEEK_HOLE FIOSEEKHOLE struct opensolaris_utsname { char *sysname; @@ -53,4 +54,7 @@ struct opensolaris_utsname { extern char hw_serial[11]; +#define task_io_account_read(n) +#define task_io_account_write(n) + #endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod_os.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod_os.h index ec1da1a46ae6..5b3b3271e39e 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/mod_os.h @@ -57,6 +57,8 @@ #define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, func, _, perm, desc) \ ZFS_MODULE_PARAM_CALL_IMPL(_vfs_ ## scope_prefix, name, perm, func ## _args(name_prefix ## name), desc) +#define ZFS_MODULE_VIRTUAL_PARAM_CALL ZFS_MODULE_PARAM_CALL + #define param_set_arc_long_args(var) \ CTLTYPE_ULONG, &var, 0, param_set_arc_long, "LU" @@ -84,6 +86,9 @@ #define param_set_max_auto_ashift_args(var) \ CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU" +#define fletcher_4_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" + #include #define module_init(fn) \ static void \ @@ -93,6 +98,13 @@ wrap_ ## fn(void *dummy __unused) \ } \ SYSINIT(zfs_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) +#define module_init_early(fn) \ +static void \ +wrap_ ## fn(void *dummy __unused) \ +{ \ + fn(); \ +} \ +SYSINIT(zfs_ ## fn, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST, wrap_ ## fn, NULL) #define module_exit(fn) \ static void \ diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/policy.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/policy.h index 3a05da12b3aa..909ae3886e9c 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/policy.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/policy.h @@ -34,6 +34,7 @@ #include struct mount; struct vattr; +struct znode; int secpolicy_nfs(cred_t *cr); int secpolicy_zfs(cred_t *crd); @@ -57,7 +58,7 @@ int secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, int unlocked_access(void *, int, cred_t *), void *node); int secpolicy_vnode_create_gid(cred_t *cr); int secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid); -int secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, +int secpolicy_vnode_setid_retain(struct znode *zp, cred_t *cr, boolean_t issuidroot); void secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr); int secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/sysmacros.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/sysmacros.h index 2d0164cb129f..7e3ab8915542 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/sysmacros.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/sysmacros.h @@ -80,6 +80,7 @@ extern "C" { #define kpreempt_disable() critical_enter() #define kpreempt_enable() critical_exit() #define CPU_SEQID curcpu +#define CPU_SEQID_UNSTABLE curcpu #define is_system_labeled() 0 /* * Convert a single byte to/from binary-coded decimal (BCD). diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/types.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/types.h index 3f895362881a..ecb91fd1bb89 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/types.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/types.h @@ -64,7 +64,7 @@ typedef u_int uint_t; typedef u_char uchar_t; typedef u_short ushort_t; typedef u_long ulong_t; -typedef u_int minor_t; +typedef int minor_t; /* END CSTYLED */ #ifndef _OFF64_T_DECLARED #define _OFF64_T_DECLARED diff --git a/sys/contrib/openzfs/include/os/freebsd/spl/sys/uio.h b/sys/contrib/openzfs/include/os/freebsd/spl/sys/uio.h index cb577df105e9..11b2189cda45 100644 --- a/sys/contrib/openzfs/include/os/freebsd/spl/sys/uio.h +++ b/sys/contrib/openzfs/include/os/freebsd/spl/sys/uio.h @@ -43,27 +43,6 @@ typedef struct uio uio_t; typedef struct iovec iovec_t; typedef enum uio_seg uio_seg_t; -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY -} xuio_type_t; - -typedef struct xuio { - uio_t xu_uio; - - /* Extended uio fields */ - enum xuio_type xu_type; /* What kind of uio structure? */ - union { - struct { - int xu_zc_rw; - void *xu_zc_priv; - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - static __inline int zfs_uiomove(void *cp, size_t n, enum uio_rw dir, uio_t *uio) { @@ -82,6 +61,8 @@ void uioskip(uio_t *uiop, size_t n); #define uio_iovcnt(uio) (uio)->uio_iovcnt #define uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len #define uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base +#define uio_fault_disable(uio, set) +#define uio_prefaultpages(size, uio) (0) static inline void uio_iov_at_index(uio_t *uio, uint_t idx, void **base, uint64_t *len) diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/Makefile.am b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/Makefile.am index bf5cc39eba74..392bb4ae3477 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/Makefile.am +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/Makefile.am @@ -8,7 +8,7 @@ KERNEL_H = \ zfs_dir.h \ zfs_ioctl_compat.h \ zfs_vfsops_os.h \ - zfs_vnops.h \ + zfs_vnops_os.h \ zfs_znode_impl.h \ zpl.h diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_context_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_context_os.h index 0316f93b27ec..8dbe907d098c 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_context_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -56,7 +56,6 @@ #define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) #define fm_panic panic -#define cond_resched() kern_yield(PRI_USER) extern int zfs_debug_level; extern struct mtx zfs_debug_mtx; #define ZFS_LOG(lvl, ...) do { \ diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vnops.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vnops.h index 587650af6ce3..bf5e03b24c06 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vnops.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vnops.h @@ -26,8 +26,9 @@ * $FreeBSD$ */ -#ifndef _SYS_ZFS_VNOPS_H_ -#define _SYS_ZFS_VNOPS_H_ +#ifndef _SYS_FS_ZFS_VNOPS_OS_H +#define _SYS_FS_ZFS_VNOPS_OS_H + int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h index ff61935e741e..ac2625d9a8ab 100644 --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -39,6 +39,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -113,7 +114,10 @@ extern minor_t zfsdev_minor_alloc(void); #define Z_ISBLK(type) ((type) == VBLK) #define Z_ISCHR(type) ((type) == VCHR) #define Z_ISLNK(type) ((type) == VLNK) +#define Z_ISDIR(type) ((type) == VDIR) +#define zn_has_cached_data(zp) vn_has_cached_data(ZTOV(zp)) +#define zn_rlimit_fsize(zp, uio, td) vn_rlimit_fsize(ZTOV(zp), (uio), (td)) /* Called on entry to each ZFS vnode and vfs operation */ #define ZFS_ENTER(zfsvfs) \ @@ -169,13 +173,12 @@ extern void zfs_tstamp_update_setup_ext(struct znode *, uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx); extern void zfs_znode_free(struct znode *); -extern zil_get_data_t zfs_get_data; extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; extern int zfsfstype; extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, char *buf); - +extern void zfs_inode_update(struct znode *); #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h index 1cdc300a6f85..e41b248b0405 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h @@ -99,14 +99,6 @@ blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) #endif } -#if !defined(HAVE_GET_DISK_AND_MODULE) -static inline struct kobject * -get_disk_and_module(struct gendisk *disk) -{ - return (get_disk(disk)); -} -#endif - #ifdef HAVE_BIO_BVEC_ITER #define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector #define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size @@ -268,12 +260,48 @@ bio_set_bi_error(struct bio *bio, int error) * * For older kernels trigger a re-reading of the partition table by calling * check_disk_change() which calls flush_disk() to invalidate the device. + * + * For newer kernels (as of 5.10), bdev_check_media_chage is used, in favor of + * check_disk_change(), with the modification that invalidation is no longer + * forced. */ +#ifdef HAVE_CHECK_DISK_CHANGE +#define zfs_check_media_change(bdev) check_disk_change(bdev) #ifdef HAVE_BLKDEV_REREAD_PART #define vdev_bdev_reread_part(bdev) blkdev_reread_part(bdev) #else #define vdev_bdev_reread_part(bdev) check_disk_change(bdev) #endif /* HAVE_BLKDEV_REREAD_PART */ +#else +#ifdef HAVE_BDEV_CHECK_MEDIA_CHANGE +static inline int +zfs_check_media_change(struct block_device *bdev) +{ + struct gendisk *gd = bdev->bd_disk; + const struct block_device_operations *bdo = gd->fops; + + if (!bdev_check_media_change(bdev)) + return (0); + + /* + * Force revalidation, to mimic the old behavior of + * check_disk_change() + */ + if (bdo->revalidate_disk) + bdo->revalidate_disk(gd); + + return (0); +} +#define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) +#else +/* + * This is encountered if check_disk_change() and bdev_check_media_change() + * are not available in the kernel - likely due to an API change that needs + * to be chased down. + */ +#error "Unsupported kernel: no usable disk change check" +#endif /* HAVE_BDEV_CHECK_MEDIA_CHANGE */ +#endif /* HAVE_CHECK_DISK_CHANGE */ /* * 2.6.27 API change @@ -282,16 +310,38 @@ bio_set_bi_error(struct bio *bio, int error) * * 4.4.0-6.21 API change for Ubuntu * lookup_bdev() gained a second argument, FMODE_*, to check inode permissions. + * + * 5.11 API change + * Changed to take a dev_t argument which is set on success and return a + * non-zero error code on failure. */ -#ifdef HAVE_1ARG_LOOKUP_BDEV -#define vdev_lookup_bdev(path) lookup_bdev(path) -#else -#ifdef HAVE_2ARGS_LOOKUP_BDEV -#define vdev_lookup_bdev(path) lookup_bdev(path, 0) +static inline int +vdev_lookup_bdev(const char *path, dev_t *dev) +{ +#if defined(HAVE_DEVT_LOOKUP_BDEV) + return (lookup_bdev(path, dev)); +#elif defined(HAVE_1ARG_LOOKUP_BDEV) + struct block_device *bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return (PTR_ERR(bdev)); + + *dev = bdev->bd_dev; + bdput(bdev); + + return (0); +#elif defined(HAVE_MODE_LOOKUP_BDEV) + struct block_device *bdev = lookup_bdev(path, FMODE_READ); + if (IS_ERR(bdev)) + return (PTR_ERR(bdev)); + + *dev = bdev->bd_dev; + bdput(bdev); + + return (0); #else #error "Unsupported kernel" -#endif /* HAVE_2ARGS_LOOKUP_BDEV */ -#endif /* HAVE_1ARG_LOOKUP_BDEV */ +#endif +} /* * Kernels without bio_set_op_attrs use bi_rw for the bio flags. @@ -465,25 +515,38 @@ blk_queue_discard_secure(struct request_queue *q) */ #define VDEV_HOLDER ((void *)0x2401de7) -static inline void -blk_generic_start_io_acct(struct request_queue *q, int rw, - unsigned long sectors, struct hd_struct *part) +static inline unsigned long +blk_generic_start_io_acct(struct request_queue *q __attribute__((unused)), + struct gendisk *disk __attribute__((unused)), + int rw __attribute__((unused)), struct bio *bio) { -#if defined(HAVE_GENERIC_IO_ACCT_3ARG) - generic_start_io_acct(rw, sectors, part); +#if defined(HAVE_BIO_IO_ACCT) + return (bio_start_io_acct(bio)); +#elif defined(HAVE_GENERIC_IO_ACCT_3ARG) + unsigned long start_time = jiffies; + generic_start_io_acct(rw, bio_sectors(bio), &disk->part0); + return (start_time); #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) - generic_start_io_acct(q, rw, sectors, part); + unsigned long start_time = jiffies; + generic_start_io_acct(q, rw, bio_sectors(bio), &disk->part0); + return (start_time); +#else + /* Unsupported */ + return (0); #endif } static inline void -blk_generic_end_io_acct(struct request_queue *q, int rw, - struct hd_struct *part, unsigned long start_time) +blk_generic_end_io_acct(struct request_queue *q __attribute__((unused)), + struct gendisk *disk __attribute__((unused)), + int rw __attribute__((unused)), struct bio *bio, unsigned long start_time) { -#if defined(HAVE_GENERIC_IO_ACCT_3ARG) - generic_end_io_acct(rw, part, start_time); +#if defined(HAVE_BIO_IO_ACCT) + bio_end_io_acct(bio, start_time); +#elif defined(HAVE_GENERIC_IO_ACCT_3ARG) + generic_end_io_acct(rw, &disk->part0, start_time); #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) - generic_end_io_acct(q, rw, part, start_time); + generic_end_io_acct(q, rw, &disk->part0, start_time); #endif } @@ -493,6 +556,8 @@ blk_generic_alloc_queue(make_request_fn make_request, int node_id) { #if defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN) return (blk_alloc_queue(make_request, node_id)); +#elif defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH) + return (blk_alloc_queue_rh(make_request, node_id)); #else struct request_queue *q = blk_alloc_queue(GFP_KERNEL); if (q != NULL) diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h index 1c48df5cbd81..e96e95313009 100644 --- a/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h +++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/mod_compat.h @@ -74,6 +74,7 @@ enum scope_prefix_types { zfs_vdev_cache, zfs_vdev_file, zfs_vdev_mirror, + zfs_vnops, zfs_zevent, zfs_zio, zfs_zil @@ -143,6 +144,17 @@ enum scope_prefix_types { MODULE_PARM_DESC(name_prefix ## name, desc) /* END CSTYLED */ +/* + * As above, but there is no variable with the name name_prefix ## name, + * so NULL is passed to module_param_call instead. + */ +/* BEGIN CSTYLED */ +#define ZFS_MODULE_VIRTUAL_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, getfunc, perm, desc) \ + CTASSERT_GLOBAL((sizeof (scope_prefix) == sizeof (enum scope_prefix_types))); \ + module_param_call(name_prefix ## name, setfunc, getfunc, NULL, perm); \ + MODULE_PARM_DESC(name_prefix ## name, desc) +/* END CSTYLED */ + #define ZFS_MODULE_PARAM_ARGS const char *buf, zfs_kernel_param_t *kp #define ZFS_MODULE_DESCRIPTION(s) MODULE_DESCRIPTION(s) @@ -150,4 +162,6 @@ enum scope_prefix_types { #define ZFS_MODULE_LICENSE(s) MODULE_LICENSE(s) #define ZFS_MODULE_VERSION(s) MODULE_VERSION(s) +#define module_init_early(fn) module_init(fn) + #endif /* _MOD_COMPAT_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h b/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h index eb3494bc7904..98d1ab1d7f8a 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/sysmacros.h @@ -76,6 +76,7 @@ #define max_ncpus num_possible_cpus() #define boot_ncpus num_online_cpus() #define CPU_SEQID smp_processor_id() +#define CPU_SEQID_UNSTABLE raw_smp_processor_id() #define is_system_labeled() 0 #ifndef RLIM64_INFINITY diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h index 16f4349e78e4..b50175a10873 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/taskq.h @@ -84,6 +84,8 @@ typedef struct taskq { int tq_nthreads; /* # of existing threads */ int tq_nspawn; /* # of threads being spawned */ int tq_maxthreads; /* # of threads maximum */ + /* If PERCPU flag is set, percent of NCPUs to have as threads */ + int tq_cpu_pct; int tq_pri; /* priority */ int tq_minalloc; /* min taskq_ent_t pool size */ int tq_maxalloc; /* max taskq_ent_t pool size */ @@ -99,6 +101,9 @@ typedef struct taskq { spl_wait_queue_head_t tq_work_waitq; /* new work waitq */ spl_wait_queue_head_t tq_wait_waitq; /* wait waitq */ tq_lock_role_t tq_lock_class; /* class when taking tq_lock */ + /* list node for the cpu hotplug callback */ + struct hlist_node tq_hp_cb_node; + boolean_t tq_hp_support; } taskq_t; typedef struct taskq_ent { diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h index abcd90dd570c..6e850c5fe7b1 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/uio.h @@ -44,14 +44,19 @@ typedef enum uio_rw { typedef enum uio_seg { UIO_USERSPACE = 0, UIO_SYSSPACE = 1, - UIO_USERISPACE = 2, - UIO_BVEC = 3, + UIO_BVEC = 2, +#if defined(HAVE_VFS_IOV_ITER) + UIO_ITER = 3, +#endif } uio_seg_t; typedef struct uio { union { const struct iovec *uio_iov; const struct bio_vec *uio_bvec; +#if defined(HAVE_VFS_IOV_ITER) + struct iov_iter *uio_iter; +#endif }; int uio_iovcnt; offset_t uio_loffset; @@ -59,60 +64,17 @@ typedef struct uio { boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; - offset_t uio_limit; ssize_t uio_resid; size_t uio_skip; } uio_t; -typedef struct aio_req { - uio_t *aio_uio; - void *aio_private; -} aio_req_t; - -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY, -} xuio_type_t; - - -#define UIOA_IOV_MAX 16 - -typedef struct uioa_page_s { - int uioa_pfncnt; - void **uioa_ppp; - caddr_t uioa_base; - size_t uioa_len; -} uioa_page_t; - -typedef struct xuio { - uio_t xu_uio; - enum xuio_type xu_type; - union { - struct { - uint32_t xu_a_state; - ssize_t xu_a_mbytes; - uioa_page_t *xu_a_lcur; - void **xu_a_lppp; - void *xu_a_hwst[4]; - uioa_page_t xu_a_locked[UIOA_IOV_MAX]; - } xu_aio; - - struct { - int xu_zc_rw; - void *xu_zc_priv; - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - #define uio_segflg(uio) (uio)->uio_segflg #define uio_offset(uio) (uio)->uio_loffset #define uio_resid(uio) (uio)->uio_resid #define uio_iovcnt(uio) (uio)->uio_iovcnt #define uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len #define uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base +#define uio_fault_disable(uio, set) (uio)->uio_fault_disable = set static inline void uio_iov_at_index(uio_t *uio, uint_t idx, void **base, uint64_t *len) @@ -140,4 +102,65 @@ uio_index_at_offset(uio_t *uio, offset_t off, uint_t *vec_idx) return (off); } +static inline void +iov_iter_init_compat(struct iov_iter *iter, unsigned int dir, + const struct iovec *iov, unsigned long nr_segs, size_t count) +{ +#if defined(HAVE_IOV_ITER_INIT) + iov_iter_init(iter, dir, iov, nr_segs, count); +#elif defined(HAVE_IOV_ITER_INIT_LEGACY) + iov_iter_init(iter, iov, nr_segs, count, 0); +#else +#error "Unsupported kernel" +#endif +} + +static inline void +uio_iovec_init(uio_t *uio, const struct iovec *iov, unsigned long nr_segs, + offset_t offset, uio_seg_t seg, ssize_t resid, size_t skip) +{ + ASSERT(seg == UIO_USERSPACE || seg == UIO_SYSSPACE); + + uio->uio_iov = iov; + uio->uio_iovcnt = nr_segs; + uio->uio_loffset = offset; + uio->uio_segflg = seg; + uio->uio_fault_disable = B_FALSE; + uio->uio_fmode = 0; + uio->uio_extflg = 0; + uio->uio_resid = resid; + uio->uio_skip = skip; +} + +static inline void +uio_bvec_init(uio_t *uio, struct bio *bio) +{ + uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; + uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); + uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; + uio->uio_segflg = UIO_BVEC; + uio->uio_fault_disable = B_FALSE; + uio->uio_fmode = 0; + uio->uio_extflg = 0; + uio->uio_resid = BIO_BI_SIZE(bio); + uio->uio_skip = BIO_BI_SKIP(bio); +} + +#if defined(HAVE_VFS_IOV_ITER) +static inline void +uio_iov_iter_init(uio_t *uio, struct iov_iter *iter, offset_t offset, + ssize_t resid, size_t skip) +{ + uio->uio_iter = iter; + uio->uio_iovcnt = iter->nr_segs; + uio->uio_loffset = offset; + uio->uio_segflg = UIO_ITER; + uio->uio_fault_disable = B_FALSE; + uio->uio_fmode = 0; + uio->uio_extflg = 0; + uio->uio_resid = resid; + uio->uio_skip = skip; +} +#endif + #endif /* SPL_UIO_H */ diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/vnode.h b/sys/contrib/openzfs/include/os/linux/spl/sys/vnode.h index 6f17db89fe53..64c270650225 100644 --- a/sys/contrib/openzfs/include/os/linux/spl/sys/vnode.h +++ b/sys/contrib/openzfs/include/os/linux/spl/sys/vnode.h @@ -52,6 +52,12 @@ #define F_FREESP 11 /* Free file space */ + +#if defined(SEEK_HOLE) && defined(SEEK_DATA) +#define F_SEEK_DATA SEEK_DATA +#define F_SEEK_HOLE SEEK_HOLE +#endif + /* * The vnode AT_ flags are mapped to the Linux ATTR_* flags. * This allows them to be used safely with an iattr structure. diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/Makefile.am b/sys/contrib/openzfs/include/os/linux/zfs/sys/Makefile.am index a5f2502d20e8..a075db476e40 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/Makefile.am +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/Makefile.am @@ -21,7 +21,7 @@ KERNEL_H = \ zfs_ctldir.h \ zfs_dir.h \ zfs_vfsops_os.h \ - zfs_vnops.h \ + zfs_vnops_os.h \ zfs_znode_impl.h \ zpl.h diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/policy.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/policy.h index 77a73ad149c5..61afc3765504 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/policy.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/policy.h @@ -35,6 +35,8 @@ #include #include +struct znode; + int secpolicy_nfs(const cred_t *); int secpolicy_sys_config(const cred_t *, boolean_t); int secpolicy_vnode_access2(const cred_t *, struct inode *, @@ -44,7 +46,7 @@ int secpolicy_vnode_chown(const cred_t *, uid_t); int secpolicy_vnode_create_gid(const cred_t *); int secpolicy_vnode_remove(const cred_t *); int secpolicy_vnode_setdac(const cred_t *, uid_t); -int secpolicy_vnode_setid_retain(const cred_t *, boolean_t); +int secpolicy_vnode_setid_retain(struct znode *, const cred_t *, boolean_t); int secpolicy_vnode_setids_setgids(const cred_t *, gid_t); int secpolicy_zinject(const cred_t *); int secpolicy_zfs(const cred_t *); diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h index 083560952f0b..4707fc6f4112 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/trace_acl.h @@ -52,7 +52,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __field(uint8_t, z_unlinked) __field(uint8_t, z_atime_dirty) __field(uint8_t, z_zn_prefetch) - __field(uint8_t, z_moved) __field(uint_t, z_blksz) __field(uint_t, z_seq) __field(uint64_t, z_mapcnt) @@ -86,7 +85,6 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->z_unlinked = zn->z_unlinked; __entry->z_atime_dirty = zn->z_atime_dirty; __entry->z_zn_prefetch = zn->z_zn_prefetch; - __entry->z_moved = zn->z_moved; __entry->z_blksz = zn->z_blksz; __entry->z_seq = zn->z_seq; __entry->z_mapcnt = zn->z_mapcnt; @@ -116,7 +114,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, __entry->mask_matched = mask_matched; ), TP_printk("zn { id %llu unlinked %u atime_dirty %u " - "zn_prefetch %u moved %u blksz %u seq %u " + "zn_prefetch %u blksz %u seq %u " "mapcnt %llu size %llu pflags %llu " "sync_cnt %u mode 0x%x is_sa %d " "is_mapped %d is_ctldir %d is_stale %d inode { " @@ -124,7 +122,7 @@ DECLARE_EVENT_CLASS(zfs_ace_class, "blkbits %u bytes %u mode 0x%x generation %x } } " "ace { type %u flags %u access_mask %u } mask_matched %u", __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty, - __entry->z_zn_prefetch, __entry->z_moved, __entry->z_blksz, + __entry->z_zn_prefetch, __entry->z_blksz, __entry->z_seq, __entry->z_mapcnt, __entry->z_size, __entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode, __entry->z_is_sa, __entry->z_is_mapped, diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vnops.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vnops.h index 2b41f3863425..df307fc0350d 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vnops.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_vnops.h @@ -22,8 +22,8 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ -#ifndef _SYS_FS_ZFS_VNOPS_H -#define _SYS_FS_ZFS_VNOPS_H +#ifndef _SYS_FS_ZFS_VNOPS_OS_H +#define _SYS_FS_ZFS_VNOPS_OS_H #include #include @@ -40,12 +40,8 @@ extern "C" { extern int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr); extern int zfs_close(struct inode *ip, int flag, cred_t *cr); -extern int zfs_holey(struct inode *ip, int cmd, loff_t *off); -extern int zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr); -extern int zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr); extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *resid); -extern int zfs_access(struct inode *ip, int mode, int flag, cred_t *cr); extern int zfs_lookup(znode_t *dzp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, @@ -58,7 +54,6 @@ extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags); extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); -extern int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr); extern int zfs_getattr_fast(struct inode *ip, struct kstat *sp); extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, @@ -72,10 +67,6 @@ extern void zfs_inactive(struct inode *ip); extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr); extern int zfs_fid(struct inode *ip, fid_t *fidp); -extern int zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, - cred_t *cr); -extern int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, - cred_t *cr); extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); extern int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc); diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h index e010970dd540..13e5fb653f5b 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -68,6 +68,10 @@ extern "C" { #define Z_ISCHR(type) S_ISCHR(type) #define Z_ISLNK(type) S_ISLNK(type) #define Z_ISDEV(type) (S_ISCHR(type) || S_ISBLK(type) || S_ISFIFO(type)) +#define Z_ISDIR(type) S_ISDIR(type) + +#define zn_has_cached_data(zp) ((zp)->z_is_mapped) +#define zn_rlimit_fsize(zp, uio, td) (0) #define zhold(zp) igrab(ZTOI((zp))) #define zrele(zp) iput(ZTOI((zp))) @@ -90,7 +94,11 @@ do { \ zfs_exit_fs(zfsvfs); \ ZFS_EXIT_READ(zfsvfs, FTAG); \ } while (0) -#define ZPL_EXIT(zfsvfs) ZFS_EXIT(zfsvfs) + +#define ZPL_EXIT(zfsvfs) \ +do { \ + rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ +} while (0) /* Verifies the znode is valid. */ #define ZFS_VERIFY_ZP_ERROR(zp, error) \ @@ -143,6 +151,8 @@ do { \ } while (0) #endif /* HAVE_INODE_TIMESPEC64_TIMES */ +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) + struct znode; extern int zfs_sync(struct super_block *, int, cred_t *); @@ -157,7 +167,6 @@ extern caddr_t zfs_map_page(page_t *, enum seg_rw); extern void zfs_unmap_page(page_t *, caddr_t); #endif /* HAVE_UIO_RW */ -extern zil_get_data_t zfs_get_data; extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; extern int zfsfstype; diff --git a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h index ef5a0b842d09..b0bb9c29c0b4 100644 --- a/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h +++ b/sys/contrib/openzfs/include/os/linux/zfs/sys/zpl.h @@ -46,15 +46,6 @@ extern const struct inode_operations zpl_dir_inode_operations; extern const struct inode_operations zpl_symlink_inode_operations; extern const struct inode_operations zpl_special_inode_operations; extern dentry_operations_t zpl_dentry_operations; - -/* zpl_file.c */ -extern ssize_t zpl_read_common(struct inode *ip, const char *buf, - size_t len, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr); -extern ssize_t zpl_write_common(struct inode *ip, const char *buf, - size_t len, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr); - extern const struct address_space_operations zpl_address_space_operations; extern const struct file_operations zpl_file_operations; extern const struct file_operations zpl_dir_file_operations; diff --git a/sys/contrib/openzfs/include/sys/Makefile.am b/sys/contrib/openzfs/include/sys/Makefile.am index a944c5ea834d..c3ebf17b5288 100644 --- a/sys/contrib/openzfs/include/sys/Makefile.am +++ b/sys/contrib/openzfs/include/sys/Makefile.am @@ -82,6 +82,7 @@ COMMON_H = \ vdev_disk.h \ vdev_file.h \ vdev.h \ + vdev_draid.h \ vdev_impl.h \ vdev_indirect_births.h \ vdev_indirect_mapping.h \ @@ -117,6 +118,7 @@ COMMON_H = \ zfs_stat.h \ zfs_sysfs.h \ zfs_vfsops.h \ + zfs_vnops.h \ zfs_znode.h \ zil.h \ zil_impl.h \ diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h index a0852b4d5a70..f58fa53b6003 100644 --- a/sys/contrib/openzfs/include/sys/arc.h +++ b/sys/contrib/openzfs/include/sys/arc.h @@ -154,6 +154,11 @@ typedef enum arc_flags */ ARC_FLAG_CACHED_ONLY = 1 << 22, + /* + * Don't instantiate an arc_buf_t for arc_read_done. + */ + ARC_FLAG_NO_BUF = 1 << 23, + /* * The arc buffer's compression mode is stored in the top 7 bits of the * flags field, so these dummy flags are included so that MDB can @@ -305,6 +310,7 @@ int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); uint64_t arc_all_memory(void); uint64_t arc_default_max(uint64_t min, uint64_t allmem); uint64_t arc_target_bytes(void); +void arc_set_limits(uint64_t); void arc_init(void); void arc_fini(void); diff --git a/sys/contrib/openzfs/include/sys/arc_impl.h b/sys/contrib/openzfs/include/sys/arc_impl.h index c5061695d944..94123fc10e67 100644 --- a/sys/contrib/openzfs/include/sys/arc_impl.h +++ b/sys/contrib/openzfs/include/sys/arc_impl.h @@ -99,6 +99,7 @@ struct arc_callback { boolean_t acb_encrypted; boolean_t acb_compressed; boolean_t acb_noauth; + boolean_t acb_nobuf; zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; @@ -350,6 +351,8 @@ typedef struct l2arc_lb_ptr_buf { #define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) #define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) #define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) +#define L2BLK_GET_STATE(field) BF64_GET((field), 57, 4) +#define L2BLK_SET_STATE(field, x) BF64_SET((field), 57, 4, x) #define PTR_SWAP(x, y) \ do { \ @@ -445,6 +448,7 @@ typedef struct l2arc_buf_hdr { l2arc_dev_t *b_dev; /* L2ARC device */ uint64_t b_daddr; /* disk address, offset byte */ uint32_t b_hits; + arc_state_type_t b_arcs_state; list_node_t b_l2node; } l2arc_buf_hdr_t; @@ -546,6 +550,8 @@ typedef struct arc_stats { kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_eligible_mfu; + kstat_named_t arcstat_evict_l2_eligible_mru; kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; @@ -744,6 +750,18 @@ typedef struct arc_stats { kstat_named_t arcstat_mfu_ghost_evictable_metadata; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; + /* + * Allocated size (in bytes) of L2ARC cached buffers by ARC state. + */ + kstat_named_t arcstat_l2_prefetch_asize; + kstat_named_t arcstat_l2_mru_asize; + kstat_named_t arcstat_l2_mfu_asize; + /* + * Allocated size (in bytes) of L2ARC cached buffers by buffer content + * type. + */ + kstat_named_t arcstat_l2_bufc_data_asize; + kstat_named_t arcstat_l2_bufc_metadata_asize; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; kstat_named_t arcstat_l2_read_bytes; @@ -909,6 +927,8 @@ extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); extern uint64_t arc_free_memory(void); extern int64_t arc_available_memory(void); extern void arc_tuning_update(boolean_t); +extern void arc_register_hotplug(void); +extern void arc_unregister_hotplug(void); extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); diff --git a/sys/contrib/openzfs/include/sys/dbuf.h b/sys/contrib/openzfs/include/sys/dbuf.h index 04338b2c491b..d221eac4c816 100644 --- a/sys/contrib/openzfs/include/sys/dbuf.h +++ b/sys/contrib/openzfs/include/sys/dbuf.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -130,6 +130,16 @@ typedef struct dbuf_dirty_record { /* list link for dbuf dirty records */ list_node_t dr_dbuf_node; + /* + * The dnode we are part of. Note that the dnode can not be moved or + * evicted due to the hold that's added by dnode_setdirty() or + * dmu_objset_sync_dnodes(), and released by dnode_rele_task() or + * userquota_updates_task(). This hold is necessary for + * dirty_lightweight_leaf-type dirty records, which don't have a hold + * on a dbuf. + */ + dnode_t *dr_dnode; + /* pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; @@ -171,6 +181,17 @@ typedef struct dbuf_dirty_record { uint8_t dr_iv[ZIO_DATA_IV_LEN]; uint8_t dr_mac[ZIO_DATA_MAC_LEN]; } dl; + struct dirty_lightweight_leaf { + /* + * This dirty record refers to a leaf (level=0) + * block, whose dbuf has not been instantiated for + * performance reasons. + */ + uint64_t dr_blkid; + abd_t *dr_abd; + zio_prop_t dr_props; + enum zio_flag dr_flags; + } dll; } dt; } dbuf_dirty_record_t; @@ -309,6 +330,8 @@ typedef struct dbuf_hash_table { kmutex_t hash_mutexes[DBUF_MUTEXES]; } dbuf_hash_table_t; +typedef void (*dbuf_prefetch_fn)(void *, boolean_t); + uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level, const uint64_t offset); @@ -324,7 +347,10 @@ int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, boolean_t fail_uncached, void *tag, dmu_buf_impl_t **dbp); -void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, +int dbuf_prefetch_impl(struct dnode *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg); +int dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, zio_priority_t prio, arc_flags_t aflags); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); @@ -344,11 +370,16 @@ void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, + dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); +int dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, + const struct zio_prop *zp, enum zio_flag flags, dmu_tx_t *tx); + void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx); void dbuf_destroy(dmu_buf_impl_t *db); diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h index 54fdbc9ad227..0c50d0409b2b 100644 --- a/sys/contrib/openzfs/include/sys/dmu.h +++ b/sys/contrib/openzfs/include/sys/dmu.h @@ -864,18 +864,6 @@ int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf -#ifdef HAVE_UIO_ZEROCOPY -int dmu_xuio_init(struct xuio *uio, int niov); -void dmu_xuio_fini(struct xuio *uio); -int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, - size_t n); -int dmu_xuio_cnt(struct xuio *uio); -struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); -void dmu_xuio_clear(struct xuio *uio, int i); -#endif /* HAVE_UIO_ZEROCOPY */ -void xuio_stat_wbuf_copied(void); -void xuio_stat_wbuf_nocopy(void); - extern int zfs_prefetch_disable; extern int zfs_max_recordsize; diff --git a/sys/contrib/openzfs/include/sys/dmu_impl.h b/sys/contrib/openzfs/include/sys/dmu_impl.h index 0c6273a3a727..def4aadba1d0 100644 --- a/sys/contrib/openzfs/include/sys/dmu_impl.h +++ b/sys/contrib/openzfs/include/sys/dmu_impl.h @@ -237,13 +237,6 @@ extern "C" { struct objset; struct dmu_pool; -typedef struct dmu_xuio { - int next; - int cnt; - struct arc_buf **bufs; - iovec_t *iovp; -} dmu_xuio_t; - typedef struct dmu_sendstatus { list_node_t dss_link; int dss_outfd; diff --git a/sys/contrib/openzfs/include/sys/dmu_objset.h b/sys/contrib/openzfs/include/sys/dmu_objset.h index 1af69832c5d3..a8cb812714ec 100644 --- a/sys/contrib/openzfs/include/sys/dmu_objset.h +++ b/sys/contrib/openzfs/include/sys/dmu_objset.h @@ -242,10 +242,10 @@ objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_t **osp); void dmu_objset_evict(objset_t *os); -void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx); void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); -int dmu_objset_userspace_upgrade(objset_t *os); +void dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); boolean_t dmu_objset_userobjused_enabled(objset_t *os); boolean_t dmu_objset_userobjspace_upgradable(objset_t *os); diff --git a/sys/contrib/openzfs/include/sys/dmu_zfetch.h b/sys/contrib/openzfs/include/sys/dmu_zfetch.h index 4303ab314ced..34b711fc06e5 100644 --- a/sys/contrib/openzfs/include/sys/dmu_zfetch.h +++ b/sys/contrib/openzfs/include/sys/dmu_zfetch.h @@ -40,6 +40,13 @@ extern unsigned long zfetch_array_rd_sz; struct dnode; /* so we can reference dnode */ +typedef struct zfetch { + kmutex_t zf_lock; /* protects zfetch structure */ + list_t zf_stream; /* list of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ + int zf_numstreams; /* number of zstream_t's */ +} zfetch_t; + typedef struct zstream { uint64_t zs_blkid; /* expect next access at this blkid */ uint64_t zs_pf_blkid; /* next block to prefetch */ @@ -52,15 +59,12 @@ typedef struct zstream { kmutex_t zs_lock; /* protects stream */ hrtime_t zs_atime; /* time last prefetch issued */ + hrtime_t zs_start_time; /* start of last prefetch */ list_node_t zs_node; /* link for zf_stream */ + zfetch_t *zs_fetch; /* parent fetch */ + zfs_refcount_t zs_blocks; /* number of pending blocks in the stream */ } zstream_t; -typedef struct zfetch { - kmutex_t zf_lock; /* protects zfetch structure */ - list_t zf_stream; /* list of zstream_t's */ - struct dnode *zf_dnode; /* dnode that owns this zfetch */ -} zfetch_t; - void zfetch_init(void); void zfetch_fini(void); diff --git a/sys/contrib/openzfs/include/sys/dsl_dataset.h b/sys/contrib/openzfs/include/sys/dsl_dataset.h index f5816a934c5f..ed934f969e92 100644 --- a/sys/contrib/openzfs/include/sys/dsl_dataset.h +++ b/sys/contrib/openzfs/include/sys/dsl_dataset.h @@ -316,6 +316,7 @@ typedef struct dsl_dataset_snapshot_arg { /* flags for holding the dataset */ typedef enum ds_hold_flags { + DS_HOLD_FLAG_NONE = 0 << 0, DS_HOLD_FLAG_DECRYPT = 1 << 0 /* needs access to encrypted data */ } ds_hold_flags_t; diff --git a/sys/contrib/openzfs/include/sys/dsl_scan.h b/sys/contrib/openzfs/include/sys/dsl_scan.h index 8f929207d2d7..19c3dd599b10 100644 --- a/sys/contrib/openzfs/include/sys/dsl_scan.h +++ b/sys/contrib/openzfs/include/sys/dsl_scan.h @@ -163,6 +163,7 @@ typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; void scan_init(void); void scan_fini(void); int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_setup_sync(void *, dmu_tx_t *); void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); int dsl_scan_cancel(struct dsl_pool *); diff --git a/sys/contrib/openzfs/include/sys/frame.h b/sys/contrib/openzfs/include/sys/frame.h index b6bbaa79b2f8..caae851421d8 100644 --- a/sys/contrib/openzfs/include/sys/frame.h +++ b/sys/contrib/openzfs/include/sys/frame.h @@ -25,7 +25,11 @@ extern "C" { #if defined(__KERNEL__) && defined(HAVE_KERNEL_OBJTOOL) && \ defined(HAVE_STACK_FRAME_NON_STANDARD) +#if defined(HAVE_KERNEL_OBJTOOL_HEADER) +#include +#else #include +#endif #else #define STACK_FRAME_NON_STANDARD(func) #endif diff --git a/sys/contrib/openzfs/include/sys/fs/zfs.h b/sys/contrib/openzfs/include/sys/fs/zfs.h index fe63d735babc..60c1b84602a3 100644 --- a/sys/contrib/openzfs/include/sys/fs/zfs.h +++ b/sys/contrib/openzfs/include/sys/fs/zfs.h @@ -249,7 +249,7 @@ typedef enum { ZPOOL_NUM_PROPS } zpool_prop_t; -/* Small enough to not hog a whole line of printout in zpool(1M). */ +/* Small enough to not hog a whole line of printout in zpool(8). */ #define ZPROP_MAX_COMMENT 32 #define ZPROP_VALUE "value" @@ -617,6 +617,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" #define ZPOOL_CONFIG_PATH "path" #define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_SPARE_ID "spareid" #define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" #define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" #define ZPOOL_CONFIG_ASHIFT "ashift" @@ -757,10 +758,17 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" #define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" +/* dRAID configuration */ +#define ZPOOL_CONFIG_DRAID_NDATA "draid_ndata" +#define ZPOOL_CONFIG_DRAID_NSPARES "draid_nspares" +#define ZPOOL_CONFIG_DRAID_NGROUPS "draid_ngroups" + #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" #define VDEV_TYPE_REPLACING "replacing" #define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DRAID "draid" +#define VDEV_TYPE_DRAID_SPARE "dspare" #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" @@ -770,6 +778,12 @@ typedef struct zpool_load_policy { #define VDEV_TYPE_L2CACHE "l2cache" #define VDEV_TYPE_INDIRECT "indirect" +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_DRAID_MAXPARITY 3 +#define VDEV_DRAID_MIN_CHILDREN 2 +#define VDEV_DRAID_MAX_CHILDREN UINT8_MAX + /* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ #define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ "com.delphix:indirect_obsolete_sm" @@ -1438,6 +1452,7 @@ typedef enum { #define ZPOOL_HIST_IOCTL "ioctl" #define ZPOOL_HIST_INPUT_NVL "in_nvl" #define ZPOOL_HIST_OUTPUT_NVL "out_nvl" +#define ZPOOL_HIST_OUTPUT_SIZE "out_size" #define ZPOOL_HIST_DSNAME "dsname" #define ZPOOL_HIST_DSID "dsid" #define ZPOOL_HIST_ERRNO "errno" diff --git a/sys/contrib/openzfs/include/sys/metaslab.h b/sys/contrib/openzfs/include/sys/metaslab.h index b3b7f865536e..ecff65f13de5 100644 --- a/sys/contrib/openzfs/include/sys/metaslab.h +++ b/sys/contrib/openzfs/include/sys/metaslab.h @@ -78,6 +78,7 @@ uint64_t metaslab_largest_allocatable(metaslab_t *); #define METASLAB_DONT_THROTTLE 0x10 #define METASLAB_MUST_RESERVE 0x20 #define METASLAB_FASTWRITE 0x40 +#define METASLAB_ZIL 0x80 int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, diff --git a/sys/contrib/openzfs/include/sys/metaslab_impl.h b/sys/contrib/openzfs/include/sys/metaslab_impl.h index 4a7475256a2b..3be0c466c403 100644 --- a/sys/contrib/openzfs/include/sys/metaslab_impl.h +++ b/sys/contrib/openzfs/include/sys/metaslab_impl.h @@ -136,6 +136,29 @@ typedef enum trace_alloc_type { #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) +/* + * Per-allocator data structure. + */ +typedef struct metaslab_class_allocator { + metaslab_group_t *mca_rotor; + uint64_t mca_aliquot; + + /* + * The allocation throttle works on a reservation system. Whenever + * an asynchronous zio wants to perform an allocation it must + * first reserve the number of blocks that it wants to allocate. + * If there aren't sufficient slots available for the pending zio + * then that I/O is throttled until more slots free up. The current + * number of reserved allocations is maintained by the mca_alloc_slots + * refcount. The mca_alloc_max_slots value determines the maximum + * number of allocations that the system allows. Gang blocks are + * allowed to reserve slots even if we've reached the maximum + * number of allocations allowed. + */ + uint64_t mca_alloc_max_slots; + zfs_refcount_t mca_alloc_slots; +} metaslab_class_allocator_t; + /* * A metaslab class encompasses a category of allocatable top-level vdevs. * Each top-level vdev is associated with a metaslab group which defines @@ -145,7 +168,7 @@ typedef enum trace_alloc_type { * When a block allocation is requested from the SPA it is associated with a * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging * to the class can be used to satisfy that request. Allocations are done - * by traversing the metaslab groups that are linked off of the mc_rotor field. + * by traversing the metaslab groups that are linked off of the mca_rotor field. * This rotor points to the next metaslab group where allocations will be * attempted. Allocating a block is a 3 step process -- select the metaslab * group, select the metaslab, and then allocate the block. The metaslab @@ -156,9 +179,7 @@ typedef enum trace_alloc_type { struct metaslab_class { kmutex_t mc_lock; spa_t *mc_spa; - metaslab_group_t *mc_rotor; metaslab_ops_t *mc_ops; - uint64_t mc_aliquot; /* * Track the number of metaslab groups that have been initialized @@ -173,21 +194,6 @@ struct metaslab_class { */ boolean_t mc_alloc_throttle_enabled; - /* - * The allocation throttle works on a reservation system. Whenever - * an asynchronous zio wants to perform an allocation it must - * first reserve the number of blocks that it wants to allocate. - * If there aren't sufficient slots available for the pending zio - * then that I/O is throttled until more slots free up. The current - * number of reserved allocations is maintained by the mc_alloc_slots - * refcount. The mc_alloc_max_slots value determines the maximum - * number of allocations that the system allows. Gang blocks are - * allowed to reserve slots even if we've reached the maximum - * number of allocations allowed. - */ - uint64_t *mc_alloc_max_slots; - zfs_refcount_t *mc_alloc_slots; - uint64_t mc_alloc_groups; /* # of allocatable groups */ uint64_t mc_alloc; /* total allocated space */ @@ -201,6 +207,8 @@ struct metaslab_class { * recent use. */ multilist_t *mc_metaslab_txg_list; + + metaslab_class_allocator_t mc_allocator[]; }; /* @@ -258,7 +266,7 @@ struct metaslab_group { * * Each allocator in each metaslab group has a current queue depth * (mg_alloc_queue_depth[allocator]) and a current max queue depth - * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group + * (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group * has an absolute max queue depth (mg_max_alloc_queue_depth). We * add IOs to an allocator until the mg_alloc_queue_depth for that * allocator hits the cur_max. Every time an IO completes for a given @@ -271,8 +279,7 @@ struct metaslab_group { * groups are unable to handle their share of allocations. */ uint64_t mg_max_alloc_queue_depth; - int mg_allocators; - metaslab_group_allocator_t *mg_allocator; /* array */ + /* * A metalab group that can no longer allocate the minimum block * size will set mg_no_free_space. Once a metaslab group is out @@ -290,6 +297,9 @@ struct metaslab_group { boolean_t mg_disabled_updating; kmutex_t mg_ms_disabled_lock; kcondvar_t mg_ms_disabled_cv; + + int mg_allocators; + metaslab_group_allocator_t mg_allocator[]; }; /* diff --git a/sys/contrib/openzfs/include/sys/spa_impl.h b/sys/contrib/openzfs/include/sys/spa_impl.h index 3eb87d2bb220..a3afaef38721 100644 --- a/sys/contrib/openzfs/include/sys/spa_impl.h +++ b/sys/contrib/openzfs/include/sys/spa_impl.h @@ -240,8 +240,9 @@ struct spa { kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ - int spa_min_ashift; /* of vdevs in normal class */ - int spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_min_ashift; /* of vdevs in normal class */ + uint64_t spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_min_alloc; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ diff --git a/sys/contrib/openzfs/include/sys/txg.h b/sys/contrib/openzfs/include/sys/txg.h index 260a3b43cfe8..22158bd1a5e6 100644 --- a/sys/contrib/openzfs/include/sys/txg.h +++ b/sys/contrib/openzfs/include/sys/txg.h @@ -41,6 +41,7 @@ extern "C" { #define TXG_MASK (TXG_SIZE - 1) /* mask for size */ #define TXG_INITIAL TXG_SIZE /* initial txg */ #define TXG_IDX (txg & TXG_MASK) +#define TXG_UNKNOWN 0 /* Number of txgs worth of frees we defer adding to in-core spacemaps */ #define TXG_DEFER_SIZE 2 diff --git a/sys/contrib/openzfs/include/sys/vdev.h b/sys/contrib/openzfs/include/sys/vdev.h index 309ce33be067..7bc72a03db1c 100644 --- a/sys/contrib/openzfs/include/sys/vdev.h +++ b/sys/contrib/openzfs/include/sys/vdev.h @@ -49,10 +49,13 @@ typedef enum vdev_dtl_type { extern int zfs_nocacheflush; +typedef boolean_t vdev_open_children_func_t(vdev_t *vd); + extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); extern void vdev_dbgmsg_print_tree(vdev_t *, int); extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); +extern void vdev_open_children_subset(vdev_t *, vdev_open_children_func_t *); extern int vdev_validate(vdev_t *); extern int vdev_copy_path_strict(vdev_t *, vdev_t *); extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); @@ -71,7 +74,10 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); -extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); +extern boolean_t vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, boolean_t scrub_done, boolean_t rebuild_done); extern boolean_t vdev_dtl_required(vdev_t *vd); @@ -97,8 +103,14 @@ extern void vdev_metaslab_set_size(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd, char *tag); + +typedef void vdev_xlate_func_t(void *arg, range_seg64_t *physical_rs); + +extern boolean_t vdev_xlate_is_empty(range_seg64_t *rs); extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs); + range_seg64_t *physical_rs, range_seg64_t *remain_rs); +extern void vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg); extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); diff --git a/sys/contrib/openzfs/include/sys/vdev_draid.h b/sys/contrib/openzfs/include/sys/vdev_draid.h new file mode 100644 index 000000000000..65417a93c4ed --- /dev/null +++ b/sys/contrib/openzfs/include/sys/vdev_draid.h @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_DRAID_H +#define _SYS_VDEV_DRAID_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Constants required to generate and use dRAID permutations. + */ +#define VDEV_DRAID_SEED 0xd7a1d5eed +#define VDEV_DRAID_MAX_MAPS 254 +#define VDEV_DRAID_ROWSHIFT SPA_MAXBLOCKSHIFT +#define VDEV_DRAID_ROWHEIGHT (1ULL << VDEV_DRAID_ROWSHIFT) +#define VDEV_DRAID_REFLOW_RESERVE (2 * VDEV_DRAID_ROWHEIGHT) + +/* + * dRAID permutation map. + */ +typedef struct draid_map { + uint64_t dm_children; /* # of permuation columns */ + uint64_t dm_nperms; /* # of permutation rows */ + uint64_t dm_seed; /* dRAID map seed */ + uint64_t dm_checksum; /* Checksum of generated map */ + uint8_t *dm_perms; /* base permutation array */ +} draid_map_t; + +/* + * dRAID configuration. + */ +typedef struct vdev_draid_config { + /* + * Values read from the dRAID nvlist configuration. + */ + uint64_t vdc_ndata; /* # of data devices in group */ + uint64_t vdc_nparity; /* # of parity devices in group */ + uint64_t vdc_nspares; /* # of distributed spares */ + uint64_t vdc_children; /* # of children */ + uint64_t vdc_ngroups; /* # groups per slice */ + + /* + * Immutable derived constants. + */ + uint8_t *vdc_perms; /* permutation array */ + uint64_t vdc_nperms; /* # of permutations */ + uint64_t vdc_groupwidth; /* = data + parity */ + uint64_t vdc_ndisks; /* = children - spares */ + uint64_t vdc_groupsz; /* = groupwidth * DRAID_ROWSIZE */ + uint64_t vdc_devslicesz; /* = (groupsz * groups) / ndisks */ +} vdev_draid_config_t; + +/* + * Functions for handling dRAID permutation maps. + */ +extern uint64_t vdev_draid_rand(uint64_t *); +extern int vdev_draid_lookup_map(uint64_t, const draid_map_t **); +extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **); + +/* + * General dRAID support functions. + */ +extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); +extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); +extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t); +extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *); +extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); + +/* Functions for dRAID distributed spares. */ +extern vdev_t *vdev_draid_spare_get_child(vdev_t *, uint64_t); +extern vdev_t *vdev_draid_spare_get_parent(vdev_t *); +extern int vdev_draid_spare_create(nvlist_t *, vdev_t *, uint64_t *, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_DRAID_H */ diff --git a/sys/contrib/openzfs/include/sys/vdev_impl.h b/sys/contrib/openzfs/include/sys/vdev_impl.h index 3c4c3fb5a279..fc169842a86b 100644 --- a/sys/contrib/openzfs/include/sys/vdev_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_impl.h @@ -68,14 +68,19 @@ extern uint32_t zfs_vdev_async_write_max_active; /* * Virtual device operations */ +typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); +typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef uint64_t vdev_min_asize_func_t(vdev_t *vd); +typedef uint64_t vdev_min_alloc_func_t(vdev_t *vd); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); -typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, const dva_t *dva, + size_t psize, uint64_t phys_birth); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); @@ -87,13 +92,24 @@ typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, * Given a target vdev, translates the logical range "in" to the physical * range "res" */ -typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in, - range_seg64_t *res); +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *logical, + range_seg64_t *physical, range_seg64_t *remain); +typedef uint64_t vdev_rebuild_asize_func_t(vdev_t *vd, uint64_t start, + uint64_t size, uint64_t max_segment); +typedef void vdev_metaslab_init_func_t(vdev_t *vd, uint64_t *startp, + uint64_t *sizep); +typedef void vdev_config_generate_func_t(vdev_t *vd, nvlist_t *nv); +typedef uint64_t vdev_nparity_func_t(vdev_t *vd); +typedef uint64_t vdev_ndisks_func_t(vdev_t *vd); typedef const struct vdev_ops { + vdev_init_func_t *vdev_op_init; + vdev_fini_func_t *vdev_op_fini; vdev_open_func_t *vdev_op_open; vdev_close_func_t *vdev_op_close; vdev_asize_func_t *vdev_op_asize; + vdev_min_asize_func_t *vdev_op_min_asize; + vdev_min_alloc_func_t *vdev_op_min_alloc; vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; @@ -101,11 +117,12 @@ typedef const struct vdev_ops { vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; - /* - * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. - * Used when initializing vdevs. Isn't used by leaf ops. - */ vdev_xlation_func_t *vdev_op_xlate; + vdev_rebuild_asize_func_t *vdev_op_rebuild_asize; + vdev_metaslab_init_func_t *vdev_op_metaslab_init; + vdev_config_generate_func_t *vdev_op_config_generate; + vdev_nparity_func_t *vdev_op_nparity; + vdev_ndisks_func_t *vdev_op_ndisks; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -148,6 +165,9 @@ struct vdev_queue { avl_tree_t vq_write_offset_tree; avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; + zio_priority_t vq_last_prio; /* Last sent I/O priority. */ + uint32_t vq_ia_active; /* Active interactive I/Os. */ + uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ @@ -325,16 +345,13 @@ struct vdev { kthread_t *vdev_rebuild_thread; vdev_rebuild_t vdev_rebuild_config; - /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */ + /* For limiting outstanding I/Os (initialize, TRIM) */ kmutex_t vdev_initialize_io_lock; kcondvar_t vdev_initialize_io_cv; uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; uint64_t vdev_trim_inflight[3]; - kmutex_t vdev_rebuild_io_lock; - kcondvar_t vdev_rebuild_io_cv; - uint64_t vdev_rebuild_inflight; /* * Values stored in the config for an indirect or removing vdev. @@ -392,7 +409,6 @@ struct vdev { uint64_t vdev_removed; /* persistent removed state */ uint64_t vdev_resilver_txg; /* persistent resilvering state */ uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ - uint64_t vdev_nparity; /* number of parity devices for raidz */ char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ @@ -445,8 +461,6 @@ struct vdev { zfs_ratelimit_t vdev_checksum_rl; }; -#define VDEV_RAIDZ_MAXPARITY 3 - #define VDEV_PAD_SIZE (8 << 10) /* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 @@ -532,6 +546,9 @@ typedef struct vdev_label { #define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) #define VDEV_LABELS 4 #define VDEV_BEST_LABEL VDEV_LABELS +#define VDEV_OFFSET_IS_LABEL(vd, off) \ + (((off) < VDEV_LABEL_START_SIZE) || \ + ((off) >= ((vd)->vdev_psize - VDEV_LABEL_END_SIZE))) #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 @@ -577,6 +594,8 @@ extern vdev_ops_t vdev_root_ops; extern vdev_ops_t vdev_mirror_ops; extern vdev_ops_t vdev_replacing_ops; extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_draid_ops; +extern vdev_ops_t vdev_draid_spare_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; @@ -587,11 +606,15 @@ extern vdev_ops_t vdev_indirect_ops; /* * Common size functions */ -extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, - range_seg64_t *out); +extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs); extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_default_min_asize(vdev_t *vd); extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); +extern uint64_t vdev_get_min_alloc(vdev_t *vd); +extern uint64_t vdev_get_nparity(vdev_t *vd); +extern uint64_t vdev_get_ndisks(vdev_t *vd); /* * Global variables diff --git a/sys/contrib/openzfs/include/sys/vdev_raidz.h b/sys/contrib/openzfs/include/sys/vdev_raidz.h index 0ce2b5ea1d67..029fdef5f4f8 100644 --- a/sys/contrib/openzfs/include/sys/vdev_raidz.h +++ b/sys/contrib/openzfs/include/sys/vdev_raidz.h @@ -32,6 +32,7 @@ extern "C" { #endif struct zio; +struct raidz_row; struct raidz_map; #if !defined(_KERNEL) struct kernel_param {}; @@ -43,8 +44,11 @@ struct kernel_param {}; struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, uint64_t); void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_generate_parity_row(struct raidz_map *, struct raidz_row *); void vdev_raidz_generate_parity(struct raidz_map *); -int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); +void vdev_raidz_child_done(zio_t *); +void vdev_raidz_io_done(zio_t *); /* * vdev_raidz_math interface @@ -52,11 +56,16 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_math_init(void); void vdev_raidz_math_fini(void); const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); -int vdev_raidz_math_generate(struct raidz_map *); -int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, - const int); +int vdev_raidz_math_generate(struct raidz_map *, struct raidz_row *); +int vdev_raidz_math_reconstruct(struct raidz_map *, struct raidz_row *, + const int *, const int *, const int); int vdev_raidz_impl_set(const char *); +typedef struct vdev_raidz { + int vd_logical_width; + int vd_nparity; +} vdev_raidz_t; + #ifdef __cplusplus } #endif diff --git a/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h b/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h index 8492daedb6f8..38d4f9e0bd48 100644 --- a/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h +++ b/sys/contrib/openzfs/include/sys/vdev_raidz_impl.h @@ -29,6 +29,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -106,30 +107,45 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ abd_t *rc_abd; /* I/O data */ - void *rc_gdata; /* used to store the "good" version */ + void *rc_orig_data; /* pre-reconstruction */ + abd_t *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ + uint8_t rc_need_orig_restore; /* need to restore from orig_data? */ + uint8_t rc_repair; /* Write good data to this column */ } raidz_col_t; +typedef struct raidz_row { + uint64_t rr_cols; /* Regular column count */ + uint64_t rr_scols; /* Count including skipped columns */ + uint64_t rr_bigcols; /* Remainder data column count */ + uint64_t rr_missingdata; /* Count of missing data devices */ + uint64_t rr_missingparity; /* Count of missing parity devices */ + uint64_t rr_firstdatacol; /* First data column/parity count */ + abd_t *rr_abd_copy; /* rm_asize-buffer of copied data */ + abd_t *rr_abd_empty; /* dRAID empty sector buffer */ + int rr_nempty; /* empty sectors included in parity */ + int rr_code; /* reconstruction code (unused) */ +#ifdef ZFS_DEBUG + uint64_t rr_offset; /* Logical offset for *_io_verify() */ + uint64_t rr_size; /* Physical size for *_io_verify() */ +#endif + raidz_col_t rr_col[0]; /* Flexible array of I/O columns */ +} raidz_row_t; + typedef struct raidz_map { - uint64_t rm_cols; /* Regular column count */ - uint64_t rm_scols; /* Count including skipped columns */ - uint64_t rm_bigcols; /* Number of oversized columns */ - uint64_t rm_asize; /* Actual total I/O size */ - uint64_t rm_missingdata; /* Count of missing data devices */ - uint64_t rm_missingparity; /* Count of missing parity devices */ - uint64_t rm_firstdatacol; /* First data column/parity count */ - uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ - abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ - uint8_t rm_freed; /* map no longer has referencing ZIO */ - uint8_t rm_ecksuminjected; /* checksum error was injected */ + boolean_t rm_freed; /* map no longer has referencing ZIO */ + boolean_t rm_ecksuminjected; /* checksum error was injected */ + int rm_nrows; /* Regular row count */ + int rm_nskip; /* RAIDZ sectors skipped for padding */ + int rm_skipstart; /* Column index of padding start */ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ - raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ + raidz_row_t *rm_row[0]; /* flexible array of rows */ } raidz_map_t; + #define RAIDZ_ORIGINAL_IMPL (INT_MAX) extern const raidz_impl_ops_t vdev_raidz_scalar_impl; @@ -163,14 +179,15 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; * * raidz_parity Returns parity of the RAIDZ block * raidz_ncols Returns number of columns the block spans + * Note, all rows have the same number of columns. * raidz_nbigcols Returns number of big columns * raidz_col_p Returns pointer to a column * raidz_col_size Returns size of a column * raidz_big_size Returns size of big columns * raidz_short_size Returns size of short columns */ -#define raidz_parity(rm) ((rm)->rm_firstdatacol) -#define raidz_ncols(rm) ((rm)->rm_cols) +#define raidz_parity(rm) ((rm)->rm_row[0]->rr_firstdatacol) +#define raidz_ncols(rm) ((rm)->rm_row[0]->rr_cols) #define raidz_nbigcols(rm) ((rm)->rm_bigcols) #define raidz_col_p(rm, c) ((rm)->rm_col + (c)) #define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size) @@ -185,10 +202,10 @@ extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; */ #define _RAIDZ_GEN_WRAP(code, impl) \ static void \ -impl ## _gen_ ## code(void *rmp) \ +impl ## _gen_ ## code(void *rrp) \ { \ - raidz_map_t *rm = (raidz_map_t *)rmp; \ - raidz_generate_## code ## _impl(rm); \ + raidz_row_t *rr = (raidz_row_t *)rrp; \ + raidz_generate_## code ## _impl(rr); \ } /* @@ -199,10 +216,10 @@ impl ## _gen_ ## code(void *rmp) \ */ #define _RAIDZ_REC_WRAP(code, impl) \ static int \ -impl ## _rec_ ## code(void *rmp, const int *tgtidx) \ +impl ## _rec_ ## code(void *rrp, const int *tgtidx) \ { \ - raidz_map_t *rm = (raidz_map_t *)rmp; \ - return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \ + raidz_row_t *rr = (raidz_row_t *)rrp; \ + return (raidz_reconstruct_## code ## _impl(rr, tgtidx)); \ } /* diff --git a/sys/contrib/openzfs/include/sys/vdev_rebuild.h b/sys/contrib/openzfs/include/sys/vdev_rebuild.h index 3d4b8cc46836..61ae15c5d09a 100644 --- a/sys/contrib/openzfs/include/sys/vdev_rebuild.h +++ b/sys/contrib/openzfs/include/sys/vdev_rebuild.h @@ -66,10 +66,14 @@ typedef struct vdev_rebuild { vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + kmutex_t vr_io_lock; /* inflight IO lock */ + kcondvar_t vr_io_cv; /* inflight IO cv */ /* In-core state and progress */ uint64_t vr_scan_offset[TXG_SIZE]; uint64_t vr_prev_scan_time_ms; /* any previous scan time */ + uint64_t vr_bytes_inflight_max; /* maximum bytes inflight */ + uint64_t vr_bytes_inflight; /* current bytes inflight */ /* Per-rebuild pass statistics for calculating bandwidth */ uint64_t vr_pass_start_time; diff --git a/sys/contrib/openzfs/include/sys/zfs_context.h b/sys/contrib/openzfs/include/sys/zfs_context.h index 9f637036ee71..ee3216d6763a 100644 --- a/sys/contrib/openzfs/include/sys/zfs_context.h +++ b/sys/contrib/openzfs/include/sys/zfs_context.h @@ -626,6 +626,7 @@ extern void delay(clock_t ticks); #define defclsyspri 0 #define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1)) +#define CPU_SEQID_UNSTABLE CPU_SEQID #define kcred NULL #define CRED() NULL diff --git a/sys/contrib/openzfs/include/sys/zfs_vnops.h b/sys/contrib/openzfs/include/sys/zfs_vnops.h new file mode 100644 index 000000000000..6bf077b4bf79 --- /dev/null +++ b/sys/contrib/openzfs/include/sys/zfs_vnops.h @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VNOPS_H +#define _SYS_FS_ZFS_VNOPS_H +#include + +extern int zfs_fsync(znode_t *, int, cred_t *); +extern int zfs_read(znode_t *, uio_t *, int, cred_t *); +extern int zfs_write(znode_t *, uio_t *, int, cred_t *); +extern int zfs_holey(znode_t *, ulong_t, loff_t *); +extern int zfs_access(znode_t *, int, int, cred_t *); + +extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *); +extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *); + +extern int mappedread(znode_t *, int, uio_t *); +extern int mappedread_sf(znode_t *, int, uio_t *); +extern void update_pages(znode_t *, int64_t, int, objset_t *); + +/* + * Platform code that asynchronously drops zp's inode / vnode_t. + * + * Asynchronous dropping ensures that the caller will never drop the + * last reference on an inode / vnode_t in the current context. + * Doing so while holding open a tx could result in a deadlock if + * the platform calls into filesystem again in the implementation + * of inode / vnode_t dropping (e.g. call from iput_final()). + */ +extern void zfs_zrele_async(znode_t *zp); + +extern zil_get_data_t zfs_get_data; + +#endif diff --git a/sys/contrib/openzfs/include/sys/zfs_znode.h b/sys/contrib/openzfs/include/sys/zfs_znode.h index 4138f6eba0a0..1ae1520e0736 100644 --- a/sys/contrib/openzfs/include/sys/zfs_znode.h +++ b/sys/contrib/openzfs/include/sys/zfs_znode.h @@ -187,7 +187,6 @@ typedef struct znode { boolean_t z_unlinked; /* file has been unlinked */ boolean_t z_atime_dirty; /* atime needs to be synced */ boolean_t z_zn_prefetch; /* Prefetch znodes? */ - boolean_t z_moved; /* Has this znode been moved? */ boolean_t z_is_sa; /* are we native sa? */ boolean_t z_is_mapped; /* are we mmap'ed */ boolean_t z_is_ctldir; /* are we .zfs entry */ diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h index 4959831716b5..334ca064b371 100644 --- a/sys/contrib/openzfs/include/sys/zio.h +++ b/sys/contrib/openzfs/include/sys/zio.h @@ -372,6 +372,7 @@ struct zio_cksum_report { nvlist_t *zcr_detector; void *zcr_cbdata; size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_sector; uint64_t zcr_align; uint64_t zcr_length; zio_cksum_finish_f *zcr_finish; diff --git a/sys/contrib/openzfs/include/sys/zvol_impl.h b/sys/contrib/openzfs/include/sys/zvol_impl.h index 36199c311d07..5137d2172088 100644 --- a/sys/contrib/openzfs/include/sys/zvol_impl.h +++ b/sys/contrib/openzfs/include/sys/zvol_impl.h @@ -46,6 +46,7 @@ typedef struct zvol_state { uint32_t zv_flags; /* ZVOL_* flags */ uint32_t zv_open_count; /* open counts */ uint32_t zv_changed; /* disk changed */ + uint32_t zv_volmode; /* volmode */ zilog_t *zv_zilog; /* ZIL handle */ zfs_rangelock_t zv_rangelock; /* for range locking */ dnode_t *zv_dn; /* dnode hold */ @@ -88,6 +89,7 @@ int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio); int zvol_init_impl(void); void zvol_fini_impl(void); +void zvol_wait_close(zvol_state_t *zv); /* * platform dependent functions exported to platform independent code diff --git a/sys/contrib/openzfs/include/zfeature_common.h b/sys/contrib/openzfs/include/zfeature_common.h index db0138ae8e39..cf05bad76c3c 100644 --- a/sys/contrib/openzfs/include/zfeature_common.h +++ b/sys/contrib/openzfs/include/zfeature_common.h @@ -76,6 +76,7 @@ typedef enum spa_feature { SPA_FEATURE_LIVELIST, SPA_FEATURE_DEVICE_REBUILD, SPA_FEATURE_ZSTD_COMPRESS, + SPA_FEATURE_DRAID, SPA_FEATURES } spa_feature_t; diff --git a/sys/contrib/openzfs/lib/Makefile.am b/sys/contrib/openzfs/lib/Makefile.am index f049288a1ae7..685c7b6695c6 100644 --- a/sys/contrib/openzfs/lib/Makefile.am +++ b/sys/contrib/openzfs/lib/Makefile.am @@ -15,4 +15,21 @@ SUBDIRS += libzutil libunicode # These five libraries, which are installed as the final build product, # incorporate the eight convenience libraries given above. -SUBDIRS += libuutil libzfs_core libzfs libzpool libzfsbootenv +DISTLIBS = libuutil libzfs_core libzfs libzpool libzfsbootenv +SUBDIRS += $(DISTLIBS) +DISTLIBS += libnvpair + +# An ABI is stored for each of these libraries. Note that libzpool.so +# is only linked against by ztest and zdb and no stable ABI is provided. +ABILIBS = libnvpair libuutil libzfs_core libzfs libzfsbootenv + +PHONY = checkabi storeabi +checkabi: $(ABILIBS) + set -e ; for dir in $(ABILIBS) ; do \ + $(MAKE) -C $$dir checkabi ; \ + done + +storeabi: $(ABILIBS) + set -e ; for dir in $(ABILIBS) ; do \ + $(MAKE) -C $$dir storeabi ; \ + done diff --git a/sys/contrib/openzfs/lib/libnvpair/Makefile.am b/sys/contrib/openzfs/lib/libnvpair/Makefile.am index ec16c5d526c5..7840e099bbd4 100644 --- a/sys/contrib/openzfs/lib/libnvpair/Makefile.am +++ b/sys/contrib/openzfs/lib/libnvpair/Makefile.am @@ -1,4 +1,5 @@ include $(top_srcdir)/config/Rules.am +PHONY = VPATH = \ $(top_srcdir)/module/nvpair \ @@ -10,6 +11,8 @@ AM_CFLAGS += $(FRAME_LARGER_THAN) $(LIBTIRPC_CFLAGS) lib_LTLIBRARIES = libnvpair.la +include $(top_srcdir)/config/Abigail.am + USER_C = \ libnvpair.c \ libnvpair_json.c \ @@ -37,8 +40,7 @@ if !ASAN_ENABLED libnvpair_la_LDFLAGS += -Wl,-z,defs endif -if BUILD_FREEBSD libnvpair_la_LDFLAGS += -version-info 3:0:0 -else -libnvpair_la_LDFLAGS += -version-info 1:1:0 -endif + +# Library ABI +EXTRA_DIST = libnvpair.abi libnvpair.suppr diff --git a/sys/contrib/openzfs/lib/libnvpair/libnvpair.abi b/sys/contrib/openzfs/lib/libnvpair/libnvpair.abi new file mode 100644 index 000000000000..c1b50a8aa39c --- /dev/null +++ b/sys/contrib/openzfs/lib/libnvpair/libnvpair.abi @@ -0,0 +1,2805 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sys/contrib/openzfs/lib/libnvpair/libnvpair.suppr b/sys/contrib/openzfs/lib/libnvpair/libnvpair.suppr new file mode 100644 index 000000000000..f4db8a49e4f2 --- /dev/null +++ b/sys/contrib/openzfs/lib/libnvpair/libnvpair.suppr @@ -0,0 +1,2 @@ +[suppress_type] + name = FILE* diff --git a/sys/contrib/openzfs/lib/libspl/include/sys/uio.h b/sys/contrib/openzfs/lib/libspl/include/sys/uio.h index 3a834b996add..1d56b5b18baf 100644 --- a/sys/contrib/openzfs/lib/libspl/include/sys/uio.h +++ b/sys/contrib/openzfs/lib/libspl/include/sys/uio.h @@ -59,7 +59,6 @@ typedef enum uio_rw { typedef enum uio_seg { UIO_USERSPACE = 0, UIO_SYSSPACE = 1, - UIO_USERISPACE = 2, } uio_seg_t; #elif defined(__FreeBSD__) @@ -73,49 +72,9 @@ typedef struct uio { uio_seg_t uio_segflg; /* address space (kernel or user) */ uint16_t uio_fmode; /* file mode flags */ uint16_t uio_extflg; /* extended flags */ - offset_t uio_limit; /* u-limit (maximum byte offset) */ ssize_t uio_resid; /* residual count */ } uio_t; -typedef enum xuio_type { - UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY, -} xuio_type_t; - -#define UIOA_IOV_MAX 16 - -typedef struct uioa_page_s { /* locked uio_iov state */ - int uioa_pfncnt; /* count of pfn_t(s) in *uioa_ppp */ - void **uioa_ppp; /* page_t or pfn_t array */ - caddr_t uioa_base; /* address base */ - size_t uioa_len; /* span length */ -} uioa_page_t; - -typedef struct xuio { - uio_t xu_uio; /* embedded UIO structure */ - - /* Extended uio fields */ - enum xuio_type xu_type; /* uio type */ - union { - struct { - uint32_t xu_a_state; /* state of async i/o */ - ssize_t xu_a_mbytes; /* bytes moved */ - uioa_page_t *xu_a_lcur; /* uioa_locked[] pointer */ - void **xu_a_lppp; /* lcur->uioa_pppp[] pointer */ - void *xu_a_hwst[4]; /* opaque hardware state */ - uioa_page_t xu_a_locked[UIOA_IOV_MAX]; - } xu_aio; - - struct { - int xu_zc_rw; /* read or write buffer */ - void *xu_zc_priv; /* fs specific */ - } xu_zc; - } xu_ext; -} xuio_t; - -#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv -#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw - #define uio_segflg(uio) (uio)->uio_segflg #define uio_offset(uio) (uio)->uio_loffset #define uio_resid(uio) (uio)->uio_resid diff --git a/sys/contrib/openzfs/lib/libuutil/Makefile.am b/sys/contrib/openzfs/lib/libuutil/Makefile.am index 8d9b32e44802..5e7cf5419d6c 100644 --- a/sys/contrib/openzfs/lib/libuutil/Makefile.am +++ b/sys/contrib/openzfs/lib/libuutil/Makefile.am @@ -1,7 +1,10 @@ include $(top_srcdir)/config/Rules.am +PHONY = lib_LTLIBRARIES = libuutil.la +include $(top_srcdir)/config/Abigail.am + USER_C = \ uu_alloc.c \ uu_avl.c \ @@ -27,8 +30,7 @@ if !ASAN_ENABLED libuutil_la_LDFLAGS += -Wl,-z,defs endif -if BUILD_FREEBSD libuutil_la_LDFLAGS += -version-info 3:0:0 -else -libuutil_la_LDFLAGS += -version-info 1:1:0 -endif + +# Library ABI +EXTRA_DIST = libuutil.abi libuutil.suppr diff --git a/sys/contrib/openzfs/lib/libuutil/libuutil.abi b/sys/contrib/openzfs/lib/libuutil/libuutil.abi new file mode 100644 index 000000000000..c152289089c4 --- /dev/null +++ b/sys/contrib/openzfs/lib/libuutil/libuutil.abi @@ -0,0 +1,1608 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sys/contrib/openzfs/lib/libuutil/libuutil.suppr b/sys/contrib/openzfs/lib/libuutil/libuutil.suppr new file mode 100644 index 000000000000..f4db8a49e4f2 --- /dev/null +++ b/sys/contrib/openzfs/lib/libuutil/libuutil.suppr @@ -0,0 +1,2 @@ +[suppress_type] + name = FILE* diff --git a/sys/contrib/openzfs/lib/libzfs/Makefile.am b/sys/contrib/openzfs/lib/libzfs/Makefile.am index f88fb828d53d..cd80ef7195fd 100644 --- a/sys/contrib/openzfs/lib/libzfs/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfs/Makefile.am @@ -1,4 +1,5 @@ include $(top_srcdir)/config/Rules.am +PHONY = VPATH = \ $(top_srcdir)/module/icp \ @@ -13,6 +14,8 @@ pkgconfig_DATA = libzfs.pc lib_LTLIBRARIES = libzfs.la +include $(top_srcdir)/config/Abigail.am + USER_C = \ libzfs_changelist.c \ libzfs_config.c \ @@ -58,7 +61,6 @@ KERNEL_C = \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ - zfs_uio.c \ zpool_prop.c \ zprop_common.c @@ -84,10 +86,12 @@ endif if BUILD_FREEBSD libzfs_la_LIBADD += -lutil -lgeom -libzfs_la_LDFLAGS += -version-info 4:0:0 -else -libzfs_la_LDFLAGS += -version-info 2:0:0 endif +libzfs_la_LDFLAGS += -version-info 4:0:0 + +# Library ABI +EXTRA_DIST = libzfs.abi libzfs.suppr + # Licensing data -EXTRA_DIST = THIRDPARTYLICENSE.openssl THIRDPARTYLICENSE.openssl.descrip +EXTRA_DIST += THIRDPARTYLICENSE.openssl THIRDPARTYLICENSE.openssl.descrip diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.abi b/sys/contrib/openzfs/lib/libzfs/libzfs.abi new file mode 100644 index 000000000000..911b6d00c4a4 --- /dev/null +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.abi @@ -0,0 +1,4879 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs.suppr b/sys/contrib/openzfs/lib/libzfs/libzfs.suppr new file mode 100644 index 000000000000..d55b5b728116 --- /dev/null +++ b/sys/contrib/openzfs/lib/libzfs/libzfs.suppr @@ -0,0 +1,13 @@ +[suppress_type] + name = FILE* + +[suppress_type] + type_kind = typedef + name = SHA256_CTX + +[suppress_type] + type_kind = typedef + name = SHA2_CTX + +[suppress_variable] + name = zfs_deleg_perm_tab diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c index 1eaed435c156..47418b3237bb 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_dataset.c @@ -5336,6 +5336,16 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in * the 128k block example above. * + * The situtation is slightly different for dRAID since the minimum allocation + * size is the full group width. The same 8K block above would be written as + * follows in a dRAID group: + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D1 | S0 | S1 | + * +-------+-------+-------+-------+-------+ + * * Compression may lead to a variety of block sizes being written for the same * volume or file. There is no clear way to reserve just the amount of space * that will be required, so the worst case (no compression) is assumed. @@ -5365,6 +5375,23 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, return (asize); } +/* + * Derived from function of same name in module/zfs/vdev_draid.c. Returns the + * amount of space (in bytes) that will be allocated for the specified block + * size. + */ +static uint64_t +vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, + uint64_t blksize) +{ + ASSERT3U(ndisks, >, nparity); + uint64_t ndata = ndisks - nparity; + uint64_t rows = ((blksize - 1) / (ndata << ashift)) + 1; + uint64_t asize = (rows * ndisks) << ashift; + + return (asize); +} + /* * Determine how much space will be allocated if it lands on the most space- * inefficient top-level vdev. Returns the size in bytes required to store one @@ -5374,7 +5401,7 @@ static uint64_t volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) { nvlist_t *config, *tree, **vdevs; - uint_t nvdevs, v; + uint_t nvdevs; uint64_t ret = 0; config = zpool_get_config(zhp, NULL); @@ -5384,33 +5411,61 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) return (nblocks * blksize); } - for (v = 0; v < nvdevs; v++) { + for (int v = 0; v < nvdevs; v++) { char *type; uint64_t nparity, ashift, asize, tsize; - nvlist_t **disks; - uint_t ndisks; uint64_t volsize; if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, - &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || - nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, - &nparity) != 0 || - nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, - &ashift) != 0 || - nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, - &disks, &ndisks) != 0) { + &type) != 0) continue; + + if (strcmp(type, VDEV_TYPE_RAIDZ) != 0 && + strcmp(type, VDEV_TYPE_DRAID) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_NPARITY, &nparity) != 0) + continue; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_ASHIFT, &ashift) != 0) + continue; + + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + nvlist_t **disks; + uint_t ndisks; + + if (nvlist_lookup_nvlist_array(vdevs[v], + ZPOOL_CONFIG_CHILDREN, &disks, &ndisks) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_raidz_asize(ndisks, nparity, ashift, + SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_raidz_asize(ndisks, nparity, ashift, + blksize); + } else { + uint64_t ndata; + + if (nvlist_lookup_uint64(vdevs[v], + ZPOOL_CONFIG_DRAID_NDATA, &ndata) != 0) + continue; + + /* allocation size for the "typical" 128k block */ + tsize = vdev_draid_asize(ndata + nparity, nparity, + ashift, SPA_OLD_MAXBLOCKSIZE); + + /* allocation size for the blksize block */ + asize = vdev_draid_asize(ndata + nparity, nparity, + ashift, blksize); } - /* allocation size for the "typical" 128k block */ - tsize = vdev_raidz_asize(ndisks, nparity, ashift, - SPA_OLD_MAXBLOCKSIZE); - /* allocation size for the blksize block */ - asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); - /* - * Scale this size down as a ratio of 128k / tsize. See theory - * statement above. + * Scale this size down as a ratio of 128k / tsize. + * See theory statement above. */ volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; if (volsize > ret) { diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_import.c b/sys/contrib/openzfs/lib/libzfs/libzfs_import.c index 6c5f61836978..44d3ade49644 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_import.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_import.c @@ -112,7 +112,6 @@ refresh_config_libzfs(void *handle, nvlist_t *tryconfig) return (refresh_config((libzfs_handle_t *)handle, tryconfig)); } - static int pool_active_libzfs(void *handle, const char *name, uint64_t guid, boolean_t *isactive) diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c index 00b0b6faf2e2..c661ab3131b0 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_pool.c @@ -42,10 +42,10 @@ #include #include #include +#include #include #include #include - #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" @@ -481,7 +481,8 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, if (err != 0) { ASSERT3U(err, ==, ENOENT); zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid feature '%s'"), fname); + "feature '%s' unsupported by kernel"), + fname); (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } @@ -783,7 +784,8 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) } int -zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) +zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp, + boolean_t literal) { libzfs_handle_t *hdl = zhp->zpool_hdl; zprop_list_t *entry; @@ -862,13 +864,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) } for (entry = *plp; entry != NULL; entry = entry->pl_next) { - - if (entry->pl_fixed) + if (entry->pl_fixed && !literal) continue; if (entry->pl_prop != ZPROP_INVAL && zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), - NULL, B_FALSE) == 0) { + NULL, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } @@ -960,6 +961,7 @@ zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) if (ret == 0 && !isopen && (strncmp(pool, "mirror", 6) == 0 || strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "draid", 5) == 0 || strncmp(pool, "spare", 5) == 0 || strcmp(pool, "log") == 0)) { if (hdl != NULL) @@ -1186,6 +1188,37 @@ zpool_has_special_vdev(nvlist_t *nvroot) return (B_FALSE); } +/* + * Output a dRAID top-level vdev name in to the provided buffer. + */ +static char * +zpool_draid_name(char *name, int len, uint64_t data, uint64_t parity, + uint64_t spares, uint64_t children) +{ + snprintf(name, len, "%s%llu:%llud:%lluc:%llus", + VDEV_TYPE_DRAID, (u_longlong_t)parity, (u_longlong_t)data, + (u_longlong_t)children, (u_longlong_t)spares); + + return (name); +} + +/* + * Return B_TRUE if the provided name is a dRAID spare name. + */ +boolean_t +zpool_is_draid_spare(const char *name) +{ + uint64_t spare_id, parity, vdev_id; + + if (sscanf(name, VDEV_TYPE_DRAID "%llu-%llu-%llu", + (u_longlong_t *)&parity, (u_longlong_t *)&vdev_id, + (u_longlong_t *)&spare_id) == 3) { + return (B_TRUE); + } + + return (B_FALSE); +} + /* * Create the named pool, using the provided vdev list. It is assumed * that the consumer has already validated the contents of the nvlist, so we @@ -2668,6 +2701,11 @@ zpool_vdev_is_interior(const char *name) VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); + + if (strncmp(name, VDEV_TYPE_DRAID, strlen(VDEV_TYPE_DRAID)) == 0 && + !zpool_is_draid_spare(name)) + return (B_TRUE); + return (B_FALSE); } @@ -3101,7 +3139,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_SPARE) == 0 && + if ((strcmp(type, VDEV_TYPE_SPARE) == 0 || + strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) && children == 2 && child[which] == tgt) return (B_TRUE); @@ -3216,8 +3255,12 @@ zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, "cannot replace a log with a spare")); } else if (rebuild) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "only mirror vdevs support sequential " - "reconstruction")); + "only mirror and dRAID vdevs support " + "sequential reconstruction")); + } else if (zpool_is_draid_spare(new_disk)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spares can only replace child " + "devices in their parent's dRAID vdev")); } else if (version >= SPA_VERSION_MULTI_REPLACE) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "already in replacing/spare config; wait " @@ -3388,7 +3431,7 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, nvlist_t *props, splitflags_t flags) { zfs_cmd_t zc = {"\0"}; - char msg[1024]; + char msg[1024], *bias; nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; nvlist_t **varray = NULL, *zc_props = NULL; uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; @@ -3446,6 +3489,7 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, for (c = 0; c < children; c++) { uint64_t is_log = B_FALSE, is_hole = B_FALSE; + boolean_t is_special = B_FALSE, is_dedup = B_FALSE; char *type; nvlist_t **mchild, *vdev; uint_t mchildren; @@ -3492,6 +3536,13 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, goto out; } + if (nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { + if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) + is_special = B_TRUE; + else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) + is_dedup = B_TRUE; + } verify(nvlist_lookup_nvlist_array(child[c], ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); @@ -3509,6 +3560,20 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) goto out; + + if (flags.dryrun != 0) { + if (is_dedup == B_TRUE) { + if (nvlist_add_string(varray[vcount - 1], + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) != 0) + goto out; + } else if (is_special == B_TRUE) { + if (nvlist_add_string(varray[vcount - 1], + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) != 0) + goto out; + } + } } /* did we find every disk the user specified? */ @@ -3618,6 +3683,12 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + if (zpool_is_draid_spare(path)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dRAID spares cannot be removed")); + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + } + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, &islog)) == NULL) @@ -3955,9 +4026,10 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, } /* - * Remove the partition from the path it this is a whole disk. + * Remove the partition from the path if this is a whole disk. */ - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + if (strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0 && + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) == 0 && value && !(name_flags & VDEV_NAME_PATH)) { return (zfs_strip_partition(path)); } @@ -3975,6 +4047,27 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, path = buf; } + /* + * If it's a dRAID device, we add parity, groups, and spares. + */ + if (strcmp(path, VDEV_TYPE_DRAID) == 0) { + uint64_t ndata, nparity, nspares; + nvlist_t **child; + uint_t children; + + verify(nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_NPARITY, &nparity) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NDATA, &ndata) == 0); + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_DRAID_NSPARES, &nspares) == 0); + + path = zpool_draid_name(buf, sizeof (buf), ndata, + nparity, nspares, children); + } + /* * We identify each top-level vdev by using a * naming convention. diff --git a/sys/contrib/openzfs/lib/libzfs/libzfs_util.c b/sys/contrib/openzfs/lib/libzfs/libzfs_util.c index a457fbfd0639..95cb32957218 100644 --- a/sys/contrib/openzfs/lib/libzfs/libzfs_util.c +++ b/sys/contrib/openzfs/lib/libzfs/libzfs_util.c @@ -148,15 +148,15 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_MOUNTFAILED: return (dgettext(TEXT_DOMAIN, "mount failed")); case EZFS_UMOUNTFAILED: - return (dgettext(TEXT_DOMAIN, "umount failed")); + return (dgettext(TEXT_DOMAIN, "unmount failed")); case EZFS_UNSHARENFSFAILED: - return (dgettext(TEXT_DOMAIN, "unshare(1M) failed")); + return (dgettext(TEXT_DOMAIN, "NFS share removal failed")); case EZFS_SHARENFSFAILED: - return (dgettext(TEXT_DOMAIN, "share(1M) failed")); + return (dgettext(TEXT_DOMAIN, "NFS share creation failed")); case EZFS_UNSHARESMBFAILED: - return (dgettext(TEXT_DOMAIN, "smb remove share failed")); + return (dgettext(TEXT_DOMAIN, "SMB share removal failed")); case EZFS_SHARESMBFAILED: - return (dgettext(TEXT_DOMAIN, "smb add share failed")); + return (dgettext(TEXT_DOMAIN, "SMB share creation failed")); case EZFS_PERM: return (dgettext(TEXT_DOMAIN, "permission denied")); case EZFS_NOSPC: diff --git a/sys/contrib/openzfs/lib/libzfs_core/Makefile.am b/sys/contrib/openzfs/lib/libzfs_core/Makefile.am index e94ba85d275c..760cadddeb94 100644 --- a/sys/contrib/openzfs/lib/libzfs_core/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfs_core/Makefile.am @@ -1,9 +1,12 @@ include $(top_srcdir)/config/Rules.am +PHONY = pkgconfig_DATA = libzfs_core.pc lib_LTLIBRARIES = libzfs_core.la +include $(top_srcdir)/config/Abigail.am + USER_C = \ libzfs_core.c @@ -23,7 +26,9 @@ endif if BUILD_FREEBSD libzfs_core_la_LIBADD += -lutil -lgeom -libzfs_core_la_LDFLAGS += -version-info 3:0:0 -else -libzfs_core_la_LDFLAGS += -version-info 1:0:0 endif + +libzfs_core_la_LDFLAGS += -version-info 3:0:0 + +# Library ABI +EXTRA_DIST = libzfs_core.abi libzfs_core.suppr diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi new file mode 100644 index 000000000000..02627e229417 --- /dev/null +++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.abi @@ -0,0 +1,2820 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.suppr b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.suppr new file mode 100644 index 000000000000..109d331cfd51 --- /dev/null +++ b/sys/contrib/openzfs/lib/libzfs_core/libzfs_core.suppr @@ -0,0 +1,5 @@ +[suppress_type] + name = FILE* + +[suppress_type] + name = pthread_cond_t diff --git a/sys/contrib/openzfs/lib/libzfsbootenv/Makefile.am b/sys/contrib/openzfs/lib/libzfsbootenv/Makefile.am index 6b9a8f0137a2..51ab48f543b8 100644 --- a/sys/contrib/openzfs/lib/libzfsbootenv/Makefile.am +++ b/sys/contrib/openzfs/lib/libzfsbootenv/Makefile.am @@ -1,9 +1,12 @@ include $(top_srcdir)/config/Rules.am +PHONY = pkgconfig_DATA = libzfsbootenv.pc lib_LTLIBRARIES = libzfsbootenv.la +include $(top_srcdir)/config/Abigail.am + if BUILD_FREEBSD DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs endif @@ -30,3 +33,6 @@ libzfsbootenv_la_LDFLAGS += -Wl,-z,defs endif libzfsbootenv_la_LDFLAGS += -version-info 1:0:0 + +# Library ABI +EXTRA_DIST = libzfsbootenv.abi libzfsbootenv.suppr diff --git a/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.abi b/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.abi new file mode 100644 index 000000000000..8ef242d2f5ac --- /dev/null +++ b/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.abi @@ -0,0 +1,212 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.pc.in b/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.pc.in index 50865050bbfb..986286d9bc8c 100644 --- a/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.pc.in +++ b/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.pc.in @@ -7,6 +7,6 @@ Name: libzfsbootenv Description: LibZFSBootENV library Version: @VERSION@ URL: https://github.com/openzfs/zfs -Requires: libzfs libnvpair +Requires: libzfs Cflags: -I${includedir} Libs: -L${libdir} -lzfsbootenv diff --git a/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.suppr b/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.suppr new file mode 100644 index 000000000000..f4db8a49e4f2 --- /dev/null +++ b/sys/contrib/openzfs/lib/libzfsbootenv/libzfsbootenv.suppr @@ -0,0 +1,2 @@ +[suppress_type] + name = FILE* diff --git a/sys/contrib/openzfs/lib/libzpool/Makefile.am b/sys/contrib/openzfs/lib/libzpool/Makefile.am index 992c21cc1560..7aa7e80985aa 100644 --- a/sys/contrib/openzfs/lib/libzpool/Makefile.am +++ b/sys/contrib/openzfs/lib/libzpool/Makefile.am @@ -47,7 +47,6 @@ KERNEL_C = \ zfs_fletcher_superscalar4.c \ zfs_namecheck.c \ zfs_prop.c \ - zfs_uio.c \ zpool_prop.c \ zprop_common.c \ abd.c \ @@ -124,6 +123,8 @@ KERNEL_C = \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ vdev_file.c \ vdev_indirect_births.c \ vdev_indirect.c \ @@ -216,7 +217,7 @@ libzpool_la_LIBADD = \ $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ $(abs_top_builddir)/lib/libzstd/libzstd.la -libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl +libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl -lm libzpool_la_LDFLAGS = -pthread @@ -226,11 +227,10 @@ endif if BUILD_FREEBSD libzpool_la_LIBADD += -lgeom -libzpool_la_LDFLAGS += -version-info 4:0:0 -else -libzpool_la_LDFLAGS += -version-info 2:0:0 endif +libzpool_la_LDFLAGS += -version-info 4:0:0 + if TARGET_CPU_POWERPC vdev_raidz_math_powerpc_altivec.$(OBJEXT): CFLAGS += -maltivec vdev_raidz_math_powerpc_altivec.l$(OBJEXT): CFLAGS += -maltivec diff --git a/sys/contrib/openzfs/lib/libzutil/zutil_import.c b/sys/contrib/openzfs/lib/libzutil/zutil_import.c index b8cdc118b263..3a1827294502 100644 --- a/sys/contrib/openzfs/lib/libzutil/zutil_import.c +++ b/sys/contrib/openzfs/lib/libzutil/zutil_import.c @@ -1539,7 +1539,7 @@ zpool_find_config(void *hdl, const char *target, nvlist_t **configp, nvlist_t *pools; nvlist_t *match = NULL; nvlist_t *config = NULL; - char *name = NULL, *sepp = NULL; + char *sepp = NULL; char sep = '\0'; int count = 0; char *targetdup = strdup(target); @@ -1563,11 +1563,11 @@ zpool_find_config(void *hdl, const char *target, nvlist_t **configp, /* multiple matches found */ continue; } else { - match = config; - name = nvpair_name(elem); + match = fnvlist_dup(config); } } } + fnvlist_free(pools); } if (count == 0) { @@ -1577,6 +1577,7 @@ zpool_find_config(void *hdl, const char *target, nvlist_t **configp, if (count > 1) { free(targetdup); + fnvlist_free(match); return (EINVAL); } diff --git a/sys/contrib/openzfs/man/man1/arcstat.1 b/sys/contrib/openzfs/man/man1/arcstat.1 index 7fe1e0bfb14a..de63206e279f 100644 --- a/sys/contrib/openzfs/man/man1/arcstat.1 +++ b/sys/contrib/openzfs/man/man1/arcstat.1 @@ -13,13 +13,13 @@ .\" Copyright (c) 2015 by Delphix. All rights reserved. .\" Copyright (c) 2020 by AJ Jordan. All rights reserved. .\" -.TH ARCSTAT 1 "Aug 24, 2020" OpenZFS +.TH ARCSTAT 1 "Oct 20, 2020" OpenZFS .SH NAME arcstat \- report ZFS ARC and L2ARC statistics .SH SYNOPSIS .LP .nf -\fBarcstat\fR [\fB-hvx\fR] [\fB-f field[,field]...\fR] [\fB-o file\fR] [\fB-s string\fR] [\fBinterval\fR [\fBcount\fR]] +\fBarcstat\fR [\fB-havxp\fR] [\fB-f field[,field]...\fR] [\fB-o file\fR] [\fB-s string\fR] [\fBinterval\fR [\fBcount\fR]] .fi .SH DESCRIPTION @@ -332,6 +332,96 @@ L2ARC misses per second Total L2ARC accesses per second .RE +.sp +.ne 2 +.na +\fBl2pref \fR +.ad +.RS 14n +L2ARC prefetch allocated size per second +.RE + +.sp +.ne 2 +.na +\fBl2pref% \fR +.ad +.RS 14n +L2ARC prefetch allocated size percentage +.RE + +.sp +.ne 2 +.na +\fBl2mfu \fR +.ad +.RS 14n +L2ARC MFU allocated size per second +.RE + +.sp +.ne 2 +.na +\fBl2mfu% \fR +.ad +.RS 14n +L2ARC MFU allocated size percentage +.RE + +.sp +.ne 2 +.na +\fBl2mru \fR +.ad +.RS 14n +L2ARC MRU allocated size per second +.RE + +.sp +.ne 2 +.na +\fBl2mru% \fR +.ad +.RS 14n +L2ARC MRU allocated size percentage +.RE + +.sp +.ne 2 +.na +\fBl2data \fR +.ad +.RS 14n +L2ARC data (buf content) allocated size per second +.RE + +.sp +.ne 2 +.na +\fBl2data% \fR +.ad +.RS 14n +L2ARC data (buf content) allocated size percentage +.RE + +.sp +.ne 2 +.na +\fBl2meta \fR +.ad +.RS 14n +L2ARC metadata (buf content) allocated size per second +.RE + +.sp +.ne 2 +.na +\fBl2meta% \fR +.ad +.RS 14n +L2ARC metadata (buf content) allocated size percentage +.RE + .sp .ne 2 .na @@ -420,6 +510,15 @@ May temporarily be negative, in which case the ARC will reduce the target size \ .LP The following options are supported: +.sp +.ne 2 +.na +\fB\fB-a\fR\fR +.ad +.RS 12n +Print all possible stats. +.RE + .sp .ne 2 .na @@ -447,6 +546,15 @@ Display help message. Report statistics to a file instead of the standard output. .RE +.sp +.ne 2 +.na +\fB\fB-p\fR\fR +.ad +.RS 12n +Disable auto-scaling of numerical fields (for raw, machine-parsable values). +.RE + .sp .ne 2 .na diff --git a/sys/contrib/openzfs/man/man1/raidz_test.1 b/sys/contrib/openzfs/man/man1/raidz_test.1 index 94e48bf49bd7..26e6b24ad815 100644 --- a/sys/contrib/openzfs/man/man1/raidz_test.1 +++ b/sys/contrib/openzfs/man/man1/raidz_test.1 @@ -61,6 +61,11 @@ during testing. .IP Size of data for raidz block. Size is 1 << (zio_size_shift). .HP +.BI "\-r" " reflow_offset" " (default: uint max)" +.IP +Set raidz expansion offset. The expanded raidz map allocation function will +produce different map configurations depending on this value. +.HP .BI "\-S(weep)" .IP Sweep parameter space while verifying the raidz implementations. This option @@ -77,6 +82,10 @@ This options starts the benchmark mode. All implementations are benchmarked using increasing per disk data size. Results are given as throughput per disk, measured in MiB/s. .HP +.BI "\-e(xpansion)" +.IP +Use expanded raidz map allocation function. +.HP .BI "\-v(erbose)" .IP Increase verbosity. diff --git a/sys/contrib/openzfs/man/man1/ztest.1 b/sys/contrib/openzfs/man/man1/ztest.1 index 68c978ca0968..3f30b3ed743d 100644 --- a/sys/contrib/openzfs/man/man1/ztest.1 +++ b/sys/contrib/openzfs/man/man1/ztest.1 @@ -23,6 +23,7 @@ .\" Copyright (c) 2009 Oracle and/or its affiliates. All rights reserved. .\" Copyright (c) 2009 Michael Gebetsroither . All rights .\" reserved. +.\" Copyright (c) 2017, Intel Corporation. .\" .TH ZTEST 1 "Aug 24, 2020" OpenZFS @@ -82,13 +83,29 @@ Used alignment in test. .IP Number of mirror copies. .HP -.BI "\-r" " raidz_disks" " (default: 4)" +.BI "\-r" " raidz_disks / draid_disks" " (default: 4 / 16)" .IP Number of raidz disks. .HP -.BI "\-R" " raidz_parity" " (default: 1)" +.BI "\-R" " raid_parity" " (default: 1)" .IP -Raidz parity. +Raid parity (raidz & draid). +.HP +.BI "\-K" " raid_kind" " (default: 'random') raidz|draid|random" +.IP +The kind of RAID config to use. With 'random' the kind alternates between raidz and draid. +.HP +.BI "\-D" " draid_data" " (default: 4)" +.IP +Number of data disks in a dRAID redundancy group. +.HP +.BI "\-S" " draid_spares" " (default: 1)" +.IP +Number of dRAID distributed spare disks. +.HP +.BI "\-C" " vdev_class_state" " (default: random)" +.IP +The vdev allocation class state: special=on|off|random. .HP .BI "\-d" " datasets" " (default: 7)" .IP diff --git a/sys/contrib/openzfs/man/man5/zfs-module-parameters.5 b/sys/contrib/openzfs/man/man5/zfs-module-parameters.5 index 1b1a0d56a3ab..41e8ffa79585 100644 --- a/sys/contrib/openzfs/man/man5/zfs-module-parameters.5 +++ b/sys/contrib/openzfs/man/man5/zfs-module-parameters.5 @@ -209,7 +209,19 @@ This may be desired to avoid wasting space on L2ARC when reading/writing large amounts of data that are not expected to be accessed more than once. The default is \fB0\fR, meaning both MRU and MFU data and metadata are cached. When turning off (\fB0\fR) this feature some MRU buffers will still be present -in ARC and eventually cached on L2ARC. +in ARC and eventually cached on L2ARC. If \fBl2arc_noprefetch\fR is set to 0, +some prefetched buffers will be cached to L2ARC, and those might later +transition to MRU, in which case the \fBl2arc_mru_asize\fR arcstat will not +be 0. Regardless of \fBl2arc_noprefetch\fR, some MFU buffers might be evicted +from ARC, accessed later on as prefetches and transition to MRU as prefetches. +If accessed again they are counted as MRU and the \fBl2arc_mru_asize\fR arcstat +will not be 0. The ARC status of L2ARC buffers when they were first cached in +L2ARC can be seen in the \fBl2arc_mru_asize\fR, \fBl2arc_mfu_asize\fR and +\fBl2arc_prefetch_asize\fR arcstats when importing the pool or onlining a cache +device if persistent L2ARC is enabled. The \fBevicted_l2_eligible_mru\fR +arcstat does not take into account if this option is enabled as the information +provided by the evicted_l2_eligible_* arcstats can be used to decide if +toggling this option is appropriate for the current workload. .sp Use \fB0\fR for no (default) and \fB1\fR for yes. .RE @@ -254,9 +266,15 @@ Default value: \fB0\fR%. .ad .RS 12n Do not write buffers to L2ARC if they were prefetched but not used by -applications. +applications. In case there are prefetched buffers in L2ARC and this option +is later set to \fB1\fR, we do not read the prefetched buffers from L2ARC. +Setting this option to \fB0\fR is useful for caching sequential reads from the +disks to L2ARC and serve those reads from L2ARC later on. This may be beneficial +in case the L2ARC device is significantly faster in sequential reads than the +disks of the pool. .sp -Use \fB1\fR for yes (default) and \fB0\fR to disable. +Use \fB1\fR to disable (default) and \fB0\fR to enable caching/reading +prefetches to/from L2ARC.. .RE .sp @@ -361,6 +379,20 @@ by the test suite to facilitate testing. Default value: \fB16,777,217\fR. .RE +.sp +.ne 2 +.na +\fBzfs_history_output_max\fR (int) +.ad +.RS 12n +When attempting to log the output nvlist of an ioctl in the on-disk history, the +output will not be stored if it is larger than size (in bytes). This must be +less then DMU_MAX_ACCESS (64MB). This applies primarily to +zfs_ioc_channel_program(). +.sp +Default value: \fB1MB\fR. +.RE + .sp .ne 2 .na @@ -494,6 +526,40 @@ memory that is the threshold. Default value: \fB25 percent\fR .RE +.sp +.ne 2 +.na +\fBzfs_metaslab_try_hard_before_gang\fR (int) +.ad +.RS 12n +If not set (the default), we will first try normal allocation. +If that fails then we will do a gang allocation. +If that fails then we will do a "try hard" gang allocation. +If that fails then we will have a multi-layer gang block. +.sp +If set, we will first try normal allocation. +If that fails then we will do a "try hard" allocation. +If that fails we will do a gang allocation. +If that fails we will do a "try hard" gang allocation. +If that fails then we will have a multi-layer gang block. +.sp +Default value: \fB0 (false)\fR +.RE + +.sp +.ne 2 +.na +\fBzfs_metaslab_find_max_tries\fR (int) +.ad +.RS 12n +When not trying hard, we only consider this number of the best metaslabs. +This improves performance, especially when there are many metaslabs per vdev +and the allocation can't actually be satisfied (so we would otherwise iterate +all the metaslabs). +.sp +Default value: \fB100\fR +.RE + .sp .ne 2 .na @@ -1997,8 +2063,7 @@ Default value: \fB1\fR. .ad .RS 12n The maximum number of I/Os active to each device. Ideally, this will be >= -the sum of each queue's max_active. It must be at least the sum of each -queue's min_active. See the section "ZFS I/O SCHEDULER". +the sum of each queue's max_active. See the section "ZFS I/O SCHEDULER". .sp Default value: \fB1,000\fR. .RE @@ -2147,6 +2212,42 @@ See the section "ZFS I/O SCHEDULER". Default value: \fB1\fR. .RE +.sp +.ne 2 +.na +\fBzfs_vdev_nia_delay\fR (int) +.ad +.RS 12n +For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), +the number of concurrently-active I/O's is limited to *_min_active, unless +the vdev is "idle". When there are no interactive I/Os active (sync or +async), and zfs_vdev_nia_delay I/Os have completed since the last +interactive I/O, then the vdev is considered to be "idle", and the number +of concurrently-active non-interactive I/O's is increased to *_max_active. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_nia_credit\fR (int) +.ad +.RS 12n +Some HDDs tend to prioritize sequential I/O so high, that concurrent +random I/O latency reaches several seconds. On some HDDs it happens +even if sequential I/Os are submitted one at a time, and so setting +*_max_active to 1 does not help. To prevent non-interactive I/Os, like +scrub, from monopolizing the device no more than zfs_vdev_nia_credit +I/Os can be sent while there are outstanding incomplete interactive +I/Os. This enforced wait ensures the HDD services the interactive I/O +within a reasonable amount of time. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB5\fR. +.RE + .sp .ne 2 .na @@ -2884,6 +2985,31 @@ top-level vdev. Default value: \fB1,048,576\fR. .RE +.sp +.ne 2 +.na +\fBzfs_rebuild_scrub_enabled\fR (int) +.ad +.RS 12n +Automatically start a pool scrub when the last active sequential resilver +completes in order to verify the checksums of all blocks which have been +resilvered. This option is enabled by default and is strongly recommended. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_rebuild_vdev_limit\fR (ulong) +.ad +.RS 12n +Maximum amount of i/o that can be concurrently issued for a sequential +resilver per leaf device, given in bytes. +.sp +Default value: \fB33,554,432\fR. +.RE + .sp .ne 2 .na diff --git a/sys/contrib/openzfs/man/man5/zpool-features.5 b/sys/contrib/openzfs/man/man5/zpool-features.5 index 36c4343a1388..2e5ab4c37642 100644 --- a/sys/contrib/openzfs/man/man5/zpool-features.5 +++ b/sys/contrib/openzfs/man/man5/zpool-features.5 @@ -306,6 +306,30 @@ This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is used on a top-level vdev, and will never return to being \fBenabled\fR. .RE +.sp +.ne 2 +.na +\fBdraid\fR +.ad +.RS 4n +.TS +l l . +GUID org.openzfs:draid +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature enables use of the \fBdraid\fR vdev type. dRAID is a variant +of raidz which provides integrated distributed hot spares that allow faster +resilvering while retaining the benefits of raidz. Data, parity, and spare +space are organized in redundancy groups and distributed evenly over all of +the devices. + +This feature becomes \fBactive\fR when creating a pool which uses the +\fBdraid\fR vdev type, or when adding a new \fBdraid\fR vdev to an +existing pool. +.RE + .sp .ne 2 .na @@ -676,11 +700,11 @@ When the \fBmulti_vdev_crash_dump\fR feature is set to \fBenabled\fR, the administrator can use the \fBdumpadm\fR(1M) command to configure a dump device on a pool comprised of multiple vdevs. -Under Linux this feature is registered for compatibility but not used. -New pools created under Linux will have the feature \fBenabled\fR but -will never transition to \fB\fBactive\fR. This functionality is not -required in order to support crash dumps under Linux. Existing pools -where this feature is \fB\fBactive\fR can be imported. +Under FreeBSD and Linux this feature is registered for compatibility but not +used. New pools created under FreeBSD and Linux will have the feature +\fBenabled\fR but will never transition to \fB\fBactive\fR. This functionality +is not required in order to support crash dumps under FreeBSD and Linux. +Existing pools where this feature is \fB\fBactive\fR can be imported. .RE .sp diff --git a/sys/contrib/openzfs/man/man8/Makefile.am b/sys/contrib/openzfs/man/man8/Makefile.am index 07f6aefa68e6..602645180beb 100644 --- a/sys/contrib/openzfs/man/man8/Makefile.am +++ b/sys/contrib/openzfs/man/man8/Makefile.am @@ -82,7 +82,8 @@ dist_man_MANS = \ zpool-upgrade.8 \ zpool-wait.8 \ zstream.8 \ - zstreamdump.8 + zstreamdump.8 \ + zpool_influxdb.8 nodist_man_MANS = \ zed.8 \ diff --git a/sys/contrib/openzfs/man/man8/zfs-allow.8 b/sys/contrib/openzfs/man/man8/zfs-allow.8 index f32b29a72661..ac4bf658b92a 100644 --- a/sys/contrib/openzfs/man/man8/zfs-allow.8 +++ b/sys/contrib/openzfs/man/man8/zfs-allow.8 @@ -34,57 +34,57 @@ .Dt ZFS-ALLOW 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm allow +.Nm zfs-allow .Nd Delegates ZFS administration permission for the file systems to non-privileged users. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm allow .Op Fl dglu .Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns .Ar setname Oc Ns ... .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm allow .Op Fl dl .Fl e Ns | Ns Sy everyone .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns .Ar setname Oc Ns ... .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm allow .Fl c .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns .Ar setname Oc Ns ... .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm allow .Fl s No @ Ns Ar setname .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns .Ar setname Oc Ns ... .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm unallow .Op Fl dglru .Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns .Ar setname Oc Ns ... Oc .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm unallow .Op Fl dlr .Fl e Ns | Ns Sy everyone .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns .Ar setname Oc Ns ... Oc .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm unallow .Op Fl r .Fl c .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns .Ar setname Oc Ns ... Oc .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm unallow .Op Fl r .Fl s No @ Ns Ar setname @@ -94,7 +94,7 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm allow .Ar filesystem Ns | Ns Ar volume .Xc @@ -116,7 +116,7 @@ These permissions cannot be delegated because the Linux .Xr mount 8 command restricts modifications of the global namespace to the root user. .It Xo -.Nm +.Nm zfs .Cm allow .Op Fl dglu .Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... @@ -125,7 +125,7 @@ command restricts modifications of the global namespace to the root user. .Ar filesystem Ns | Ns Ar volume .Xc .It Xo -.Nm +.Nm zfs .Cm allow .Op Fl dl .Fl e Ns | Ns Sy everyone @@ -271,7 +271,7 @@ xattr property zoned property .Ed .It Xo -.Nm +.Nm zfs .Cm allow .Fl c .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns @@ -285,7 +285,7 @@ These permissions are granted .Pq locally to the creator of any newly-created descendent file system. .It Xo -.Nm +.Nm zfs .Cm allow .Fl s No @ Ns Ar setname .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns @@ -302,7 +302,7 @@ name must begin with .Sy @ , and can be no more than 64 characters long. .It Xo -.Nm +.Nm zfs .Cm unallow .Op Fl dglru .Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... @@ -311,7 +311,7 @@ and can be no more than 64 characters long. .Ar filesystem Ns | Ns Ar volume .Xc .It Xo -.Nm +.Nm zfs .Cm unallow .Op Fl dlr .Fl e Ns | Ns Sy everyone @@ -320,7 +320,7 @@ and can be no more than 64 characters long. .Ar filesystem Ns | Ns Ar volume .Xc .It Xo -.Nm +.Nm zfs .Cm unallow .Op Fl r .Fl c @@ -358,7 +358,7 @@ options. Recursively remove the permissions from this file system and all descendents. .El .It Xo -.Nm +.Nm zfs .Cm unallow .Op Fl r .Fl s No @ Ns Ar setname diff --git a/sys/contrib/openzfs/man/man8/zfs-bookmark.8 b/sys/contrib/openzfs/man/man8/zfs-bookmark.8 index 3b2af815920d..042ddf504435 100644 --- a/sys/contrib/openzfs/man/man8/zfs-bookmark.8 +++ b/sys/contrib/openzfs/man/man8/zfs-bookmark.8 @@ -35,13 +35,13 @@ .Dt ZFS-BOOKMARK 8 SMM .Os .Sh NAME -.Nm zfs Ns Pf - Cm bookmark +.Nm zfs-bookmark .Nd Creates a bookmark of the given snapshot. .Sh SYNOPSIS .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm bookmark .Ar snapshot Ns | Ns Ar bookmark newbookmark .Xc diff --git a/sys/contrib/openzfs/man/man8/zfs-clone.8 b/sys/contrib/openzfs/man/man8/zfs-clone.8 index 352a2392ff40..9cb84d3c56d6 100644 --- a/sys/contrib/openzfs/man/man8/zfs-clone.8 +++ b/sys/contrib/openzfs/man/man8/zfs-clone.8 @@ -34,10 +34,10 @@ .Dt ZFS-CLONE 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm clone +.Nm zfs-clone .Nd Creates a clone of the given snapshot. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm clone .Op Fl p .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... @@ -45,7 +45,7 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm clone .Op Fl p .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... diff --git a/sys/contrib/openzfs/man/man8/zfs-create.8 b/sys/contrib/openzfs/man/man8/zfs-create.8 index c37d63305a4e..5a4f9a32afa5 100644 --- a/sys/contrib/openzfs/man/man8/zfs-create.8 +++ b/sys/contrib/openzfs/man/man8/zfs-create.8 @@ -30,19 +30,19 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd June 30, 2019 +.Dd December 1, 2020 .Dt ZFS-CREATE 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm create +.Nm zfs-create .Nd Creates a new ZFS file system. .Sh SYNOPSIS .Nm zfs .Cm create -.Op Fl Pnpv +.Op Fl Pnpuv .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem -.Nm +.Nm zfs .Cm create .Op Fl ps .Op Fl b Ar blocksize @@ -51,16 +51,18 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm create -.Op Fl Pnpv +.Op Fl Pnpuv .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem .Xc Creates a new ZFS file system. The file system is automatically mounted according to the .Sy mountpoint -property inherited from the parent. +property inherited from the parent, unless the +.Fl u +option is used. .Bl -tag -width "-o" .It Fl o Ar property Ns = Ns Ar value Sets the specified property as if the command @@ -122,11 +124,13 @@ to due to the use of the .Fl o option. +.It Fl u +Do not mount the newly created file system. .It Fl v Print verbose information about the created dataset. .El .It Xo -.Nm +.Nm zfs .Cm create .Op Fl ps .Op Fl b Ar blocksize diff --git a/sys/contrib/openzfs/man/man8/zfs-destroy.8 b/sys/contrib/openzfs/man/man8/zfs-destroy.8 index 99ae33d5ecf4..b0365cc82a91 100644 --- a/sys/contrib/openzfs/man/man8/zfs-destroy.8 +++ b/sys/contrib/openzfs/man/man8/zfs-destroy.8 @@ -34,25 +34,25 @@ .Dt ZFS-DESTROY 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm destroy +.Nm zfs-destroy .Nd Destroys the given dataset(s), snapshot(s), or bookmark. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm destroy .Op Fl Rfnprv .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm destroy .Op Fl Rdnprv .Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns .Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns ... -.Nm +.Nm zfs .Cm destroy .Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm destroy .Op Fl Rfnprv .Ar filesystem Ns | Ns Ar volume @@ -96,7 +96,7 @@ or the options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .It Xo -.Nm +.Nm zfs .Cm destroy .Op Fl Rdnprv .Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns @@ -167,7 +167,7 @@ options, as they can destroy large portions of a pool and cause unexpected behavior for mounted file systems in use. .El .It Xo -.Nm +.Nm zfs .Cm destroy .Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark .Xc diff --git a/sys/contrib/openzfs/man/man8/zfs-diff.8 b/sys/contrib/openzfs/man/man8/zfs-diff.8 index bcc12f7cbba7..c7b9e138d849 100644 --- a/sys/contrib/openzfs/man/man8/zfs-diff.8 +++ b/sys/contrib/openzfs/man/man8/zfs-diff.8 @@ -34,17 +34,17 @@ .Dt ZFS-DIFF 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm diff +.Nm zfs-diff .Nd Display the difference between two snapshots of a given filesystem. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm diff .Op Fl FHt .Ar snapshot Ar snapshot Ns | Ns Ar filesystem .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm diff .Op Fl FHt .Ar snapshot Ar snapshot Ns | Ns Ar filesystem @@ -66,7 +66,7 @@ R The path has been renamed .Bl -tag -width "-F" .It Fl F Display an indication of the type of file, in a manner similar to the -.Fl +.Fl F option of .Xr ls 1 . .Bd -literal diff --git a/sys/contrib/openzfs/man/man8/zfs-hold.8 b/sys/contrib/openzfs/man/man8/zfs-hold.8 index de30caab153a..ac56fc4a434a 100644 --- a/sys/contrib/openzfs/man/man8/zfs-hold.8 +++ b/sys/contrib/openzfs/man/man8/zfs-hold.8 @@ -34,25 +34,25 @@ .Dt ZFS-HOLD 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm hold +.Nm zfs-hold .Nd Hold a snapshot to prevent it being removed with the zfs destroy command. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm hold .Op Fl r .Ar tag Ar snapshot Ns ... -.Nm +.Nm zfs .Cm holds .Op Fl rH .Ar snapshot Ns ... -.Nm +.Nm zfs .Cm release .Op Fl r .Ar tag Ar snapshot Ns ... .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm hold .Op Fl r .Ar tag Ar snapshot Ns ... @@ -73,7 +73,7 @@ Specifies that a hold with the given tag is applied recursively to the snapshots of all descendent file systems. .El .It Xo -.Nm +.Nm zfs .Cm holds .Op Fl rH .Ar snapshot Ns ... @@ -87,7 +87,7 @@ listing the holds on the named snapshot. Do not print headers, use tab-delimited output. .El .It Xo -.Nm +.Nm zfs .Cm release .Op Fl r .Ar tag Ar snapshot Ns ... diff --git a/sys/contrib/openzfs/man/man8/zfs-jail.8 b/sys/contrib/openzfs/man/man8/zfs-jail.8 index 8274179bb089..4c439d53f0d6 100644 --- a/sys/contrib/openzfs/man/man8/zfs-jail.8 +++ b/sys/contrib/openzfs/man/man8/zfs-jail.8 @@ -41,7 +41,7 @@ .Dt ZFS-JAIL 8 .Os FreeBSD .Sh NAME -.Nm zfs Ns Pf - Cm jail +.Nm zfs-jail .Nd Attaches and detaches ZFS filesystems from FreeBSD jails. .No A Tn ZFS dataset can be attached to a jail by using the @@ -72,16 +72,16 @@ After a dataset is attached to a jail and the jailed property is set, a jailed file system cannot be mounted outside the jail, since the jail administrator might have set the mount point to an unacceptable value. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm jail .Ar jailid Ns | Ns Ar jailname filesystem -.Nm +.Nm zfs .Cm unjail .Ar jailid Ns | Ns Ar jailname filesystem .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm jail .Ar jailid filesystem .Xc @@ -104,7 +104,7 @@ See .Xr jail 8 for more information on managing jails and configuring the parameters above. .It Xo -.Nm +.Nm zfs .Cm unjail .Ar jailid filesystem .Xc diff --git a/sys/contrib/openzfs/man/man8/zfs-list.8 b/sys/contrib/openzfs/man/man8/zfs-list.8 index ad2b57e6d664..e6db73631f5c 100644 --- a/sys/contrib/openzfs/man/man8/zfs-list.8 +++ b/sys/contrib/openzfs/man/man8/zfs-list.8 @@ -34,10 +34,10 @@ .Dt ZFS-LIST 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm list +.Nm zfs-list .Nd Lists the property information for the given datasets in tabular form. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm list .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp @@ -49,7 +49,7 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm list .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp diff --git a/sys/contrib/openzfs/man/man8/zfs-load-key.8 b/sys/contrib/openzfs/man/man8/zfs-load-key.8 index 72248b6962d9..7d273ddd53b0 100644 --- a/sys/contrib/openzfs/man/man8/zfs-load-key.8 +++ b/sys/contrib/openzfs/man/man8/zfs-load-key.8 @@ -34,26 +34,26 @@ .Dt ZFS-LOAD-KEY 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm load-key +.Nm zfs-load-key .Nd Load, unload, or change the encryption key used to access a dataset. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm load-key .Op Fl nr .Op Fl L Ar keylocation .Fl a | Ar filesystem -.Nm +.Nm zfs .Cm unload-key .Op Fl r .Fl a | Ar filesystem -.Nm +.Nm zfs .Cm change-key .Op Fl l .Op Fl o Ar keylocation Ns = Ns Ar value .Op Fl o Ar keyformat Ns = Ns Ar value .Op Fl o Ar pbkdf2iters Ns = Ns Ar value .Ar filesystem -.Nm +.Nm zfs .Cm change-key .Fl i .Op Fl l @@ -61,7 +61,7 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm load-key .Op Fl nr .Op Fl L Ar keylocation @@ -118,7 +118,7 @@ may only be given as .Sy prompt . .El .It Xo -.Nm +.Nm zfs .Cm unload-key .Op Fl r .Fl a | Ar filesystem @@ -139,7 +139,7 @@ encryption roots. Unloads the keys for all encryption roots in all imported pools. .El .It Xo -.Nm +.Nm zfs .Cm change-key .Op Fl l .Op Fl o Ar keylocation Ns = Ns Ar value @@ -148,7 +148,7 @@ Unloads the keys for all encryption roots in all imported pools. .Ar filesystem .Xc .It Xo -.Nm +.Nm zfs .Cm change-key .Fl i .Op Fl l diff --git a/sys/contrib/openzfs/man/man8/zfs-mount.8 b/sys/contrib/openzfs/man/man8/zfs-mount.8 index feddafb28e2f..00fb37c786e1 100644 --- a/sys/contrib/openzfs/man/man8/zfs-mount.8 +++ b/sys/contrib/openzfs/man/man8/zfs-mount.8 @@ -34,29 +34,29 @@ .Dt ZFS-MOUNT 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm mount +.Nm zfs-mount .Nd Manage mount state of ZFS file systems. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm mount -.Nm +.Nm zfs .Cm mount .Op Fl Oflv .Op Fl o Ar options .Fl a | Ar filesystem -.Nm +.Nm zfs .Cm unmount .Op Fl fu .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm mount .Xc Displays all ZFS file systems currently mounted. .It Xo -.Nm +.Nm zfs .Cm mount .Op Fl Oflv .Op Fl o Ar options @@ -105,7 +105,7 @@ Report mount progress. Attempt to force mounting of all filesystems, even those that couldn't normally be mounted (e.g. redacted datasets). .El .It Xo -.Nm +.Nm zfs .Cm unmount .Op Fl fu .Fl a | Ar filesystem Ns | Ns Ar mountpoint diff --git a/sys/contrib/openzfs/man/man8/zfs-program.8 b/sys/contrib/openzfs/man/man8/zfs-program.8 index 41d38587e547..b08c94916de6 100644 --- a/sys/contrib/openzfs/man/man8/zfs-program.8 +++ b/sys/contrib/openzfs/man/man8/zfs-program.8 @@ -16,10 +16,11 @@ .Dt ZFS-PROGRAM 8 .Os .Sh NAME -.Nm zfs program +.Nm zfs-program .Nd executes ZFS channel programs .Sh SYNOPSIS -.Cm "zfs program" +.Nm zfs +.Cm program .Op Fl jn .Op Fl t Ar instruction-limit .Op Fl m Ar memory-limit diff --git a/sys/contrib/openzfs/man/man8/zfs-project.8 b/sys/contrib/openzfs/man/man8/zfs-project.8 index d3171c05c5bc..21c300f83df1 100644 --- a/sys/contrib/openzfs/man/man8/zfs-project.8 +++ b/sys/contrib/openzfs/man/man8/zfs-project.8 @@ -34,26 +34,26 @@ .Dt ZFS-PROJECT 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm project +.Nm zfs-project .Nd List, set, or clear project ID and/or inherit flag on the file(s) or directories. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm project .Oo Fl d Ns | Ns Fl r Ns Oc .Ar file Ns | Ns Ar directory Ns ... -.Nm +.Nm zfs .Cm project .Fl C .Oo Fl kr Ns Oc .Ar file Ns | Ns Ar directory Ns ... -.Nm +.Nm zfs .Cm project .Fl c .Oo Fl 0 Ns Oc .Oo Fl d Ns | Ns Fl r Ns Oc .Op Fl p Ar id .Ar file Ns | Ns Ar directory Ns ... -.Nm +.Nm zfs .Cm project .Op Fl p Ar id .Oo Fl rs Ns Oc @@ -61,7 +61,7 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm project .Oo Fl d Ns | Ns Fl r Ns Oc .Ar file Ns | Ns Ar directory Ns ... @@ -79,7 +79,7 @@ Show on subdirectories recursively. It will overwrite the former specified option. .El .It Xo -.Nm +.Nm zfs .Cm project .Fl C .Oo Fl kr Ns Oc @@ -94,7 +94,7 @@ as zero. Clear on subdirectories recursively. .El .It Xo -.Nm +.Nm zfs .Cm project .Fl c .Oo Fl 0 Ns Oc @@ -126,7 +126,7 @@ Check on subdirectories recursively. It will overwrite the former specified option. .El .It Xo -.Nm +.Nm zfs .Cm project .Op Fl p Ar id .Oo Fl rs Ns Oc diff --git a/sys/contrib/openzfs/man/man8/zfs-promote.8 b/sys/contrib/openzfs/man/man8/zfs-promote.8 index 08cd8b2b94f2..64c124c11b61 100644 --- a/sys/contrib/openzfs/man/man8/zfs-promote.8 +++ b/sys/contrib/openzfs/man/man8/zfs-promote.8 @@ -34,16 +34,16 @@ .Dt ZFS-PROMOTE 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm promote +.Nm zfs-promote .Nd Promotes a clone file system to no longer be dependent on its origin snapshot. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm promote .Ar clone-filesystem .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm promote .Ar clone-filesystem .Xc diff --git a/sys/contrib/openzfs/man/man8/zfs-receive.8 b/sys/contrib/openzfs/man/man8/zfs-receive.8 index 3cd17fea4a2d..ed5cbbdf0b79 100644 --- a/sys/contrib/openzfs/man/man8/zfs-receive.8 +++ b/sys/contrib/openzfs/man/man8/zfs-receive.8 @@ -34,17 +34,17 @@ .Dt ZFS-RECEIVE 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm receive +.Nm zfs-receive .Nd Creates a snapshot whose contents are as specified in the stream provided on standard input. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm receive .Op Fl FhMnsuv .Op Fl o Sy origin Ns = Ns Ar snapshot .Op Fl o Ar property Ns = Ns Ar value .Op Fl x Ar property .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm +.Nm zfs .Cm receive .Op Fl FhMnsuv .Op Fl d Ns | Ns Fl e @@ -52,14 +52,14 @@ .Op Fl o Ar property Ns = Ns Ar value .Op Fl x Ar property .Ar filesystem -.Nm +.Nm zfs .Cm receive .Fl A .Ar filesystem Ns | Ns Ar volume .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm receive .Op Fl FhMnsuv .Op Fl o Sy origin Ns = Ns Ar snapshot @@ -68,7 +68,7 @@ .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo -.Nm +.Nm zfs .Cm receive .Op Fl FhMnsuv .Op Fl d Ns | Ns Fl e @@ -290,7 +290,7 @@ receive. This allows unencrypted streams to be received as encrypted datasets. To cause the received dataset (or root dataset of a recursive stream) to be received as an encryption root, specify encryption properties in the same manner as is required for -.Nm +.Nm zfs .Cm create . For instance: .Bd -literal @@ -301,7 +301,7 @@ Note that .Op Fl o Ar keylocation Ns = Ns Ar prompt may not be specified here, since stdin is already being utilized for the send stream. Once the receive has completed, you can use -.Nm +.Nm zfs .Cm set to change this setting after the fact. Similarly, you can receive a dataset as an encrypted child by specifying @@ -361,7 +361,7 @@ restrictions (e.g. set-once) apply equally to .Fl x . .El .It Xo -.Nm +.Nm zfs .Cm receive .Fl A .Ar filesystem Ns | Ns Ar volume diff --git a/sys/contrib/openzfs/man/man8/zfs-rename.8 b/sys/contrib/openzfs/man/man8/zfs-rename.8 index 78bd8934a00f..f57bcd8441f4 100644 --- a/sys/contrib/openzfs/man/man8/zfs-rename.8 +++ b/sys/contrib/openzfs/man/man8/zfs-rename.8 @@ -34,40 +34,40 @@ .Dt ZFS-RENAME 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm rename +.Nm zfs-rename .Nd Renames the given dataset (filesystem or snapshot). .Sh SYNOPSIS -.Nm +.Nm zfs .Cm rename .Op Fl f .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm +.Nm zfs .Cm rename .Fl p .Op Fl f .Ar filesystem Ns | Ns Ar volume .Ar filesystem Ns | Ns Ar volume -.Nm +.Nm zfs .Cm rename .Fl u .Op Fl f .Ar filesystem Ar filesystem -.Nm +.Nm zfs .Cm rename .Fl r .Ar snapshot Ar snapshot .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm rename .Op Fl f .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot .Xc .It Xo -.Nm +.Nm zfs .Cm rename .Fl p .Op Fl f @@ -75,7 +75,7 @@ .Ar filesystem Ns | Ns Ar volume .Xc .It Xo -.Nm +.Nm zfs .Cm rename .Fl u .Op Fl f @@ -112,7 +112,7 @@ or the file system is not unmounted even if this option is not given. .El .It Xo -.Nm +.Nm zfs .Cm rename .Fl r .Ar snapshot Ar snapshot diff --git a/sys/contrib/openzfs/man/man8/zfs-rollback.8 b/sys/contrib/openzfs/man/man8/zfs-rollback.8 index 1078efd8a4b7..8a7cb6621fae 100644 --- a/sys/contrib/openzfs/man/man8/zfs-rollback.8 +++ b/sys/contrib/openzfs/man/man8/zfs-rollback.8 @@ -34,17 +34,17 @@ .Dt ZFS-ROLLBACK 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm rollback +.Nm zfs-rollback .Nd Roll back the given dataset to a previous snapshot. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm rollback .Op Fl Rfr .Ar snapshot .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm rollback .Op Fl Rfr .Ar snapshot diff --git a/sys/contrib/openzfs/man/man8/zfs-send.8 b/sys/contrib/openzfs/man/man8/zfs-send.8 index 8b7f940eb793..1a241a52d54b 100644 --- a/sys/contrib/openzfs/man/man8/zfs-send.8 +++ b/sys/contrib/openzfs/man/man8/zfs-send.8 @@ -34,43 +34,43 @@ .Dt ZFS-SEND 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm send +.Nm zfs-send .Nd Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm send .Op Fl DLPRbcehnpvw .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot .Ar snapshot -.Nm +.Nm zfs .Cm send .Op Fl DLPRcenpvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot -.Nm +.Nm zfs .Cm send .Fl -redact Ar redaction_bookmark .Op Fl DLPcenpv .br .Op Fl i Ar snapshot Ns | Ns Ar bookmark .Ar snapshot -.Nm +.Nm zfs .Cm send .Op Fl Penv .Fl t .Ar receive_resume_token -.Nm +.Nm zfs .Cm send .Op Fl Pnv .Fl S Ar filesystem -.Nm +.Nm zfs .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns ... .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm send .Op Fl DLPRbcehnpvw .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot @@ -266,7 +266,7 @@ The format of the stream is committed. You will be able to receive your streams on future versions of ZFS. .El .It Xo -.Nm +.Nm zfs .Cm send .Op Fl DLPRcenpvw .Op Fl i Ar snapshot Ns | Ns Ar bookmark @@ -391,7 +391,7 @@ Print verbose information about the stream package generated. This information includes a per-second report of how much data has been sent. .El .It Xo -.Nm +.Nm zfs .Cm send .Fl -redact Ar redaction_bookmark .Op Fl DLPcenpv @@ -482,7 +482,7 @@ These are stored with the redacted snapshot, and are used to detect and correctly handle the cases above. Note that for technical reasons, raw sends and redacted sends cannot be combined at this time. .It Xo -.Nm +.Nm zfs .Cm send .Op Fl Penv .Fl t @@ -497,7 +497,7 @@ See the documentation for .Sy zfs receive -s for more details. .It Xo -.Nm +.Nm zfs .Cm send .Op Fl Pnv .Op Fl i Ar snapshot Ns | Ns Ar bookmark @@ -514,7 +514,7 @@ always use the last fully received snapshot as the incremental source if it exists. .El .It Xo -.Nm +.Nm zfs .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns ... diff --git a/sys/contrib/openzfs/man/man8/zfs-set.8 b/sys/contrib/openzfs/man/man8/zfs-set.8 index 5b4348ba94c0..74a7a61d0e29 100644 --- a/sys/contrib/openzfs/man/man8/zfs-set.8 +++ b/sys/contrib/openzfs/man/man8/zfs-set.8 @@ -34,14 +34,14 @@ .Dt ZFS-SET 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm set +.Nm zfs-set .Nd Sets the property or list of properties to the given value(s) for each dataset. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm set .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... -.Nm +.Nm zfs .Cm get .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp @@ -50,14 +50,14 @@ .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc .Cm all | Ar property Ns Oo , Ns Ar property Oc Ns ... .Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns ... -.Nm +.Nm zfs .Cm inherit .Op Fl rS .Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm set .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... @@ -79,7 +79,7 @@ For more information, see the section of .Xr zfsprops 8 . .It Xo -.Nm +.Nm zfs .Cm get .Op Fl r Ns | Ns Fl d Ar depth .Op Fl Hp @@ -160,7 +160,7 @@ or .Sy all . .El .It Xo -.Nm +.Nm zfs .Cm inherit .Op Fl rS .Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... diff --git a/sys/contrib/openzfs/man/man8/zfs-share.8 b/sys/contrib/openzfs/man/man8/zfs-share.8 index cb013103371c..ce35accdbf7e 100644 --- a/sys/contrib/openzfs/man/man8/zfs-share.8 +++ b/sys/contrib/openzfs/man/man8/zfs-share.8 @@ -34,19 +34,19 @@ .Dt ZFS-SHARE 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm share +.Nm zfs-share .Nd Shares and unshares available ZFS filesystems. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm share .Fl a | Ar filesystem -.Nm +.Nm zfs .Cm unshare .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm share .Fl a | Ar filesystem .Xc @@ -68,7 +68,7 @@ or property is set. .El .It Xo -.Nm +.Nm zfs .Cm unshare .Fl a | Ar filesystem Ns | Ns Ar mountpoint .Xc diff --git a/sys/contrib/openzfs/man/man8/zfs-snapshot.8 b/sys/contrib/openzfs/man/man8/zfs-snapshot.8 index 618d41d22fe1..b677bc73d2bd 100644 --- a/sys/contrib/openzfs/man/man8/zfs-snapshot.8 +++ b/sys/contrib/openzfs/man/man8/zfs-snapshot.8 @@ -34,10 +34,10 @@ .Dt ZFS-SNAPSHOT 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm snapshot +.Nm zfs-snapshot .Nd Creates snapshots with the given names. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm snapshot .Op Fl r .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... @@ -45,7 +45,7 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm snapshot .Op Fl r .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... diff --git a/sys/contrib/openzfs/man/man8/zfs-upgrade.8 b/sys/contrib/openzfs/man/man8/zfs-upgrade.8 index 742d5d7bd9e7..6a79f3ea77fd 100644 --- a/sys/contrib/openzfs/man/man8/zfs-upgrade.8 +++ b/sys/contrib/openzfs/man/man8/zfs-upgrade.8 @@ -34,15 +34,15 @@ .Dt ZFS-UPGRADE 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm upgrade +.Nm zfs-upgrade .Nd Manage upgrading the on-disk version of filesystems. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm upgrade -.Nm +.Nm zfs .Cm upgrade .Fl v -.Nm +.Nm zfs .Cm upgrade .Op Fl r .Op Fl V Ar version @@ -50,18 +50,18 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm upgrade .Xc Displays a list of file systems that are not the most recent version. .It Xo -.Nm +.Nm zfs .Cm upgrade .Fl v .Xc Displays a list of currently supported file system versions. .It Xo -.Nm +.Nm zfs .Cm upgrade .Op Fl r .Op Fl V Ar version diff --git a/sys/contrib/openzfs/man/man8/zfs-userspace.8 b/sys/contrib/openzfs/man/man8/zfs-userspace.8 index 9c103bf48c04..0a9977a61c6a 100644 --- a/sys/contrib/openzfs/man/man8/zfs-userspace.8 +++ b/sys/contrib/openzfs/man/man8/zfs-userspace.8 @@ -34,10 +34,10 @@ .Dt ZFS-USERSPACE 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm userspace +.Nm zfs-userspace .Nd Displays space consumed by, and quotas on, each user or group in the specified filesystem or snapshot. .Sh SYNOPSIS -.Nm +.Nm zfs .Cm userspace .Op Fl Hinp .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc @@ -45,7 +45,7 @@ .Oo Fl S Ar field Oc Ns ... .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc .Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path -.Nm +.Nm zfs .Cm groupspace .Op Fl Hinp .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc @@ -53,7 +53,7 @@ .Oo Fl S Ar field Oc Ns ... .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc .Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path -.Nm +.Nm zfs .Cm projectspace .Op Fl Hp .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc @@ -63,7 +63,7 @@ .Sh DESCRIPTION .Bl -tag -width "" .It Xo -.Nm +.Nm zfs .Cm userspace .Op Fl Hinp .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc @@ -146,7 +146,7 @@ The default is The default can be changed to include group types. .El .It Xo -.Nm +.Nm zfs .Cm groupspace .Op Fl Hinp .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc @@ -162,7 +162,7 @@ This subcommand is identical to except that the default types to display are .Fl t Sy posixgroup Ns \&, Ns Sy smbgroup . .It Xo -.Nm +.Nm zfs .Cm projectspace .Op Fl Hp .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc diff --git a/sys/contrib/openzfs/man/man8/zfs-wait.8 b/sys/contrib/openzfs/man/man8/zfs-wait.8 index e7e3decb9da1..c3a85e29c6c0 100644 --- a/sys/contrib/openzfs/man/man8/zfs-wait.8 +++ b/sys/contrib/openzfs/man/man8/zfs-wait.8 @@ -31,17 +31,17 @@ .Dt ZFS-WAIT 8 .Os .Sh NAME -.Nm zfs Ns Pf - Cm wait +.Nm zfs-wait .Nd Wait for background activity to stop in a ZFS filesystem .Sh SYNOPSIS -.Nm +.Nm zfs .Cm wait .Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns ... .Ar fs .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zfs .Cm wait .Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns ... .Ar fs diff --git a/sys/contrib/openzfs/man/man8/zgenhostid.8 b/sys/contrib/openzfs/man/man8/zgenhostid.8 index ff198443dd2a..14264cb8cdc3 100644 --- a/sys/contrib/openzfs/man/man8/zgenhostid.8 +++ b/sys/contrib/openzfs/man/man8/zgenhostid.8 @@ -54,7 +54,8 @@ instead of default .It Ar hostid Specifies the value to be placed in .Pa /etc/hostid . -It must be a number with a value between 1 and 2^32-1. +It should be a number with a value between 1 and 2^32-1. +If it is 0, zgenhostid will generate a random hostid. This value .Sy must be unique among your systems. diff --git a/sys/contrib/openzfs/man/man8/zpool-add.8 b/sys/contrib/openzfs/man/man8/zpool-add.8 index e8adc353ac9c..cf1630435eb4 100644 --- a/sys/contrib/openzfs/man/man8/zpool-add.8 +++ b/sys/contrib/openzfs/man/man8/zpool-add.8 @@ -30,10 +30,10 @@ .Dt ZPOOL-ADD 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm add +.Nm zpool-add .Nd Adds specified virtual devices to a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm add .Op Fl fgLnP .Oo Fl o Ar property Ns = Ns Ar value Oc @@ -41,7 +41,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm add .Op Fl fgLnP .Oo Fl o Ar property Ns = Ns Ar value Oc diff --git a/sys/contrib/openzfs/man/man8/zpool-attach.8 b/sys/contrib/openzfs/man/man8/zpool-attach.8 index 03e04436df81..41b6a6b613da 100644 --- a/sys/contrib/openzfs/man/man8/zpool-attach.8 +++ b/sys/contrib/openzfs/man/man8/zpool-attach.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-ATTACH 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm attach +.Nm zpool-attach .Nd Attach a new device to an existing ZFS virtual device (vdev). .Sh SYNOPSIS -.Nm +.Nm zpool .Cm attach .Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc @@ -42,7 +42,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm attach .Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc diff --git a/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 b/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 index df04c8819146..128970ee66ac 100644 --- a/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 +++ b/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 @@ -31,17 +31,17 @@ .Dt ZPOOL-CHECKPOINT 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm checkpoint +.Nm zpool-checkpoint .Nd Checkpoints the current state of a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm checkpoint .Op Fl d, -discard Oo Fl w, -wait Oc .Ar pool .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm checkpoint .Op Fl d, -discard Oo Fl w, -wait Oc .Ar pool diff --git a/sys/contrib/openzfs/man/man8/zpool-clear.8 b/sys/contrib/openzfs/man/man8/zpool-clear.8 index ee7a6a255905..e00eb760af61 100644 --- a/sys/contrib/openzfs/man/man8/zpool-clear.8 +++ b/sys/contrib/openzfs/man/man8/zpool-clear.8 @@ -31,17 +31,17 @@ .Dt ZPOOL-CLEAR 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm clear +.Nm zpool-clear .Nd Clears device errors in a ZFS storage pool. .Sh SYNOPSIS -.Nm +.Nm zpool .Cm clear .Ar pool .Op Ar device .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm clear .Ar pool .Op Ar device diff --git a/sys/contrib/openzfs/man/man8/zpool-create.8 b/sys/contrib/openzfs/man/man8/zpool-create.8 index 0676a67f9fc6..7406a493e377 100644 --- a/sys/contrib/openzfs/man/man8/zpool-create.8 +++ b/sys/contrib/openzfs/man/man8/zpool-create.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-CREATE 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm create +.Nm zpool-create .Nd Creates a new ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm create .Op Fl dfn .Op Fl m Ar mountpoint @@ -46,7 +46,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm create .Op Fl dfn .Op Fl m Ar mountpoint @@ -73,12 +73,14 @@ and period The pool names .Sy mirror , .Sy raidz , +.Sy draid , .Sy spare and .Sy log are reserved, as are names beginning with .Sy mirror , .Sy raidz , +.Sy draid , .Sy spare , and the pattern .Sy c[0-9] . diff --git a/sys/contrib/openzfs/man/man8/zpool-destroy.8 b/sys/contrib/openzfs/man/man8/zpool-destroy.8 index 9eace1983ebb..d4fc21eaec02 100644 --- a/sys/contrib/openzfs/man/man8/zpool-destroy.8 +++ b/sys/contrib/openzfs/man/man8/zpool-destroy.8 @@ -31,17 +31,17 @@ .Dt ZPOOL-DESTROY 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm destroy +.Nm zpool-destroy .Nd Destroys the given ZFS storage pool, freeing up any devices for other use .Sh SYNOPSIS -.Nm +.Nm zpool .Cm destroy .Op Fl f .Ar pool .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm destroy .Op Fl f .Ar pool diff --git a/sys/contrib/openzfs/man/man8/zpool-detach.8 b/sys/contrib/openzfs/man/man8/zpool-detach.8 index dab8871cea87..75a5786d5a3d 100644 --- a/sys/contrib/openzfs/man/man8/zpool-detach.8 +++ b/sys/contrib/openzfs/man/man8/zpool-detach.8 @@ -31,16 +31,16 @@ .Dt ZPOOL-DETACH 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm detach +.Nm zpool-detach .Nd Detaches a device from a ZFS mirror vdev (virtual device) .Sh SYNOPSIS -.Nm +.Nm zpool .Cm detach .Ar pool device .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm detach .Ar pool device .Xc diff --git a/sys/contrib/openzfs/man/man8/zpool-events.8 b/sys/contrib/openzfs/man/man8/zpool-events.8 index e5887a1d820d..3a6ff8840168 100644 --- a/sys/contrib/openzfs/man/man8/zpool-events.8 +++ b/sys/contrib/openzfs/man/man8/zpool-events.8 @@ -31,16 +31,16 @@ .Dt ZPOOL-EVENTS 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm events +.Nm zpool-events .Nd Lists all recent events generated by the ZFS kernel modules .Sh SYNOPSIS -.Nm +.Nm zpool .Cm events .Op Fl vHf Oo Ar pool Oc | Fl c .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm events .Op Fl vHf Oo Ar pool Oc | Fl c .Xc diff --git a/sys/contrib/openzfs/man/man8/zpool-export.8 b/sys/contrib/openzfs/man/man8/zpool-export.8 index afe185c98c58..1b8077ba19fa 100644 --- a/sys/contrib/openzfs/man/man8/zpool-export.8 +++ b/sys/contrib/openzfs/man/man8/zpool-export.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-EXPORT 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm export +.Nm zpool-export .Nd Exports the given ZFS storage pools from the system .Sh SYNOPSIS -.Nm +.Nm zpool .Cm export .Op Fl a .Op Fl f @@ -42,7 +42,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm export .Op Fl a .Op Fl f @@ -60,7 +60,7 @@ A pool can not be exported if it has a shared spare that is currently being used. .Pp For pools to be portable, you must give the -.Nm +.Nm zpool command whole disks, not just partitions, so that ZFS can label the disks with portable EFI labels. Otherwise, disk drivers on platforms of different endianness will not recognize diff --git a/sys/contrib/openzfs/man/man8/zpool-get.8 b/sys/contrib/openzfs/man/man8/zpool-get.8 index fbe341a8cddd..c514bb0c5e8c 100644 --- a/sys/contrib/openzfs/man/man8/zpool-get.8 +++ b/sys/contrib/openzfs/man/man8/zpool-get.8 @@ -31,23 +31,23 @@ .Dt ZPOOL-GET 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm get +.Nm zpool-get .Nd Retrieves properties for the specified ZFS storage pool(s) .Sh SYNOPSIS -.Nm +.Nm zpool .Cm get .Op Fl Hp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... .Oo Ar pool Oc Ns ... -.Nm +.Nm zpool .Cm set .Ar property Ns = Ns Ar value .Ar pool .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm get .Op Fl Hp .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... @@ -85,7 +85,7 @@ is the default value. Display numbers in parsable (exact) values. .El .It Xo -.Nm +.Nm zpool .Cm set .Ar property Ns = Ns Ar value .Ar pool diff --git a/sys/contrib/openzfs/man/man8/zpool-history.8 b/sys/contrib/openzfs/man/man8/zpool-history.8 index 73fb27449d8c..5b0a102f3825 100644 --- a/sys/contrib/openzfs/man/man8/zpool-history.8 +++ b/sys/contrib/openzfs/man/man8/zpool-history.8 @@ -31,17 +31,17 @@ .Dt ZPOOL-HISTORY 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm history +.Nm zpool-history .Nd Displays the command history of the specified ZFS storage pool(s) .Sh SYNOPSIS -.Nm +.Nm zpool .Cm history .Op Fl il .Oo Ar pool Oc Ns ... .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm history .Op Fl il .Oo Ar pool Oc Ns ... diff --git a/sys/contrib/openzfs/man/man8/zpool-import.8 b/sys/contrib/openzfs/man/man8/zpool-import.8 index e63db9363fd0..ac349574309e 100644 --- a/sys/contrib/openzfs/man/man8/zpool-import.8 +++ b/sys/contrib/openzfs/man/man8/zpool-import.8 @@ -31,14 +31,14 @@ .Dt ZPOOL-IMPORT 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm import +.Nm zpool-import .Nd Lists ZFS storage pools available to import or import the specified pools .Sh SYNOPSIS -.Nm +.Nm zpool .Cm import .Op Fl D .Op Fl d Ar dir Ns | Ns device -.Nm +.Nm zpool .Cm import .Fl a .Op Fl DflmN @@ -48,7 +48,7 @@ .Op Fl o Ar mntopts .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... .Op Fl R Ar root -.Nm +.Nm zpool .Cm import .Op Fl Dflm .Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc @@ -63,7 +63,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm import .Op Fl D .Op Fl d Ar dir Ns | Ns device @@ -110,7 +110,7 @@ option can be specified multiple times. Lists destroyed pools only. .El .It Xo -.Nm +.Nm zpool .Cm import .Fl a .Op Fl DflmN @@ -244,7 +244,7 @@ option, above. WARNING: This option can be extremely hazardous to the health of your pool and should only be used as a last resort. .El .It Xo -.Nm +.Nm zpool .Cm import .Op Fl Dflm .Op Fl F Oo Fl n Oc Oo Fl t Oc Oo Fl T Oc Oo Fl X Oc diff --git a/sys/contrib/openzfs/man/man8/zpool-initialize.8 b/sys/contrib/openzfs/man/man8/zpool-initialize.8 index e8bf656e4b97..2734c1b340b8 100644 --- a/sys/contrib/openzfs/man/man8/zpool-initialize.8 +++ b/sys/contrib/openzfs/man/man8/zpool-initialize.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-INITIALIZE 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm initialize +.Nm zpool-initialize .Nd Write to all unallocated regions of eligible devices in a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm initialize .Op Fl c | Fl s .Op Fl w @@ -43,7 +43,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm initialize .Op Fl c | Fl s .Op Fl w diff --git a/sys/contrib/openzfs/man/man8/zpool-iostat.8 b/sys/contrib/openzfs/man/man8/zpool-iostat.8 index f91e55c3b01a..e457eb142ab8 100644 --- a/sys/contrib/openzfs/man/man8/zpool-iostat.8 +++ b/sys/contrib/openzfs/man/man8/zpool-iostat.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-IOSTAT 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm iostat +.Nm zpool-iostat .Nd Display logical I/O statistics for the given ZFS storage pools/vdevs .Sh SYNOPSIS -.Nm +.Nm zpool .Cm iostat .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw .Op Fl T Sy u Ns | Ns Sy d @@ -44,7 +44,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm iostat .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw .Op Fl T Sy u Ns | Ns Sy d diff --git a/sys/contrib/openzfs/man/man8/zpool-labelclear.8 b/sys/contrib/openzfs/man/man8/zpool-labelclear.8 index 52c80056142c..576eee21c508 100644 --- a/sys/contrib/openzfs/man/man8/zpool-labelclear.8 +++ b/sys/contrib/openzfs/man/man8/zpool-labelclear.8 @@ -31,17 +31,17 @@ .Dt ZPOOL-LABELCLEAR 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm labelclear +.Nm zpool-labelclear .Nd Removes ZFS label information from the specified physical device .Sh SYNOPSIS -.Nm +.Nm zpool .Cm labelclear .Op Fl f .Ar device .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm labelclear .Op Fl f .Ar device diff --git a/sys/contrib/openzfs/man/man8/zpool-list.8 b/sys/contrib/openzfs/man/man8/zpool-list.8 index 5e8c6fe5c9ea..068a68893463 100644 --- a/sys/contrib/openzfs/man/man8/zpool-list.8 +++ b/sys/contrib/openzfs/man/man8/zpool-list.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-LIST 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm list +.Nm zpool-list .Nd Lists ZFS storage pools along with a health status and space usage .Sh SYNOPSIS -.Nm +.Nm zpool .Cm list .Op Fl HgLpPv .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... @@ -44,7 +44,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm list .Op Fl HgLpPv .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... @@ -111,7 +111,7 @@ See .It Fl v Verbose statistics. Reports usage statistics for individual vdevs within the pool, in addition to -the pool-wise statistics. +the pool-wide statistics. .El .El .Sh SEE ALSO diff --git a/sys/contrib/openzfs/man/man8/zpool-offline.8 b/sys/contrib/openzfs/man/man8/zpool-offline.8 index cdcda141f5eb..3bf3bae72541 100644 --- a/sys/contrib/openzfs/man/man8/zpool-offline.8 +++ b/sys/contrib/openzfs/man/man8/zpool-offline.8 @@ -31,22 +31,22 @@ .Dt ZPOOL-OFFLINE 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm offline +.Nm zpool-offline .Nd Take a physical device in a ZFS storage pool offline .Sh SYNOPSIS -.Nm +.Nm zpool .Cm offline .Op Fl f .Op Fl t .Ar pool Ar device Ns ... -.Nm +.Nm zpool .Cm online .Op Fl e .Ar pool Ar device Ns ... .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm offline .Op Fl f .Op Fl t @@ -68,7 +68,7 @@ Temporary. Upon reboot, the specified physical device reverts to its previous state. .El .It Xo -.Nm +.Nm zpool .Cm online .Op Fl e .Ar pool Ar device Ns ... diff --git a/sys/contrib/openzfs/man/man8/zpool-reguid.8 b/sys/contrib/openzfs/man/man8/zpool-reguid.8 index e73f421ade78..f5c4a33f694a 100644 --- a/sys/contrib/openzfs/man/man8/zpool-reguid.8 +++ b/sys/contrib/openzfs/man/man8/zpool-reguid.8 @@ -31,16 +31,16 @@ .Dt ZPOOL-REGUID 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm reguid +.Nm zpool-reguid .Nd Generate a new unique identifier for a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm reguid .Ar pool .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm reguid .Ar pool .Xc diff --git a/sys/contrib/openzfs/man/man8/zpool-remove.8 b/sys/contrib/openzfs/man/man8/zpool-remove.8 index cd394f107e8d..f491cd40ac5c 100644 --- a/sys/contrib/openzfs/man/man8/zpool-remove.8 +++ b/sys/contrib/openzfs/man/man8/zpool-remove.8 @@ -31,21 +31,21 @@ .Dt ZPOOL-REMOVE 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm remove +.Nm zpool-remove .Nd Remove a device from a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm remove .Op Fl npw .Ar pool Ar device Ns ... -.Nm +.Nm zpool .Cm remove .Fl s .Ar pool .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm remove .Op Fl npw .Ar pool Ar device Ns ... @@ -95,7 +95,7 @@ flag, displays numbers as parsable (exact) values. Waits until the removal has completed before returning. .El .It Xo -.Nm +.Nm zpool .Cm remove .Fl s .Ar pool diff --git a/sys/contrib/openzfs/man/man8/zpool-reopen.8 b/sys/contrib/openzfs/man/man8/zpool-reopen.8 index 206fedcc57d9..6f804cc7e5f2 100644 --- a/sys/contrib/openzfs/man/man8/zpool-reopen.8 +++ b/sys/contrib/openzfs/man/man8/zpool-reopen.8 @@ -31,17 +31,17 @@ .Dt ZPOOL-REOPEN 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm reopen +.Nm zpool-reopen .Nd Reopen all virtual devices (vdevs) associated with a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm reopen .Op Fl n .Ar pool .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm reopen .Op Fl n .Ar pool diff --git a/sys/contrib/openzfs/man/man8/zpool-replace.8 b/sys/contrib/openzfs/man/man8/zpool-replace.8 index 32eaf77c0c2c..b8527a3862c5 100644 --- a/sys/contrib/openzfs/man/man8/zpool-replace.8 +++ b/sys/contrib/openzfs/man/man8/zpool-replace.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-REPLACE 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm replace +.Nm zpool-replace .Nd Replace one device with another in a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm replace .Op Fl fsw .Oo Fl o Ar property Ns = Ns Ar value Oc @@ -42,7 +42,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm replace .Op Fl fsw .Op Fl o Ar property Ns = Ns Ar value diff --git a/sys/contrib/openzfs/man/man8/zpool-resilver.8 b/sys/contrib/openzfs/man/man8/zpool-resilver.8 index a804bea4a3b8..602e296fea11 100644 --- a/sys/contrib/openzfs/man/man8/zpool-resilver.8 +++ b/sys/contrib/openzfs/man/man8/zpool-resilver.8 @@ -31,16 +31,16 @@ .Dt ZPOOL-RESILVER 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm resilver +.Nm zpool-resilver .Nd Start a resilver of a device in a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm resilver .Ar pool Ns ... .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm resilver .Ar pool Ns ... .Xc diff --git a/sys/contrib/openzfs/man/man8/zpool-scrub.8 b/sys/contrib/openzfs/man/man8/zpool-scrub.8 index a5624795544c..6ff2eb261017 100644 --- a/sys/contrib/openzfs/man/man8/zpool-scrub.8 +++ b/sys/contrib/openzfs/man/man8/zpool-scrub.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-SCRUB 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm scrub +.Nm zpool-scrub .Nd Begin a scrub or resume a paused scrub of a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm scrub .Op Fl s | Fl p .Op Fl w @@ -42,7 +42,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm scrub .Op Fl s | Fl p .Op Fl w @@ -52,7 +52,7 @@ Begins a scrub or resumes a paused scrub. The scrub examines all data in the specified pools to verify that it checksums correctly. For replicated -.Pq mirror or raidz +.Pq mirror, raidz, or draid devices, ZFS automatically repairs any damage discovered during the scrub. The .Nm zpool Cm status diff --git a/sys/contrib/openzfs/man/man8/zpool-split.8 b/sys/contrib/openzfs/man/man8/zpool-split.8 index d2943e6e374b..609cbe6bad29 100644 --- a/sys/contrib/openzfs/man/man8/zpool-split.8 +++ b/sys/contrib/openzfs/man/man8/zpool-split.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-SPLIT 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm split +.Nm zpool-split .Nd Split devices off a ZFS storage pool creating a new pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm split .Op Fl gLlnP .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... @@ -44,7 +44,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm split .Op Fl gLlnP .Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... diff --git a/sys/contrib/openzfs/man/man8/zpool-status.8 b/sys/contrib/openzfs/man/man8/zpool-status.8 index c2fe76c03812..54f0987b80c6 100644 --- a/sys/contrib/openzfs/man/man8/zpool-status.8 +++ b/sys/contrib/openzfs/man/man8/zpool-status.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-STATUS 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm status +.Nm zpool-status .Nd Display detailed health status for the given ZFS storage pools .Sh SYNOPSIS -.Nm +.Nm zpool .Cm status .Oo Fl c Ar SCRIPT Oc .Op Fl DigLpPstvx @@ -44,7 +44,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm status .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... .Op Fl DigLpPstvx diff --git a/sys/contrib/openzfs/man/man8/zpool-sync.8 b/sys/contrib/openzfs/man/man8/zpool-sync.8 index 098fdde9e5c9..027d129d181c 100644 --- a/sys/contrib/openzfs/man/man8/zpool-sync.8 +++ b/sys/contrib/openzfs/man/man8/zpool-sync.8 @@ -31,16 +31,16 @@ .Dt ZPOOL-SYNC 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm sync +.Nm zpool-sync .Nd Force data to be written to primary storage of a ZFS storage pool and update reporting data .Sh SYNOPSIS -.Nm +.Nm zpool .Cm sync .Oo Ar pool Oc Ns ... .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm sync .Op Ar pool ... .Xc diff --git a/sys/contrib/openzfs/man/man8/zpool-trim.8 b/sys/contrib/openzfs/man/man8/zpool-trim.8 index eef58aae5da6..1d8bc6e44a24 100644 --- a/sys/contrib/openzfs/man/man8/zpool-trim.8 +++ b/sys/contrib/openzfs/man/man8/zpool-trim.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-TRIM 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm trim +.Nm zpool-trim .Nd Initiate immediate TRIM operations for all free space in a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm trim .Op Fl dw .Op Fl r Ar rate @@ -44,7 +44,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm trim .Op Fl dw .Op Fl c | Fl s diff --git a/sys/contrib/openzfs/man/man8/zpool-upgrade.8 b/sys/contrib/openzfs/man/man8/zpool-upgrade.8 index d23d0e0e6d0c..b9d023b22d3f 100644 --- a/sys/contrib/openzfs/man/man8/zpool-upgrade.8 +++ b/sys/contrib/openzfs/man/man8/zpool-upgrade.8 @@ -31,22 +31,22 @@ .Dt ZPOOL-UPGRADE 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm upgrade +.Nm zpool-upgrade .Nd Manage version and feature flags of ZFS storage pools .Sh SYNOPSIS -.Nm +.Nm zpool .Cm upgrade -.Nm +.Nm zpool .Cm upgrade .Fl v -.Nm +.Nm zpool .Cm upgrade .Op Fl V Ar version .Fl a Ns | Ns Ar pool Ns ... .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm upgrade .Xc Displays pools which do not have all supported features enabled and pools @@ -56,7 +56,7 @@ Use .Nm zpool Cm upgrade Fl a to enable all features on all pools. .It Xo -.Nm +.Nm zpool .Cm upgrade .Fl v .Xc @@ -65,7 +65,7 @@ See .Xr zpool-features 5 for a description of feature flags features supported by the current software. .It Xo -.Nm +.Nm zpool .Cm upgrade .Op Fl V Ar version .Fl a Ns | Ns Ar pool Ns ... diff --git a/sys/contrib/openzfs/man/man8/zpool-wait.8 b/sys/contrib/openzfs/man/man8/zpool-wait.8 index a3bdba669e53..ff6d992243b8 100644 --- a/sys/contrib/openzfs/man/man8/zpool-wait.8 +++ b/sys/contrib/openzfs/man/man8/zpool-wait.8 @@ -31,10 +31,10 @@ .Dt ZPOOL-WAIT 8 .Os .Sh NAME -.Nm zpool Ns Pf - Cm wait +.Nm zpool-wait .Nd Wait for background activity to stop in a ZFS storage pool .Sh SYNOPSIS -.Nm +.Nm zpool .Cm wait .Op Fl Hp .Op Fl T Sy u Ns | Ns Sy d @@ -44,7 +44,7 @@ .Sh DESCRIPTION .Bl -tag -width Ds .It Xo -.Nm +.Nm zpool .Cm wait .Op Fl Hp .Op Fl T Sy u Ns | Ns Sy d diff --git a/sys/contrib/openzfs/man/man8/zpool_influxdb.8 b/sys/contrib/openzfs/man/man8/zpool_influxdb.8 new file mode 100644 index 000000000000..bd899dbe90a0 --- /dev/null +++ b/sys/contrib/openzfs/man/man8/zpool_influxdb.8 @@ -0,0 +1,93 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at +.\" https://opensource.org/licenses/CDDL-1.0 +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright 2020 Richard Elling +.\" .Dd June 14, 2020 +.TH zpool_influxdb 8 +.SH NAME +zpool_influxdb \- collect zpool statistics in influxdb line protocol format +.SH SYNOPSIS +.LP +.nf +\fBzpool_influxdb\fR [--execd] [--no-histogram] [--sum-histogram-buckets] +[--tags key=value] [pool] +\fBzpool_influxdb\fR --help +.fi +.SH DESCRIPTION +The \fBzpool_influxdb\fR command produces influxdb line protocol compatible +metrics from zpools. Like the \fBzpool\fR command, \fBzpool_influxdb\fR +reads the current pool status and statistics. Unlike the \fBzpool\fR +command which is intended for humans, \fBzpool_influxdb\fR formats the +output in influxdb line protocol. The expected use is as a plugin to a +metrics collector or aggregator, such as telegraf. + +By default, \fBzpool_influxdb\fR prints pool metrics and status in the +influxdb line protocol format. All pools are printed, similar to +the \fBzpool status\fR command. Providing a pool name restricts the +output to the named pool. + +Like the \fBzpool\fR command, \fBzpool_influxdb\fR uses internal data +structures that can change over time as new ZFS releases are made. +Therefore, the \fBzpool_influxdb\fR command must be compiled against the +ZFS source. It is expected that later releases of ZFS includes compatible +\fBzpool_influxdb\fR and \fBzpool\fR commands. + +.SH OPTIONS +.TP +\fB\--execd\fR, \fB-e\fR +Run in daemon mode compatible with telegraf`s \fBexecd\fR plugin. +In this mode, the pools are sampled every time there is a [return] on stdin. +Once a sample printed, \fBzpool_influxdb\fR waits for another [return]. +When run on a terminal, use [ctrl+C] to exit. +.TP +\fB\--no-histogram\fR, \fB-n\fR +Do not print latency and I/O size histograms. This can reduce the total +amount of data, but one should consider the value brought by the insights +that latency and I/O size distributions provide. The resulting values +are suitable for graphing with grafana's heatmap plugin. +.TP +\fB--sum-histogram-buckets\fR, \fB-s\fR +Accumulates bucket values. By default, the values are not accumulated and +the raw data appears as shown by \fBzpool iostat\fR. This works well for +grafana's heatmap plugin. Summing the buckets produces output similar to +prometheus histograms. +.TP +\fB--tags\fR, \fB-t\fR +Adds specified tags to the tag set. Tags are key=value pairs and multiple +tags are separated by commas. No sanity checking is performed. +See the InfluxDB Line Protocol format documentation for details on escaping +special characters used in tags. +.TP +\fB\--help\fR, \fB\-h\fR +Print a usage summary. + +.SH SEE ALSO +.LP +\fBzpool-status\fR(8) +\fBzpool-iostat\fR(8) +.PP +Influxdb https://github.com/influxdata/influxdb +.PP +Telegraf https://github.com/influxdata/telegraf +.PP +Grafana https://grafana.com +.PP +Prometheus https://prometheus.io diff --git a/sys/contrib/openzfs/man/man8/zpoolconcepts.8 b/sys/contrib/openzfs/man/man8/zpoolconcepts.8 index f9c262f4be42..d999b03547a9 100644 --- a/sys/contrib/openzfs/man/man8/zpoolconcepts.8 +++ b/sys/contrib/openzfs/man/man8/zpoolconcepts.8 @@ -64,7 +64,7 @@ A file must be specified by a full path. A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with N disks of size X can hold X bytes and can withstand (N-1) devices -failing before data integrity is compromised. +failing without losing data. .It Sy raidz , raidz1 , raidz2 , raidz3 A variation on RAID-5 that allows for better distribution of parity and eliminates the RAID-5 @@ -88,11 +88,75 @@ vdev type is an alias for .Sy raidz1 . .Pp A raidz group with N disks of size X with P parity disks can hold approximately -(N-P)*X bytes and can withstand P device(s) failing before data integrity is -compromised. +(N-P)*X bytes and can withstand P device(s) failing without losing data. The minimum number of devices in a raidz group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance. +.It Sy draid , draid1 , draid2 , draid3 +A variant of raidz that provides integrated distributed hot spares which +allows for faster resilvering while retaining the benefits of raidz. +A dRAID vdev is constructed from multiple internal raidz groups, each with D +data devices and P parity devices. +These groups are distributed over all of the children in order to fully +utilize the available disk performance. +.Pp +Unlike raidz, dRAID uses a fixed stripe width (padding as necessary with +zeros) to allow fully sequential resilvering. +This fixed stripe width significantly effects both usable capacity and IOPS. +For example, with the default D=8 and 4k disk sectors the minimum allocation +size is 32k. +If using compression, this relatively large allocation size can reduce the +effective compression ratio. +When using ZFS volumes and dRAID the default volblocksize property is increased +to account for the allocation size. +If a dRAID pool will hold a significant amount of small blocks, it is +recommended to also add a mirrored +.Sy special +vdev to store those blocks. +.Pp +In regards to IO/s, performance is similar to raidz since for any read all D +data disks must be accessed. +Delivered random IOPS can be reasonably approximated as +floor((N-S)/(D+P))*. +.Pp +Like raidz a dRAID can have single-, double-, or triple-parity. The +.Sy draid1 , +.Sy draid2 , +and +.Sy draid3 +types can be used to specify the parity level. +The +.Sy draid +vdev type is an alias for +.Sy draid1 . +.Pp +A dRAID with N disks of size X, D data disks per redundancy group, P parity +level, and S distributed hot spares can hold approximately (N-S)*(D/(D+P))*X +bytes and can withstand P device(s) failing without losing data. +.It Sy draid[][:d][:c][:s] +A non-default dRAID configuration can be specified by appending one or more +of the following optional arguments to the +.Sy draid +keyword. +.Pp +.Em parity +- The parity level (1-3). +.Pp +.Em data +- The number of data devices per redundancy group. +In general a smaller value of D will increase IOPS, improve the compression ratio, and speed up resilvering at the expense of total usable capacity. +Defaults to 8, unless N-P-S is less than 8. +.Pp +.Em children +- The expected number of children. +Useful as a cross-check when listing a large number of devices. +An error is returned when the provided number of children differs. +.Pp +.Em spares +- The number of distributed hot spares. +Defaults to zero. +.Pp +.Pp .It Sy spare A pseudo-vdev which keeps track of available hot spares for a pool. For more information, see the @@ -273,6 +337,14 @@ If the original faulted device is detached, then the hot spare assumes its place in the configuration, and is removed from the spare list of all active pools. .Pp +The +.Sy draid +vdev type provides distributed hot spares. +These hot spares are named after the dRAID vdev they're a part of ( +.Qq draid1-2-3 specifies spare 3 of vdev 2, which is a single parity dRAID +) and may only be used by that dRAID vdev. +Otherwise, they behave the same as normal hot spares. +.Pp Spares cannot replace log devices. .Ss Intent Log The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous diff --git a/sys/contrib/openzfs/man/man8/zpoolprops.8 b/sys/contrib/openzfs/man/man8/zpoolprops.8 index 3437e48864ce..f78acaa46505 100644 --- a/sys/contrib/openzfs/man/man8/zpoolprops.8 +++ b/sys/contrib/openzfs/man/man8/zpoolprops.8 @@ -54,12 +54,15 @@ This property can also be referred to by its shortened column name, .It Sy expandsize Amount of uninitialized space within the pool or device that can be used to increase the total capacity of the pool. -Uninitialized space consists of any space on an EFI labeled vdev which has not -been brought online -.Po e.g, using -.Nm zpool Cm online Fl e -.Pc . -This space occurs when a LUN is dynamically expanded. +On whole-disk vdevs, this is the space beyond the end of the GPT – +typically occurring when a LUN is dynamically expanded +or a disk replaced with a larger one. +On partition vdevs, this is the space appended to the partition after it was +added to the pool – most likely by resizing it in-place. +The space can be claimed for the pool by bringing it online with +.Sy autoexpand=on +or using +.Nm zpool Cm online Fl e . .It Sy fragmentation The amount of fragmentation in the pool. As the amount of space .Sy allocated diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd index 4feb9e1eaf0c..e7cddcc5bb5e 100644 --- a/sys/contrib/openzfs/module/Makefile.bsd +++ b/sys/contrib/openzfs/module/Makefile.bsd @@ -24,7 +24,6 @@ KMOD= openzfs CFLAGS+= -I${.OBJDIR:H}/include CFLAGS+= -I${INCDIR} -CFLAGS+= -I${INCDIR}/spl CFLAGS+= -I${INCDIR}/os/freebsd CFLAGS+= -I${INCDIR}/os/freebsd/spl CFLAGS+= -I${INCDIR}/os/freebsd/zfs @@ -40,7 +39,13 @@ CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSS .endif .if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true" -CFLAGS+= -DINVARIANTS -DWITNESS -g -O0 -DZFS_DEBUG -DOPENSOLARIS_WITNESS +CFLAGS+= -DZFS_DEBUG -g +.if defined(WITH_INVARIANTS) && ${WITH_INVARIANTS} == "true" + CFLAGS+= -DINVARIANTS -DWITNESS -DOPENSOLARIS_WITNESS +.endif +.if defined(WITH_O0) && ${WITH_O0} == "true" + CFLAGS+= -O0 +.endif .else CFLAGS += -DNDEBUG .endif @@ -102,9 +107,10 @@ SRCS+= nvpair.c \ #os/freebsd/spl SRCS+= acl_common.c \ - btree.c \ callb.c \ list.c \ + sha256c.c \ + sha512c.c \ spl_acl.c \ spl_cmn_err.c \ spl_dtrace.c \ @@ -112,6 +118,7 @@ SRCS+= acl_common.c \ spl_kstat.c \ spl_misc.c \ spl_policy.c \ + spl_procfs_list.c \ spl_string.c \ spl_sunddi.c \ spl_sysevent.c \ @@ -119,11 +126,8 @@ SRCS+= acl_common.c \ spl_uio.c \ spl_vfs.c \ spl_vm.c \ - spl_zone.c \ - sha256c.c \ - sha512c.c \ - spl_procfs_list.c \ - spl_zlib.c + spl_zlib.c \ + spl_zone.c .if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ @@ -133,6 +137,7 @@ SRCS+= spl_atomic.c #os/freebsd/zfs SRCS+= abd_os.c \ + arc_os.c \ crypto_os.c \ dmu_os.c \ hkdf.c \ @@ -140,17 +145,16 @@ SRCS+= abd_os.c \ spa_os.c \ sysctl_os.c \ vdev_file.c \ - vdev_label_os.c \ vdev_geom.c \ + vdev_label_os.c \ zfs_acl.c \ zfs_ctldir.c \ + zfs_debug.c \ zfs_dir.c \ zfs_ioctl_compat.c \ zfs_ioctl_os.c \ - zfs_log.c \ - zfs_replay.c \ zfs_vfsops.c \ - zfs_vnops.c \ + zfs_vnops_os.c \ zfs_znode.c \ zio_crypt.c \ zvol_os.c @@ -178,10 +182,10 @@ SRCS+= zfeature_common.c \ SRCS+= abd.c \ aggsum.c \ arc.c \ - arc_os.c \ blkptr.c \ bplist.c \ bpobj.c \ + btree.c \ cityhash.c \ dbuf.c \ dbuf_stats.c \ @@ -245,6 +249,8 @@ SRCS+= abd.c \ unique.c \ vdev.c \ vdev_cache.c \ + vdev_draid.c \ + vdev_draid_rand.c \ vdev_indirect.c \ vdev_indirect_births.c \ vdev_indirect_mapping.c \ @@ -276,16 +282,18 @@ SRCS+= abd.c \ zcp_synctask.c \ zfeature.c \ zfs_byteswap.c \ - zfs_debug.c \ zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ zfs_ioctl.c \ + zfs_log.c \ zfs_onexit.c \ zfs_quota.c \ zfs_ratelimit.c \ + zfs_replay.c \ zfs_rlock.c \ zfs_sa.c \ + zfs_vnops.c \ zil.c \ zio.c \ zio_checksum.c \ @@ -323,7 +331,7 @@ CFLAGS.spl_vm.c= -Wno-cast-qual CFLAGS.spl_zlib.c= -Wno-cast-qual CFLAGS.abd.c= -Wno-cast-qual CFLAGS.zfs_log.c= -Wno-cast-qual -CFLAGS.zfs_vnops.c= -Wno-pointer-arith +CFLAGS.zfs_vnops_os.c= -Wno-pointer-arith CFLAGS.u8_textprep.c= -Wno-cast-qual CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith @@ -341,6 +349,7 @@ CFLAGS.lz4.c= -Wno-cast-qual CFLAGS.spa.c= -Wno-cast-qual CFLAGS.spa_misc.c= -Wno-cast-qual CFLAGS.sysctl_os.c= -include ../zfs_config.h +CFLAGS.vdev_draid.c= -Wno-cast-qual CFLAGS.vdev_raidz.c= -Wno-cast-qual CFLAGS.vdev_raidz_math.c= -Wno-cast-qual CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c index 5553c55e11cd..23686c59e8ce 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c @@ -59,10 +59,12 @@ boolean_t gcm_avx_can_use_movbe = B_FALSE; static boolean_t gcm_use_avx = B_FALSE; #define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) +extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); + static inline boolean_t gcm_avx_will_work(void); static inline void gcm_set_avx(boolean_t); static inline boolean_t gcm_toggle_avx(void); -extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); +static inline size_t gcm_simd_get_htab_size(boolean_t); static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, crypto_data_t *, size_t); @@ -629,6 +631,21 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, (volatile boolean_t *)&gcm_avx_can_use_movbe); } } + /* Allocate Htab memory as needed. */ + if (gcm_ctx->gcm_use_avx == B_TRUE) { + size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); + + if (htab_len == 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + gcm_ctx->gcm_htab_len = htab_len; + gcm_ctx->gcm_Htable = + (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag); + + if (gcm_ctx->gcm_Htable == NULL) { + return (CRYPTO_HOST_MEMORY); + } + } /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ @@ -689,6 +706,22 @@ gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, if (ks->ops->needs_byteswap == B_TRUE) { gcm_ctx->gcm_use_avx = B_FALSE; } + /* Allocate Htab memory as needed. */ + if (gcm_ctx->gcm_use_avx == B_TRUE) { + size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx); + + if (htab_len == 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + gcm_ctx->gcm_htab_len = htab_len; + gcm_ctx->gcm_Htable = + (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag); + + if (gcm_ctx->gcm_Htable == NULL) { + return (CRYPTO_HOST_MEMORY); + } + } + /* Avx and non avx context initialization differs from here on. */ if (gcm_ctx->gcm_use_avx == B_FALSE) { #endif /* ifdef CAN_USE_GCM_ASM */ @@ -1018,7 +1051,7 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); /* Clear the FPU registers since they hold sensitive internal state. */ #define clear_fpu_regs() clear_fpu_regs_avx() #define GHASH_AVX(ctx, in, len) \ - gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t (*)[2])(ctx)->gcm_Htable, \ + gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \ in, len) #define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) @@ -1036,8 +1069,8 @@ extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); extern void aes_encrypt_intel(const uint32_t rk[], int nr, const uint32_t pt[4], uint32_t ct[4]); -extern void gcm_init_htab_avx(uint64_t Htable[16][2], const uint64_t H[2]); -extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t Htable[16][2], +extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]); +extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable, const uint8_t *in, size_t len); extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, @@ -1073,6 +1106,18 @@ gcm_toggle_avx(void) } } +static inline size_t +gcm_simd_get_htab_size(boolean_t simd_mode) +{ + switch (simd_mode) { + case B_TRUE: + return (2 * 6 * 2 * sizeof (uint64_t)); + + default: + return (0); + } +} + /* * Clear sensitive data in the context. * @@ -1088,7 +1133,6 @@ gcm_clear_ctx(gcm_ctx_t *ctx) { bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder)); bzero(ctx->gcm_H, sizeof (ctx->gcm_H)); - bzero(ctx->gcm_Htable, sizeof (ctx->gcm_Htable)); bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0)); bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp)); } diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c index f07876a478e2..faae9722bd04 100644 --- a/sys/contrib/openzfs/module/icp/algs/modes/modes.c +++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c @@ -152,6 +152,14 @@ crypto_free_mode_ctx(void *ctx) vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf, ((gcm_ctx_t *)ctx)->gcm_pt_buf_len); +#ifdef CAN_USE_GCM_ASM + if (((gcm_ctx_t *)ctx)->gcm_Htable != NULL) { + gcm_ctx_t *gcm_ctx = (gcm_ctx_t *)ctx; + bzero(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len); + kmem_free(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len); + } +#endif + kmem_free(ctx, sizeof (gcm_ctx_t)); } } diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S index ed9f660fce5b..dc71ae2c1c89 100644 --- a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S +++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -55,6 +55,7 @@ .type _aesni_ctr32_ghash_6x,@function .align 32 _aesni_ctr32_ghash_6x: +.cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 @@ -363,12 +364,14 @@ _aesni_ctr32_ghash_6x: vpxor %xmm4,%xmm8,%xmm8 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x #endif /* ifdef HAVE_MOVBE */ .type _aesni_ctr32_ghash_no_movbe_6x,@function .align 32 _aesni_ctr32_ghash_no_movbe_6x: +.cfi_startproc vmovdqu 32(%r11),%xmm2 subq $6,%rdx vpxor %xmm4,%xmm4,%xmm4 @@ -689,6 +692,7 @@ _aesni_ctr32_ghash_no_movbe_6x: vpxor %xmm4,%xmm8,%xmm8 .byte 0xf3,0xc3 +.cfi_endproc .size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x .globl aesni_gcm_decrypt @@ -714,6 +718,8 @@ aesni_gcm_decrypt: .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 + pushq %r9 +.cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 @@ -726,7 +732,8 @@ aesni_gcm_decrypt: andq $-128,%rsp vmovdqu (%r11),%xmm0 leaq 128(%rcx),%rcx - leaq 32+32(%r9),%r9 + movq 32(%r9),%r9 + leaq 32(%r9),%r9 movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. vpshufb %xmm0,%xmm8,%xmm8 @@ -782,7 +789,9 @@ aesni_gcm_decrypt: vmovups %xmm14,-16(%rsi) vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) + movq -56(%rax),%r9 +.cfi_restore %r9 + vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 @@ -807,6 +816,7 @@ aesni_gcm_decrypt: .type _aesni_ctr32_6x,@function .align 32 _aesni_ctr32_6x: +.cfi_startproc vmovdqu 0-128(%rcx),%xmm4 vmovdqu 32(%r11),%xmm2 leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. @@ -893,6 +903,7 @@ _aesni_ctr32_6x: vpshufb %xmm0,%xmm1,%xmm1 vpxor %xmm4,%xmm14,%xmm14 jmp .Loop_ctr32 +.cfi_endproc .size _aesni_ctr32_6x,.-_aesni_ctr32_6x .globl aesni_gcm_encrypt @@ -918,6 +929,8 @@ aesni_gcm_encrypt: .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 + pushq %r9 +.cfi_offset %r9,-64 vzeroupper vmovdqu (%r8),%xmm1 @@ -960,7 +973,8 @@ aesni_gcm_encrypt: call _aesni_ctr32_6x vmovdqu (%r9),%xmm8 - leaq 32+32(%r9),%r9 + movq 32(%r9),%r9 + leaq 32(%r9),%r9 subq $12,%rdx movq $192,%r10 vpshufb %xmm0,%xmm8,%xmm8 @@ -1151,7 +1165,9 @@ aesni_gcm_encrypt: vpxor %xmm7,%xmm2,%xmm2 vpxor %xmm2,%xmm8,%xmm8 vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) + movq -56(%rax),%r9 +.cfi_restore %r9 + vmovdqu %xmm8,(%r9) vzeroupper movq -48(%rax),%r15 diff --git a/sys/contrib/openzfs/module/icp/core/kcf_sched.c b/sys/contrib/openzfs/module/icp/core/kcf_sched.c index 40d50553d67e..81fd15f8ea26 100644 --- a/sys/contrib/openzfs/module/icp/core/kcf_sched.c +++ b/sys/contrib/openzfs/module/icp/core/kcf_sched.c @@ -1308,9 +1308,7 @@ kcf_reqid_insert(kcf_areq_node_t *areq) kcf_areq_node_t *headp; kcf_reqid_table_t *rt; - kpreempt_disable(); - rt = kcf_reqid_table[CPU_SEQID & REQID_TABLE_MASK]; - kpreempt_enable(); + rt = kcf_reqid_table[CPU_SEQID_UNSTABLE & REQID_TABLE_MASK]; mutex_enter(&rt->rt_lock); diff --git a/sys/contrib/openzfs/module/icp/include/modes/modes.h b/sys/contrib/openzfs/module/icp/include/modes/modes.h index 57a211ccf1bf..ab71197542eb 100644 --- a/sys/contrib/openzfs/module/icp/include/modes/modes.h +++ b/sys/contrib/openzfs/module/icp/include/modes/modes.h @@ -219,14 +219,14 @@ typedef struct gcm_ctx { size_t gcm_pt_buf_len; uint32_t gcm_tmp[4]; /* - * The relative positions of gcm_ghash, gcm_H and pre-computed - * gcm_Htable are hard coded in aesni-gcm-x86_64.S and ghash-x86_64.S, - * so please don't change (or adjust accordingly). + * The offset of gcm_Htable relative to gcm_ghash, (32), is hard coded + * in aesni-gcm-x86_64.S, so please don't change (or adjust there). */ uint64_t gcm_ghash[2]; uint64_t gcm_H[2]; #ifdef CAN_USE_GCM_ASM - uint64_t gcm_Htable[12][2]; + uint64_t *gcm_Htable; + size_t gcm_htab_len; #endif uint64_t gcm_J0[2]; uint64_t gcm_len_a_len_c[2]; diff --git a/sys/contrib/openzfs/module/icp/io/aes.c b/sys/contrib/openzfs/module/icp/io/aes.c index 96fb6bb1af30..e540af4473f7 100644 --- a/sys/contrib/openzfs/module/icp/io/aes.c +++ b/sys/contrib/openzfs/module/icp/io/aes.c @@ -1051,6 +1051,16 @@ aes_encrypt_atomic(crypto_provider_handle_t provider, bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); } +#ifdef CAN_USE_GCM_ASM + if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE) && + ((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) { + + gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx; + + bzero(ctx->gcm_Htable, ctx->gcm_htab_len); + kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); + } +#endif return (ret); } @@ -1209,6 +1219,14 @@ aes_decrypt_atomic(crypto_provider_handle_t provider, vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf, ((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len); } +#ifdef CAN_USE_GCM_ASM + if (((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) { + gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx; + + bzero(ctx->gcm_Htable, ctx->gcm_htab_len); + kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len); + } +#endif } return (ret); diff --git a/sys/contrib/openzfs/module/lua/lapi.c b/sys/contrib/openzfs/module/lua/lapi.c index 8f072531fde5..6a845c461052 100644 --- a/sys/contrib/openzfs/module/lua/lapi.c +++ b/sys/contrib/openzfs/module/lua/lapi.c @@ -1300,7 +1300,7 @@ module_exit(lua_fini); ZFS_MODULE_DESCRIPTION("Lua Interpreter for ZFS"); ZFS_MODULE_AUTHOR("Lua.org"); -ZFS_MODULE_LICENSE("MIT"); +ZFS_MODULE_LICENSE("Dual MIT/GPL"); ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); EXPORT_SYMBOL(lua_absindex); diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c index 5cd5c69efa71..5ecd3d310361 100644 --- a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c +++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c @@ -37,6 +37,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include int @@ -312,11 +313,11 @@ secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid) } int -secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, +secpolicy_vnode_setid_retain(znode_t *zp, cred_t *cr, boolean_t issuidroot __unused) { - if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + if (secpolicy_fs_owner(ZTOV(zp)->v_mount, cr) == 0) return (0); return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c index a7bda509bf54..0a323e8856a3 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c @@ -106,13 +106,13 @@ abd_free_chunk(void *c) kmem_cache_free(abd_chunk_cache, c); } -static size_t +static uint_t abd_chunkcnt_for_bytes(size_t size) { return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); } -static inline size_t +static inline uint_t abd_scatter_chunkcnt(abd_t *abd) { ASSERT(!abd_is_linear(abd)); @@ -129,7 +129,7 @@ abd_size_alloc_linear(size_t size) void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { - size_t n = abd_scatter_chunkcnt(abd); + uint_t n = abd_scatter_chunkcnt(abd); ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); int waste = n * zfs_abd_chunk_size - abd->abd_size; if (op == ABDSTAT_INCR) { @@ -161,25 +161,28 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) void abd_verify_scatter(abd_t *abd) { + uint_t i, n; + /* * There is no scatter linear pages in FreeBSD so there is an * if an error if the ABD has been marked as a linear page. */ - VERIFY(!abd_is_linear_page(abd)); + ASSERT(!abd_is_linear_page(abd)); ASSERT3U(ABD_SCATTER(abd).abd_offset, <, zfs_abd_chunk_size); - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - ASSERT3P( - ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { + ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL); } } void abd_alloc_chunks(abd_t *abd, size_t size) { - size_t n = abd_chunkcnt_for_bytes(size); - for (int i = 0; i < n; i++) { + uint_t i, n; + + n = abd_chunkcnt_for_bytes(size); + for (i = 0; i < n; i++) { void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); ASSERT3P(c, !=, NULL); ABD_SCATTER(abd).abd_chunks[i] = c; @@ -190,8 +193,10 @@ abd_alloc_chunks(abd_t *abd, size_t size) void abd_free_chunks(abd_t *abd) { - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { + uint_t i, n; + + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); } } @@ -199,7 +204,7 @@ abd_free_chunks(abd_t *abd) abd_t * abd_alloc_struct(size_t size) { - size_t chunkcnt = abd_chunkcnt_for_bytes(size); + uint_t chunkcnt = abd_chunkcnt_for_bytes(size); /* * In the event we are allocating a gang ABD, the size passed in * will be 0. We must make sure to set abd_size to the size of an @@ -221,9 +226,9 @@ abd_alloc_struct(size_t size) void abd_free_struct(abd_t *abd) { - size_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 : + uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 : abd_scatter_chunkcnt(abd); - int size = MAX(sizeof (abd_t), + ssize_t size = MAX(sizeof (abd_t), offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); mutex_destroy(&abd->abd_mtx); ASSERT(!list_link_active(&abd->abd_gang_link)); @@ -238,7 +243,9 @@ abd_free_struct(abd_t *abd) static void abd_alloc_zero_scatter(void) { - size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + uint_t i, n; + + n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP); abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); @@ -251,7 +258,7 @@ abd_alloc_zero_scatter(void) ABD_SCATTER(abd_zero_scatter).abd_chunk_size = zfs_abd_chunk_size; - for (int i = 0; i < n; i++) { + for (i = 0; i < n; i++) { ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = abd_zero_buf; } @@ -356,7 +363,7 @@ abd_get_offset_scatter(abd_t *sabd, size_t off) ASSERT3U(off, <=, sabd->abd_size); size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; - size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + uint_t chunkcnt = abd_scatter_chunkcnt(sabd) - (new_offset / zfs_abd_chunk_size); abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c index 94df750035a4..4fc7468bfa47 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c @@ -243,3 +243,13 @@ arc_lowmem_fini(void) if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); } + +void +arc_register_hotplug(void) +{ +} + +void +arc_unregister_hotplug(void) +{ +} diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c index 1b37ce0d7f6b..647c1463ba14 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -114,6 +114,7 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); @@ -228,15 +229,14 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, static int sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) { - uint32_t val; - int err; + int err, val; val = arc_no_grow_shift; - err = sysctl_handle_32(oidp, &val, 0, req); + err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); - if (val >= arc_shrink_shift) + if (val < 0 || val >= arc_shrink_shift) return (EINVAL); arc_no_grow_shift = val; @@ -244,8 +244,8 @@ sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) } SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, - CTLTYPE_U32 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, sizeof (uint32_t), - sysctl_vfs_zfs_arc_no_grow_shift, "U", + CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, sizeof (int), + sysctl_vfs_zfs_arc_no_grow_shift, "I", "log2(fraction of ARC which must be free to allow growing)"); int diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c index cf762c5fd61c..825bd706e0c0 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c @@ -292,19 +292,28 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; /* @@ -313,19 +322,28 @@ vdev_ops_t vdev_file_ops = { #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c index f042eff7cd2e..c9e8e21982cf 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c @@ -1141,7 +1141,6 @@ vdev_geom_io_start(zio_t *zio) break; case ZIO_TYPE_IOCTL: bp->bio_cmd = BIO_FLUSH; - bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = cp->provider->mediasize; bp->bio_length = 0; @@ -1190,17 +1189,26 @@ vdev_geom_rele(vdev_t *vd) } vdev_ops_t vdev_disk_ops = { - vdev_geom_open, - vdev_geom_close, - vdev_default_asize, - vdev_geom_io_start, - vdev_geom_io_done, - NULL, - NULL, - vdev_geom_hold, - vdev_geom_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_init = NULL, + .vdev_op_fini = NULL, + .vdev_op_open = vdev_geom_open, + .vdev_op_close = vdev_geom_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_geom_io_start, + .vdev_op_io_done = vdev_geom_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_geom_hold, + .vdev_op_rele = vdev_geom_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c index d7786d5136a2..8fb259f4ba76 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c @@ -158,7 +158,8 @@ zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp, rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td); if (rc) return (SET_ERROR(rc)); - *resid = auio.uio_resid; + if (resid) + *resid = auio.uio_resid; *offp += count - auio.uio_resid; return (SET_ERROR(0)); } @@ -296,7 +297,8 @@ zfs_file_unlink(const char *fnamep) rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0); #else #ifdef AT_BENEATH - rc = kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0, 0); + rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), + seg, 0, 0); #else rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), seg, 0); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c index 8b22f2fdc3b3..e69de29bb2d1 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_onexit_os.c @@ -1,70 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include - -static int -zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) -{ - *zo = zfsdev_get_state(minor, ZST_ONEXIT); - if (*zo == NULL) - return (SET_ERROR(EBADF)); - - return (0); -} - -int -zfs_onexit_fd_hold(int fd, minor_t *minorp) -{ - file_t *fp, *tmpfp; - zfs_onexit_t *zo; - void *data; - int error; - - if ((error = zfs_file_get(fd, &fp))) - return (error); - - tmpfp = curthread->td_fpop; - curthread->td_fpop = fp; - error = devfs_get_cdevpriv(&data); - if (error == 0) - *minorp = (minor_t)(uintptr_t)data; - curthread->td_fpop = tmpfp; - if (error != 0) - return (SET_ERROR(EBADF)); - return (zfs_onexit_minor_to_state(*minorp, &zo)); -} - -void -zfs_onexit_fd_rele(int fd) -{ - zfs_file_put(fd); -} diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c index 54ebfa7532dd..7bc6b83d0272 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -433,7 +434,7 @@ zfs_sync(vfs_t *vfsp, int waitfor) } else { /* * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the + * run sync(8). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c index 3c3285f93389..2e8eadb5e16e 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c @@ -29,6 +29,7 @@ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ + #include #include #include @@ -270,69 +271,13 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) return (0); } -/* - * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and - * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. - */ -static int -zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off) -{ - znode_t *zp = VTOZ(vp); - uint64_t noff = (uint64_t)*off; /* new offset */ - uint64_t file_sz; - int error; - boolean_t hole; - - file_sz = zp->z_size; - if (noff >= file_sz) { - return (SET_ERROR(ENXIO)); - } - - if (cmd == _FIO_SEEK_HOLE) - hole = B_TRUE; - else - hole = B_FALSE; - - error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); - - if (error == ESRCH) - return (SET_ERROR(ENXIO)); - - /* file was dirty, so fall back to using generic logic */ - if (error == EBUSY) { - if (hole) - *off = file_sz; - - return (0); - } - - /* - * We could find a hole that begins after the logical end-of-file, - * because dmu_offset_next() only works on whole blocks. If the - * EOF falls mid-block, then indicate that the "virtual hole" - * at the end of the file begins at the logical EOF, rather than - * at the end of the last block. - */ - if (noff > file_sz) { - ASSERT(hole); - noff = file_sz; - } - - if (noff < *off) - return (error); - *off = noff; - return (error); -} - /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, int *rvalp) { - offset_t off; + loff_t off; int error; - zfsvfs_t *zfsvfs; - znode_t *zp; switch (com) { case _FIOFFS: @@ -350,18 +295,12 @@ zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, return (0); } - case _FIO_SEEK_DATA: - case _FIO_SEEK_HOLE: + case F_SEEK_DATA: + case F_SEEK_HOLE: { off = *(offset_t *)data; - zp = VTOZ(vp); - zfsvfs = zp->z_zfsvfs; - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - /* offset parameter is in/out */ - error = zfs_holey(vp, com, &off); - ZFS_EXIT(zfsvfs); + error = zfs_holey(VTOZ(vp), com, &off); if (error) return (error); *(offset_t *)data = off; @@ -525,16 +464,15 @@ page_unhold(vm_page_t pp) * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ -static void -update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, - int segflg, dmu_tx_t *tx) +void +update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { vm_object_t obj; struct sf_buf *sf; + vnode_t *vp = ZTOV(zp); caddr_t va; int off; - ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); @@ -552,8 +490,8 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); - (void) dmu_read(os, oid, start+off, nbytes, - va+off, DMU_READ_PREFETCH); + (void) dmu_read(os, zp->z_id, start + off, nbytes, + va + off, DMU_READ_PREFETCH); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); @@ -579,10 +517,10 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ -static int -mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +int +mappedread_sf(znode_t *zp, int nbytes, uio_t *uio) { - znode_t *zp = VTOZ(vp); + vnode_t *vp = ZTOV(zp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; @@ -664,10 +602,10 @@ mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ -static int -mappedread(vnode_t *vp, int nbytes, uio_t *uio) +int +mappedread(znode_t *zp, int nbytes, uio_t *uio) { - znode_t *zp = VTOZ(vp); + vnode_t *vp = ZTOV(zp); vm_object_t obj; int64_t start; int len = nbytes; @@ -710,523 +648,6 @@ mappedread(vnode_t *vp, int nbytes, uio_t *uio) return (error); } -offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ - -/* - * Read bytes from specified file into supplied buffer. - * - * IN: vp - vnode of file to be read from. - * uio - structure supplying read location, range info, - * and return buffer. - * ioflag - SYNC flags; used to provide FRSYNC semantics. - * cr - credentials of caller. - * ct - caller context - * - * OUT: uio - updated offset and range, buffer filled. - * - * RETURN: 0 on success, error code on failure. - * - * Side Effects: - * vp - atime updated if byte count > 0 - */ -/* ARGSUSED */ -static int -zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - ssize_t n, nbytes, start_resid; - int error = 0; - int64_t nread; - zfs_locked_range_t *lr; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - /* We don't copy out anything useful for directories. */ - if (vp->v_type == VDIR) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EISDIR)); - } - - if (zp->z_pflags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - /* - * Validate file offset - */ - if (uio->uio_loffset < (offset_t)0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Fasttrack empty reads - */ - if (uio->uio_resid == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* - * If we're in FRSYNC mode, sync out this znode before reading it. - */ - if (zfsvfs->z_log && - (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) - zil_commit(zfsvfs->z_log, zp->z_id); - - /* - * Lock the range against changes. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset, - uio->uio_resid, RL_READER); - - /* - * If we are reading past end-of-file we can skip - * to the end; but we might still need to set atime. - */ - if (uio->uio_loffset >= zp->z_size) { - error = 0; - goto out; - } - - ASSERT(uio->uio_loffset < zp->z_size); - n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); - start_resid = n; - - while (n > 0) { - nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - - if (uio->uio_segflg == UIO_NOCOPY) - error = mappedread_sf(vp, nbytes, uio); - else if (vn_has_cached_data(vp)) { - error = mappedread(vp, nbytes, uio); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes); - } - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - - n -= nbytes; - } - - nread = start_resid - n; - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); - -out: - zfs_rangelock_exit(lr); - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Write the bytes to a file. - * - * IN: vp - vnode of file to be written to. - * uio - structure supplying write location, range info, - * and data buffer. - * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is - * set if in append mode. - * cr - credentials of caller. - * ct - caller context (NFS/CIFS fem monitor only) - * - * OUT: uio - updated offset and range. - * - * RETURN: 0 on success, error code on failure. - * - * Timestamps: - * vp - ctime|mtime updated if byte count > 0 - */ - -/* ARGSUSED */ -static int -zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) -{ - znode_t *zp = VTOZ(vp); - rlim64_t limit = MAXOFFSET_T; - ssize_t start_resid = uio->uio_resid; - ssize_t tx_bytes; - uint64_t end_size; - dmu_buf_impl_t *db; - dmu_tx_t *tx; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog; - offset_t woff; - ssize_t n, nbytes; - zfs_locked_range_t *lr; - int max_blksz = zfsvfs->z_max_blksz; - int error = 0; - arc_buf_t *abuf; - iovec_t *aiov = NULL; - xuio_t *xuio = NULL; - int i_iov = 0; - int iovcnt __unused = uio->uio_iovcnt; - iovec_t *iovp = uio->uio_iov; - int write_eof; - int count = 0; - sa_bulk_attr_t bulk[4]; - uint64_t mtime[2], ctime[2]; - uint64_t uid, gid, projid; - int64_t nwritten; - - /* - * Fasttrack empty write - */ - n = start_resid; - if (n == 0) - return (0); - - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - /* - * If immutable or not appending then return EPERM. - * Intentionally allow ZFS_READONLY through here. - * See zfs_zaccess_common() - */ - if ((zp->z_pflags & ZFS_IMMUTABLE) || - ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && - (uio->uio_loffset < zp->z_size))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - zilog = zfsvfs->z_log; - - /* - * Validate file offset - */ - woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * If in append mode, set the io offset pointer to eof. - */ - if (ioflag & FAPPEND) { - /* - * Obtain an appending range lock to guarantee file append - * semantics. We reset the write offset once we have the lock. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); - woff = lr->lr_offset; - if (lr->lr_length == UINT64_MAX) { - /* - * We overlocked the file because this write will cause - * the file block size to increase. - * Note that zp_size cannot change with this lock held. - */ - woff = zp->z_size; - } - uio->uio_loffset = woff; - } else { - /* - * Note that if the file block size will change as a result of - * this write, then this range lock will lock the entire file - * so that we can re-write the block safely. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); - } - - if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (EFBIG); - } - - if (woff >= limit) { - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFBIG)); - } - - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; - - /* Will this write extend the file length? */ - write_eof = (woff + n > zp->z_size); - - end_size = MAX(zp->z_size, woff + n); - - uid = zp->z_uid; - gid = zp->z_gid; - projid = zp->z_projid; - - /* - * Write the file in reasonable size chunks. Each chunk is written - * in a separate transaction; this keeps the intent log records small - * and allows us to do more fine-grained space accounting. - */ - while (n > 0) { - woff = uio->uio_loffset; - - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || - (projid != ZFS_DEFAULT_PROJID && - zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - projid))) { - error = SET_ERROR(EDQUOT); - break; - } - - abuf = NULL; - if (xuio) { - ASSERT(i_iov < iovcnt); - aiov = &iovp[i_iov]; - abuf = dmu_xuio_arcbuf(xuio, i_iov); - dmu_xuio_clear(xuio, i_iov); - DTRACE_PROBE3(zfs_cp_write, int, i_iov, - iovec_t *, aiov, arc_buf_t *, abuf); - ASSERT((aiov->iov_base == abuf->b_data) || - ((char *)aiov->iov_base - (char *)abuf->b_data + - aiov->iov_len == arc_buf_size(abuf))); - i_iov++; - } else if (n >= max_blksz && - woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { - /* - * This write covers a full block. "Borrow" a buffer - * from the dmu so that we can fill it before we enter - * a transaction. This avoids the possibility of - * holding up the transaction if the data copy hangs - * up on a pagefault (e.g., from an NFS server mapping). - */ - size_t cbytes; - - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); - ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes))) { - dmu_return_arcbuf(abuf); - break; - } - ASSERT(cbytes == max_blksz); - } - - /* - * Start a transaction. - */ - tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); - DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, - MIN(n, max_blksz)); - DB_DNODE_EXIT(db); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - if (abuf != NULL) - dmu_return_arcbuf(abuf); - break; - } - - /* - * If zfs_range_lock() over-locked we grow the blocksize - * and then reduce the lock range. This will only happen - * on the first iteration since zfs_range_reduce() will - * shrink down r_len to the appropriate size. - */ - if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - zfs_rangelock_reduce(lr, woff, n); - } - - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - - if (woff + nbytes > zp->z_size) - vnode_pager_setsize(vp, woff + nbytes); - - if (abuf == NULL) { - tx_bytes = uio->uio_resid; - error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); - tx_bytes -= uio->uio_resid; - } else { - tx_bytes = nbytes; - ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); - /* - * If this is not a full block write, but we are - * extending the file past EOF and this data starts - * block-aligned, use assign_arcbuf(). Otherwise, - * write via dmu_write(). - */ - if (tx_bytes < max_blksz && (!write_eof || - aiov->iov_base != abuf->b_data)) { - ASSERT(xuio); - dmu_write(zfsvfs->z_os, zp->z_id, woff, - aiov->iov_len, aiov->iov_base, tx); - dmu_return_arcbuf(abuf); - xuio_stat_wbuf_copied(); - } else { - ASSERT(xuio || tx_bytes == max_blksz); - dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, - abuf, tx); - } - ASSERT(tx_bytes <= uio->uio_resid); - uioskip(uio, tx_bytes); - } - if (tx_bytes && vn_has_cached_data(vp)) { - update_pages(vp, woff, tx_bytes, zfsvfs->z_os, - zp->z_id, uio->uio_segflg, tx); - } - - /* - * If we made no progress, we're done. If we made even - * partial progress, update the znode and ZIL accordingly. - */ - if (tx_bytes == 0) { - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - (void *)&zp->z_size, sizeof (uint64_t), tx); - dmu_tx_commit(tx); - ASSERT(error != 0); - break; - } - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the execute bits is set. - * - * It would be nice to to this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(vp, cr, - (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { - uint64_t newmode; - zp->z_mode &= ~(S_ISUID | S_ISGID); - newmode = zp->z_mode; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), - (void *)&newmode, sizeof (uint64_t), tx); - } - mutex_exit(&zp->z_acl_lock); - - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); - - /* - * Update the file size (zp_size) if it has changed; - * account for possible concurrent updates. - */ - while ((end_size = zp->z_size) < uio->uio_loffset) { - (void) atomic_cas_64(&zp->z_size, end_size, - uio->uio_loffset); - ASSERT(error == 0 || error == EFAULT); - } - /* - * If we are replaying and eof is non zero then force - * the file size to the specified eof. Note, there's no - * concurrency during replay. - */ - if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) - zp->z_size = zfsvfs->z_replay_eof; - - if (error == 0) - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - else - (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, - ioflag, NULL, NULL); - dmu_tx_commit(tx); - - if (error != 0) - break; - ASSERT(tx_bytes == nbytes); - n -= nbytes; - - } - - zfs_rangelock_exit(lr); - - /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. - */ - if (zfsvfs->z_replay || uio->uio_resid == start_resid) { - ZFS_EXIT(zfsvfs); - return (error); - } - - /* - * EFAULT means that at least one page of the source buffer was not - * available. VFS will re-try remaining I/O upon this error. - */ - if (error == EFAULT) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (ioflag & (FSYNC | FDSYNC) || - zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, zp->z_id); - - nwritten = start_resid - uio->uio_resid; - dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); - - ZFS_EXIT(zfsvfs); - return (0); -} - int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *presid) @@ -1249,184 +670,13 @@ zfs_write_simple(znode_t *zp, const void *data, size_t len, return (error); } -static void -zfs_get_done(zgd_t *zgd, int error) +void +zfs_zrele_async(znode_t *zp) { - znode_t *zp = zgd->zgd_private; - objset_t *os = zp->z_zfsvfs->z_os; + vnode_t *vp = ZTOV(zp); + objset_t *os = ITOZSB(vp)->z_os; - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); - - zfs_rangelock_exit(zgd->zgd_lr); - - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os))); - - kmem_free(zgd, sizeof (zgd_t)); -} - -#ifdef ZFS_DEBUG -static int zil_fault_io = 0; -#endif - -/* - * Get data to generate a TX_WRITE intent log record. - */ -int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) -{ - zfsvfs_t *zfsvfs = arg; - objset_t *os = zfsvfs->z_os; - znode_t *zp; - uint64_t object = lr->lr_foid; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; - dmu_buf_t *db; - zgd_t *zgd; - int error = 0; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - /* - * Nothing to do if the file has been removed - */ - if (zfs_zget(zfsvfs, object, &zp) != 0) - return (SET_ERROR(ENOENT)); - if (zp->z_unlinked) { - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - VN_RELE_ASYNC(ZTOV(zp), - dsl_pool_zrele_taskq(dmu_objset_pool(os))); - return (SET_ERROR(ENOENT)); - } - - zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_lwb = lwb; - zgd->zgd_private = zp; - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, - size, RL_READER); - /* test for truncation needs to be done while range locked */ - if (offset >= zp->z_size) { - error = SET_ERROR(ENOENT); - } else { - error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); - } - ASSERT(error == 0 || error == ENOENT); - } else { /* indirect write */ - /* - * Have to lock the whole block to ensure when it's - * written out and its checksum is being calculated - * that no one can change the data. We need to re-check - * blocksize after we get the lock in case it's changed! - */ - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; - offset -= blkoff; - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - zfs_rangelock_exit(zgd->zgd_lr); - } - /* test for truncation needs to be done while range locked */ - if (lr->lr_offset >= zp->z_size) - error = SET_ERROR(ENOENT); -#ifdef ZFS_DEBUG - if (zil_fault_io) { - error = SET_ERROR(EIO); - zil_fault_io = 0; - } -#endif - if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); - - /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. - */ - if (error == 0) - return (0); - - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP, - * so there's no need to zio_flush() its - * vdevs (flushing would needlesly hurt - * performance, and doesn't work on - * indirect vdevs). - */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; - } - } - } - - zfs_get_done(zgd, error); - - return (error); -} - -/*ARGSUSED*/ -static int -zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); - else - error = zfs_zaccess_rwx(zp, mode, flag, cr); - - ZFS_EXIT(zfsvfs); - return (error); + VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os))); } static int @@ -2708,27 +1958,6 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, return (error); } -ulong_t zfs_fsync_sync_cnt = 4; - -static int -zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - - if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - } - tsd_set(zfs_fsyncer_key, NULL); - return (0); -} - - /* * Get the requested file attributes and place them in the provided * vattr structure. @@ -3905,7 +3134,7 @@ zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) return (error); } -#if __FreeBSD_version < 1300110 +#if __FreeBSD_version < 1300124 static void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) @@ -4793,45 +4022,6 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, } } -/*ARGSUSED*/ -static int -zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, - caller_context_t *ct) -{ - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_setacl(zp, vsecp, skipaclchk, cr); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - static int zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead) @@ -5225,7 +4415,7 @@ static int zfs_freebsd_read(struct vop_read_args *ap) { - return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + return (zfs_read(VTOZ(ap->a_vp), ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred)); } @@ -5242,7 +4432,7 @@ static int zfs_freebsd_write(struct vop_write_args *ap) { - return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + return (zfs_write(VTOZ(ap->a_vp), ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred)); } @@ -5301,7 +4491,7 @@ zfs_freebsd_access(struct vop_access_args *ap) */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) - error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); + error = zfs_access(zp, accmode, 0, ap->a_cred); /* * VADMIN has to be handled by vaccess(). @@ -5512,7 +4702,7 @@ zfs_freebsd_fsync(struct vop_fsync_args *ap) { vop_stdfsync(ap); - return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); + return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred)); } #ifndef _SYS_SYSPROTO_H_ @@ -5825,7 +5015,11 @@ zfs_freebsd_inactive(struct vop_inactive_args *ap) { vnode_t *vp = ap->a_vp; +#if __FreeBSD_version >= 1300123 zfs_inactive(vp, curthread->td_ucred, NULL); +#else + zfs_inactive(vp, ap->a_td->td_ucred, NULL); +#endif return (0); } @@ -6377,7 +5571,8 @@ zfs_freebsd_getacl(struct vop_getacl_args *ap) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; - if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))) + if ((error = zfs_getsecattr(VTOZ(ap->a_vp), + &vsecattr, 0, ap->a_cred))) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, @@ -6510,7 +5705,13 @@ zfs_vptocnp(struct vop_vptocnp_args *ap) error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); #endif if (error == 0) { - error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, ap->a_buflen); +#if __FreeBSD_version >= 1300123 + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, + ap->a_buflen); +#else + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, + ap->a_buf, ap->a_buflen); +#endif vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c index 40baa0b80928..6a21623c5f67 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -149,7 +149,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_vnode = NULL; - zp->z_moved = 0; return (0); } @@ -278,7 +277,6 @@ zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) sharezp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); - sharezp->z_moved = 0; sharezp->z_unlinked = 0; sharezp->z_atime_dirty = 0; sharezp->z_zfsvfs = zfsvfs; @@ -437,7 +435,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, vp->v_data = zp; ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); - zp->z_moved = 0; zp->z_sa_hdl = NULL; zp->z_unlinked = 0; @@ -1692,7 +1689,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) rootzp = zfs_znode_alloc_kmem(KM_SLEEP); ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); - rootzp->z_moved = 0; rootzp->z_unlinked = 0; rootzp->z_atime_dirty = 0; rootzp->z_is_sa = USE_SA(version, os); @@ -2015,6 +2011,20 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } + +void +zfs_inode_update(znode_t *zp) +{ + vm_object_t object; + + if ((object = ZTOV(zp)->v_object) == NULL || + zp->z_size == object->un_pager.vnp.vnp_size) + return; + + vnode_pager_setsize(ZTOV(zp), zp->z_size); +} + + #ifdef _KERNEL int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c index fb88bc325d3c..fd2beee7bdd2 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c @@ -1070,6 +1070,16 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + /* + * This is necessary here as we check next whether + * OBJSET_FLAG_USERACCOUNTING_COMPLETE or + * OBJSET_FLAG_USEROBJACCOUNTING are set in order to + * decide if the local_mac should be zeroed out. + */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. @@ -1081,7 +1091,10 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || - (datalen <= OBJSET_PHYS_SIZE_V1)) { + (datalen <= OBJSET_PHYS_SIZE_V1) || + (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 || + (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) && + key->zk_version > 0)) { bzero(local_mac, ZIO_OBJSET_MAC_LEN); return (0); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c index 092eb34eaa47..6c44e3681709 100644 --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c @@ -116,7 +116,6 @@ enum zvol_geom_state { }; struct zvol_state_os { - int zso_volmode; #define zso_dev _zso_state._zso_dev #define zso_geom _zso_state._zso_geom union { @@ -134,6 +133,7 @@ struct zvol_state_os { enum zvol_geom_state zsg_state; } _zso_geom; } _zso_state; + int zso_dying; }; static uint32_t zvol_minors; @@ -209,7 +209,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; int err = 0; - boolean_t drop_suspend = B_TRUE; + boolean_t drop_suspend = B_FALSE; boolean_t drop_namespace = B_FALSE; if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { @@ -228,16 +228,15 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; if (zv == NULL) { - if (drop_namespace) - mutex_exit(&spa_namespace_lock); rw_exit(&zvol_state_lock); - return (SET_ERROR(ENXIO)); + err = SET_ERROR(ENXIO); + goto out_locked; } if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { /* * We need to guarantee that the namespace lock is held - * to avoid spurious failures in zvol_first_open + * to avoid spurious failures in zvol_first_open. */ drop_namespace = B_TRUE; if (!mutex_tryenter(&spa_namespace_lock)) { @@ -247,8 +246,12 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) } } mutex_enter(&zv->zv_state_lock); - - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + if (zv->zv_zso->zso_dying) { + rw_exit(&zvol_state_lock); + err = SET_ERROR(ENXIO); + goto out_zv_locked; + } + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* * make sure zvol is not suspended during first open @@ -256,6 +259,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { + drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); @@ -266,8 +270,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) drop_suspend = B_FALSE; } } - } else { - drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); @@ -277,7 +279,7 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); err = zvol_first_open(zv, !(flag & FWRITE)); if (err) - goto out_mutex; + goto out_zv_locked; pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; @@ -289,41 +291,37 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) */ if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || dmu_objset_incompatible_encryption_version(zv->zv_objset))) { - err = EROFS; - goto out_open_count; + err = SET_ERROR(EROFS); + goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { - err = EBUSY; - goto out_open_count; + err = SET_ERROR(EBUSY); + goto out_opened; } #ifdef FEXCL if (flag & FEXCL) { if (zv->zv_open_count != 0) { - err = EBUSY; - goto out_open_count; + err = SET_ERROR(EBUSY); + goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } #endif zv->zv_open_count += count; - if (drop_namespace) - mutex_exit(&spa_namespace_lock); - mutex_exit(&zv->zv_state_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - return (0); - -out_open_count: - if (zv->zv_open_count == 0) +out_opened: + if (zv->zv_open_count == 0) { zvol_last_close(zv); -out_mutex: + wakeup(zv); + } +out_zv_locked: + mutex_exit(&zv->zv_state_lock); +out_locked: if (drop_namespace) mutex_exit(&spa_namespace_lock); - mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - return (SET_ERROR(err)); + return (err); } /*ARGSUSED*/ @@ -332,6 +330,7 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; + int new_open_count; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; @@ -342,30 +341,32 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { - ASSERT(zv->zv_open_count == 1); + ASSERT3U(zv->zv_open_count, ==, 1); zv->zv_flags &= ~ZVOL_EXCL; } - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ - ASSERT(zv->zv_open_count > 0); + ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ - if ((zv->zv_open_count - count) == 0) { + new_open_count = zv->zv_open_count - count; + if (new_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ - if (zv->zv_open_count != 1) { + new_open_count = zv->zv_open_count - count; + if (new_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } @@ -380,11 +381,11 @@ zvol_geom_close(struct g_provider *pp, int flag, int count) /* * You may get multiple opens, but only one close. */ - zv->zv_open_count -= count; - + zv->zv_open_count = new_open_count; if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); + wakeup(zv); } mutex_exit(&zv->zv_state_lock); @@ -400,7 +401,7 @@ zvol_geom_run(zvol_state_t *zv) struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); g_error_provider(pp, 0); @@ -414,7 +415,7 @@ zvol_geom_destroy(zvol_state_t *zv) struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); g_topology_assert(); @@ -422,10 +423,25 @@ zvol_geom_destroy(zvol_state_t *zv) VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING); mutex_exit(&zv->zv_state_lock); zsg->zsg_provider = NULL; - pp->private = NULL; g_wither_geom(pp->geom, ENXIO); } +void +zvol_wait_close(zvol_state_t *zv) +{ + + if (zv->zv_volmode != ZFS_VOLMODE_GEOM) + return; + mutex_enter(&zv->zv_state_lock); + zv->zv_zso->zso_dying = B_TRUE; + + if (zv->zv_open_count) + msleep(zv, &zv->zv_state_lock, + PRIBIO, "zvol:dying", 10*hz); + mutex_exit(&zv->zv_state_lock); +} + + static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) { @@ -483,7 +499,7 @@ zvol_geom_worker(void *arg) struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct bio *bp; - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); thread_lock(curthread); sched_prio(curthread, PRIBIO); @@ -512,9 +528,13 @@ static void zvol_geom_bio_start(struct bio *bp) { zvol_state_t *zv = bp->bio_to->private; - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct zvol_state_geom *zsg; boolean_t first; + if (zv == NULL) { + g_io_deliver(bp, ENXIO); + return; + } if (bp->bio_cmd == BIO_GETATTR) { if (zvol_geom_bio_getattr(bp)) g_io_deliver(bp, EOPNOTSUPP); @@ -522,6 +542,7 @@ zvol_geom_bio_start(struct bio *bp) } if (!THREAD_CAN_SLEEP()) { + zsg = &zv->zv_zso->zso_geom; mtx_lock(&zsg->zsg_queue_mtx); first = (bioq_first(&zsg->zsg_queue) == NULL); bioq_insert_tail(&zsg->zsg_queue, bp); @@ -540,7 +561,7 @@ zvol_geom_bio_getattr(struct bio *bp) zvol_state_t *zv; zv = bp->bio_to->private; - ASSERT(zv != NULL); + ASSERT3P(zv, !=, NULL); spa_t *spa = dmu_objset_spa(zv->zv_objset); uint64_t refd, avail, usedobjs, availobjs; @@ -613,7 +634,7 @@ zvol_geom_bio_strategy(struct bio *bp) goto sync; break; default: - error = EOPNOTSUPP; + error = SET_ERROR(EOPNOTSUPP); goto resume; } @@ -621,7 +642,7 @@ zvol_geom_bio_strategy(struct bio *bp) volsize = zv->zv_volsize; os = zv->zv_objset; - ASSERT(os != NULL); + ASSERT3P(os, !=, NULL); addr = bp->bio_data; resid = bp->bio_length; @@ -688,7 +709,7 @@ zvol_geom_bio_strategy(struct bio *bp) bp->bio_completed = bp->bio_length - resid; if (bp->bio_completed < bp->bio_length && off > volsize) - error = EINVAL; + error = SET_ERROR(EINVAL); switch (bp->bio_cmd) { case BIO_FLUSH: @@ -825,18 +846,33 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) zvol_state_t *zv; struct zvol_state_dev *zsd; int err = 0; - boolean_t drop_suspend = B_TRUE; + boolean_t drop_suspend = B_FALSE; + boolean_t drop_namespace = B_FALSE; +retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); - return (SET_ERROR(ENXIO)); + err = SET_ERROR(ENXIO); + goto out_locked; } + if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + /* + * We need to guarantee that the namespace lock is held + * to avoid spurious failures in zvol_first_open. + */ + drop_namespace = B_TRUE; + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + mutex_enter(&spa_namespace_lock); + goto retry; + } + } mutex_enter(&zv->zv_state_lock); - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* * make sure zvol is not suspended during first open @@ -844,6 +880,7 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { + drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); @@ -854,8 +891,6 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) drop_suspend = B_FALSE; } } - } else { - drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); @@ -865,21 +900,21 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); err = zvol_first_open(zv, !(flags & FWRITE)); if (err) - goto out_locked; + goto out_zv_locked; } if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - err = EROFS; + err = SET_ERROR(EROFS); goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { - err = EBUSY; + err = SET_ERROR(EBUSY); goto out_opened; } #ifdef FEXCL if (flags & FEXCL) { if (zv->zv_open_count != 0) { - err = EBUSY; + err = SET_ERROR(EBUSY); goto out_opened; } zv->zv_flags |= ZVOL_EXCL; @@ -894,20 +929,19 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) (zv->zv_flags & ZVOL_WRITTEN_TO) != 0) zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); } - - mutex_exit(&zv->zv_state_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - return (0); - out_opened: - if (zv->zv_open_count == 0) + if (zv->zv_open_count == 0) { zvol_last_close(zv); -out_locked: + wakeup(zv); + } +out_zv_locked: mutex_exit(&zv->zv_state_lock); +out_locked: + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - return (SET_ERROR(err)); + return (err); } static int @@ -926,17 +960,17 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { - ASSERT(zv->zv_open_count == 1); + ASSERT3U(zv->zv_open_count, ==, 1); zv->zv_flags &= ~ZVOL_EXCL; } - ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ - ASSERT(zv->zv_open_count > 0); + ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition @@ -972,6 +1006,7 @@ zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); + wakeup(zv); } mutex_exit(&zv->zv_state_lock); @@ -1022,7 +1057,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); - error = EINVAL; + error = SET_ERROR(EINVAL); break; } rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); @@ -1076,7 +1111,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, refd = metaslab_class_get_alloc(spa_normal_class(spa)); arg->value.off = refd / DEV_BSIZE; } else - error = ENOIOCTL; + error = SET_ERROR(ENOIOCTL); break; } case FIOSEEKHOLE: @@ -1092,7 +1127,7 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, break; } default: - error = ENOIOCTL; + error = SET_ERROR(ENOIOCTL); } return (error); @@ -1144,14 +1179,14 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; struct g_geom *gp; g_topology_lock(); gp = pp->geom; - ASSERT(gp != NULL); + ASSERT3P(gp, !=, NULL); zsg->zsg_provider = NULL; g_wither_provider(pp, ENXIO); @@ -1164,7 +1199,7 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname) zsg->zsg_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); - } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; @@ -1206,26 +1241,30 @@ zvol_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count == 0); + ASSERT0(zv->zv_open_count); ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp __maybe_unused = zsg->zsg_provider; + + ASSERT3P(pp->private, ==, NULL); g_topology_lock(); zvol_geom_destroy(zv); g_topology_unlock(); mtx_destroy(&zsg->zsg_queue_mtx); - } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; - if (dev != NULL) - destroy_dev(dev); + ASSERT3P(dev->si_drv2, ==, NULL); + + destroy_dev(dev); } mutex_destroy(&zv->zv_state_lock); @@ -1249,7 +1288,6 @@ zvol_create_minor_impl(const char *name) int error; ZFS_LOG(1, "Creating ZVOL %s...", name); - hash = zvol_name_hash(name); if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); @@ -1258,10 +1296,11 @@ zvol_create_minor_impl(const char *name) } DROP_GIANT(); - /* lie and say we're read-only */ - error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + /* lie and say we're read-only */ + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; @@ -1275,8 +1314,10 @@ zvol_create_minor_impl(const char *name) error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); - if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT) + if (error || volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; + error = 0; + /* * zvol_alloc equivalent ... */ @@ -1284,8 +1325,8 @@ zvol_create_minor_impl(const char *name) zv->zv_hash = hash; mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); - zv->zv_zso->zso_volmode = volmode; - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + zv->zv_volmode = volmode; + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp; struct g_geom *gp; @@ -1298,7 +1339,6 @@ zvol_create_minor_impl(const char *name) gp->start = zvol_geom_bio_start; gp->access = zvol_geom_access; pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); - /* TODO: NULL check? */ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = 0; @@ -1306,7 +1346,7 @@ zvol_create_minor_impl(const char *name) zsg->zsg_provider = pp; bioq_init(&zsg->zsg_queue); - } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; @@ -1320,12 +1360,12 @@ zvol_create_minor_impl(const char *name) args.mda_mode = 0640; args.mda_si_drv2 = zv; error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); - if (error != 0) { - mutex_destroy(&zv->zv_state_lock); + if (error) { kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + mutex_destroy(&zv->zv_state_lock); kmem_free(zv, sizeof (*zv)); dmu_objset_disown(os, B_TRUE, FTAG); - goto out_giant; + goto out_doi; } dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; @@ -1350,15 +1390,14 @@ zvol_create_minor_impl(const char *name) ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); - /* XXX do prefetch */ + /* TODO: prefetch for geom tasting */ zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { - if (error == 0) - zvol_geom_run(zv); + if (error == 0 && volmode == ZFS_VOLMODE_GEOM) { + zvol_geom_run(zv); g_topology_unlock(); } out_doi: @@ -1368,9 +1407,8 @@ zvol_create_minor_impl(const char *name) zvol_insert(zv); zvol_minors++; rw_exit(&zvol_state_lock); + ZFS_LOG(1, "ZVOL %s created.", name); } - ZFS_LOG(1, "ZVOL %s created.", name); -out_giant: PICKUP_GIANT(); return (error); } @@ -1379,11 +1417,11 @@ static void zvol_clear_private(zvol_state_t *zv) { ASSERT(RW_LOCK_HELD(&zvol_state_lock)); - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; - if (pp == NULL) /* XXX when? */ + if (pp->private == NULL) /* already cleared */ return; mtx_lock(&zsg->zsg_queue_mtx); @@ -1391,11 +1429,15 @@ zvol_clear_private(zvol_state_t *zv) pp->private = NULL; wakeup_one(&zsg->zsg_queue); while (zsg->zsg_state != ZVOL_GEOM_RUNNING) - msleep(&zsg->zsg_state, - &zsg->zsg_queue_mtx, + msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx, 0, "zvol:w", 0); mtx_unlock(&zsg->zsg_queue_mtx); ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev = zsd->zsd_cdev; + + dev->si_drv2 = NULL; } } @@ -1403,15 +1445,17 @@ static int zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) { zv->zv_volsize = volsize; - if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; - if (pp == NULL) /* XXX when? */ - return (0); - g_topology_lock(); + if (pp->private == NULL) { + g_topology_unlock(); + return (SET_ERROR(ENXIO)); + } + /* * Do not invoke resize event when initial size was zero. * ZVOL initializes the size on first open, this is not diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c index fafadffc751c..e8d89bfeabe5 100644 --- a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c +++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c @@ -28,6 +28,9 @@ #include #include #include +#ifdef HAVE_CPU_HOTPLUG +#include +#endif int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); @@ -35,7 +38,7 @@ MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); int spl_taskq_thread_dynamic = 1; -module_param(spl_taskq_thread_dynamic, int, 0644); +module_param(spl_taskq_thread_dynamic, int, 0444); MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); int spl_taskq_thread_priority = 1; @@ -59,6 +62,11 @@ EXPORT_SYMBOL(system_delay_taskq); static taskq_t *dynamic_taskq; static taskq_thread_t *taskq_thread_create(taskq_t *); +#ifdef HAVE_CPU_HOTPLUG +/* Multi-callback id for cpu hotplugging. */ +static int spl_taskq_cpuhp_state; +#endif + /* List of all taskqs */ LIST_HEAD(tq_list); struct rw_semaphore tq_list_sem; @@ -1024,13 +1032,14 @@ taskq_thread_create(taskq_t *tq) } taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, +taskq_create(const char *name, int threads_arg, pri_t pri, int minalloc, int maxalloc, uint_t flags) { taskq_t *tq; taskq_thread_t *tqt; int count = 0, rc = 0, i; unsigned long irqflags; + int nthreads = threads_arg; ASSERT(name != NULL); ASSERT(minalloc >= 0); @@ -1041,15 +1050,27 @@ taskq_create(const char *name, int nthreads, pri_t pri, if (flags & TASKQ_THREADS_CPU_PCT) { ASSERT(nthreads <= 100); ASSERT(nthreads >= 0); - nthreads = MIN(nthreads, 100); + nthreads = MIN(threads_arg, 100); nthreads = MAX(nthreads, 0); - nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + nthreads = MAX((num_online_cpus() * nthreads) /100, 1); } tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); if (tq == NULL) return (NULL); + tq->tq_hp_support = B_FALSE; +#ifdef HAVE_CPU_HOTPLUG + if (flags & TASKQ_THREADS_CPU_PCT) { + tq->tq_hp_support = B_TRUE; + if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state, + &tq->tq_hp_cb_node) != 0) { + kmem_free(tq, sizeof (*tq)); + return (NULL); + } + } +#endif + spin_lock_init(&tq->tq_lock); INIT_LIST_HEAD(&tq->tq_thread_list); INIT_LIST_HEAD(&tq->tq_active_list); @@ -1058,6 +1079,7 @@ taskq_create(const char *name, int nthreads, pri_t pri, tq->tq_nthreads = 0; tq->tq_nspawn = 0; tq->tq_maxthreads = nthreads; + tq->tq_cpu_pct = threads_arg; tq->tq_pri = pri; tq->tq_minalloc = minalloc; tq->tq_maxalloc = maxalloc; @@ -1131,6 +1153,12 @@ taskq_destroy(taskq_t *tq) tq->tq_flags &= ~TASKQ_ACTIVE; spin_unlock_irqrestore(&tq->tq_lock, flags); +#ifdef HAVE_CPU_HOTPLUG + if (tq->tq_hp_support) { + VERIFY0(cpuhp_state_remove_instance_nocalls( + spl_taskq_cpuhp_state, &tq->tq_hp_cb_node)); + } +#endif /* * When TASKQ_ACTIVE is clear new tasks may not be added nor may * new worker threads be spawned for dynamic taskq. @@ -1198,7 +1226,6 @@ taskq_destroy(taskq_t *tq) } EXPORT_SYMBOL(taskq_destroy); - static unsigned int spl_taskq_kick = 0; /* @@ -1255,12 +1282,96 @@ module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, MODULE_PARM_DESC(spl_taskq_kick, "Write nonzero to kick stuck taskqs to spawn more threads"); +#ifdef HAVE_CPU_HOTPLUG +/* + * This callback will be called exactly once for each core that comes online, + * for each dynamic taskq. We attempt to expand taskqs that have + * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every + * time, to correctly determine whether or not to add a thread. + */ +static int +spl_taskq_expand(unsigned int cpu, struct hlist_node *node) +{ + taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); + unsigned long flags; + int err = 0; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + int nthreads = MIN(tq->tq_cpu_pct, 100); + nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1); + tq->tq_maxthreads = nthreads; + + if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && + tq->tq_maxthreads > tq->tq_nthreads) { + ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads + 1); + taskq_thread_t *tqt = taskq_thread_create(tq); + if (tqt == NULL) + err = -1; + } + +out: + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (err); +} + +/* + * While we don't support offlining CPUs, it is possible that CPUs will fail + * to online successfully. We do need to be able to handle this case + * gracefully. + */ +static int +spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node) +{ + taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node); + unsigned long flags; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT); + int nthreads = MIN(tq->tq_cpu_pct, 100); + nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1); + tq->tq_maxthreads = nthreads; + + if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) && + tq->tq_maxthreads < tq->tq_nthreads) { + ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1); + taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + struct task_struct *thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kthread_stop(thread); + + return (0); + } + +out: + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); +} +#endif + int spl_taskq_init(void) { init_rwsem(&tq_list_sem); tsd_create(&taskq_tsd, NULL); +#ifdef HAVE_CPU_HOTPLUG + spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down); +#endif + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_taskq == NULL) @@ -1269,6 +1380,9 @@ spl_taskq_init(void) system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); if (system_delay_taskq == NULL) { +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); +#endif taskq_destroy(system_taskq); return (1); } @@ -1276,6 +1390,9 @@ spl_taskq_init(void) dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); if (dynamic_taskq == NULL) { +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); +#endif taskq_destroy(system_taskq); taskq_destroy(system_delay_taskq); return (1); @@ -1304,4 +1421,9 @@ spl_taskq_fini(void) system_taskq = NULL; tsd_destroy(&taskq_tsd); + +#ifdef HAVE_CPU_HOTPLUG + cpuhp_remove_multi_state(spl_taskq_cpuhp_state); + spl_taskq_cpuhp_state = 0; +#endif } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in index 87414d6eacc5..75bec52c94e2 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in +++ b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in @@ -23,8 +23,9 @@ $(MODULE)-objs += ../os/linux/zfs/zfs_dir.o $(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o $(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o $(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o +$(MODULE)-objs += ../os/linux/zfs/zfs_uio.o $(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o -$(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o +$(MODULE)-objs += ../os/linux/zfs/zfs_vnops_os.o $(MODULE)-objs += ../os/linux/zfs/zfs_znode.o $(MODULE)-objs += ../os/linux/zfs/zio_crypt.o $(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c index c2281449ed12..0abac228447f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c @@ -178,7 +178,7 @@ static struct page *abd_zero_page = NULL; static kmem_cache_t *abd_cache = NULL; static kstat_t *abd_ksp; -static size_t +static uint_t abd_chunkcnt_for_bytes(size_t size) { return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c index 792c75d46ffe..83d4a3d8496c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include #endif #include #include @@ -73,6 +75,9 @@ */ int zfs_arc_shrinker_limit = 10000; +#ifdef CONFIG_MEMORY_HOTPLUG +static struct notifier_block arc_hotplug_callback_mem_nb; +#endif /* * Return a default max arc size based on the amount of physical memory. @@ -278,18 +283,9 @@ arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) return (0); } -void -arc_lowmem_init(void) +static void +arc_set_sys_free(uint64_t allmem) { - uint64_t allmem = arc_all_memory(); - - /* - * Register a shrinker to support synchronous (direct) memory - * reclaim from the arc. This is done to prevent kswapd from - * swapping out pages when it is preferable to shrink the arc. - */ - spl_register_shrinker(&arc_shrinker); - /* * The ARC tries to keep at least this much memory available for the * system. This gives the ARC time to shrink in response to memory @@ -342,6 +338,20 @@ arc_lowmem_init(void) arc_sys_free = wmark * 3 + allmem / 32; } +void +arc_lowmem_init(void) +{ + uint64_t allmem = arc_all_memory(); + + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); + arc_set_sys_free(allmem); +} + void arc_lowmem_fini(void) { @@ -375,6 +385,52 @@ param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) return (0); } + +#ifdef CONFIG_MEMORY_HOTPLUG +/* ARGSUSED */ +static int +arc_hotplug_callback(struct notifier_block *self, unsigned long action, + void *arg) +{ + uint64_t allmem = arc_all_memory(); + if (action != MEM_ONLINE) + return (NOTIFY_OK); + + arc_set_limits(allmem); + +#ifdef __LP64__ + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#else + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#endif + + arc_set_sys_free(allmem); + return (NOTIFY_OK); +} +#endif + +void +arc_register_hotplug(void) +{ +#ifdef CONFIG_MEMORY_HOTPLUG + arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback; + /* There is no significance to the value 100 */ + arc_hotplug_callback_mem_nb.priority = 100; + register_memory_notifier(&arc_hotplug_callback_mem_nb); +#endif +} + +void +arc_unregister_hotplug(void) +{ +#ifdef CONFIG_MEMORY_HOTPLUG + unregister_memory_notifier(&arc_hotplug_callback_mem_nb); +#endif +} #else /* _KERNEL */ int64_t arc_available_memory(void) @@ -405,6 +461,16 @@ arc_free_memory(void) { return (spa_get_random(arc_all_memory() * 20 / 100)); } + +void +arc_register_hotplug(void) +{ +} + +void +arc_unregister_hotplug(void) +{ +} #endif /* _KERNEL */ /* diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c index 5267d67eea82..8780d7f6c70a 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/policy.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c @@ -204,7 +204,8 @@ secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) * Enforced in the Linux VFS. */ int -secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) +secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr, + boolean_t issuidroot) { return (priv_policy_user(cr, CAP_FSETID, EPERM)); } @@ -271,7 +272,7 @@ void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, + secpolicy_vnode_setid_retain(NULL, cr, (vap->va_mode & S_ISUID) != 0 && (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { vap->va_mask |= AT_MODE; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c index a54961c76870..4bd27d1b516f 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c @@ -94,6 +94,14 @@ bdev_capacity(struct block_device *bdev) return (i_size_read(bdev->bd_inode)); } +#if !defined(HAVE_BDEV_WHOLE) +static inline struct block_device * +bdev_whole(struct block_device *bdev) +{ + return (bdev->bd_contains); +} +#endif + /* * Returns the maximum expansion capacity of the block device (in bytes). * @@ -118,7 +126,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) uint64_t psize; int64_t available; - if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { + if (wholedisk && bdev != bdev_whole(bdev)) { /* * When reporting maximum expansion capacity for a wholedisk * deduct any capacity which is expected to be lost due to @@ -132,7 +140,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) * "reserved" EFI partition: in such cases return the device * usable capacity. */ - available = i_size_read(bdev->bd_contains->bd_inode) - + available = i_size_read(bdev_whole(bdev)->bd_inode) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); @@ -192,8 +200,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, vd->vd_bdev = NULL; if (bdev) { - if (v->vdev_expanding && bdev != bdev->bd_contains) { - bdevname(bdev->bd_contains, disk_name + 5); + if (v->vdev_expanding && bdev != bdev_whole(bdev)) { + bdevname(bdev_whole(bdev), disk_name + 5); /* * If userland has BLKPG_RESIZE_PARTITION, * then it should have updated the partition @@ -468,7 +476,11 @@ vdev_blkg_tryget(struct blkcg_gq *blkg) this_cpu_inc(*count); rc = true; } else { +#ifdef ZFS_PERCPU_REF_COUNT_IN_DATA + rc = atomic_long_inc_not_zero(&ref->data->count); +#else rc = atomic_long_inc_not_zero(&ref->count); +#endif } rcu_read_unlock_sched(); @@ -787,7 +799,7 @@ vdev_disk_io_done(zio_t *zio) vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - if (check_disk_change(vd->vd_bdev)) { + if (zfs_check_media_change(vd->vd_bdev)) { invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); @@ -822,9 +834,13 @@ vdev_disk_rele(vdev_t *vd) } vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, @@ -833,6 +849,11 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c index 423ce858144c..bf8a13ae6154 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c @@ -305,9 +305,13 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, @@ -316,6 +320,11 @@ vdev_ops_t vdev_file_ops = { .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; @@ -341,9 +350,13 @@ vdev_file_fini(void) #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_file_open, .vdev_op_close = vdev_file_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_file_io_start, .vdev_op_io_done = vdev_file_io_done, .vdev_op_state_change = NULL, @@ -352,6 +365,11 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_rele = vdev_file_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c index c13a9771235d..a1668e46e4f9 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c @@ -467,7 +467,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_zn_prefetch = B_FALSE; - zp->z_moved = B_FALSE; zp->z_is_sa = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_TRUE; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c index 36bbd5d0829b..165c1218ae79 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c @@ -294,7 +294,7 @@ zfs_sync(struct super_block *sb, int wait, cred_t *cr) } else { /* * Sync all ZFS filesystems. This is what happens when you - * run sync(1M). Unlike other filesystems, ZFS honors the + * run sync(1). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); @@ -1451,7 +1451,7 @@ int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) { const char *osname = zm->mnt_osname; - struct inode *root_inode; + struct inode *root_inode = NULL; uint64_t recordsize; int error = 0; zfsvfs_t *zfsvfs = NULL; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c index b668c7dff013..3be387a30e5c 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops.c @@ -240,78 +240,6 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) return (0); } -#if defined(SEEK_HOLE) && defined(SEEK_DATA) -/* - * Lseek support for finding holes (cmd == SEEK_HOLE) and - * data (cmd == SEEK_DATA). "off" is an in/out parameter. - */ -static int -zfs_holey_common(struct inode *ip, int cmd, loff_t *off) -{ - znode_t *zp = ITOZ(ip); - uint64_t noff = (uint64_t)*off; /* new offset */ - uint64_t file_sz; - int error; - boolean_t hole; - - file_sz = zp->z_size; - if (noff >= file_sz) { - return (SET_ERROR(ENXIO)); - } - - if (cmd == SEEK_HOLE) - hole = B_TRUE; - else - hole = B_FALSE; - - error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); - - if (error == ESRCH) - return (SET_ERROR(ENXIO)); - - /* file was dirty, so fall back to using generic logic */ - if (error == EBUSY) { - if (hole) - *off = file_sz; - - return (0); - } - - /* - * We could find a hole that begins after the logical end-of-file, - * because dmu_offset_next() only works on whole blocks. If the - * EOF falls mid-block, then indicate that the "virtual hole" - * at the end of the file begins at the logical EOF, rather than - * at the end of the last block. - */ - if (noff > file_sz) { - ASSERT(hole); - noff = file_sz; - } - - if (noff < *off) - return (error); - *off = noff; - return (error); -} - -int -zfs_holey(struct inode *ip, int cmd, loff_t *off) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_holey_common(ip, cmd, off); - - ZFS_EXIT(zfsvfs); - return (error); -} -#endif /* SEEK_HOLE && SEEK_DATA */ - #if defined(_KERNEL) /* * When a file is memory mapped, we must keep the IO data synchronized @@ -320,10 +248,10 @@ zfs_holey(struct inode *ip, int cmd, loff_t *off) * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ -static void -update_pages(struct inode *ip, int64_t start, int len, - objset_t *os, uint64_t oid) +void +update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { + struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; uint64_t nbytes; @@ -340,8 +268,8 @@ update_pages(struct inode *ip, int64_t start, int len, flush_dcache_page(pp); pb = kmap(pp); - (void) dmu_read(os, oid, start+off, nbytes, pb+off, - DMU_READ_PREFETCH); + (void) dmu_read(os, zp->z_id, start + off, nbytes, + pb + off, DMU_READ_PREFETCH); kunmap(pp); if (mapping_writably_mapped(mp)) @@ -369,12 +297,12 @@ update_pages(struct inode *ip, int64_t start, int len, * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ -static int -mappedread(struct inode *ip, int nbytes, uio_t *uio) +int +mappedread(znode_t *zp, int nbytes, uio_t *uio) { + struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; - znode_t *zp = ITOZ(ip); int64_t start, off; uint64_t bytes; int len = nbytes; @@ -414,574 +342,8 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio) } #endif /* _KERNEL */ -unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; -/* - * Read bytes from specified file into supplied buffer. - * - * IN: ip - inode of file to be read from. - * uio - structure supplying read location, range info, - * and return buffer. - * ioflag - O_SYNC flags; used to provide FRSYNC semantics. - * O_DIRECT flag; used to bypass page cache. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range, buffer filled. - * - * RETURN: 0 on success, error code on failure. - * - * Side Effects: - * inode - atime updated if byte count > 0 - */ -/* ARGSUSED */ -int -zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) -{ - int error = 0; - boolean_t frsync = B_FALSE; - - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (zp->z_pflags & ZFS_AV_QUARANTINED) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EACCES)); - } - - /* - * Validate file offset - */ - if (uio->uio_loffset < (offset_t)0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - /* - * Fasttrack empty reads - */ - if (uio->uio_resid == 0) { - ZFS_EXIT(zfsvfs); - return (0); - } - -#ifdef FRSYNC - /* - * If we're in FRSYNC mode, sync out this znode before reading it. - * Only do this for non-snapshots. - * - * Some platforms do not support FRSYNC and instead map it - * to O_SYNC, which results in unnecessary calls to zil_commit. We - * only honor FRSYNC requests on platforms which support it. - */ - frsync = !!(ioflag & FRSYNC); -#endif - if (zfsvfs->z_log && - (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) - zil_commit(zfsvfs->z_log, zp->z_id); - - /* - * Lock the range against changes. - */ - zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, - uio->uio_loffset, uio->uio_resid, RL_READER); - - /* - * If we are reading past end-of-file we can skip - * to the end; but we might still need to set atime. - */ - if (uio->uio_loffset >= zp->z_size) { - error = 0; - goto out; - } - - ASSERT(uio->uio_loffset < zp->z_size); - ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); - ssize_t start_resid = n; - -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { - int nblk; - int blksz = zp->z_blksz; - uint64_t offset = uio->uio_loffset; - - xuio = (xuio_t *)uio; - if ((ISP2(blksz))) { - nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, - blksz)) / blksz; - } else { - ASSERT(offset + n <= blksz); - nblk = 1; - } - (void) dmu_xuio_init(xuio, nblk); - - if (vn_has_cached_data(ip)) { - /* - * For simplicity, we always allocate a full buffer - * even if we only expect to read a portion of a block. - */ - while (--nblk >= 0) { - (void) dmu_xuio_add(xuio, - dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz), 0, blksz); - } - } - } -#endif /* HAVE_UIO_ZEROCOPY */ - - while (n > 0) { - ssize_t nbytes = MIN(n, zfs_read_chunk_size - - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - - if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { - error = mappedread(ip, nbytes, uio); - } else { - error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes); - } - - if (error) { - /* convert checksum errors into IO errors */ - if (error == ECKSUM) - error = SET_ERROR(EIO); - break; - } - - n -= nbytes; - } - - int64_t nread = start_resid - n; - dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); - task_io_account_read(nread); -out: - zfs_rangelock_exit(lr); - - ZFS_EXIT(zfsvfs); - return (error); -} - -/* - * Write the bytes to a file. - * - * IN: ip - inode of file to be written to. - * uio - structure supplying write location, range info, - * and data buffer. - * ioflag - O_APPEND flag set if in append mode. - * O_DIRECT flag; used to bypass page cache. - * cr - credentials of caller. - * - * OUT: uio - updated offset and range. - * - * RETURN: 0 if success - * error code if failure - * - * Timestamps: - * ip - ctime|mtime updated if byte count > 0 - */ - -/* ARGSUSED */ -int -zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) -{ - int error = 0; - ssize_t start_resid = uio->uio_resid; - - /* - * Fasttrack empty write - */ - ssize_t n = start_resid; - if (n == 0) - return (0); - - rlim64_t limit = uio->uio_limit; - if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) - limit = MAXOFFSET_T; - - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ZTOZSB(zp); - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - sa_bulk_attr_t bulk[4]; - int count = 0; - uint64_t mtime[2], ctime[2]; - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, - &zp->z_size, 8); - SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, - &zp->z_pflags, 8); - - /* - * Callers might not be able to detect properly that we are read-only, - * so check it explicitly here. - */ - if (zfs_is_readonly(zfsvfs)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EROFS)); - } - - /* - * If immutable or not appending then return EPERM - */ - if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || - ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && - (uio->uio_loffset < zp->z_size))) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EPERM)); - } - - /* - * Validate file offset - */ - offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset; - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - int max_blksz = zfsvfs->z_max_blksz; - xuio_t *xuio = NULL; - - /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ -#ifdef HAVE_UIO_ZEROCOPY - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else -#endif - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFAULT)); - } - - /* - * If in append mode, set the io offset pointer to eof. - */ - zfs_locked_range_t *lr; - if (ioflag & O_APPEND) { - /* - * Obtain an appending range lock to guarantee file append - * semantics. We reset the write offset once we have the lock. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); - woff = lr->lr_offset; - if (lr->lr_length == UINT64_MAX) { - /* - * We overlocked the file because this write will cause - * the file block size to increase. - * Note that zp_size cannot change with this lock held. - */ - woff = zp->z_size; - } - uio->uio_loffset = woff; - } else { - /* - * Note that if the file block size will change as a result of - * this write, then this range lock will lock the entire file - * so that we can re-write the block safely. - */ - lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); - } - - if (woff >= limit) { - zfs_rangelock_exit(lr); - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EFBIG)); - } - - if ((woff + n) > limit || woff > (limit - n)) - n = limit - woff; - - /* Will this write extend the file length? */ - int write_eof = (woff + n > zp->z_size); - - uint64_t end_size = MAX(zp->z_size, woff + n); - zilog_t *zilog = zfsvfs->z_log; -#ifdef HAVE_UIO_ZEROCOPY - int i_iov = 0; - const iovec_t *iovp = uio->uio_iov; - int iovcnt __maybe_unused = uio->uio_iovcnt; -#endif - - - /* - * Write the file in reasonable size chunks. Each chunk is written - * in a separate transaction; this keeps the intent log records small - * and allows us to do more fine-grained space accounting. - */ - while (n > 0) { - woff = uio->uio_loffset; - - if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, - KUID_TO_SUID(ip->i_uid)) || - zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, - KGID_TO_SGID(ip->i_gid)) || - (zp->z_projid != ZFS_DEFAULT_PROJID && - zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, - zp->z_projid))) { - error = SET_ERROR(EDQUOT); - break; - } - - arc_buf_t *abuf = NULL; - const iovec_t *aiov = NULL; - if (xuio) { -#ifdef HAVE_UIO_ZEROCOPY - ASSERT(i_iov < iovcnt); - ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); - aiov = &iovp[i_iov]; - abuf = dmu_xuio_arcbuf(xuio, i_iov); - dmu_xuio_clear(xuio, i_iov); - ASSERT((aiov->iov_base == abuf->b_data) || - ((char *)aiov->iov_base - (char *)abuf->b_data + - aiov->iov_len == arc_buf_size(abuf))); - i_iov++; -#endif - } else if (n >= max_blksz && woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { - /* - * This write covers a full block. "Borrow" a buffer - * from the dmu so that we can fill it before we enter - * a transaction. This avoids the possibility of - * holding up the transaction if the data copy hangs - * up on a pagefault (e.g., from an NFS server mapping). - */ - size_t cbytes; - - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); - ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes))) { - dmu_return_arcbuf(abuf); - break; - } - ASSERT(cbytes == max_blksz); - } - - /* - * Start a transaction. - */ - dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); - dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); - DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, - MIN(n, max_blksz)); - DB_DNODE_EXIT(db); - zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - if (abuf != NULL) - dmu_return_arcbuf(abuf); - break; - } - - /* - * If rangelock_enter() over-locked we grow the blocksize - * and then reduce the lock range. This will only happen - * on the first iteration since rangelock_reduce() will - * shrink down lr_length to the appropriate size. - */ - if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); - zfs_rangelock_reduce(lr, woff, n); - } - - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - - ssize_t tx_bytes; - if (abuf == NULL) { - tx_bytes = uio->uio_resid; - uio->uio_fault_disable = B_TRUE; - error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); - uio->uio_fault_disable = B_FALSE; - if (error == EFAULT) { - dmu_tx_commit(tx); - /* - * Account for partial writes before - * continuing the loop. - * Update needs to occur before the next - * uio_prefaultpages, or prefaultpages may - * error, and we may break the loop early. - */ - if (tx_bytes != uio->uio_resid) - n -= tx_bytes - uio->uio_resid; - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - break; - } - continue; - } else if (error != 0) { - dmu_tx_commit(tx); - break; - } - tx_bytes -= uio->uio_resid; - } else { - tx_bytes = nbytes; - ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); - /* - * If this is not a full block write, but we are - * extending the file past EOF and this data starts - * block-aligned, use assign_arcbuf(). Otherwise, - * write via dmu_write(). - */ - if (tx_bytes < max_blksz && (!write_eof || - aiov->iov_base != abuf->b_data)) { - ASSERT(xuio); - dmu_write(zfsvfs->z_os, zp->z_id, woff, - /* cppcheck-suppress nullPointer */ - aiov->iov_len, aiov->iov_base, tx); - dmu_return_arcbuf(abuf); - xuio_stat_wbuf_copied(); - } else { - ASSERT(xuio || tx_bytes == max_blksz); - error = dmu_assign_arcbuf_by_dbuf( - sa_get_db(zp->z_sa_hdl), woff, abuf, tx); - if (error != 0) { - dmu_return_arcbuf(abuf); - dmu_tx_commit(tx); - break; - } - } - ASSERT(tx_bytes <= uio->uio_resid); - uioskip(uio, tx_bytes); - } - if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { - update_pages(ip, woff, - tx_bytes, zfsvfs->z_os, zp->z_id); - } - - /* - * If we made no progress, we're done. If we made even - * partial progress, update the znode and ZIL accordingly. - */ - if (tx_bytes == 0) { - (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), - (void *)&zp->z_size, sizeof (uint64_t), tx); - dmu_tx_commit(tx); - ASSERT(error != 0); - break; - } - - /* - * Clear Set-UID/Set-GID bits on successful write if not - * privileged and at least one of the execute bits is set. - * - * It would be nice to do this after all writes have - * been done, but that would still expose the ISUID/ISGID - * to another app after the partial write is committed. - * - * Note: we don't call zfs_fuid_map_id() here because - * user 0 is not an ephemeral uid. - */ - mutex_enter(&zp->z_acl_lock); - uint32_t uid = KUID_TO_SUID(ip->i_uid); - if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | - (S_IXUSR >> 6))) != 0 && - (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && - secpolicy_vnode_setid_retain(cr, - ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { - uint64_t newmode; - zp->z_mode &= ~(S_ISUID | S_ISGID); - ip->i_mode = newmode = zp->z_mode; - (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), - (void *)&newmode, sizeof (uint64_t), tx); - } - mutex_exit(&zp->z_acl_lock); - - zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); - - /* - * Update the file size (zp_size) if it has changed; - * account for possible concurrent updates. - */ - while ((end_size = zp->z_size) < uio->uio_loffset) { - (void) atomic_cas_64(&zp->z_size, end_size, - uio->uio_loffset); - ASSERT(error == 0); - } - /* - * If we are replaying and eof is non zero then force - * the file size to the specified eof. Note, there's no - * concurrency during replay. - */ - if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) - zp->z_size = zfsvfs->z_replay_eof; - - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, - NULL, NULL); - dmu_tx_commit(tx); - - if (error != 0) - break; - ASSERT(tx_bytes == nbytes); - n -= nbytes; - - if (!xuio && n > 0) { - if (uio_prefaultpages(MIN(n, max_blksz), uio)) { - error = EFAULT; - break; - } - } - } - - zfs_inode_update(zp); - zfs_rangelock_exit(lr); - - /* - * If we're in replay mode, or we made no progress, return error. - * Otherwise, it's at least a partial write, so it's successful. - */ - if (zfsvfs->z_replay || uio->uio_resid == start_resid) { - ZFS_EXIT(zfsvfs); - return (error); - } - - if (ioflag & (O_SYNC | O_DSYNC) || - zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, zp->z_id); - - int64_t nwritten = start_resid - uio->uio_resid; - dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); - task_io_account_write(nwritten); - - ZFS_EXIT(zfsvfs); - return (0); -} - /* * Write the bytes to a file. * @@ -993,37 +355,40 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * OUT: resid - remaining bytes to write * * RETURN: 0 if success - * positive error code if failure + * positive error code if failure. EIO is returned + * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, - loff_t pos, size_t *resid) + loff_t pos, size_t *residp) { - ssize_t written; - int error = 0; + fstrans_cookie_t cookie; + int error; - written = zpl_write_common(ZTOI(zp), data, len, &pos, - UIO_SYSSPACE, 0, kcred); - if (written < 0) { - error = -written; - } else if (resid == NULL) { - if (written < len) - error = SET_ERROR(EIO); /* short write */ - } else { - *resid = len - written; + struct iovec iov; + iov.iov_base = (void *)data; + iov.iov_len = len; + + uio_t uio; + uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); + + cookie = spl_fstrans_mark(); + error = zfs_write(zp, &uio, 0, kcred); + spl_fstrans_unmark(cookie); + + if (error == 0) { + if (residp != NULL) + *residp = uio_resid(&uio); + else if (uio_resid(&uio) != 0) + error = SET_ERROR(EIO); } + return (error); } -/* - * Drop a reference on the passed inode asynchronously. This ensures - * that the caller will never drop the last reference on an inode in - * the current context. Doing so while holding open a tx could result - * in a deadlock if iput_final() re-enters the filesystem code. - */ void zfs_zrele_async(znode_t *zp) { @@ -1040,179 +405,6 @@ zfs_zrele_async(znode_t *zp) zrele(zp); } -/* ARGSUSED */ -static void -zfs_get_done(zgd_t *zgd, int error) -{ - znode_t *zp = zgd->zgd_private; - - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); - - zfs_rangelock_exit(zgd->zgd_lr); - - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - zfs_zrele_async(zp); - - kmem_free(zgd, sizeof (zgd_t)); -} - -#ifdef ZFS_DEBUG -static int zil_fault_io = 0; -#endif - -/* - * Get data to generate a TX_WRITE intent log record. - */ -int -zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) -{ - zfsvfs_t *zfsvfs = arg; - objset_t *os = zfsvfs->z_os; - znode_t *zp; - uint64_t object = lr->lr_foid; - uint64_t offset = lr->lr_offset; - uint64_t size = lr->lr_length; - dmu_buf_t *db; - zgd_t *zgd; - int error = 0; - - ASSERT3P(lwb, !=, NULL); - ASSERT3P(zio, !=, NULL); - ASSERT3U(size, !=, 0); - - /* - * Nothing to do if the file has been removed - */ - if (zfs_zget(zfsvfs, object, &zp) != 0) - return (SET_ERROR(ENOENT)); - if (zp->z_unlinked) { - /* - * Release the vnode asynchronously as we currently have the - * txg stopped from syncing. - */ - zfs_zrele_async(zp); - return (SET_ERROR(ENOENT)); - } - - zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_lwb = lwb; - zgd->zgd_private = zp; - - /* - * Write records come in two flavors: immediate and indirect. - * For small writes it's cheaper to store the data with the - * log record (immediate); for large writes it's cheaper to - * sync the data and get a pointer to it (indirect) so that - * we don't have to write the data twice. - */ - if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - /* test for truncation needs to be done while range locked */ - if (offset >= zp->z_size) { - error = SET_ERROR(ENOENT); - } else { - error = dmu_read(os, object, offset, size, buf, - DMU_READ_NO_PREFETCH); - } - ASSERT(error == 0 || error == ENOENT); - } else { /* indirect write */ - /* - * Have to lock the whole block to ensure when it's - * written out and its checksum is being calculated - * that no one can change the data. We need to re-check - * blocksize after we get the lock in case it's changed! - */ - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; - offset -= blkoff; - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - zfs_rangelock_exit(zgd->zgd_lr); - } - /* test for truncation needs to be done while range locked */ - if (lr->lr_offset >= zp->z_size) - error = SET_ERROR(ENOENT); -#ifdef ZFS_DEBUG - if (zil_fault_io) { - error = SET_ERROR(EIO); - zil_fault_io = 0; - } -#endif - if (error == 0) - error = dmu_buf_hold(os, object, offset, zgd, &db, - DMU_READ_NO_PREFETCH); - - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; - - zgd->zgd_db = db; - zgd->zgd_bp = bp; - - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); - - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); - - /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. - */ - if (error == 0) - return (0); - - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP. - */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; - } - } - } - - zfs_get_done(zgd, error); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - if (flag & V_ACE_MASK) - error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); - else - error = zfs_zaccess_rwx(zp, mode, flag, cr); - - ZFS_EXIT(zfsvfs); - return (error); -} /* * Lookup an entry in a directory, or an extended attribute directory. @@ -2440,26 +1632,6 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) return (error); } -ulong_t zfs_fsync_sync_cnt = 4; - -int -zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - - (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); - - if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); - } - tsd_set(zfs_fsyncer_key, NULL); - - return (0); -} - /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source @@ -4796,207 +3968,9 @@ zfs_fid(struct inode *ip, fid_t *fidp) return (0); } -/*ARGSUSED*/ -int -zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - error = zfs_getacl(zp, vsecp, skipaclchk, cr); - ZFS_EXIT(zfsvfs); - - return (error); -} - -/*ARGSUSED*/ -int -zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) -{ - zfsvfs_t *zfsvfs = ZTOZSB(zp); - int error; - boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - - error = zfs_setacl(zp, vsecp, skipaclchk, cr); - - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); - - ZFS_EXIT(zfsvfs); - return (error); -} - -#ifdef HAVE_UIO_ZEROCOPY -/* - * The smallest read we may consider to loan out an arcbuf. - * This must be a power of 2. - */ -int zcr_blksz_min = (1 << 10); /* 1K */ -/* - * If set to less than the file block size, allow loaning out of an - * arcbuf for a partial block read. This must be a power of 2. - */ -int zcr_blksz_max = (1 << 17); /* 128K */ - -/*ARGSUSED*/ -static int -zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) -{ - znode_t *zp = ITOZ(ip); - zfsvfs_t *zfsvfs = ITOZSB(ip); - int max_blksz = zfsvfs->z_max_blksz; - uio_t *uio = &xuio->xu_uio; - ssize_t size = uio->uio_resid; - offset_t offset = uio->uio_loffset; - int blksz; - int fullblk, i; - arc_buf_t *abuf; - ssize_t maxsize; - int preamble, postamble; - - if (xuio->xu_type != UIOTYPE_ZEROCOPY) - return (SET_ERROR(EINVAL)); - - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); - switch (ioflag) { - case UIO_WRITE: - /* - * Loan out an arc_buf for write if write size is bigger than - * max_blksz, and the file's block size is also max_blksz. - */ - blksz = max_blksz; - if (size < blksz || zp->z_blksz != blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - /* - * Caller requests buffers for write before knowing where the - * write offset might be (e.g. NFS TCP write). - */ - if (offset == -1) { - preamble = 0; - } else { - preamble = P2PHASE(offset, blksz); - if (preamble) { - preamble = blksz - preamble; - size -= preamble; - } - } - - postamble = P2PHASE(size, blksz); - size -= postamble; - - fullblk = size / blksz; - (void) dmu_xuio_init(xuio, - (preamble != 0) + fullblk + (postamble != 0)); - - /* - * Have to fix iov base/len for partial buffers. They - * currently represent full arc_buf's. - */ - if (preamble) { - /* data begins in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, - blksz - preamble, preamble); - } - - for (i = 0; i < fullblk; i++) { - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, blksz); - } - - if (postamble) { - /* data ends in the middle of the arc_buf */ - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - blksz); - ASSERT(abuf); - (void) dmu_xuio_add(xuio, abuf, 0, postamble); - } - break; - case UIO_READ: - /* - * Loan out an arc_buf for read if the read size is larger than - * the current file block size. Block alignment is not - * considered. Partial arc_buf will be loaned out for read. - */ - blksz = zp->z_blksz; - if (blksz < zcr_blksz_min) - blksz = zcr_blksz_min; - if (blksz > zcr_blksz_max) - blksz = zcr_blksz_max; - /* avoid potential complexity of dealing with it */ - if (blksz > max_blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - maxsize = zp->z_size - uio->uio_loffset; - if (size > maxsize) - size = maxsize; - - if (size < blksz) { - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - break; - default: - ZFS_EXIT(zfsvfs); - return (SET_ERROR(EINVAL)); - } - - uio->uio_extflg = UIO_XUIO; - XUIO_XUZC_RW(xuio) = ioflag; - ZFS_EXIT(zfsvfs); - return (0); -} - -/*ARGSUSED*/ -static int -zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) -{ - int i; - arc_buf_t *abuf; - int ioflag = XUIO_XUZC_RW(xuio); - - ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); - - i = dmu_xuio_cnt(xuio); - while (i-- > 0) { - abuf = dmu_xuio_arcbuf(xuio, i); - /* - * if abuf == NULL, it must be a write buffer - * that has been returned in zfs_write(). - */ - if (abuf) - dmu_return_arcbuf(abuf); - ASSERT(abuf || ioflag == UIO_WRITE); - } - - dmu_xuio_fini(xuio); - return (0); -} -#endif /* HAVE_UIO_ZEROCOPY */ - #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); -EXPORT_SYMBOL(zfs_read); -EXPORT_SYMBOL(zfs_write); -EXPORT_SYMBOL(zfs_access); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); @@ -5004,7 +3978,6 @@ EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); -EXPORT_SYMBOL(zfs_fsync); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); @@ -5014,8 +3987,6 @@ EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); -EXPORT_SYMBOL(zfs_getsecattr); -EXPORT_SYMBOL(zfs_setsecattr); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); @@ -5024,8 +3995,6 @@ EXPORT_SYMBOL(zfs_map); /* BEGIN CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); -module_param(zfs_read_chunk_size, ulong, 0644); -MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); /* END CSTYLED */ #endif diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c index a542c662cb15..b33594488ee0 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c @@ -134,7 +134,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; - zp->z_moved = B_FALSE; return (0); } @@ -505,6 +504,7 @@ zfs_inode_update(znode_t *zp) dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); + ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); @@ -546,7 +546,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; - zp->z_moved = B_FALSE; zp->z_is_mapped = B_FALSE; zp->z_is_ctldir = B_FALSE; zp->z_is_stale = B_FALSE; @@ -619,7 +618,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); zfsvfs->z_nr_znodes++; - membar_producer(); mutex_exit(&zfsvfs->z_znodes_lock); unlock_new_inode(ip); @@ -1901,7 +1899,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = B_FALSE; rootzp->z_atime_dirty = B_FALSE; - rootzp->z_moved = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c index 96dabe55a138..8106359e1c77 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c @@ -1197,6 +1197,16 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + /* + * This is necessary here as we check next whether + * OBJSET_FLAG_USERACCOUNTING_COMPLETE or + * OBJSET_FLAG_USEROBJACCOUNTING are set in order to + * decide if the local_mac should be zeroed out. + */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + /* * The local MAC protects the user, group and project accounting. * If these objects are not present, the local MAC is zeroed out. @@ -1208,7 +1218,10 @@ zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, (datalen >= OBJSET_PHYS_SIZE_V2 && osp->os_userused_dnode.dn_type == DMU_OT_NONE && osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || - (datalen <= OBJSET_PHYS_SIZE_V1)) { + (datalen <= OBJSET_PHYS_SIZE_V1) || + (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 || + (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) && + key->zk_version > 0)) { bzero(local_mac, ZIO_OBJSET_MAC_LEN); return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c index fa4500f6f8d1..e6420f19ed87 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c @@ -55,7 +55,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (!zpl_dir_emit_dots(filp, ctx)) goto out; @@ -76,7 +76,7 @@ zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) ctx->pos++; } out: - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (error); } @@ -242,13 +242,14 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) uint64_t id, pos; int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); cookie = spl_fstrans_mark(); if (!zpl_dir_emit_dots(filp, ctx)) goto out; - pos = ctx->pos; + /* Start the position at 0 if it already emitted . and .. */ + pos = (ctx->pos == 2 ? 0 : ctx->pos); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, @@ -265,7 +266,7 @@ zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) } out: spl_fstrans_unmark(cookie); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); if (error == -ENOENT) return (0); @@ -368,13 +369,13 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); generic_fillattr(ip, stat); stat->nlink = stat->size = 2; stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); stat->atime = current_time(ip); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (0); } @@ -452,7 +453,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) znode_t *dzp; int error = 0; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); cookie = spl_fstrans_mark(); if (zfsvfs->z_shares_dir == 0) { @@ -471,7 +472,7 @@ zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) iput(ZTOI(dzp)); out: spl_fstrans_unmark(cookie); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); ASSERT3S(error, <=, 0); return (error); @@ -502,13 +503,13 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, znode_t *dzp; int error; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { generic_fillattr(path->dentry->d_inode, stat); stat->nlink = stat->size = 2; stat->atime = current_time(ip); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); return (0); } @@ -518,7 +519,7 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, iput(ZTOI(dzp)); } - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); ASSERT3S(error, <=, 0); return (error); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c index 51e189a87272..9e08c94e2147 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c @@ -212,244 +212,221 @@ zfs_io_flags(struct kiocb *kiocb) return (flags); } -static ssize_t -zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr, size_t skip) +/* + * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() + * is true. This is needed since datasets with inherited "relatime" property + * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after + * `zfs set relatime=...`), which is what relatime test in VFS by + * relatime_need_update() is based on. + */ +static inline void +zpl_file_accessed(struct file *filp) { - ssize_t read; - uio_t uio = { { 0 }, 0 }; - int error; - fstrans_cookie_t cookie; - - uio.uio_iov = iovp; - uio.uio_iovcnt = nr_segs; - uio.uio_loffset = *ppos; - uio.uio_segflg = segment; - uio.uio_limit = MAXOFFSET_T; - uio.uio_resid = count; - uio.uio_skip = skip; - - cookie = spl_fstrans_mark(); - error = -zfs_read(ip, &uio, flags, cr); - spl_fstrans_unmark(cookie); - if (error < 0) - return (error); - - read = count - uio.uio_resid; - *ppos += read; - - return (read); -} - -inline ssize_t -zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, - uio_seg_t segment, int flags, cred_t *cr) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr, 0)); -} - -static ssize_t -zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; - zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip)); - ssize_t read; - unsigned int f_flags = filp->f_flags; - f_flags |= zfs_io_flags(kiocb); - crhold(cr); - read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, - nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); - crfree(cr); - - /* - * If relatime is enabled, call file_accessed() only if - * zfs_relatime_need_update() is true. This is needed since datasets - * with inherited "relatime" property aren't necessarily mounted with - * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what - * relatime test in VFS by relatime_need_update() is based on. - */ - if (!IS_NOATIME(ip) && zfsvfs->z_relatime) { + if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { if (zfs_relatime_need_update(ip)) file_accessed(filp); } else { file_accessed(filp); } +} + +#if defined(HAVE_VFS_RW_ITERATE) + +/* + * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports + * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to + * manipulate the iov_iter are available. In which case the full iov_iter + * can be attached to the uio and correctly handled in the lower layers. + * Otherwise, for older kernels extract the iovec and pass it instead. + */ +static void +zpl_uio_init(uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, + loff_t pos, ssize_t count, size_t skip) +{ +#if defined(HAVE_VFS_IOV_ITER) + uio_iov_iter_init(uio, to, pos, count, skip); +#else + uio_iovec_init(uio, to->iov, to->nr_segs, pos, + to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, + count, skip); +#endif +} + +static ssize_t +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + ssize_t count = iov_iter_count(to); + uio_t uio; + + zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); return (read); } -#if defined(HAVE_VFS_RW_ITERATE) -static ssize_t -zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +static inline ssize_t +zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, + size_t *countp) { - ssize_t ret; - uio_seg_t seg = UIO_USERSPACE; - if (to->type & ITER_KVEC) - seg = UIO_SYSSPACE; - if (to->type & ITER_BVEC) - seg = UIO_BVEC; - ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, - iov_iter_count(to), seg, to->iov_offset); - if (ret > 0) - iov_iter_advance(to, ret); - return (ret); -} -#else -static ssize_t -zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, loff_t pos) -{ - ssize_t ret; - size_t count; - - ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE); - if (ret) +#ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB + ssize_t ret = generic_write_checks(kiocb, from); + if (ret <= 0) return (ret); - return (zpl_iter_read_common(kiocb, iovp, nr_segs, count, - UIO_USERSPACE, 0)); -} -#endif /* HAVE_VFS_RW_ITERATE */ + *countp = ret; +#else + struct file *file = kiocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *ip = mapping->host; + int isblk = S_ISBLK(ip->i_mode); -static ssize_t -zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, - unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, - cred_t *cr, size_t skip) -{ - ssize_t wrote; - uio_t uio = { { 0 }, 0 }; - int error; - fstrans_cookie_t cookie; + *countp = iov_iter_count(from); + ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk); + if (ret) + return (ret); +#endif - if (flags & O_APPEND) - *ppos = i_size_read(ip); - - uio.uio_iov = iovp; - uio.uio_iovcnt = nr_segs; - uio.uio_loffset = *ppos; - uio.uio_segflg = segment; - uio.uio_limit = MAXOFFSET_T; - uio.uio_resid = count; - uio.uio_skip = skip; - - cookie = spl_fstrans_mark(); - error = -zfs_write(ip, &uio, flags, cr); - spl_fstrans_unmark(cookie); - if (error < 0) - return (error); - - wrote = count - uio.uio_resid; - *ppos += wrote; - - return (wrote); + return (0); } -inline ssize_t -zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, - uio_seg_t segment, int flags, cred_t *cr) -{ - struct iovec iov; - - iov.iov_base = (void *)buf; - iov.iov_len = len; - - return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, - flags, cr, 0)); -} - -static ssize_t -zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, - unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) -{ - cred_t *cr = CRED(); - struct file *filp = kiocb->ki_filp; - ssize_t wrote; - unsigned int f_flags = filp->f_flags; - - f_flags |= zfs_io_flags(kiocb); - crhold(cr); - wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, - nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); - crfree(cr); - - return (wrote); -} - -#if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { - size_t count; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + uio_t uio; + size_t count = 0; ssize_t ret; - uio_seg_t seg = UIO_USERSPACE; -#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB - struct file *file = kiocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *ip = mapping->host; - int isblk = S_ISBLK(ip->i_mode); - - count = iov_iter_count(from); - ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); + ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); -#else - /* - * XXX - ideally this check should be in the same lock region with - * write operations, so that there's no TOCTTOU race when doing - * append and someone else grow the file. - */ - ret = generic_write_checks(kiocb, from); - if (ret <= 0) - return (ret); - count = ret; -#endif - if (from->type & ITER_KVEC) - seg = UIO_SYSSPACE; - if (from->type & ITER_BVEC) - seg = UIO_BVEC; + zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, - count, seg, from->iov_offset); - if (ret > 0) - iov_iter_advance(from, ret); + crhold(cr); + cookie = spl_fstrans_mark(); - return (ret); + int error = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + if (wrote > 0) + iov_iter_advance(from, wrote); + + return (wrote); } -#else + +#else /* !HAVE_VFS_RW_ITERATE */ + static ssize_t -zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, +zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct file *file = kiocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *ip = mapping->host; - int isblk = S_ISBLK(ip->i_mode); + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; size_t count; ssize_t ret; - ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ); + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); - ret = generic_write_checks(file, &pos, &count, isblk); + uio_t uio; + uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); + + return (read); +} + +static ssize_t +zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + size_t count; + ssize_t ret; + + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); - return (zpl_iter_write_common(kiocb, iovp, nr_segs, count, - UIO_USERSPACE, 0)); + ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); + if (ret) + return (ret); + + uio_t uio; + uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_write(ITOZ(ip), &uio, + filp->f_flags | zfs_io_flags(kiocb), cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); } #endif /* HAVE_VFS_RW_ITERATE */ @@ -486,13 +463,26 @@ zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) #error "Unknown direct IO interface" #endif -#else +#else /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_DIRECT_IO_IOVEC) static ssize_t -zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp, +zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { + if (rw == WRITE) + return (zpl_aio_write(kiocb, iov, nr_segs, pos)); + else + return (zpl_aio_read(kiocb, iov, nr_segs, pos)); +} +#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) +static ssize_t +zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) +{ + const struct iovec *iovp = iov_iter_iovec(iter); + unsigned long nr_segs = iter->nr_segs; + + ASSERT3S(pos, ==, kiocb->ki_pos); if (rw == WRITE) return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); else @@ -517,7 +507,7 @@ zpl_llseek(struct file *filp, loff_t offset, int whence) spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); - error = -zfs_holey(ip, whence, &offset); + error = -zfs_holey(ITOZ(ip), whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); @@ -603,10 +593,6 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). - * - * Current this function relies on zpl_read_common() and the O_DIRECT - * flag to read in a page. This works but the more correct way is to - * update zfs_fillpage() to be Linux friendly and use that interface. */ static int zpl_readpage(struct file *filp, struct page *pp) @@ -675,10 +661,10 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) enum writeback_sync_modes sync_mode; int result; - ZFS_ENTER(zfsvfs); + ZPL_ENTER(zfsvfs); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); sync_mode = wbc->sync_mode; /* @@ -691,11 +677,11 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) wbc->sync_mode = WB_SYNC_NONE; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); if (sync_mode != wbc->sync_mode) { - ZFS_ENTER(zfsvfs); - ZFS_VERIFY_ZP(zp); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); - ZFS_EXIT(zfsvfs); + ZPL_EXIT(zfsvfs); /* * We need to call write_cache_pages() again (we can't just @@ -1037,6 +1023,10 @@ const struct file_operations zpl_file_operations = { #endif .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, +#ifdef HAVE_VFS_IOV_ITER + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, +#endif #else .read = do_sync_read, .write = do_sync_write, diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c index f3b97a22074c..f336fbb1272b 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c @@ -490,19 +490,17 @@ zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) { fstrans_cookie_t cookie; cred_t *cr = CRED(); - struct iovec iov; - uio_t uio = { { 0 }, 0 }; int error; crhold(cr); *link = NULL; + + struct iovec iov; iov.iov_len = MAXPATHLEN; iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = (MAXPATHLEN - 1); + uio_t uio; + uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0); cookie = spl_fstrans_mark(); error = -zfs_readlink(ip, &uio, cr); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c index 9db8bda4cc66..c2fd3fee1401 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c @@ -185,14 +185,27 @@ zpl_remount_fs(struct super_block *sb, int *flags, char *data) static int __zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs) { - char *fsname; + ZPL_ENTER(zfsvfs); - ZFS_ENTER(zfsvfs); - fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); dmu_objset_name(zfsvfs->z_os, fsname); - seq_puts(seq, fsname); + + for (int i = 0; fsname[i] != 0; i++) { + /* + * Spaces in the dataset name must be converted to their + * octal escape sequence for getmntent(3) to correctly + * parse then fsname portion of /proc/self/mounts. + */ + if (fsname[i] == ' ') { + seq_puts(seq, "\\040"); + } else { + seq_putc(seq, fsname[i]); + } + } + kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); - ZFS_EXIT(zfsvfs); + + ZPL_EXIT(zfsvfs); return (0); } diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c index 9b5fd0fd397b..1ec3dae2bb81 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c @@ -274,10 +274,10 @@ static int zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { + fstrans_cookie_t cookie; struct inode *xip = NULL; znode_t *dxzp = NULL; znode_t *xzp = NULL; - loff_t pos = 0; int error; /* Lookup the xattr directory */ @@ -302,7 +302,19 @@ zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, goto out; } - error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); + struct iovec iov; + iov.iov_base = (void *)value; + iov.iov_len = size; + + uio_t uio; + uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0); + + cookie = spl_fstrans_mark(); + error = -zfs_read(ITOZ(xip), &uio, 0, cr); + spl_fstrans_unmark(cookie); + + if (error == 0) + error = size - uio_resid(&uio); out: if (xzp) zrele(xzp); @@ -441,7 +453,6 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, znode_t *dxzp = NULL; znode_t *xzp = NULL; vattr_t *vap = NULL; - ssize_t wrote; int lookup_flags, error; const int xattr_mode = S_IFREG | 0644; loff_t pos = 0; @@ -496,13 +507,8 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, if (error) goto out; - wrote = zpl_write_common(ZTOI(xzp), value, size, &pos, - UIO_SYSSPACE, 0, cr); - if (wrote < 0) - error = wrote; - + error = -zfs_write_simple(xzp, value, size, pos, NULL); out: - if (error == 0) { ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c index 218e1101edf8..cdc2076702af 100644 --- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c +++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c @@ -66,49 +66,33 @@ typedef struct zv_request { * Given a path, return TRUE if path is a ZVOL. */ static boolean_t -zvol_is_zvol_impl(const char *device) +zvol_is_zvol_impl(const char *path) { - struct block_device *bdev; - unsigned int major; + dev_t dev = 0; - bdev = vdev_lookup_bdev(device); - if (IS_ERR(bdev)) + if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); - major = MAJOR(bdev->bd_dev); - bdput(bdev); - - if (major == zvol_major) + if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } -static void -uio_from_bio(uio_t *uio, struct bio *bio) -{ - uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; - uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); - uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; - uio->uio_segflg = UIO_BVEC; - uio->uio_limit = MAXOFFSET_T; - uio->uio_resid = BIO_BI_SIZE(bio); - uio->uio_skip = BIO_BI_SKIP(bio); -} - static void zvol_write(void *arg) { - int error = 0; - zv_request_t *zvr = arg; struct bio *bio = zvr->bio; - uio_t uio = { { 0 }, 0 }; - uio_from_bio(&uio, bio); + int error = 0; + uio_t uio; + + uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; - ASSERT(zv && zv->zv_open_count > 0); - ASSERT(zv->zv_zilog != NULL); + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + ASSERT3P(zv->zv_zilog, !=, NULL); /* bio marked as FLUSH need to flush before write */ if (bio_is_flush(bio)) @@ -122,10 +106,14 @@ zvol_write(void *arg) return; } + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; - unsigned long start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE, - bio_sectors(bio), &zv->zv_zso->zvo_disk->part0); + unsigned long start_time; + + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); boolean_t sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; @@ -169,8 +157,10 @@ zvol_write(void *arg) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_zso->zvo_queue, - WRITE, &zv->zv_zso->zvo_disk->part0, start_jif); + + if (acct) + blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } @@ -187,14 +177,18 @@ zvol_discard(void *arg) boolean_t sync; int error = 0; dmu_tx_t *tx; - unsigned long start_jif; - ASSERT(zv && zv->zv_open_count > 0); - ASSERT(zv->zv_zilog != NULL); + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + ASSERT3P(zv->zv_zilog, !=, NULL); - start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE, - bio_sectors(bio), &zv->zv_zso->zvo_disk->part0); + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; + unsigned long start_time; + + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; @@ -239,8 +233,10 @@ zvol_discard(void *arg) unlock: rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE, - &zv->zv_zso->zvo_disk->part0, start_jif); + + if (acct) + blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); + BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } @@ -248,20 +244,25 @@ zvol_discard(void *arg) static void zvol_read(void *arg) { - int error = 0; - zv_request_t *zvr = arg; struct bio *bio = zvr->bio; - uio_t uio = { { 0 }, 0 }; - uio_from_bio(&uio, bio); + int error = 0; + uio_t uio; + + uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; - ASSERT(zv && zv->zv_open_count > 0); + ASSERT3P(zv, !=, NULL); + ASSERT3U(zv->zv_open_count, >, 0); + struct request_queue *q = zv->zv_zso->zvo_queue; + struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; - unsigned long start_jif = jiffies; - blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio), - &zv->zv_zso->zvo_disk->part0); + unsigned long start_time; + + boolean_t acct = blk_queue_io_stat(q); + if (acct) + start_time = blk_generic_start_io_acct(q, disk, READ, bio); zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); @@ -289,8 +290,10 @@ zvol_read(void *arg) task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); - blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ, - &zv->zv_zso->zvo_disk->part0, start_jif); + + if (acct) + blk_generic_end_io_acct(q, disk, READ, bio, start_time); + BIO_END_IO(bio, -error); kmem_free(zvr, sizeof (zv_request_t)); } @@ -482,9 +485,9 @@ zvol_open(struct block_device *bdev, fmode_t flag) rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock)); if (zv->zv_open_count == 0) { + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); if (error) goto out_mutex; @@ -501,7 +504,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - check_disk_change(bdev); + zfs_check_media_change(bdev); return (0); @@ -530,7 +533,7 @@ zvol_release(struct gendisk *disk, fmode_t mode) zv = disk->private_data; mutex_enter(&zv->zv_state_lock); - ASSERT(zv->zv_open_count > 0); + ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition @@ -553,11 +556,12 @@ zvol_release(struct gendisk *disk, fmode_t mode) rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock)); zv->zv_open_count--; - if (zv->zv_open_count == 0) + if (zv->zv_open_count == 0) { + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); + } mutex_exit(&zv->zv_state_lock); @@ -652,8 +656,15 @@ zvol_revalidate_disk(struct gendisk *disk) static int zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) { + struct gendisk *disk = zv->zv_zso->zvo_disk; - revalidate_disk(zv->zv_zso->zvo_disk); +#if defined(HAVE_REVALIDATE_DISK_SIZE) + revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); +#elif defined(HAVE_REVALIDATE_DISK) + revalidate_disk(disk); +#else + zvol_revalidate_disk(disk); +#endif return (0); } @@ -697,46 +708,6 @@ zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) return (0); } -/* - * Find a zvol_state_t given the full major+minor dev_t. If found, - * return with zv_state_lock taken, otherwise, return (NULL) without - * taking zv_state_lock. - */ -static zvol_state_t * -zvol_find_by_dev(dev_t dev) -{ - zvol_state_t *zv; - - rw_enter(&zvol_state_lock, RW_READER); - for (zv = list_head(&zvol_state_list); zv != NULL; - zv = list_next(&zvol_state_list, zv)) { - mutex_enter(&zv->zv_state_lock); - if (zv->zv_zso->zvo_dev == dev) { - rw_exit(&zvol_state_lock); - return (zv); - } - mutex_exit(&zv->zv_state_lock); - } - rw_exit(&zvol_state_lock); - - return (NULL); -} - -static struct kobject * -zvol_probe(dev_t dev, int *part, void *arg) -{ - zvol_state_t *zv; - struct kobject *kobj; - - zv = zvol_find_by_dev(dev); - kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL; - ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock)); - if (zv) - mutex_exit(&zv->zv_state_lock); - - return (kobj); -} - static struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, @@ -774,6 +745,7 @@ zvol_alloc(dev_t dev, const char *name) zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; + zv->zv_volmode = volmode; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); @@ -859,8 +831,8 @@ zvol_free(zvol_state_t *zv) ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); - ASSERT(zv->zv_open_count == 0); - ASSERT(zv->zv_zso->zvo_disk->private_data == NULL); + ASSERT0(zv->zv_open_count); + ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); @@ -879,6 +851,11 @@ zvol_free(zvol_state_t *zv) kmem_free(zv, sizeof (zvol_state_t)); } +void +zvol_wait_close(zvol_state_t *zv) +{ +} + /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block @@ -1083,9 +1060,6 @@ zvol_init(void) return (-ENOMEM); } zvol_init_impl(); - blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, - THIS_MODULE, zvol_probe, NULL, NULL); - ida_init(&zvol_ida); zvol_register_ops(&zvol_linux_ops); return (0); @@ -1095,7 +1069,6 @@ void zvol_fini(void) { zvol_fini_impl(); - blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); unregister_blkdev(zvol_major, ZVOL_DRIVER); taskq_destroy(zvol_taskq); ida_destroy(&zvol_ida); diff --git a/sys/contrib/openzfs/module/zcommon/Makefile.in b/sys/contrib/openzfs/module/zcommon/Makefile.in index b5cdf4c0c9fe..ebc538440445 100644 --- a/sys/contrib/openzfs/module/zcommon/Makefile.in +++ b/sys/contrib/openzfs/module/zcommon/Makefile.in @@ -19,7 +19,6 @@ $(MODULE)-objs += zfs_fletcher_superscalar.o $(MODULE)-objs += zfs_fletcher_superscalar4.o $(MODULE)-objs += zfs_namecheck.o $(MODULE)-objs += zfs_prop.o -$(MODULE)-objs += zfs_uio.o $(MODULE)-objs += zpool_prop.o $(MODULE)-objs += zprop_common.o diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c index 97ddacbab9e0..34ebabcf3b3c 100644 --- a/sys/contrib/openzfs/module/zcommon/zfeature_common.c +++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c @@ -576,7 +576,7 @@ zpool_feature_init(void) zfeature_register(SPA_FEATURE_DEVICE_REBUILD, "org.openzfs:device_rebuild", "device_rebuild", - "Support for sequential device rebuilds", + "Support for sequential mirror/dRAID device rebuilds", ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); { @@ -589,6 +589,10 @@ zpool_feature_init(void) "zstd compression algorithm support.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps); } + + zfeature_register(SPA_FEATURE_DRAID, + "org.openzfs:draid", "draid", "Support for distributed spare RAID", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); } #if defined(_KERNEL) diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c index 3e0632a32864..7a9de4a4309d 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c @@ -660,7 +660,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ } -#define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ +#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */ typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, zio_cksum_t *); @@ -885,23 +885,26 @@ zio_abd_checksum_func_t fletcher_4_abd_ops = { .acf_iter = abd_fletcher_4_iter }; +#if defined(_KERNEL) -#if defined(_KERNEL) && defined(__linux__) +#define IMPL_FMT(impl, i) (((impl) == (i)) ? "[%s] " : "%s ") + +#if defined(__linux__) static int fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) { const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); char *fmt; - int i, cnt = 0; + int cnt = 0; /* list fastest */ - fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s "; + fmt = IMPL_FMT(impl, IMPL_FASTEST); cnt += sprintf(buffer + cnt, fmt, "fastest"); /* list all supported implementations */ - for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { - fmt = (i == impl) ? "[%s] " : "%s "; + for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); cnt += sprintf(buffer + cnt, fmt, fletcher_4_supp_impls[i]->name); } @@ -915,14 +918,62 @@ fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) return (fletcher_4_impl_set(val)); } +#else + +#include + +static int +fletcher_4_param(ZFS_MODULE_PARAM_ARGS) +{ + int err; + + if (req->newptr == NULL) { + const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + const int init_buflen = 64; + const char *fmt; + struct sbuf *s; + + s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req); + + /* list fastest */ + fmt = IMPL_FMT(impl, IMPL_FASTEST); + (void) sbuf_printf(s, fmt, "fastest"); + + /* list all supported implementations */ + for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) { + fmt = IMPL_FMT(impl, i); + (void) sbuf_printf(s, fmt, + fletcher_4_supp_impls[i]->name); + } + + err = sbuf_finish(s); + sbuf_delete(s); + + return (err); + } + + char buf[16]; + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (-fletcher_4_impl_set(buf)); +} + +#endif + +#undef IMPL_FMT + /* * Choose a fletcher 4 implementation in ZFS. * Users can choose "cycle" to exercise all implementations, but this is * for testing purpose therefore it can only be set in user space. */ -module_param_call(zfs_fletcher_4_impl, - fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); -MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); +/* BEGIN CSTYLED */ +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl, + fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW, + "Select fletcher 4 implementation."); +/* END CSTYLED */ EXPORT_SYMBOL(fletcher_init); EXPORT_SYMBOL(fletcher_2_incremental_native); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c index f8625042a74c..0011a971cacb 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c @@ -442,7 +442,9 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (-1); } - if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (strcmp(pool, "mirror") == 0 || + strcmp(pool, "raidz") == 0 || + strcmp(pool, "draid") == 0) { if (why) *why = NAME_ERR_RESERVED; return (-1); diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c index 0352b13aa240..b78331187e13 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_prop.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c @@ -551,14 +551,14 @@ zfs_prop_init(void) PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", "MOUNTPOINT"); zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", - PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | NFS share options", "SHARENFS"); zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "filesystem | volume | snapshot | bookmark", "TYPE"); zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, - "on | off | sharemgr(1M) options", "SHARESMB"); + "on | off | SMB share options", "SHARESMB"); zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); @@ -1016,7 +1016,7 @@ zcommon_fini(void) kfpu_fini(); } -module_init(zcommon_init); +module_init_early(zcommon_init); module_exit(zcommon_fini); #endif diff --git a/sys/contrib/openzfs/module/zcommon/zfs_uio.c b/sys/contrib/openzfs/module/zcommon/zfs_uio.c index d586e0a1220a..e435e1a9f78a 100644 --- a/sys/contrib/openzfs/module/zcommon/zfs_uio.c +++ b/sys/contrib/openzfs/module/zcommon/zfs_uio.c @@ -39,12 +39,6 @@ * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ -/* - * The uio support from OpenSolaris has been added as a short term - * work around. The hope is to adopt native Linux type and drop the - * use of uio's entirely. Under Linux they only add overhead and - * when possible we want to use native APIs for the ZPL layer. - */ #ifdef _KERNEL #include @@ -71,7 +65,6 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) cnt = MIN(iov->iov_len - skip, n); switch (uio->uio_segflg) { case UIO_USERSPACE: - case UIO_USERISPACE: /* * p = kernel data pointer * iov->iov_base = user data pointer @@ -165,81 +158,82 @@ uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio) return (0); } +#if defined(HAVE_VFS_IOV_ITER) +static int +uiomove_iter(void *p, size_t n, enum uio_rw rw, struct uio *uio, + boolean_t revert) +{ + size_t cnt = MIN(n, uio->uio_resid); + + if (uio->uio_skip) + iov_iter_advance(uio->uio_iter, uio->uio_skip); + + if (rw == UIO_READ) + cnt = copy_to_iter(p, cnt, uio->uio_iter); + else + cnt = copy_from_iter(p, cnt, uio->uio_iter); + + /* + * When operating on a full pipe no bytes are processed. + * In which case return EFAULT which is converted to EAGAIN + * by the kernel's generic_file_splice_read() function. + */ + if (cnt == 0) + return (EFAULT); + + /* + * Revert advancing the uio_iter. This is set by uiocopy() + * to avoid consuming the uio and its iov_iter structure. + */ + if (revert) + iov_iter_revert(uio->uio_iter, cnt); + + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + + return (0); +} +#endif + int uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) { - if (uio->uio_segflg != UIO_BVEC) - return (uiomove_iov(p, n, rw, uio)); - else + if (uio->uio_segflg == UIO_BVEC) return (uiomove_bvec(p, n, rw, uio)); +#if defined(HAVE_VFS_IOV_ITER) + else if (uio->uio_segflg == UIO_ITER) + return (uiomove_iter(p, n, rw, uio, B_FALSE)); +#endif + else + return (uiomove_iov(p, n, rw, uio)); } EXPORT_SYMBOL(uiomove); -#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) - -/* - * Fault in the pages of the first n bytes specified by the uio structure. - * 1 byte in each page is touched and the uio struct is unmodified. Any - * error will terminate the process as this is only a best attempt to get - * the pages resident. - */ int uio_prefaultpages(ssize_t n, struct uio *uio) { - const struct iovec *iov; - ulong_t cnt, incr; - caddr_t p; - uint8_t tmp; - int iovcnt; - size_t skip; + struct iov_iter iter, *iterp = NULL; - /* no need to fault in kernel pages */ - switch (uio->uio_segflg) { - case UIO_SYSSPACE: - case UIO_BVEC: - return (0); - case UIO_USERSPACE: - case UIO_USERISPACE: - break; - default: - ASSERT(0); - } - - iov = uio->uio_iov; - iovcnt = uio->uio_iovcnt; - skip = uio->uio_skip; - - for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) { - cnt = MIN(iov->iov_len - skip, n); - /* empty iov */ - if (cnt == 0) - continue; - n -= cnt; - /* - * touch each page in this segment. - */ - p = iov->iov_base + skip; - while (cnt) { - if (fuword8((uint8_t *)p, &tmp)) - return (EFAULT); - incr = MIN(cnt, PAGESIZE); - p += incr; - cnt -= incr; - } - /* - * touch the last byte in case it straddles a page. - */ - p--; - if (fuword8((uint8_t *)p, &tmp)) - return (EFAULT); +#if defined(HAVE_IOV_ITER_FAULT_IN_READABLE) + if (uio->uio_segflg == UIO_USERSPACE) { + iterp = &iter; + iov_iter_init_compat(iterp, READ, uio->uio_iov, + uio->uio_iovcnt, uio->uio_resid); +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + iterp = uio->uio_iter; +#endif } + if (iterp && iov_iter_fault_in_readable(iterp, n)) + return (EFAULT); +#endif return (0); } EXPORT_SYMBOL(uio_prefaultpages); /* - * same as uiomove() but doesn't modify uio structure. + * The same as uiomove() but doesn't modify uio structure. * return in cbytes how many bytes were copied. */ int @@ -249,39 +243,54 @@ uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) int ret; bcopy(uio, &uio_copy, sizeof (struct uio)); - ret = uiomove(p, n, rw, &uio_copy); + + if (uio->uio_segflg == UIO_BVEC) + ret = uiomove_bvec(p, n, rw, &uio_copy); +#if defined(HAVE_VFS_IOV_ITER) + else if (uio->uio_segflg == UIO_ITER) + ret = uiomove_iter(p, n, rw, &uio_copy, B_TRUE); +#endif + else + ret = uiomove_iov(p, n, rw, &uio_copy); + *cbytes = uio->uio_resid - uio_copy.uio_resid; + return (ret); } EXPORT_SYMBOL(uiocopy); /* - * Drop the next n chars out of *uiop. + * Drop the next n chars out of *uio. */ void -uioskip(uio_t *uiop, size_t n) +uioskip(uio_t *uio, size_t n) { - if (n > uiop->uio_resid) + if (n > uio->uio_resid) return; - uiop->uio_skip += n; - if (uiop->uio_segflg != UIO_BVEC) { - while (uiop->uio_iovcnt && - uiop->uio_skip >= uiop->uio_iov->iov_len) { - uiop->uio_skip -= uiop->uio_iov->iov_len; - uiop->uio_iov++; - uiop->uio_iovcnt--; + if (uio->uio_segflg == UIO_BVEC) { + uio->uio_skip += n; + while (uio->uio_iovcnt && + uio->uio_skip >= uio->uio_bvec->bv_len) { + uio->uio_skip -= uio->uio_bvec->bv_len; + uio->uio_bvec++; + uio->uio_iovcnt--; } +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + iov_iter_advance(uio->uio_iter, n); +#endif } else { - while (uiop->uio_iovcnt && - uiop->uio_skip >= uiop->uio_bvec->bv_len) { - uiop->uio_skip -= uiop->uio_bvec->bv_len; - uiop->uio_bvec++; - uiop->uio_iovcnt--; + uio->uio_skip += n; + while (uio->uio_iovcnt && + uio->uio_skip >= uio->uio_iov->iov_len) { + uio->uio_skip -= uio->uio_iov->iov_len; + uio->uio_iov++; + uio->uio_iovcnt--; } } - uiop->uio_loffset += n; - uiop->uio_resid -= n; + uio->uio_loffset += n; + uio->uio_resid -= n; } EXPORT_SYMBOL(uioskip); #endif /* _KERNEL */ diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in index 259ac4dc926c..653ea0da9bcc 100644 --- a/sys/contrib/openzfs/module/zfs/Makefile.in +++ b/sys/contrib/openzfs/module/zfs/Makefile.in @@ -84,6 +84,8 @@ $(MODULE)-objs += uberblock.o $(MODULE)-objs += unique.o $(MODULE)-objs += vdev.o $(MODULE)-objs += vdev_cache.o +$(MODULE)-objs += vdev_draid.o +$(MODULE)-objs += vdev_draid_rand.o $(MODULE)-objs += vdev_indirect.o $(MODULE)-objs += vdev_indirect_births.o $(MODULE)-objs += vdev_indirect_mapping.o @@ -120,6 +122,7 @@ $(MODULE)-objs += zfs_ratelimit.o $(MODULE)-objs += zfs_replay.o $(MODULE)-objs += zfs_rlock.o $(MODULE)-objs += zfs_sa.o +$(MODULE)-objs += zfs_vnops.o $(MODULE)-objs += zil.o $(MODULE)-objs += zio.o $(MODULE)-objs += zio_checksum.o diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c index 6018a42ca0d8..68d4aa5f5cb4 100644 --- a/sys/contrib/openzfs/module/zfs/abd.c +++ b/sys/contrib/openzfs/module/zfs/abd.c @@ -781,16 +781,17 @@ int abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_iter_func_t *func, void *private) { - int ret = 0; struct abd_iter aiter; - boolean_t abd_multi; - abd_t *c_abd; + int ret = 0; + + if (size == 0) + return (0); abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - abd_multi = abd_is_gang(abd); - c_abd = abd_init_abd_iter(abd, &aiter, off); + boolean_t abd_multi = abd_is_gang(abd); + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { /* If we are at the end of the gang ABD we are done */ @@ -920,6 +921,9 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; + if (size == 0) + return (0); + abd_verify(dabd); abd_verify(sabd); diff --git a/sys/contrib/openzfs/module/zfs/aggsum.c b/sys/contrib/openzfs/module/zfs/aggsum.c index a2fec27744e1..e46da95f676c 100644 --- a/sys/contrib/openzfs/module/zfs/aggsum.c +++ b/sys/contrib/openzfs/module/zfs/aggsum.c @@ -70,6 +70,11 @@ * zeroing out the borrowed value (forcing that thread to borrow on its next * request, which will also be expensive). This is what makes aggsums well * suited for write-many read-rarely operations. + * + * Note that the aggsums do not expand if more CPUs are hot-added. In that + * case, we will have less fanout than boot_ncpus, but we don't want to always + * reserve the RAM necessary to create the extra slots for additional CPUs up + * front, and dynamically adding them is a complex task. */ /* @@ -167,9 +172,7 @@ aggsum_add(aggsum_t *as, int64_t delta) struct aggsum_bucket *asb; int64_t borrow; - kpreempt_disable(); - asb = &as->as_buckets[CPU_SEQID % as->as_numbuckets]; - kpreempt_enable(); + asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets]; /* Try fast path if we already borrowed enough before. */ mutex_enter(&asb->asc_lock); diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c index 68508cf152a8..c21ae27b9af8 100644 --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -492,6 +492,8 @@ arc_stats_t arc_stats = { { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible_mfu", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible_mru", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, @@ -533,6 +535,11 @@ arc_stats_t arc_stats = { { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_prefetch_asize", KSTAT_DATA_UINT64 }, + { "l2_mru_asize", KSTAT_DATA_UINT64 }, + { "l2_mfu_asize", KSTAT_DATA_UINT64 }, + { "l2_bufc_data_asize", KSTAT_DATA_UINT64 }, + { "l2_bufc_metadata_asize", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, { "l2_read_bytes", KSTAT_DATA_UINT64 }, @@ -894,6 +901,17 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); +static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, + boolean_t state_only); + +#define l2arc_hdr_arcstats_increment(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE) +#define l2arc_hdr_arcstats_decrement(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE) +#define l2arc_hdr_arcstats_increment_state(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE) +#define l2arc_hdr_arcstats_decrement_state(hdr) \ + l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU @@ -951,7 +969,7 @@ static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, - const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); @@ -1727,7 +1745,7 @@ static arc_buf_hdr_t * arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, enum zio_compress compress, uint8_t complevel, boolean_t protected, - boolean_t prefetch) + boolean_t prefetch, arc_state_type_t arcs_state) { arc_buf_hdr_t *hdr; @@ -1751,6 +1769,7 @@ arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, hdr->b_l2hdr.b_dev = dev; hdr->b_l2hdr.b_daddr = daddr; + hdr->b_l2hdr.b_arcs_state = arcs_state; return (hdr); } @@ -2312,7 +2331,11 @@ add_reference(arc_buf_hdr_t *hdr, void *tag) arc_evictable_space_decrement(hdr, state); } /* remove the prefetch flag if we get a reference */ + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } } @@ -2595,9 +2618,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, } } - if (HDR_HAS_L1HDR(hdr)) + if (HDR_HAS_L1HDR(hdr)) { hdr->b_l1hdr.b_state = new_state; + if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) { + l2arc_hdr_arcstats_decrement_state(hdr); + hdr->b_l2hdr.b_arcs_state = new_state->arcs_state; + l2arc_hdr_arcstats_increment_state(hdr); + } + } + /* * L2 headers should never be on the L2 state list since they don't * have L1 headers allocated. @@ -3684,6 +3714,76 @@ arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, return (buf); } +static void +l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, + boolean_t state_only) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + arc_buf_contents_t type = hdr->b_type; + int64_t lsize_s; + int64_t psize_s; + int64_t asize_s; + + if (incr) { + lsize_s = lsize; + psize_s = psize; + asize_s = asize; + } else { + lsize_s = -lsize; + psize_s = -psize; + asize_s = -asize; + } + + /* If the buffer is a prefetch, count it as such. */ + if (HDR_PREFETCH(hdr)) { + ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s); + } else { + /* + * We use the value stored in the L2 header upon initial + * caching in L2ARC. This value will be updated in case + * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC + * metadata (log entry) cannot currently be updated. Having + * the ARC state in the L2 header solves the problem of a + * possibly absent L1 header (apparent in buffers restored + * from persistent L2ARC). + */ + switch (hdr->b_l2hdr.b_arcs_state) { + case ARC_STATE_MRU_GHOST: + case ARC_STATE_MRU: + ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s); + break; + case ARC_STATE_MFU_GHOST: + case ARC_STATE_MFU: + ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s); + break; + default: + break; + } + } + + if (state_only) + return; + + ARCSTAT_INCR(arcstat_l2_psize, psize_s); + ARCSTAT_INCR(arcstat_l2_lsize, lsize_s); + + switch (type) { + case ARC_BUFC_DATA: + ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s); + break; + case ARC_BUFC_METADATA: + ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s); + break; + default: + break; + } +} + + static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { @@ -3697,9 +3797,7 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) list_remove(&dev->l2ad_buflist, hdr); - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); - + l2arc_hdr_arcstats_decrement(hdr); vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), @@ -3903,6 +4001,21 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) if (l2arc_write_eligible(hdr->b_spa, hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); + + switch (state->arcs_state) { + case ARC_STATE_MRU: + ARCSTAT_INCR( + arcstat_evict_l2_eligible_mru, + HDR_GET_LSIZE(hdr)); + break; + case ARC_STATE_MFU: + ARCSTAT_INCR( + arcstat_evict_l2_eligible_mfu, + HDR_GET_LSIZE(hdr)); + break; + default: + break; + } } else { ARCSTAT_INCR(arcstat_evict_l2_ineligible, HDR_GET_LSIZE(hdr)); @@ -4769,14 +4882,7 @@ arc_kmem_reap_soon(void) static boolean_t arc_evict_cb_check(void *arg, zthr_t *zthr) { - /* - * This is necessary so that any changes which may have been made to - * many of the zfs_arc_* module parameters will be propagated to - * their actual internal variable counterparts. Without this, - * changing those module params at runtime would have no effect. - */ - arc_tuning_update(B_FALSE); - +#ifdef ZFS_DEBUG /* * This is necessary in order to keep the kstat information * up to date for tools that display kstat data such as the @@ -4784,12 +4890,11 @@ arc_evict_cb_check(void *arg, zthr_t *zthr) * typically do not call kstat's update function, but simply * dump out stats from the most recent update. Without * this call, these commands may show stale stats for the - * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even - * with this change, the data might be up to 1 second - * out of date(the arc_evict_zthr has a maximum sleep - * time of 1 second); but that should suffice. The - * arc_state_t structures can be queried directly if more - * accurate information is needed. + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this call, the data might be out of date if the + * evict thread hasn't been woken recently; but that should + * suffice. The arc_state_t structures can be queried + * directly if more accurate information is needed. */ #ifndef __FreeBSD__ if (arc_ksp != NULL) @@ -5347,11 +5452,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } hdr->b_l1hdr.b_arc_access = now; return; @@ -5382,13 +5491,16 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * was evicted from the cache. Move it to the * MFU state. */ - if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { @@ -5641,7 +5753,7 @@ arc_read_done(zio_t *zio) */ int callback_cnt = 0; for (acb = callback_list; acb != NULL; acb = acb->acb_next) { - if (!acb->acb_done) + if (!acb->acb_done || acb->acb_nobuf) continue; callback_cnt++; @@ -5806,6 +5918,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); + boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF; int rc = 0; ASSERT(!embedded_bp || @@ -5890,6 +6003,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, acb->acb_compressed = compressed_read; acb->acb_encrypted = encrypted_read; acb->acb_noauth = noauth_read; + acb->acb_nobuf = no_buf; acb->acb_zb = *zb; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, @@ -5899,8 +6013,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; - mutex_exit(hash_lock); - goto out; } mutex_exit(hash_lock); goto out; @@ -5909,7 +6021,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ASSERT(hdr->b_l1hdr.b_state == arc_mru || hdr->b_l1hdr.b_state == arc_mfu); - if (done) { + if (done && !no_buf) { if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { /* * This is a demand read which does not have to @@ -5963,8 +6075,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc != EACCES); } else if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); @@ -6108,8 +6224,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, } if (*arc_flags & ARC_FLAG_PREFETCH && - zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_decrement_state(hdr); arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (HDR_HAS_L2HDR(hdr)) + l2arc_hdr_arcstats_increment_state(hdr); + } if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) @@ -6178,7 +6299,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, metadata, misses); } - if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { + /* Check if the spa even has l2 configured */ + const boolean_t spa_has_l2 = l2arc_ndev != 0 && + spa->spa_l2cache.sav_count > 0; + + if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. @@ -6186,7 +6311,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. - * 5. This isn't prefetch and l2arc_noprefetch is set. + * 5. This isn't prefetch or l2arc_noprefetch is 0. */ if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && @@ -6279,15 +6404,24 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, } else { if (vd != NULL) spa_config_exit(spa, SCL_L2ARC, vd); + /* - * Skip ARC stat bump for block pointers with - * embedded data. The data are read from the blkptr - * itself via decode_embedded_bp_compressed(). + * Only a spa with l2 should contribute to l2 + * miss stats. (Including the case of having a + * faulted cache device - that's also a miss.) */ - if (l2arc_ndev != 0 && !embedded_bp) { - DTRACE_PROBE1(l2arc__miss, - arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_misses); + if (spa_has_l2) { + /* + * Skip ARC stat bump for block pointers with + * embedded data. The data are read from the + * blkptr itself via + * decode_embedded_bp_compressed(). + */ + if (!embedded_bp) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } } } @@ -7072,9 +7206,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) */ uint64_t total_dirty = reserve + arc_tempreserve + anon_size; uint64_t spa_dirty_anon = spa_dirty_data(spa); - - if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && - anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && + uint64_t rarc_c = arc_warm ? arc_c : arc_c_max; + if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 && + anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 && spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { #ifdef ZFS_DEBUG uint64_t meta_esize = zfs_refcount_count( @@ -7082,9 +7216,9 @@ arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) uint64_t data_esize = zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " - "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n", arc_tempreserve >> 10, meta_esize >> 10, - data_esize >> 10, reserve >> 10, arc_c >> 10); + data_esize >> 10, reserve >> 10, rarc_c >> 10); #endif DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); return (SET_ERROR(ERESTART)); @@ -7451,6 +7585,15 @@ arc_target_bytes(void) return (arc_c); } +void +arc_set_limits(uint64_t allmem) +{ + /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ + arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); + + /* How to set default max varies by platform. */ + arc_c_max = arc_default_max(arc_c_min, allmem); +} void arc_init(void) { @@ -7466,11 +7609,7 @@ arc_init(void) arc_lowmem_init(); #endif - /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ - arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); - - /* How to set default max varies by platform. */ - arc_c_max = arc_default_max(arc_c_min, allmem); + arc_set_limits(allmem); #ifndef _KERNEL /* @@ -7507,6 +7646,8 @@ arc_init(void) if (arc_c < arc_c_min) arc_c = arc_c_min; + arc_register_hotplug(); + arc_state_init(); buf_init(); @@ -7515,8 +7656,9 @@ arc_init(void) offsetof(arc_prune_t, p_node)); mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri, - boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri, + boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); @@ -7527,8 +7669,8 @@ arc_init(void) kstat_install(arc_ksp); } - arc_evict_zthr = zthr_create_timer("arc_evict", - arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1)); + arc_evict_zthr = zthr_create("arc_evict", + arc_evict_cb_check, arc_evict_cb, NULL); arc_reap_zthr = zthr_create_timer("arc_reap", arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1)); @@ -7613,6 +7755,8 @@ arc_fini(void) buf_fini(); arc_state_fini(); + arc_unregister_hotplug(); + /* * We destroy the zthrs after all the ARC state has been * torn down to avoid the case of them receiving any @@ -8068,9 +8212,6 @@ l2arc_write_done(zio_t *zio) DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); - if (zio->io_error != 0) - ARCSTAT_BUMP(arcstat_l2_writes_error); - /* * All writes completed, or an error was hit. */ @@ -8134,8 +8275,7 @@ l2arc_write_done(zio_t *zio) arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); uint64_t psize = HDR_GET_PSIZE(hdr); - ARCSTAT_INCR(arcstat_l2_psize, -psize); - ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); + l2arc_hdr_arcstats_decrement(hdr); bytes_dropped += vdev_psize_to_asize(dev->l2ad_vdev, psize); @@ -8183,6 +8323,8 @@ l2arc_write_done(zio_t *zio) list_destroy(&cb->l2wcb_abd_list); if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_writes_error); + /* * Restore the lbps array in the header to its previous state. * If the list of log block pointers is empty, zero out the @@ -8748,9 +8890,16 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) goto top; } - ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); - if (!dev->l2ad_first) - ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + if (!all) { + /* + * In case of cache device removal (all) the following + * assertions may be violated without functional consequences + * as the device is about to be removed. + */ + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + } } /* @@ -9089,6 +9238,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) hdr->b_l2hdr.b_hits = 0; hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_arcs_state = + hdr->b_l1hdr.b_state->arcs_state; arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); mutex_enter(&dev->l2ad_mtx); @@ -9111,6 +9262,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) write_psize += psize; write_asize += asize; dev->l2ad_hand += asize; + l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); mutex_exit(hash_lock); @@ -9153,8 +9305,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); - ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); - ARCSTAT_INCR(arcstat_l2_psize, write_psize); dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); @@ -9379,8 +9529,6 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) l2arc_dev_hdr_phys_t *l2dhdr; uint64_t l2dhdr_asize; spa_t *spa; - int err; - boolean_t l2dhdr_valid = B_TRUE; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); @@ -9409,10 +9557,7 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) /* * Read the device header, if an error is returned do not rebuild L2ARC. */ - if ((err = l2arc_dev_hdr_read(dev)) != 0) - l2dhdr_valid = B_FALSE; - - if (l2dhdr_valid && dev->l2ad_log_entries > 0) { + if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, @@ -9712,7 +9857,7 @@ l2arc_rebuild(l2arc_dev_t *dev) * L2BLK_GET_PSIZE returns aligned size for log blocks. */ uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); - l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); + l2arc_log_blk_restore(dev, this_lb, asize); /* * log block restored, include its pointer in the list of @@ -9759,6 +9904,7 @@ l2arc_rebuild(l2arc_dev_t *dev) !dev->l2ad_first) goto out; + cond_resched(); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { @@ -9792,7 +9938,7 @@ l2arc_rebuild(l2arc_dev_t *dev) PTR_SWAP(this_lb, next_lb); this_io = next_io; next_io = NULL; - } + } if (this_io != NULL) l2arc_log_blk_fetch_abort(this_io); @@ -9859,7 +10005,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, - ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); @@ -10030,7 +10176,7 @@ l2arc_log_blk_read(l2arc_dev_t *dev, */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, - uint64_t lb_asize, uint64_t lb_daddr) + uint64_t lb_asize) { uint64_t size = 0, asize = 0; uint64_t log_entries = dev->l2ad_log_entries; @@ -10104,19 +10250,18 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, L2BLK_GET_PROTECTED((le)->le_prop), - L2BLK_GET_PREFETCH((le)->le_prop)); + L2BLK_GET_PREFETCH((le)->le_prop), + L2BLK_GET_STATE((le)->le_prop)); asize = vdev_psize_to_asize(dev->l2ad_vdev, L2BLK_GET_PSIZE((le)->le_prop)); /* * vdev_space_update() has to be called before arc_hdr_destroy() to - * avoid underflow since the latter also calls the former. + * avoid underflow since the latter also calls vdev_space_update(). */ + l2arc_hdr_arcstats_increment(hdr); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); - ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr)); - ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr)); - mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, hdr); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); @@ -10136,14 +10281,15 @@ l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); exists->b_l2hdr.b_dev = dev; exists->b_l2hdr.b_daddr = le->le_daddr; + exists->b_l2hdr.b_arcs_state = + L2BLK_GET_STATE((le)->le_prop); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_buflist, exists); (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(exists), exists); mutex_exit(&dev->l2ad_mtx); + l2arc_hdr_arcstats_increment(exists); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); - ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists)); - ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists)); } ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); } @@ -10439,6 +10585,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); + L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state); dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); @@ -10607,5 +10754,8 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, - "When full, ARC allocation waits for eviction of this % of alloc size"); + "When full, ARC allocation waits for eviction of this % of alloc size"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW, + "The number of headers to evict per sublist before moving to the next"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c index 7d817320aae4..93445a80294b 100644 --- a/sys/contrib/openzfs/module/zfs/dbuf.c +++ b/sys/contrib/openzfs/module/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2019, Klara Inc. @@ -1973,6 +1973,74 @@ dbuf_redirty(dbuf_dirty_record_t *dr) } } +dbuf_dirty_record_t * +dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid); + dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE); + ASSERT(dn->dn_maxblkid >= blkid); + + dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP); + list_link_init(&dr->dr_dirty_node); + list_link_init(&dr->dr_dbuf_node); + dr->dr_dnode = dn; + dr->dr_txg = tx->tx_txg; + dr->dt.dll.dr_blkid = blkid; + dr->dr_accounted = dn->dn_datablksz; + + /* + * There should not be any dbuf for the block that we're dirtying. + * Otherwise the buffer contents could be inconsistent between the + * dbuf and the lightweight dirty record. + */ + ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)); + + mutex_enter(&dn->dn_mtx); + int txgoff = tx->tx_txg & TXG_MASK; + if (dn->dn_free_ranges[txgoff] != NULL) { + range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1); + } + + if (dn->dn_nlevels == 1) { + ASSERT3U(blkid, <, dn->dn_nblkptr); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); + dnode_setdirty(dn, tx); + } else { + mutex_exit(&dn->dn_mtx); + + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent_db = dbuf_hold_level(dn, + 1, blkid >> epbs, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (parent_db == NULL) { + kmem_free(dr, sizeof (*dr)); + return (NULL); + } + int err = dbuf_read(parent_db, NULL, + (DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err != 0) { + dbuf_rele(parent_db, FTAG); + kmem_free(dr, sizeof (*dr)); + return (NULL); + } + + dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx); + dbuf_rele(parent_db, FTAG); + mutex_enter(&parent_dr->dt.di.dr_mtx); + ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg); + list_insert_tail(&parent_dr->dt.di.dr_children, dr); + mutex_exit(&parent_dr->dt.di.dr_mtx); + dr->dr_parent = parent_dr; + } + + dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx); + + return (dr); +} + dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { @@ -2090,6 +2158,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); list_link_init(&dr->dr_dirty_node); list_link_init(&dr->dr_dbuf_node); + dr->dr_dnode = dn; if (db->db_level == 0) { void *data_old = db->db_buf; @@ -2255,7 +2324,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) dmu_buf_impl_t *db = dr->dr_dbuf; if (dr->dt.dl.dr_data != db->db.db_data) { - struct dnode *dn = DB_DNODE(db); + struct dnode *dn = dr->dr_dnode; int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); kmem_free(dr->dt.dl.dr_data, max_bonuslen); @@ -2280,9 +2349,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - dnode_t *dn; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr; ASSERT(txg != 0); @@ -2302,13 +2369,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - dr = dbuf_find_dirty_eq(db, txg); + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) return (B_FALSE); ASSERT(dr->dr_dbuf == db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); + dnode_t *dn = dr->dr_dnode; dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -2336,7 +2402,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } - DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); @@ -2627,11 +2692,9 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); arc_buf_destroy(buf, db); - xuio_stat_wbuf_copied(); return; } - xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); @@ -3003,8 +3066,29 @@ typedef struct dbuf_prefetch_arg { zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ + dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */ + void *dpa_arg; /* prefetch completion arg */ } dbuf_prefetch_arg_t; +static void +dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done) +{ + if (dpa->dpa_cb != NULL) + dpa->dpa_cb(dpa->dpa_arg, io_done); + kmem_free(dpa, sizeof (*dpa)); +} + +static void +dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + dbuf_prefetch_fini(dpa, B_TRUE); + if (abuf != NULL) + arc_buf_destroy(abuf, private); +} + /* * Actually issue the prefetch read for the block given. */ @@ -3017,11 +3101,12 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) - return; + return (dbuf_prefetch_fini(dpa, B_FALSE)); int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = - dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_NO_BUF; /* dnodes are always read as raw and then converted later */ if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) && @@ -3031,7 +3116,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); - (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, + dbuf_issue_final_prefetch_done, dpa, dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } @@ -3051,8 +3137,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, if (abuf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); - kmem_free(dpa, sizeof (*dpa)); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } ASSERT(zio == NULL || zio->io_error == 0); @@ -3084,11 +3169,9 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, dpa->dpa_curlevel, curblkid, FTAG); if (db == NULL) { - kmem_free(dpa, sizeof (*dpa)); arc_buf_destroy(abuf, private); - return; + return (dbuf_prefetch_fini(dpa, B_TRUE)); } - (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); dbuf_rele(db, FTAG); @@ -3105,11 +3188,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dpa->dpa_dnode->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { - kmem_free(dpa, sizeof (*dpa)); + dbuf_prefetch_fini(dpa, B_TRUE); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); dbuf_issue_final_prefetch(dpa, bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -3139,9 +3221,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, * complete. Note that the prefetch might fail if the dataset is encrypted and * the encryption key is unmapped before the IO completes. */ -void -dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, - arc_flags_t aflags) +int +dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb, + void *arg) { blkptr_t bp; int epbs, nlevels, curlevel; @@ -3151,10 +3234,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (blkid > dn->dn_maxblkid) - return; + goto no_issue; if (level == 0 && dnode_block_freed(dn, blkid)) - return; + goto no_issue; /* * This dnode hasn't been written to disk yet, so there's nothing to @@ -3162,11 +3245,11 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, */ nlevels = dn->dn_phys->dn_nlevels; if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) - return; + goto no_issue; epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) - return; + goto no_issue; dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); @@ -3176,7 +3259,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * This dbuf already exists. It is either CACHED, or * (we assume) about to be read or filled. */ - return; + goto no_issue; } /* @@ -3212,7 +3295,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) - return; + goto no_issue; ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); @@ -3230,6 +3313,8 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, dpa->dpa_dnode = dn; dpa->dpa_epbs = epbs; dpa->dpa_zio = pio; + dpa->dpa_cb = cb; + dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) @@ -3245,7 +3330,6 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, if (curlevel == level) { ASSERT3U(curblkid, ==, blkid); dbuf_issue_final_prefetch(dpa, &bp); - kmem_free(dpa, sizeof (*dpa)); } else { arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; zbookmark_phys_t zb; @@ -3266,6 +3350,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, * dpa may have already been freed. */ zio_nowait(pio); + return (1); +no_issue: + if (cb != NULL) + cb(arg, B_FALSE); + return (0); +} + +int +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) +{ + + return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL)); } /* @@ -3803,15 +3900,13 @@ dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(DB_DNODE_HELD(db)); ASSERT(db->db_blkid == DMU_BONUS_BLKID); ASSERT(data != NULL); - dnode_t *dn = DB_DNODE(db); + dnode_t *dn = dr->dr_dnode; ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys)); - DB_DNODE_EXIT(db); dbuf_sync_leaf_verify_bonus_dnode(dr); @@ -3870,8 +3965,7 @@ noinline static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; - zio_t *zio; + dnode_t *dn = dr->dr_dnode; ASSERT(dmu_tx_is_syncing(tx)); @@ -3891,12 +3985,9 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); - DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; @@ -3905,7 +3996,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_write(dr, db->db_buf, tx); - zio = dr->dr_zio; + zio_t *zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -3930,7 +4021,7 @@ static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) { #ifdef ZFS_DEBUG - dnode_t *dn = DB_DNODE(dr->dr_dbuf); + dnode_t *dn = dr->dr_dnode; /* * Encrypted bonus buffers can have data past their bonuslen. @@ -3953,6 +4044,153 @@ dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) #endif } +static blkptr_t * +dbuf_lightweight_bp(dbuf_dirty_record_t *dr) +{ + /* This must be a lightweight dirty record. */ + ASSERT3P(dr->dr_dbuf, ==, NULL); + dnode_t *dn = dr->dr_dnode; + + if (dn->dn_phys->dn_nlevels == 1) { + VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr); + return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]); + } else { + dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + VERIFY3U(parent_db->db_level, ==, 1); + VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn); + VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid); + blkptr_t *bp = parent_db->db.db_data; + return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]); + } +} + +static void +dbuf_lightweight_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + blkptr_t *bp = zio->io_bp; + + if (zio->io_error != 0) + return; + + dnode_t *dn = dr->dr_dnode; + + blkptr_t *bp_orig = dbuf_lightweight_bp(dr); + spa_t *spa = dmu_objset_spa(dn->dn_objset); + int64_t delta = bp_get_dsize_sync(spa, bp) - + bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta); + + uint64_t blkid = dr->dt.dll.dr_blkid; + mutex_enter(&dn->dn_mtx); + if (blkid > dn->dn_phys->dn_maxblkid) { + ASSERT0(dn->dn_objset->os_raw_receive); + dn->dn_phys->dn_maxblkid = blkid; + } + mutex_exit(&dn->dn_mtx); + + if (!BP_IS_EMBEDDED(bp)) { + uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1; + BP_SET_FILL(bp, fill); + } + + dmu_buf_impl_t *parent_db; + EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1); + if (dr->dr_parent == NULL) { + parent_db = dn->dn_dbuf; + } else { + parent_db = dr->dr_parent->dr_dbuf; + } + rw_enter(&parent_db->db_rwlock, RW_WRITER); + *bp_orig = *bp; + rw_exit(&parent_db->db_rwlock); +} + +static void +dbuf_lightweight_physdone(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dsl_pool_t *dp = spa_get_dsl(zio->io_spa); + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dbuf_lightweight_done(). + */ + int delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + +static void +dbuf_lightweight_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + + VERIFY0(zio->io_error); + + objset_t *os = dr->dr_dnode->dn_objset; + dmu_tx_t *tx = os->os_synctx; + + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { + ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, zio->io_bp, tx); + } + + /* + * See comment in dbuf_write_done(). + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } + + abd_free(dr->dt.dll.dr_abd); + kmem_free(dr, sizeof (*dr)); +} + +noinline static void +dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dnode_t *dn = dr->dr_dnode; + zio_t *pio; + if (dn->dn_phys->dn_nlevels == 1) { + pio = dn->dn_zio; + } else { + pio = dr->dr_parent->dr_zio; + } + + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(dn->dn_objset), + .zb_object = dn->dn_object, + .zb_level = 0, + .zb_blkid = dr->dt.dll.dr_blkid, + }; + + /* + * See comment in dbuf_write(). This is so that zio->io_bp_orig + * will have the old BP in dbuf_lightweight_done(). + */ + dr->dr_bp_copy = *dbuf_lightweight_bp(dr); + + dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset), + dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, + dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), + &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, + dbuf_lightweight_physdone, dbuf_lightweight_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); + + zio_nowait(dr->dr_zio); +} + /* * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is * critical the we not allow the compiler to inline this function in to @@ -3963,7 +4201,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; + dnode_t *dn = dr->dr_dnode; objset_t *os; uint64_t txg = tx->tx_txg; @@ -3987,9 +4225,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } DBUF_VERIFY(db); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); - if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { @@ -4079,16 +4314,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr); - DB_DNODE_EXIT(db); } else { - /* - * Although zio_nowait() does not "wait for an IO", it does - * initiate the IO. If this is an empty write it seems plausible - * that the IO could actually be completed before the nowait - * returns. We need to DB_DNODE_EXIT() first in case - * zio_nowait() invalidates the dbuf. - */ - DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } @@ -4111,15 +4337,19 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) DMU_META_DNODE_OBJECT); break; } - if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - VERIFY3U(dr->dr_dbuf->db_level, ==, level); - } list_remove(list, dr); - if (dr->dr_dbuf->db_level > 0) - dbuf_sync_indirect(dr, tx); - else - dbuf_sync_leaf(dr, tx); + if (dr->dr_dbuf == NULL) { + dbuf_sync_lightweight(dr, tx); + } else { + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } } } @@ -4299,7 +4529,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; - dbuf_dirty_record_t *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); @@ -4320,7 +4549,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) DBUF_VERIFY(db); - dr = db->db_data_pending; + dbuf_dirty_record_t *dr = db->db_data_pending; + dnode_t *dn = dr->dr_dnode; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(list_next(&db->db_dirty_records, dr) == NULL); @@ -4328,14 +4558,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); - DB_DNODE_EXIT(db); } #endif @@ -4347,10 +4572,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { - dnode_t *dn; - - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { @@ -4361,7 +4582,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); } - DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } @@ -4554,7 +4774,7 @@ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn; + dnode_t *dn = dr->dr_dnode; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; @@ -4565,8 +4785,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); - DB_DNODE_ENTER(db); - dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { @@ -4622,7 +4840,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); - DB_DNODE_EXIT(db); /* * We copy the blkptr now (rather than when we instantiate the dirty diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c index 2c96645214f8..a02f43df13fd 100644 --- a/sys/contrib/openzfs/module/zfs/dmu.c +++ b/sys/contrib/openzfs/module/zfs/dmu.c @@ -499,7 +499,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; - zio_t *zio; + zio_t *zio = NULL; ASSERT(length <= DMU_MAX_ACCESS); @@ -531,14 +531,17 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + if (read) + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, + ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); - zio_nowait(zio); + if (read) + zio_nowait(zio); return (SET_ERROR(EIO)); } @@ -555,15 +558,15 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, } rw_exit(&dn->dn_struct_rwlock); - /* wait for async i/o */ - err = zio_wait(zio); - if (err) { - dmu_buf_rele_array(dbp, nblks, tag); - return (err); - } - - /* wait for other io to complete */ if (read) { + /* wait for async read i/o */ + err = zio_wait(zio); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); @@ -1165,165 +1168,12 @@ dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } -/* - * DMU support for xuio - */ -kstat_t *xuio_ksp = NULL; - -typedef struct xuio_stats { - /* loaned yet not returned arc_buf */ - kstat_named_t xuiostat_onloan_rbuf; - kstat_named_t xuiostat_onloan_wbuf; - /* whether a copy is made when loaning out a read buffer */ - kstat_named_t xuiostat_rbuf_copied; - kstat_named_t xuiostat_rbuf_nocopy; - /* whether a copy is made when assigning a write buffer */ - kstat_named_t xuiostat_wbuf_copied; - kstat_named_t xuiostat_wbuf_nocopy; -} xuio_stats_t; - -static xuio_stats_t xuio_stats = { - { "onloan_read_buf", KSTAT_DATA_UINT64 }, - { "onloan_write_buf", KSTAT_DATA_UINT64 }, - { "read_buf_copied", KSTAT_DATA_UINT64 }, - { "read_buf_nocopy", KSTAT_DATA_UINT64 }, - { "write_buf_copied", KSTAT_DATA_UINT64 }, - { "write_buf_nocopy", KSTAT_DATA_UINT64 } -}; - -#define XUIOSTAT_INCR(stat, val) \ - atomic_add_64(&xuio_stats.stat.value.ui64, (val)) -#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) - -#ifdef HAVE_UIO_ZEROCOPY -int -dmu_xuio_init(xuio_t *xuio, int nblk) -{ - dmu_xuio_t *priv; - uio_t *uio = &xuio->xu_uio; - - uio->uio_iovcnt = nblk; - uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); - - priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); - priv->cnt = nblk; - priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); - priv->iovp = (iovec_t *)uio->uio_iov; - XUIO_XUZC_PRIV(xuio) = priv; - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); - - return (0); -} - -void -dmu_xuio_fini(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int nblk = priv->cnt; - - kmem_free(priv->iovp, nblk * sizeof (iovec_t)); - kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); - kmem_free(priv, sizeof (dmu_xuio_t)); - - if (XUIO_XUZC_RW(xuio) == UIO_READ) - XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); - else - XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); -} - -/* - * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } - * and increase priv->next by 1. - */ -int -dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) -{ - struct iovec *iov; - uio_t *uio = &xuio->xu_uio; - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - int i = priv->next++; - - ASSERT(i < priv->cnt); - ASSERT(off + n <= arc_buf_lsize(abuf)); - iov = (iovec_t *)uio->uio_iov + i; - iov->iov_base = (char *)abuf->b_data + off; - iov->iov_len = n; - priv->bufs[i] = abuf; - return (0); -} - -int -dmu_xuio_cnt(xuio_t *xuio) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - return (priv->cnt); -} - -arc_buf_t * -dmu_xuio_arcbuf(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - return (priv->bufs[i]); -} - -void -dmu_xuio_clear(xuio_t *xuio, int i) -{ - dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); - - ASSERT(i < priv->cnt); - priv->bufs[i] = NULL; -} -#endif /* HAVE_UIO_ZEROCOPY */ - -static void -xuio_stat_init(void) -{ - xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (xuio_ksp != NULL) { - xuio_ksp->ks_data = &xuio_stats; - kstat_install(xuio_ksp); - } -} - -static void -xuio_stat_fini(void) -{ - if (xuio_ksp != NULL) { - kstat_delete(xuio_ksp); - xuio_ksp = NULL; - } -} - -void -xuio_stat_wbuf_copied(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_copied); -} - -void -xuio_stat_wbuf_nocopy(void) -{ - XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); -} - #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; -#ifdef HAVE_UIO_ZEROCOPY - xuio_t *xuio = NULL; -#endif /* * NB: we could do this block-at-a-time, but it's nice @@ -1344,21 +1194,6 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) bufoff = uio_offset(uio) - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); -#ifdef HAVE_UIO_ZEROCOPY - if (xuio) { - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - arc_buf_t *dbuf_abuf = dbi->db_buf; - arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); - err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); - if (!err) - uio_advance(uio, tocpy); - - if (abuf == dbuf_abuf) - XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); - else - XUIOSTAT_BUMP(xuiostat_rbuf_copied); - } else -#endif #ifdef __FreeBSD__ err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, uio); @@ -1560,6 +1395,32 @@ dmu_return_arcbuf(arc_buf_t *buf) arc_buf_destroy(buf, FTAG); } +/* + * A "lightweight" write is faster than a regular write (e.g. + * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the + * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t. However, the + * data can not be read or overwritten until the transaction's txg has been + * synced. This makes it appropriate for workloads that are known to be + * (temporarily) write-only, like "zfs receive". + * + * A single block is written, starting at the specified offset in bytes. If + * the call is successful, it returns 0 and the provided abd has been + * consumed (the caller should not free it). + */ +int +dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd, + const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr = + dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx); + if (dr == NULL) + return (SET_ERROR(EIO)); + dr->dt.dll.dr_abd = abd; + dr->dt.dll.dr_props = *zp; + dr->dt.dll.dr_flags = flags; + return (0); +} + /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via @@ -1583,8 +1444,8 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, rw_exit(&dn->dn_struct_rwlock); /* - * We can only assign if the offset is aligned, the arc buf is the - * same size as the dbuf, and the dbuf is not metadata. + * We can only assign if the offset is aligned and the arc buf is the + * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); @@ -1597,7 +1458,6 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); - XUIOSTAT_BUMP(xuiostat_wbuf_copied); } return (0); @@ -2409,7 +2269,6 @@ dmu_init(void) abd_init(); zfs_dbgmsg_init(); sa_cache_init(); - xuio_stat_init(); dmu_objset_init(); dnode_init(); zfetch_init(); @@ -2429,7 +2288,6 @@ dmu_fini(void) dbuf_fini(); dnode_fini(); dmu_objset_fini(); - xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c index 453a2842ce6e..12cdbd68b104 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_object.c +++ b/sys/contrib/openzfs/module/zfs/dmu_object.c @@ -58,10 +58,8 @@ dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; int error; - kpreempt_disable(); - cpuobj = &os->os_obj_next_percpu[CPU_SEQID % + cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE % os->os_obj_next_percpu_len]; - kpreempt_enable(); if (dn_slots == 0) { dn_slots = DNODE_MIN_SLOTS; diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c index af5935e2374d..66a8f20092e0 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_objset.c +++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. @@ -682,8 +682,9 @@ dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); @@ -755,8 +756,9 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, dsl_pool_t *dp; dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); @@ -778,11 +780,15 @@ dmu_objset_own(const char *name, dmu_objset_type_t type, * speed up pool import times and to keep this txg reserved * completely for recovery work. */ - if ((dmu_objset_userobjspace_upgradable(*osp) || - dmu_objset_projectquota_upgradable(*osp)) && - !readonly && !dp->dp_spa->spa_claiming && - (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) - dmu_objset_id_quota_upgrade(*osp); + if (!readonly && !dp->dp_spa->spa_claiming && + (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) { + if (dmu_objset_userobjspace_upgradable(*osp) || + dmu_objset_projectquota_upgradable(*osp)) { + dmu_objset_id_quota_upgrade(*osp); + } else if (dmu_objset_userused_enabled(*osp)) { + dmu_objset_userspace_upgrade(*osp); + } + } dsl_pool_rele(dp, FTAG); return (0); @@ -794,8 +800,9 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, { dsl_dataset_t *ds; int err; - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds); if (err != 0) return (err); @@ -812,9 +819,10 @@ dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, void dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) { - ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; - + ds_hold_flags_t flags; dsl_pool_t *dp = dmu_objset_pool(os); + + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag); dsl_pool_rele(dp, tag); } @@ -842,7 +850,9 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; + ds_hold_flags_t flags; + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); @@ -850,21 +860,22 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, dsl_dataset_name(ds, name); dp = ds->ds_dir->dd_pool; dsl_pool_config_enter(dp, FTAG); - dsl_dataset_disown(ds, decrypt, tag); - VERIFY0(dsl_dataset_own(dp, name, - (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds)); + dsl_dataset_disown(ds, flags, tag); + VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds)); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag) { + ds_hold_flags_t flags; + + flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; /* * Stop upgrading thread */ dmu_objset_upgrade_stop(os); - dsl_dataset_disown(os->os_dsl_dataset, - (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag); + dsl_dataset_disown(os->os_dsl_dataset, flags, tag); } void @@ -1231,7 +1242,7 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) } VERIFY0(zio_wait(rzio)); - dmu_objset_do_userquota_updates(os, tx); + dmu_objset_sync_done(os, tx); taskq_wait(dp->dp_sync_taskq); if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { ASSERT3P(ds->ds_key_mapping, !=, NULL); @@ -1424,10 +1435,15 @@ dmu_objset_upgrade_task_cb(void *data) mutex_enter(&os->os_upgrade_lock); os->os_upgrade_status = EINTR; if (!os->os_upgrade_exit) { + int status; + mutex_exit(&os->os_upgrade_lock); - os->os_upgrade_status = os->os_upgrade_cb(os); + status = os->os_upgrade_cb(os); + mutex_enter(&os->os_upgrade_lock); + + os->os_upgrade_status = status; } os->os_upgrade_exit = B_TRUE; os->os_upgrade_id = 0; @@ -1455,6 +1471,8 @@ dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb) dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); os->os_upgrade_status = ENOMEM; } + } else { + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } mutex_exit(&os->os_upgrade_lock); } @@ -1498,23 +1516,13 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) multilist_sublist_remove(list, dn); /* - * If we are not doing useraccounting (os_synced_dnodes == NULL) - * we are done with this dnode for this txg. Unset dn_dirty_txg - * if later txgs aren't dirtying it so that future holders do - * not get a stale value. Otherwise, we will do this in - * userquota_updates_task() when processing has completely - * finished for this txg. + * See the comment above dnode_rele_task() for an explanation + * of why this dnode hold is always needed (even when not + * doing user accounting). */ multilist_t *newlist = dn->dn_objset->os_synced_dnodes; - if (newlist != NULL) { - (void) dnode_add_ref(dn, newlist); - multilist_insert(newlist, dn); - } else { - mutex_enter(&dn->dn_mtx); - if (dn->dn_dirty_txg == tx->tx_txg) - dn->dn_dirty_txg = 0; - mutex_exit(&dn->dn_mtx); - } + (void) dnode_add_ref(dn, newlist); + multilist_insert(newlist, dn); dnode_sync(dn, tx); } @@ -1676,22 +1684,19 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) txgoff = tx->tx_txg & TXG_MASK; - if (dmu_objset_userused_enabled(os) && - (!os->os_encrypted || !dmu_objset_is_receiving(os))) { - /* - * We must create the list here because it uses the - * dn_dirty_link[] of this txg. But it may already - * exist because we call dsl_dataset_sync() twice per txg. - */ - if (os->os_synced_dnodes == NULL) { - os->os_synced_dnodes = - multilist_create(sizeof (dnode_t), - offsetof(dnode_t, dn_dirty_link[txgoff]), - dnode_multilist_index_func); - } else { - ASSERT3U(os->os_synced_dnodes->ml_offset, ==, - offsetof(dnode_t, dn_dirty_link[txgoff])); - } + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. But it may already + * exist because we call dsl_dataset_sync() twice per txg. + */ + if (os->os_synced_dnodes == NULL) { + os->os_synced_dnodes = + multilist_create(sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff]), + dnode_multilist_index_func); + } else { + ASSERT3U(os->os_synced_dnodes->ml_offset, ==, + offsetof(dnode_t, dn_dirty_link[txgoff])); } ml = os->os_dirty_dnodes[txgoff]; @@ -1998,8 +2003,6 @@ userquota_updates_task(void *arg) dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); - if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) - dn->dn_dirty_txg = 0; mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); @@ -2010,13 +2013,44 @@ userquota_updates_task(void *arg) kmem_free(uua, sizeof (*uua)); } -void -dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +/* + * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being + * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be + * evicted because the block containing the dnode can't be evicted until it is + * written out. However, this hold is necessary to prevent the dnode_t from + * being moved (via dnode_move()) while it's still referenced by + * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for + * dirty_lightweight_leaf-type dirty records. + * + * If we are doing user-object accounting, the dnode_rele() happens from + * userquota_updates_task() instead. + */ +static void +dnode_rele_task(void *arg) { - int num_sublists; + userquota_updates_arg_t *uua = arg; + objset_t *os = uua->uua_os; + multilist_sublist_t *list = + multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); + + dnode_t *dn; + while ((dn = multilist_sublist_head(list)) != NULL) { + multilist_sublist_remove(list, dn); + dnode_rele(dn, os->os_synced_dnodes); + } + multilist_sublist_unlock(list); + kmem_free(uua, sizeof (*uua)); +} + +/* + * Return TRUE if userquota updates are needed. + */ +static boolean_t +dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx) +{ if (!dmu_objset_userused_enabled(os)) - return; + return (B_FALSE); /* * If this is a raw receive just return and handle accounting @@ -2026,10 +2060,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) * used for recovery. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) - return; + return (B_FALSE); if (tx->tx_txg <= os->os_spa->spa_claim_max_txg) - return; + return (B_FALSE); /* Allocate the user/group/project used objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { @@ -2046,23 +2080,39 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } + return (B_TRUE); +} - num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); +/* + * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and + * also release the holds on the dnodes from dmu_objset_sync_dnodes(). + * The caller must taskq_wait(dp_sync_taskq). + */ +void +dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx) +{ + boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx); + + int num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); for (int i = 0; i < num_sublists; i++) { - if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i)) - continue; userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; uua->uua_sublist_idx = i; uua->uua_tx = tx; - /* note: caller does taskq_wait() */ + + /* + * If we don't need to update userquotas, use + * dnode_rele_task() to call dnode_rele() + */ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, - userquota_updates_task, uua, 0); + need_userquota ? userquota_updates_task : dnode_rele_task, + uua, 0); /* callback frees uua */ } } + /* * Returns a pointer to data to find uid/gid from * @@ -2084,18 +2134,11 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dr == NULL) { data = NULL; } else { - dnode_t *dn; - - DB_DNODE_ENTER(dr->dr_dbuf); - dn = DB_DNODE(dr->dr_dbuf); - - if (dn->dn_bonuslen == 0 && + if (dr->dr_dnode->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; - - DB_DNODE_EXIT(dr->dr_dbuf); } return (data); @@ -2285,8 +2328,8 @@ dmu_objset_space_upgrade(objset_t *os) return (0); } -int -dmu_objset_userspace_upgrade(objset_t *os) +static int +dmu_objset_userspace_upgrade_cb(objset_t *os) { int err = 0; @@ -2306,6 +2349,12 @@ dmu_objset_userspace_upgrade(objset_t *os) return (0); } +void +dmu_objset_userspace_upgrade(objset_t *os) +{ + dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb); +} + static int dmu_objset_id_quota_upgrade_cb(objset_t *os) { @@ -2316,14 +2365,15 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os) return (0); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); - if (!dmu_objset_userobjused_enabled(os)) + if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (!dmu_objset_projectquota_enabled(os) && dmu_objset_userobjspace_present(os)) return (SET_ERROR(ENOTSUP)); - dmu_objset_ds(os)->ds_feature_activation[ - SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; + if (dmu_objset_userobjused_enabled(os)) + dmu_objset_ds(os)->ds_feature_activation[ + SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; if (dmu_objset_projectquota_enabled(os)) dmu_objset_ds(os)->ds_feature_activation[ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; @@ -2332,7 +2382,9 @@ dmu_objset_id_quota_upgrade_cb(objset_t *os) if (err) return (err); - os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + if (dmu_objset_userobjused_enabled(os)) + os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; if (dmu_objset_projectquota_enabled(os)) os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; @@ -2977,7 +3029,7 @@ EXPORT_SYMBOL(dmu_objset_create_impl); EXPORT_SYMBOL(dmu_objset_open_impl); EXPORT_SYMBOL(dmu_objset_evict); EXPORT_SYMBOL(dmu_objset_register_type); -EXPORT_SYMBOL(dmu_objset_do_userquota_updates); +EXPORT_SYMBOL(dmu_objset_sync_done); EXPORT_SYMBOL(dmu_objset_userquota_get_ids); EXPORT_SYMBOL(dmu_objset_userused_enabled); EXPORT_SYMBOL(dmu_objset_userspace_upgrade); diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c index 2eee19a28e34..a0fd157ebc5f 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_recv.c +++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c @@ -79,10 +79,10 @@ struct receive_record_arg { dmu_replay_record_t header; void *payload; /* Pointer to a buffer containing the payload */ /* - * If the record is a write, pointer to the arc_buf_t containing the + * If the record is a WRITE or SPILL, pointer to the abd containing the * payload. */ - arc_buf_t *arc_buf; + abd_t *abd; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ @@ -95,8 +95,8 @@ struct receive_writer_arg { bqueue_t q; /* - * These three args are used to signal to the main thread that we're - * done. + * These three members are used to signal to the main thread when + * we're done. */ kmutex_t mutex; kcondvar_t cv; @@ -175,18 +175,6 @@ byteswap_record(dmu_replay_record_t *drr) DO64(drr_write.drr_key.ddk_prop); DO64(drr_write.drr_compressed_size); break; - case DRR_WRITE_BYREF: - DO64(drr_write_byref.drr_object); - DO64(drr_write_byref.drr_offset); - DO64(drr_write_byref.drr_length); - DO64(drr_write_byref.drr_toguid); - DO64(drr_write_byref.drr_refguid); - DO64(drr_write_byref.drr_refobject); - DO64(drr_write_byref.drr_refoffset); - ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. - drr_key.ddk_cksum); - DO64(drr_write_byref.drr_key.ddk_prop); - break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); @@ -572,7 +560,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; @@ -784,7 +772,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) dsl_dataset_t *ds, *newds; objset_t *os; uint64_t dsobj; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; int error; uint64_t crflags = 0; dsl_crypto_params_t dummy_dcp = { 0 }; @@ -958,7 +946,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drc->drc_drrb; int error; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; dsl_dataset_t *ds; const char *tofs = drc->drc_tofs; @@ -1106,7 +1094,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) const char *tofs = drba->drba_cookie->drc_tofs; uint64_t featureflags = drba->drba_cookie->drc_featureflags; dsl_dataset_t *ds; - ds_hold_flags_t dsflags = 0; + ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; @@ -1903,58 +1891,106 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) struct receive_record_arg *rrd; while ((rrd = list_head(&rwa->write_batch)) != NULL) { struct drr_write *drrw = &rrd->header.drr_u.drr_write; - arc_buf_t *abuf = rrd->arc_buf; + abd_t *abd = rrd->abd; ASSERT3U(drrw->drr_object, ==, rwa->last_object); - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrw->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_WRITE_PAYLOAD_SIZE(drrw)); - } - - /* - * If we are receiving an incremental large-block stream into - * a dataset that previously did a non-large-block receive, - * the WRITE record may be larger than the object's block - * size. dmu_assign_arcbuf_by_dnode() handles this as long - * as the arcbuf is not compressed, so decompress it here if - * necessary. - */ - if (drrw->drr_logical_size != dn->dn_datablksz && - arc_get_compression(abuf) != ZIO_COMPRESS_OFF) { + if (drrw->drr_logical_size != dn->dn_datablksz) { + /* + * The WRITE record is larger than the object's block + * size. We must be receiving an incremental + * large-block stream into a dataset that previously did + * a non-large-block receive. Lightweight writes must + * be exactly one block, so we need to decompress the + * data (if compressed) and do a normal dmu_write(). + */ ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); - zbookmark_phys_t zb = { - .zb_objset = dmu_objset_id(rwa->os), - .zb_object = rwa->last_object, - .zb_level = 0, - .zb_blkid = - drrw->drr_offset >> dn->dn_datablkshift, - }; + if (DRR_WRITE_COMPRESSED(drrw)) { + abd_t *decomp_abd = + abd_alloc_linear(drrw->drr_logical_size, + B_FALSE); + + err = zio_decompress_data( + drrw->drr_compressiontype, + abd, abd_to_buf(decomp_abd), + abd_get_size(abd), + abd_get_size(decomp_abd), NULL); + + if (err == 0) { + dmu_write_by_dnode(dn, + drrw->drr_offset, + drrw->drr_logical_size, + abd_to_buf(decomp_abd), tx); + } + abd_free(decomp_abd); + } else { + dmu_write_by_dnode(dn, + drrw->drr_offset, + drrw->drr_logical_size, + abd_to_buf(abd), tx); + } + if (err == 0) + abd_free(abd); + } else { + zio_prop_t zp; + dmu_write_policy(rwa->os, dn, 0, 0, &zp); + + enum zio_flag zio_flags = 0; + + if (rwa->raw) { + zp.zp_encrypt = B_TRUE; + zp.zp_compress = drrw->drr_compressiontype; + zp.zp_byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ + rwa->byteswap; + bcopy(drrw->drr_salt, zp.zp_salt, + ZIO_DATA_SALT_LEN); + bcopy(drrw->drr_iv, zp.zp_iv, + ZIO_DATA_IV_LEN); + bcopy(drrw->drr_mac, zp.zp_mac, + ZIO_DATA_MAC_LEN); + if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) { + zp.zp_nopwrite = B_FALSE; + zp.zp_copies = MIN(zp.zp_copies, + SPA_DVAS_PER_BP - 1); + } + zio_flags |= ZIO_FLAG_RAW; + } else if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + zp.zp_compress = drrw->drr_compressiontype; + zio_flags |= ZIO_FLAG_RAW_COMPRESS; + } else if (rwa->byteswap) { + /* + * Note: compressed blocks never need to be + * byteswapped, because WRITE records for + * metadata blocks are never compressed. The + * exception is raw streams, which are written + * in the original byteorder, and the byteorder + * bit is preserved in the BP by setting + * zp_byteorder above. + */ + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func( + abd_to_buf(abd), + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } /* - * The size of loaned arc bufs is counted in - * arc_loaned_bytes. When we untransform - * (decompress) the buf, its size increases. To - * ensure that arc_loaned_bytes remains accurate, we - * need to return (un-loan) the buf (with its - * compressed size) and then re-loan it (with its - * new, uncompressed size). + * Since this data can't be read until the receive + * completes, we can do a "lightweight" write for + * improved performance. */ - arc_return_buf(abuf, FTAG); - VERIFY0(arc_untransform(abuf, dmu_objset_spa(rwa->os), - &zb, B_FALSE)); - arc_loan_inuse_buf(abuf, FTAG); + err = dmu_lightweight_write_by_dnode(dn, + drrw->drr_offset, abd, &zp, zio_flags, tx); } - err = dmu_assign_arcbuf_by_dnode(dn, - drrw->drr_offset, abuf, tx); if (err != 0) { /* * This rrd is left on the list, so the caller will - * free it (and the arc_buf). + * free it (and the abd). */ break; } @@ -1987,7 +2023,7 @@ flush_write_batch(struct receive_writer_arg *rwa) if (err != 0) { struct receive_record_arg *rrd; while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) { - dmu_return_arcbuf(rrd->arc_buf); + abd_free(rrd->abd); kmem_free(rrd, sizeof (*rrd)); } } @@ -2090,9 +2126,8 @@ receive_write_embedded(struct receive_writer_arg *rwa, static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, - arc_buf_t *abuf) + abd_t *abd) { - dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; @@ -2107,7 +2142,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, * the DRR_FLAG_SPILL_BLOCK flag. */ if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { - dmu_return_arcbuf(abuf); + abd_free(abd); return (0); } @@ -2131,7 +2166,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, return (err); } - tx = dmu_tx_create(rwa->os); + dmu_tx_t *tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); @@ -2150,18 +2185,35 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, */ if (db_spill->db_size != drrs->drr_length) { dmu_buf_will_fill(db_spill, tx); - VERIFY(0 == dbuf_spill_set_blksz(db_spill, + VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrs->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_SPILL_PAYLOAD_SIZE(drrs)); + arc_buf_t *abuf; + if (rwa->raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ + rwa->byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os), + drrs->drr_object, byteorder, drrs->drr_salt, + drrs->drr_iv, drrs->drr_mac, drrs->drr_type, + drrs->drr_compressed_size, drrs->drr_length, + drrs->drr_compressiontype, 0); + } else { + abuf = arc_loan_buf(dmu_objset_spa(rwa->os), + DMU_OT_IS_METADATA(drrs->drr_type), + drrs->drr_length); + if (rwa->byteswap) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrs->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd), + DRR_SPILL_PAYLOAD_SIZE(drrs)); + } } + bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs)); + abd_free(abd); dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); @@ -2263,8 +2315,9 @@ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { dsl_dataset_t *ds = drc->drc_ds; - ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; + dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; /* * Wait for the txg sync before cleaning up the receive. For * resumable receives, this ensures that our resume state has @@ -2451,53 +2504,19 @@ receive_read_record(dmu_recv_cookie_t *drc) case DRR_WRITE: { struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; - arc_buf_t *abuf; - boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); - - if (drc->drc_raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ - drc->drc_byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os), - drrw->drr_object, byteorder, drrw->drr_salt, - drrw->drr_iv, drrw->drr_mac, drrw->drr_type, - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype, 0); - } else if (DRR_WRITE_COMPRESSED(drrw)) { - ASSERT3U(drrw->drr_compressed_size, >, 0); - ASSERT3U(drrw->drr_logical_size, >=, - drrw->drr_compressed_size); - ASSERT(!is_meta); - abuf = arc_loan_compressed_buf( - dmu_objset_spa(drc->drc_os), - drrw->drr_compressed_size, drrw->drr_logical_size, - drrw->drr_compressiontype, 0); - } else { - abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), - is_meta, drrw->drr_logical_size); - } - - err = receive_read_payload_and_next_header(drc, - DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); + int size = DRR_WRITE_PAYLOAD_SIZE(drrw); + abd_t *abd = abd_alloc_linear(size, B_FALSE); + err = receive_read_payload_and_next_header(drc, size, + abd_to_buf(abd)); if (err != 0) { - dmu_return_arcbuf(abuf); + abd_free(abd); return (err); } - drc->drc_rrd->arc_buf = abuf; + drc->drc_rrd->abd = abd; receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); } - case DRR_WRITE_BYREF: - { - struct drr_write_byref *drrwb = - &drc->drc_rrd->header.drr_u.drr_write_byref; - err = receive_read_payload_and_next_header(drc, 0, NULL); - receive_read_prefetch(drc, drrwb->drr_object, drrwb->drr_offset, - drrwb->drr_length); - return (err); - } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = @@ -2536,29 +2555,14 @@ receive_read_record(dmu_recv_cookie_t *drc) case DRR_SPILL: { struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; - arc_buf_t *abuf; - /* DRR_SPILL records are either raw or uncompressed */ - if (drc->drc_raw) { - boolean_t byteorder = ZFS_HOST_BYTEORDER ^ - !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ - drc->drc_byteswap; - - abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os), - drrs->drr_object, byteorder, drrs->drr_salt, - drrs->drr_iv, drrs->drr_mac, drrs->drr_type, - drrs->drr_compressed_size, drrs->drr_length, - drrs->drr_compressiontype, 0); - } else { - abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), - DMU_OT_IS_METADATA(drrs->drr_type), - drrs->drr_length); - } - err = receive_read_payload_and_next_header(drc, - DRR_SPILL_PAYLOAD_SIZE(drrs), abuf->b_data); + int size = DRR_SPILL_PAYLOAD_SIZE(drrs); + abd_t *abd = abd_alloc_linear(size, B_FALSE); + err = receive_read_payload_and_next_header(drc, size, + abd_to_buf(abd)); if (err != 0) - dmu_return_arcbuf(abuf); + abd_free(abd); else - drc->drc_rrd->arc_buf = abuf; + drc->drc_rrd->abd = abd; return (err); } case DRR_OBJECT_RANGE: @@ -2687,9 +2691,9 @@ receive_process_record(struct receive_writer_arg *rwa, if (rrd->header.drr_type != DRR_WRITE) { err = flush_write_batch(rwa); if (err != 0) { - if (rrd->arc_buf != NULL) { - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); @@ -2726,8 +2730,8 @@ receive_process_record(struct receive_writer_arg *rwa, * the rrd or arc_buf. */ ASSERT(err != 0); - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + abd_free(rrd->abd); + rrd->abd = NULL; } break; } @@ -2749,10 +2753,10 @@ receive_process_record(struct receive_writer_arg *rwa, case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; - err = receive_spill(rwa, drrs, rrd->arc_buf); + err = receive_spill(rwa, drrs, rrd->abd); if (err != 0) - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; break; } @@ -2800,9 +2804,9 @@ receive_writer_thread(void *arg) int err = 0; if (rwa->err == 0) { err = receive_process_record(rwa, rrd); - } else if (rrd->arc_buf != NULL) { - dmu_return_arcbuf(rrd->arc_buf); - rrd->arc_buf = NULL; + } else if (rrd->abd != NULL) { + abd_free(rrd->abd); + rrd->abd = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c index 225ec40537ec..62c7d01d4bd2 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_redact.c +++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c @@ -858,7 +858,7 @@ hold_next_object(objset_t *os, struct redact_record *rec, void *tag, { int err = 0; if (*dn != NULL) - dnode_rele(*dn, FTAG); + dnode_rele(*dn, tag); *dn = NULL; if (*object < rec->start_object) { *object = rec->start_object - 1; diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c index 9480c8b75497..d654382237c0 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_send.c +++ b/sys/contrib/openzfs/module/zfs/dmu_send.c @@ -2626,7 +2626,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, { int err; dsl_dataset_t *fromds; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; struct dmu_send_params dspp = {0}; dspp.embedok = embedok; dspp.large_block_ok = large_block_ok; @@ -2638,6 +2638,7 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, dspp.rawok = rawok; dspp.savedok = savedok; + dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; err = dsl_pool_hold(pool, FTAG, &dspp.dp); if (err != 0) return (err); @@ -2711,12 +2712,13 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, dmu_send_outparams_t *dsop) { int err = 0; - ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + ds_hold_flags_t dsflags; boolean_t owned = B_FALSE; dsl_dataset_t *fromds = NULL; zfs_bookmark_phys_t book = {0}; struct dmu_send_params dspp = {0}; + dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT; dspp.tosnap = tosnap; dspp.embedok = embedok; dspp.large_block_ok = large_block_ok; diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c index 09ef2be94944..0ebed4e6fbdf 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_tx.c +++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c @@ -230,9 +230,6 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); - if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) - err = SET_ERROR(EFBIG); - if (dn == NULL) return; diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c index 4d86863f30ea..5d061fe3813e 100644 --- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c +++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c @@ -59,16 +59,29 @@ typedef struct zfetch_stats { kstat_named_t zfetchstat_hits; kstat_named_t zfetchstat_misses; kstat_named_t zfetchstat_max_streams; + kstat_named_t zfetchstat_max_completion_us; + kstat_named_t zfetchstat_last_completion_us; + kstat_named_t zfetchstat_io_issued; } zfetch_stats_t; static zfetch_stats_t zfetch_stats = { { "hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "max_streams", KSTAT_DATA_UINT64 }, + { "max_completion_us", KSTAT_DATA_UINT64 }, + { "last_completion_us", KSTAT_DATA_UINT64 }, + { "io_issued", KSTAT_DATA_UINT64 }, }; #define ZFETCHSTAT_BUMP(stat) \ - atomic_inc_64(&zfetch_stats.stat.value.ui64); + atomic_inc_64(&zfetch_stats.stat.value.ui64) +#define ZFETCHSTAT_ADD(stat, val) \ + atomic_add_64(&zfetch_stats.stat.value.ui64, val) +#define ZFETCHSTAT_SET(stat, val) \ + zfetch_stats.stat.value.ui64 = val +#define ZFETCHSTAT_GET(stat) \ + zfetch_stats.stat.value.ui64 + kstat_t *zfetch_ksp; @@ -104,8 +117,8 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) { if (zf == NULL) return; - zf->zf_dnode = dno; + zf->zf_numstreams = 0; list_create(&zf->zf_stream, sizeof (zstream_t), offsetof(zstream_t, zs_node)); @@ -113,13 +126,29 @@ dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); } +static void +dmu_zfetch_stream_fini(zstream_t *zs) +{ + mutex_destroy(&zs->zs_lock); + kmem_free(zs, sizeof (*zs)); +} + static void dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) { ASSERT(MUTEX_HELD(&zf->zf_lock)); list_remove(&zf->zf_stream, zs); - mutex_destroy(&zs->zs_lock); - kmem_free(zs, sizeof (*zs)); + dmu_zfetch_stream_fini(zs); + zf->zf_numstreams--; +} + +static void +dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(MUTEX_HELD(&zf->zf_lock)); + list_remove(&zf->zf_stream, zs); + zs->zs_fetch = NULL; + zf->zf_numstreams--; } /* @@ -132,8 +161,12 @@ dmu_zfetch_fini(zfetch_t *zf) zstream_t *zs; mutex_enter(&zf->zf_lock); - while ((zs = list_head(&zf->zf_stream)) != NULL) - dmu_zfetch_stream_remove(zf, zs); + while ((zs = list_head(&zf->zf_stream)) != NULL) { + if (zfs_refcount_count(&zs->zs_blocks) != 0) + dmu_zfetch_stream_orphan(zf, zs); + else + dmu_zfetch_stream_remove(zf, zs); + } mutex_exit(&zf->zf_lock); list_destroy(&zf->zf_stream); mutex_destroy(&zf->zf_lock); @@ -151,7 +184,7 @@ static void dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) { zstream_t *zs_next; - int numstreams = 0; + hrtime_t now = gethrtime(); ASSERT(MUTEX_HELD(&zf->zf_lock)); @@ -161,11 +194,14 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) for (zstream_t *zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { zs_next = list_next(&zf->zf_stream, zs); - if (((gethrtime() - zs->zs_atime) / NANOSEC) > + /* + * Skip gethrtime() call if there are still references + */ + if (zfs_refcount_count(&zs->zs_blocks) != 0) + continue; + if (((now - zs->zs_atime) / NANOSEC) > zfetch_min_sec_reap) dmu_zfetch_stream_remove(zf, zs); - else - numstreams++; } /* @@ -179,7 +215,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / zfetch_max_distance)); - if (numstreams >= max_streams) { + if (zf->zf_numstreams >= max_streams) { ZFETCHSTAT_BUMP(zfetchstat_max_streams); return; } @@ -188,12 +224,39 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) zs->zs_blkid = blkid; zs->zs_pf_blkid = blkid; zs->zs_ipf_blkid = blkid; - zs->zs_atime = gethrtime(); + zs->zs_atime = now; + zs->zs_fetch = zf; + zfs_refcount_create(&zs->zs_blocks); mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); - + zf->zf_numstreams++; list_insert_head(&zf->zf_stream, zs); } +static void +dmu_zfetch_stream_done(void *arg, boolean_t io_issued) +{ + zstream_t *zs = arg; + + if (zs->zs_start_time && io_issued) { + hrtime_t now = gethrtime(); + hrtime_t delta = NSEC2USEC(now - zs->zs_start_time); + + zs->zs_start_time = 0; + ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta); + if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us)) + ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta); + } + + if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0) + return; + + /* + * The parent fetch structure has gone away + */ + if (zs->zs_fetch == NULL) + dmu_zfetch_stream_fini(zs); +} + /* * This is the predictive prefetch entry point. It associates dnode access * specified with blkid and nblks arguments with prefetch stream, predicts @@ -209,7 +272,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, zstream_t *zs; int64_t pf_start, ipf_start, ipf_istart, ipf_iend; int64_t pf_ahead_blks, max_blks; - int epbs, max_dist_blks, pf_nblks, ipf_nblks; + int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued; uint64_t end_of_access_blkid; end_of_access_blkid = blkid + nblks; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; @@ -230,11 +293,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, * As a fast path for small (single-block) files, ignore access * to the first block. */ - if (blkid == 0) + if (!have_lock && blkid == 0) return; if (!have_lock) rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + + /* + * A fast path for small files for which no prefetch will + * happen. + */ + if (zf->zf_dnode->dn_maxblkid < 2) { + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return; + } mutex_enter(&zf->zf_lock); /* @@ -343,9 +416,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; zs->zs_atime = gethrtime(); + /* no prior reads in progress */ + if (zfs_refcount_count(&zs->zs_blocks) == 0) + zs->zs_start_time = zs->zs_atime; zs->zs_blkid = end_of_access_blkid; + zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart, + NULL); mutex_exit(&zs->zs_lock); mutex_exit(&zf->zf_lock); + issued = 0; /* * dbuf_prefetch() is asynchronous (even when it needs to read @@ -354,16 +433,21 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, */ for (int i = 0; i < pf_nblks; i++) { - dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); } for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { - dbuf_prefetch(zf->zf_dnode, 1, iblk, - ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, + dmu_zfetch_stream_done, zs); } if (!have_lock) rw_exit(&zf->zf_dnode->dn_struct_rwlock); ZFETCHSTAT_BUMP(zfetchstat_hits); + + if (issued) + ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); } /* BEGIN CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c index 23364dbae897..eaba9c0c0e7f 100644 --- a/sys/contrib/openzfs/module/zfs/dnode.c +++ b/sys/contrib/openzfs/module/zfs/dnode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -609,7 +609,6 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); - ASSERT0(dn->dn_dirty_txg); ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); @@ -649,6 +648,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dn->dn_free_txg = 0; dn->dn_dirtyctx_firstset = NULL; + dn->dn_dirty_txg = 0; dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; @@ -1812,6 +1812,7 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(new_nlevels, >, dn->dn_nlevels); dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); @@ -1829,10 +1830,12 @@ dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); - if (dr->dr_dbuf->db_level != new_nlevels-1 && + + IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1); + if (dr->dr_dbuf == NULL || + (dr->dr_dbuf->db_level == old_nlevels - 1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) { list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c index ae44cb69765c..66e48a1e17d4 100644 --- a/sys/contrib/openzfs/module/zfs/dnode_sync.c +++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2020 Oxide Computer Company */ @@ -851,6 +851,8 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) /* * Although we have dropped our reference to the dnode, it * can't be evicted until its written, and we haven't yet - * initiated the IO for the dnode's dbuf. + * initiated the IO for the dnode's dbuf. Additionally, the caller + * has already added a reference to the dnode because it's on the + * os_synced_dnodes list. */ } diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c index 16bf2c4414a8..2faf1af52991 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c +++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c @@ -1561,33 +1561,6 @@ dsl_bookmark_latest_txg(dsl_dataset_t *ds) return (dbn->dbn_phys.zbm_creation_txg); } -static inline unsigned int -redact_block_buf_num_entries(unsigned int size) -{ - return (size / sizeof (redact_block_phys_t)); -} - -/* - * This function calculates the offset of the last entry in the array of - * redact_block_phys_t. If we're reading the redaction list into buffers of - * size bufsize, then for all but the last buffer, the last valid entry in the - * array will be the last entry in the array. However, for the last buffer, any - * amount of it may be filled. Thus, we check to see if we're looking at the - * last buffer in the redaction list, and if so, we return the total number of - * entries modulo the number of entries per buffer. Otherwise, we return the - * number of entries per buffer minus one. - */ -static inline unsigned int -last_entry(redaction_list_t *rl, unsigned int bufsize, uint64_t bufid) -{ - if (bufid == (rl->rl_phys->rlp_num_entries - 1) / - redact_block_buf_num_entries(bufsize)) { - return ((rl->rl_phys->rlp_num_entries - 1) % - redact_block_buf_num_entries(bufsize)); - } - return (redact_block_buf_num_entries(bufsize) - 1); -} - /* * Compare the redact_block_phys_t to the bookmark. If the last block in the * redact_block_phys_t is before the bookmark, return -1. If the first block in @@ -1633,8 +1606,6 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, rl_traverse_callback_t cb, void *arg) { objset_t *mos = rl->rl_mos; - redact_block_phys_t *buf; - unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; int err = 0; if (rl->rl_phys->rlp_last_object != UINT64_MAX || @@ -1651,42 +1622,48 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, } /* - * Binary search for the point to resume from. The goal is to minimize - * the number of disk reads we have to perform. + * This allows us to skip the binary search and resume checking logic + * below, if we're not resuming a redacted send. */ - buf = zio_data_buf_alloc(bufsize); - uint64_t maxbufid = (rl->rl_phys->rlp_num_entries - 1) / - redact_block_buf_num_entries(bufsize); - uint64_t minbufid = 0; - while (resume != NULL && maxbufid - minbufid >= 1) { - ASSERT3U(maxbufid, >, minbufid); - uint64_t midbufid = minbufid + ((maxbufid - minbufid) / 2); - err = dmu_read(mos, rl->rl_object, midbufid * bufsize, bufsize, - buf, DMU_READ_NO_PREFETCH); + if (ZB_IS_ZERO(resume)) + resume = NULL; + + /* + * Binary search for the point to resume from. + */ + uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1; + uint64_t minidx = 0; + while (resume != NULL && maxidx > minidx) { + redact_block_phys_t rbp = { 0 }; + ASSERT3U(maxidx, >, minidx); + uint64_t mididx = minidx + ((maxidx - minidx) / 2); + err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp), + sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH); if (err != 0) break; - int cmp0 = redact_block_zb_compare(&buf[0], resume); - int cmpn = redact_block_zb_compare( - &buf[last_entry(rl, bufsize, maxbufid)], resume); + int cmp = redact_block_zb_compare(&rbp, resume); - /* - * If the first block is before or equal to the resume point, - * and the last one is equal or after, then the resume point is - * in this buf, and we should start here. - */ - if (cmp0 <= 0 && cmpn >= 0) + if (cmp == 0) { + minidx = mididx; break; - - if (cmp0 > 0) - maxbufid = midbufid - 1; - else if (cmpn < 0) - minbufid = midbufid + 1; - else - panic("No progress in binary search for resume point"); + } else if (cmp > 0) { + maxidx = + (mididx == minidx ? minidx : mididx - 1); + } else { + minidx = mididx + 1; + } } - for (uint64_t curidx = minbufid * redact_block_buf_num_entries(bufsize); + unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; + redact_block_phys_t *buf = zio_data_buf_alloc(bufsize); + + unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t); + uint64_t start_block = minidx / entries_per_buf; + err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf, + DMU_READ_PREFETCH); + + for (uint64_t curidx = minidx; err == 0 && curidx < rl->rl_phys->rlp_num_entries; curidx++) { /* @@ -1696,22 +1673,35 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, * prefetching, and this code shouldn't be the bottleneck, so we * don't need to do manual prefetching. */ - if (curidx % redact_block_buf_num_entries(bufsize) == 0) { + if (curidx % entries_per_buf == 0) { err = dmu_read(mos, rl->rl_object, curidx * sizeof (*buf), bufsize, buf, DMU_READ_PREFETCH); if (err != 0) break; } - redact_block_phys_t *rb = &buf[curidx % - redact_block_buf_num_entries(bufsize)]; + redact_block_phys_t *rb = &buf[curidx % entries_per_buf]; /* * If resume is non-null, we should either not send the data, or * null out resume so we don't have to keep doing these * comparisons. */ if (resume != NULL) { + /* + * It is possible that after the binary search we got + * a record before the resume point. There's two cases + * where this can occur. If the record is the last + * redaction record, and the resume point is after the + * end of the redacted data, curidx will be the last + * redaction record. In that case, the loop will end + * after this iteration. The second case is if the + * resume point is between two redaction records, the + * binary search can return either the record before + * or after the resume point. In that case, the next + * iteration will be greater than the resume point. + */ if (redact_block_zb_compare(rb, resume) < 0) { + ASSERT3U(curidx, ==, minidx); continue; } else { /* @@ -1733,8 +1723,10 @@ dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, } } - if (cb(rb, arg) != 0) + if (cb(rb, arg) != 0) { + err = EINTR; break; + } } zio_data_buf_free(buf, bufsize); diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c index 26d4c2fe7e33..e38ec0cae827 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_crypt.c +++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c @@ -2007,14 +2007,6 @@ dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds, if (ret != 0) return (ret); - /* - * Useraccounting is not portable and must be done with the keys loaded. - * Therefore, whenever we do any kind of receive the useraccounting - * must not be present. - */ - ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); - ASSERT0(os->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE); - mdn = DMU_META_DNODE(os); /* @@ -2105,6 +2097,9 @@ dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype, */ arc_release(os->os_phys_buf, &os->os_phys_buf); bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN); + os->os_phys->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE; + os->os_phys->os_flags &= ~OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; + os->os_flags = os->os_phys->os_flags; bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN); os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c index 1fcd83db7988..de60c33589e3 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_dataset.c +++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c @@ -2267,10 +2267,8 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_bookmark_sync_done(ds, tx); - if (os->os_synced_dnodes != NULL) { - multilist_destroy(os->os_synced_dnodes); - os->os_synced_dnodes = NULL; - } + multilist_destroy(os->os_synced_dnodes); + os->os_synced_dnodes = NULL; if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE; diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c index 3a2028625e8b..c770eafa75d8 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_pool.c +++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. @@ -220,11 +220,12 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); - dp->dp_zrele_taskq = taskq_create("z_zrele", boot_ncpus, defclsyspri, - boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, + boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", - boot_ncpus, defclsyspri, boot_ncpus, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + 100, defclsyspri, boot_ncpus, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); return (dp); } @@ -565,6 +566,11 @@ dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); + dmu_objset_sync_done(dp->dp_meta_objset, tx); + taskq_wait(dp->dp_sync_taskq); + multilist_destroy(dp->dp_meta_objset->os_synced_dnodes); + dp->dp_meta_objset->os_synced_dnodes = NULL; + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } @@ -676,7 +682,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { - dmu_objset_do_userquota_updates(ds->ds_objset, tx); + dmu_objset_sync_done(ds->ds_objset, tx); } taskq_wait(dp->dp_sync_taskq); @@ -1264,8 +1270,16 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, * (e.g. it could be destroyed). Therefore you shouldn't do anything to the * dataset except release it. * - * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only - * or modifying operations. + * Operations generally fall somewhere into the following taxonomy: + * + * Read-Only Modifying + * + * Dataset Layer / MOS zfs get zfs destroy + * + * Individual Dataset read() write() + * + * + * Dataset Layer Operations * * Modifying operations should generally use dsl_sync_task(). The synctask * infrastructure enforces proper locking strategy with respect to the @@ -1275,6 +1289,25 @@ dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, * information from the dataset, then release the pool and dataset. * dmu_objset_{hold,rele}() are convenience routines that also do the pool * hold/rele. + * + * + * Operations On Individual Datasets + * + * Objects _within_ an objset should only be modified by the current 'owner' + * of the objset to prevent incorrect concurrent modification. Thus, use + * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner, + * and fail with EBUSY if there is already an owner. The owner can then + * implement its own locking strategy, independent of the dataset layer's + * locking infrastructure. + * (E.g., the ZPL has its own set of locks to control concurrency. A regular + * vnop will not reach into the dataset layer). + * + * Ideally, objects would also only be read by the objset’s owner, so that we + * don’t observe state mid-modification. + * (E.g. the ZPL is creating a new object and linking it into a directory; if + * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an + * intermediate state. The ioctl level violates this but in pretty benign + * ways, e.g. reading the zpl props object.) */ int diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c index 4704781bfa45..40adfbcee4e1 100644 --- a/sys/contrib/openzfs/module/zfs/dsl_scan.c +++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c @@ -713,7 +713,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) return (0); } -static void +void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; @@ -3327,20 +3327,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, return (B_TRUE); } - /* - * Check if the txg falls within the range which must be - * resilvered. DVAs outside this range can always be skipped. - */ - if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) - return (B_FALSE); - /* * Check if the top-level vdev must resilver this offset. * When the offset does not intersect with a dirty leaf DTL * then it may be possible to skip the resilver IO. The psize * is provided instead of asize to simplify the check for RAIDZ. */ - if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) return (B_FALSE); /* @@ -3987,7 +3980,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, /* * Keep track of how much data we've examined so that - * zpool(1M) status can make useful progress reports. + * zpool(8) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c index 133005b227e5..bed6bf64c928 100644 --- a/sys/contrib/openzfs/module/zfs/metaslab.c +++ b/sys/contrib/openzfs/module/zfs/metaslab.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -263,9 +264,7 @@ int zfs_metaslab_switch_threshold = 2; * Internal switch to enable/disable the metaslab allocation tracing * facility. */ -#ifdef _METASLAB_TRACING -boolean_t metaslab_trace_enabled = B_TRUE; -#endif +boolean_t metaslab_trace_enabled = B_FALSE; /* * Maximum entries that the metaslab allocation tracing facility will keep @@ -275,9 +274,7 @@ boolean_t metaslab_trace_enabled = B_TRUE; * to every exceed this value. In debug mode, the system will panic if this * limit is ever reached allowing for further investigation. */ -#ifdef _METASLAB_TRACING uint64_t metaslab_trace_max_entries = 5000; -#endif /* * Maximum number of metaslabs per group that can be disabled @@ -313,6 +310,35 @@ boolean_t zfs_metaslab_force_large_segs = B_FALSE; */ uint32_t metaslab_by_size_min_shift = 14; +/* + * If not set, we will first try normal allocation. If that fails then + * we will do a gang allocation. If that fails then we will do a "try hard" + * gang allocation. If that fails then we will have a multi-layer gang + * block. + * + * If set, we will first try normal allocation. If that fails then + * we will do a "try hard" allocation. If that fails we will do a gang + * allocation. If that fails we will do a "try hard" gang allocation. If + * that fails then we will have a multi-layer gang block. + */ +int zfs_metaslab_try_hard_before_gang = B_FALSE; + +/* + * When not trying hard, we only consider the best zfs_metaslab_find_max_tries + * metaslabs. This improves performance, especially when there are many + * metaslabs per vdev and the allocation can't actually be satisfied (so we + * would otherwise iterate all the metaslabs). If there is a metaslab with a + * worse weight but it can actually satisfy the allocation, we won't find it + * until trying hard. This may happen if the worse metaslab is not loaded + * (and the true weight is better than we have calculated), or due to weight + * bucketization. E.g. we are looking for a 60K segment, and the best + * metaslabs all have free segments in the 32-63K bucket, but the best + * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a + * subsequent metaslab has ms_max_size >60KB (but fewer segments in this + * bucket, and therefore a lower weight). + */ +int zfs_metaslab_find_max_tries = 100; + static uint64_t metaslab_weight(metaslab_t *, boolean_t); static void metaslab_set_fragmentation(metaslab_t *, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); @@ -324,19 +350,20 @@ static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static unsigned int metaslab_idx_func(multilist_t *, void *); static void metaslab_evict(metaslab_t *, uint64_t); static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg); -#ifdef _METASLAB_TRACING kmem_cache_t *metaslab_alloc_trace_cache; typedef struct metaslab_stats { kstat_named_t metaslabstat_trace_over_limit; - kstat_named_t metaslabstat_df_find_under_floor; kstat_named_t metaslabstat_reload_tree; + kstat_named_t metaslabstat_too_many_tries; + kstat_named_t metaslabstat_try_hard; } metaslab_stats_t; static metaslab_stats_t metaslab_stats = { { "trace_over_limit", KSTAT_DATA_UINT64 }, - { "df_find_under_floor", KSTAT_DATA_UINT64 }, { "reload_tree", KSTAT_DATA_UINT64 }, + { "too_many_tries", KSTAT_DATA_UINT64 }, + { "try_hard", KSTAT_DATA_UINT64 }, }; #define METASLABSTAT_BUMP(stat) \ @@ -372,18 +399,6 @@ metaslab_stat_fini(void) kmem_cache_destroy(metaslab_alloc_trace_cache); metaslab_alloc_trace_cache = NULL; } -#else - -void -metaslab_stat_init(void) -{ -} - -void -metaslab_stat_fini(void) -{ -} -#endif /* * ========================================================================== @@ -395,20 +410,19 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) { metaslab_class_t *mc; - mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + mc = kmem_zalloc(offsetof(metaslab_class_t, + mc_allocator[spa->spa_alloc_count]), KM_SLEEP); mc->mc_spa = spa; - mc->mc_rotor = NULL; mc->mc_ops = ops; mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); - mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * - sizeof (zfs_refcount_t), KM_SLEEP); - mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * - sizeof (uint64_t), KM_SLEEP); - for (int i = 0; i < spa->spa_alloc_count; i++) - zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); + for (int i = 0; i < spa->spa_alloc_count; i++) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + mca->mca_rotor = NULL; + zfs_refcount_create_tracked(&mca->mca_alloc_slots); + } return (mc); } @@ -416,21 +430,22 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) void metaslab_class_destroy(metaslab_class_t *mc) { - ASSERT(mc->mc_rotor == NULL); + spa_t *spa = mc->mc_spa; + ASSERT(mc->mc_alloc == 0); ASSERT(mc->mc_deferred == 0); ASSERT(mc->mc_space == 0); ASSERT(mc->mc_dspace == 0); - for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) - zfs_refcount_destroy(&mc->mc_alloc_slots[i]); - kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * - sizeof (zfs_refcount_t)); - kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * - sizeof (uint64_t)); + for (int i = 0; i < spa->spa_alloc_count; i++) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; + ASSERT(mca->mca_rotor == NULL); + zfs_refcount_destroy(&mca->mca_alloc_slots); + } mutex_destroy(&mc->mc_lock); multilist_destroy(mc->mc_metaslab_txg_list); - kmem_free(mc, sizeof (metaslab_class_t)); + kmem_free(mc, offsetof(metaslab_class_t, + mc_allocator[spa->spa_alloc_count])); } int @@ -445,7 +460,7 @@ metaslab_class_validate(metaslab_class_t *mc) ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - if ((mg = mc->mc_rotor) == NULL) + if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) return (0); do { @@ -454,7 +469,7 @@ metaslab_class_validate(metaslab_class_t *mc) ASSERT3P(vd->vdev_top, ==, vd); ASSERT3P(mg->mg_class, ==, mc); ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); - } while ((mg = mg->mg_next) != mc->mc_rotor); + } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); return (0); } @@ -811,7 +826,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) { metaslab_group_t *mg; - mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mg = kmem_zalloc(offsetof(metaslab_group_t, + mg_allocator[allocators]), KM_SLEEP); mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); @@ -824,8 +840,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) mg->mg_no_free_space = B_TRUE; mg->mg_allocators = allocators; - mg->mg_allocator = kmem_zalloc(allocators * - sizeof (metaslab_group_allocator_t), KM_SLEEP); for (int i = 0; i < allocators; i++) { metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); @@ -859,21 +873,19 @@ metaslab_group_destroy(metaslab_group_t *mg) metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; zfs_refcount_destroy(&mga->mga_alloc_queue_depth); } - kmem_free(mg->mg_allocator, mg->mg_allocators * - sizeof (metaslab_group_allocator_t)); - - kmem_free(mg, sizeof (metaslab_group_t)); + kmem_free(mg, offsetof(metaslab_group_t, + mg_allocator[mg->mg_allocators])); } void metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; + spa_t *spa = mc->mc_spa; metaslab_group_t *mgprev, *mgnext; - ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); + ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0); - ASSERT(mc->mc_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count <= 0); @@ -884,7 +896,7 @@ metaslab_group_activate(metaslab_group_t *mg) mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); metaslab_group_alloc_update(mg); - if ((mgprev = mc->mc_rotor) == NULL) { + if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { mg->mg_prev = mg; mg->mg_next = mg; } else { @@ -894,7 +906,10 @@ metaslab_group_activate(metaslab_group_t *mg) mgprev->mg_next = mg; mgnext->mg_prev = mg; } - mc->mc_rotor = mg; + for (int i = 0; i < spa->spa_alloc_count; i++) { + mc->mc_allocator[i].mca_rotor = mg; + mg = mg->mg_next; + } } /* @@ -915,7 +930,8 @@ metaslab_group_passivate(metaslab_group_t *mg) (SCL_ALLOC | SCL_ZIO)); if (--mg->mg_activation_count != 0) { - ASSERT(mc->mc_rotor != mg); + for (int i = 0; i < spa->spa_alloc_count; i++) + ASSERT(mc->mc_allocator[i].mca_rotor != mg); ASSERT(mg->mg_prev == NULL); ASSERT(mg->mg_next == NULL); ASSERT(mg->mg_activation_count < 0); @@ -962,12 +978,15 @@ metaslab_group_passivate(metaslab_group_t *mg) mgnext = mg->mg_next; if (mg == mgnext) { - mc->mc_rotor = NULL; + mgnext = NULL; } else { - mc->mc_rotor = mgnext; mgprev->mg_next = mgnext; mgnext->mg_prev = mgprev; } + for (int i = 0; i < spa->spa_alloc_count; i++) { + if (mc->mc_allocator[i].mca_rotor == mg) + mc->mc_allocator[i].mca_rotor = mgnext; + } mg->mg_prev = NULL; mg->mg_next = NULL; @@ -1201,7 +1220,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * in metaslab_group_alloc_update() for more information) and * the allocation throttle is disabled then allow allocations to this * device. However, if the allocation throttle is enabled then - * check if we have reached our allocation limit (mg_alloc_queue_depth) + * check if we have reached our allocation limit (mga_alloc_queue_depth) * to determine if we should allow allocations to this metaslab group. * If all metaslab groups are no longer considered allocatable * (mc_alloc_groups == 0) or we're trying to allocate the smallest @@ -1350,9 +1369,7 @@ static void metaslab_size_tree_full_load(range_tree_t *rt) { metaslab_rt_arg_t *mrap = rt->rt_arg; -#ifdef _METASLAB_TRACING METASLABSTAT_BUMP(metaslabstat_reload_tree); -#endif ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); mrap->mra_floor_shift = 0; struct mssa_arg arg = {0}; @@ -1563,6 +1580,7 @@ metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, #if defined(WITH_DF_BLOCK_ALLOCATOR) || \ defined(WITH_CF_BLOCK_ALLOCATOR) + /* * This is a helper function that can be used by the allocator to find a * suitable block to allocate. This will search the specified B-tree looking @@ -1654,19 +1672,13 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) range_seg_t *rs; if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) metaslab_size_tree_full_load(msp->ms_allocatable); + if (metaslab_df_use_largest_segment) { /* use largest free segment */ rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); } else { zfs_btree_index_t where; /* use segment of this size, or next largest */ -#ifdef _METASLAB_TRACING - metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; - if (size < (1 << mrap->mra_floor_shift)) { - METASLABSTAT_BUMP( - metaslabstat_df_find_under_floor); - } -#endif rs = metaslab_block_find(&msp->ms_allocatable_by_size, rt, msp->ms_start, size, &where); } @@ -2616,6 +2628,10 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, ms->ms_allocator = -1; ms->ms_new = B_TRUE; + vdev_ops_t *ops = vd->vdev_ops; + if (ops->vdev_op_metaslab_init != NULL) + ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); + /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. @@ -4393,7 +4409,6 @@ metaslab_is_unique(metaslab_t *msp, dva_t *dva) * Metaslab allocation tracing facility * ========================================================================== */ -#ifdef _METASLAB_TRACING /* * Add an allocation trace element to the allocation tracing list. @@ -4468,21 +4483,6 @@ metaslab_trace_fini(zio_alloc_list_t *zal) list_destroy(&zal->zal_list); zal->zal_size = 0; } -#else - -#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) - -void -metaslab_trace_init(zio_alloc_list_t *zal) -{ -} - -void -metaslab_trace_fini(zio_alloc_list_t *zal) -{ -} - -#endif /* _METASLAB_TRACING */ /* * ========================================================================== @@ -4510,13 +4510,14 @@ static void metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) { metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + metaslab_class_allocator_t *mca = + &mg->mg_class->mc_allocator[allocator]; uint64_t max = mg->mg_max_alloc_queue_depth; uint64_t cur = mga->mga_cur_max_alloc_queue_depth; while (cur < max) { if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, cur, cur + 1) == cur) { - atomic_inc_64( - &mg->mg_class->mc_alloc_max_slots[allocator]); + atomic_inc_64(&mca->mca_alloc_max_slots); return; } cur = mga->mga_cur_max_alloc_queue_depth; @@ -4622,8 +4623,16 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, if (msp == NULL) msp = avl_nearest(t, idx, AVL_AFTER); + int tries = 0; for (; msp != NULL; msp = AVL_NEXT(t, msp)) { int i; + + if (!try_hard && tries > zfs_metaslab_find_max_tries) { + METASLABSTAT_BUMP(metaslabstat_too_many_tries); + return (NULL); + } + tries++; + if (!metaslab_should_allocate(msp, asize, try_hard)) { metaslab_trace_add(zal, mg, msp, asize, d, TRACE_TOO_SMALL, allocator); @@ -5052,6 +5061,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, zio_alloc_list_t *zal, int allocator) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_group_t *mg, *fast_mg, *rotor; vdev_t *vd; boolean_t try_hard = B_FALSE; @@ -5073,7 +5083,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_aliquot because + * Note that there's no locking on mca_rotor or mca_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * @@ -5109,23 +5119,23 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, mg->mg_next != NULL) mg = mg->mg_next; } else { - mg = mc->mc_rotor; + mg = mca->mca_rotor; } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; } else if (flags & METASLAB_FASTWRITE) { - mg = fast_mg = mc->mc_rotor; + mg = fast_mg = mca->mca_rotor; do { if (fast_mg->mg_vd->vdev_pending_fastwrite < mg->mg_vd->vdev_pending_fastwrite) mg = fast_mg; - } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); + } while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor); } else { - ASSERT(mc->mc_rotor != NULL); - mg = mc->mc_rotor; + ASSERT(mca->mca_rotor != NULL); + mg = mca->mca_rotor; } /* @@ -5133,7 +5143,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * metaslab group that has been passivated, just follow the rotor. */ if (mg->mg_class != mc || mg->mg_activation_count <= 0) - mg = mc->mc_rotor; + mg = mca->mca_rotor; rotor = mg; top: @@ -5211,7 +5221,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * Bias is also used to compensate for unequally * sized vdevs so that space is allocated fairly. */ - if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { + if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vs_free = vs->vs_space - vs->vs_alloc; int64_t mc_free = mc->mc_space - mc->mc_alloc; @@ -5249,10 +5259,10 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } if ((flags & METASLAB_FASTWRITE) || - atomic_add_64_nv(&mc->mc_aliquot, asize) >= + atomic_add_64_nv(&mca->mca_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { - mc->mc_rotor = mg->mg_next; - mc->mc_aliquot = 0; + mca->mca_rotor = mg->mg_next; + mca->mca_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); @@ -5269,14 +5279,17 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, return (0); } next: - mc->mc_rotor = mg->mg_next; - mc->mc_aliquot = 0; + mca->mca_rotor = mg->mg_next; + mca->mca_aliquot = 0; } while ((mg = mg->mg_next) != rotor); /* - * If we haven't tried hard, do so now. + * If we haven't tried hard, perhaps do so now. */ - if (!try_hard) { + if (!try_hard && (zfs_metaslab_try_hard_before_gang || + GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 || + psize <= 1 << spa->spa_min_ashift)) { + METASLABSTAT_BUMP(metaslabstat_try_hard); try_hard = B_TRUE; goto top; } @@ -5588,15 +5601,15 @@ boolean_t metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio, int flags) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; uint64_t available_slots = 0; boolean_t slot_reserved = B_FALSE; - uint64_t max = mc->mc_alloc_max_slots[allocator]; + uint64_t max = mca->mca_alloc_max_slots; ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); - uint64_t reserved_slots = - zfs_refcount_count(&mc->mc_alloc_slots[allocator]); + uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots); if (reserved_slots < max) available_slots = max - reserved_slots; @@ -5606,11 +5619,8 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ - for (int d = 0; d < slots; d++) { - reserved_slots = - zfs_refcount_add(&mc->mc_alloc_slots[allocator], - zio); - } + for (int d = 0; d < slots; d++) + zfs_refcount_add(&mca->mca_alloc_slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; slot_reserved = B_TRUE; } @@ -5623,12 +5633,12 @@ void metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, int allocator, zio_t *zio) { + metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; + ASSERT(mc->mc_alloc_throttle_enabled); mutex_enter(&mc->mc_lock); - for (int d = 0; d < slots; d++) { - (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], - zio); - } + for (int d = 0; d < slots; d++) + zfs_refcount_remove(&mca->mca_alloc_slots, zio); mutex_exit(&mc->mc_lock); } @@ -5674,7 +5684,7 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, range_tree_remove(msp->ms_allocatable, offset, size); range_tree_clear(msp->ms_trim, offset, size); - if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */ metaslab_class_t *mc = msp->ms_group->mg_class; multilist_sublist_t *mls = multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); @@ -5721,7 +5731,7 @@ metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) metaslab_claim_cb_arg_t arg; /* - * Only zdb(1M) can claim on indirect vdevs. This is used + * Only zdb(8) can claim on indirect vdevs. This is used * to detect leaks of mapped space (that are not accounted * for in the obsolete counts, spacemap, or bpobj). */ @@ -5782,7 +5792,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); - if (mc->mc_rotor == NULL) { /* no vdevs in this class */ + if (mc->mc_allocator[allocator].mca_rotor == NULL) { + /* no vdevs in this class */ spa_config_exit(spa, SCL_ALLOC, FTAG); return (SET_ERROR(ENOSPC)); } @@ -5813,7 +5824,6 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, metaslab_group_alloc_increment(spa, DVA_GET_VDEV(&dva[d]), zio, flags, allocator); } - } ASSERT(error == 0); ASSERT(BP_GET_NDVAS(bp) == ndvas); @@ -6235,3 +6245,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, "Percentage of memory that can be used to store metaslab range trees"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT, + ZMOD_RW, "Try hard to allocate before ganging"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW, + "Normally only consider this many of the best metaslabs in each vdev"); diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c index 99852521b6d1..d05c9db24c20 100644 --- a/sys/contrib/openzfs/module/zfs/mmp.c +++ b/sys/contrib/openzfs/module/zfs/mmp.c @@ -307,8 +307,17 @@ mmp_next_leaf(spa_t *spa) if (leaf == NULL) leaf = list_head(&spa->spa_leaf_list); - if (!vdev_writeable(leaf)) { + /* + * We skip unwritable, offline, detached, and dRAID spare + * devices as they are either not legal targets or the write + * may fail or not be seen by other hosts. Skipped dRAID + * spares can never be written so the fail mask is not set. + */ + if (!vdev_writeable(leaf) || leaf->vdev_offline || + leaf->vdev_detached) { fail_mask |= MMP_FAIL_NOT_WRITABLE; + } else if (leaf->vdev_ops == &vdev_draid_spare_ops) { + continue; } else if (leaf->vdev_mmp_pending != 0) { fail_mask |= MMP_FAIL_WRITE_PENDING; } else { diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c index a3adfd317af6..36c0d33bf1f6 100644 --- a/sys/contrib/openzfs/module/zfs/multilist.c +++ b/sys/contrib/openzfs/module/zfs/multilist.c @@ -96,9 +96,12 @@ multilist_create_impl(size_t size, size_t offset, } /* - * Allocate a new multilist, using the default number of sublists - * (the number of CPUs, or at least 4, or the tunable - * zfs_multilist_num_sublists). + * Allocate a new multilist, using the default number of sublists (the number + * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note + * that the multilists do not expand if more CPUs are hot-added. In that case, + * we will have less fanout than boot_ncpus, but we don't want to always + * reserve the RAM necessary to create the extra slots for additional CPUs up + * front, and dynamically adding them is a complex task. */ multilist_t * multilist_create(size_t size, size_t offset, diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c index 9d1d4e0cca64..53ffbc31c186 100644 --- a/sys/contrib/openzfs/module/zfs/spa.c +++ b/sys/contrib/openzfs/module/zfs/spa.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -1280,15 +1281,15 @@ spa_activate(spa_t *spa, spa_mode_t mode) * pool traverse code from monopolizing the global (and limited) * system_taskq by inappropriately scheduling long running tasks on it. */ - spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus, - defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); /* * The taskq to upgrade datasets in this pool. Currently used by * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. */ - spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus, - defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); } /* @@ -2110,9 +2111,6 @@ spa_passivate_log(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - if (!spa_has_slogs(spa)) - return (B_FALSE); - for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; @@ -3681,7 +3679,14 @@ spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, /* * Build a new vdev tree from the trusted config */ - VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); + if (error != 0) { + nvlist_free(mos_config); + spa_config_exit(spa, SCL_ALL, FTAG); + spa_load_failed(spa, "spa_config_parse failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } /* * Vdev paths in the MOS may be obsolete. If the untrusted config was @@ -5631,7 +5636,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; - uint64_t version, obj; + uint64_t version, obj, ndraid = 0; boolean_t has_features; boolean_t has_encryption; boolean_t has_allocclass; @@ -5753,8 +5758,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_aux(spa, nvroot, txg, - VDEV_ALLOC_ADD)) == 0) { + (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { /* * instantiate the metaslab groups (this will dirty the vdevs) * we can no longer error exit past this point @@ -5895,6 +5900,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_sync_props(props, tx); } + for (int i = 0; i < ndraid; i++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); + dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; @@ -6403,13 +6411,26 @@ spa_reset(const char *pool) * ========================================================================== */ +/* + * This is called as a synctask to increment the draid feature flag + */ +static void +spa_draid_feature_incr(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int draid = (int)(uintptr_t)arg; + + for (int c = 0; c < draid; c++) + spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); +} + /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg; + uint64_t txg, ndraid = 0; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; @@ -6438,8 +6459,23 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && - (error = vdev_create(vd, txg, B_FALSE)) != 0) + (error = vdev_create(vd, txg, B_FALSE)) != 0) { return (spa_vdev_exit(spa, vd, txg, error)); + } + + /* + * The virtual dRAID spares must be added after vdev tree is created + * and the vdev guids are generated. The guid of their assoicated + * dRAID is stored in the config and used when opening the spare. + */ + if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, + rvd->vdev_children)) == 0) { + if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) + nspares = 0; + } else { + return (spa_vdev_exit(spa, vd, txg, error)); + } /* * We must validate the spares and l2cache devices after checking the @@ -6452,7 +6488,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * If we are in the middle of a device removal, we can only add * devices which match the existing devices in the pool. * If we are in the middle of a removal, or have some indirect - * vdevs, we can not add raidz toplevels. + * vdevs, we can not add raidz or dRAID top levels. */ if (spa->spa_vdev_removal != NULL || spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { @@ -6462,10 +6498,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_ashift != spa->spa_max_ashift) { return (spa_vdev_exit(spa, vd, txg, EINVAL)); } - /* Fail if top level vdev is raidz */ - if (tvd->vdev_ops == &vdev_raidz_ops) { + /* Fail if top level vdev is raidz or a dRAID */ + if (vdev_get_nparity(tvd) != 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); - } + /* * Need the top level mirror to be * a mirror of leaf vdevs only @@ -6505,6 +6541,19 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) spa->spa_l2cache.sav_sync = B_TRUE; } + /* + * We can't increment a feature while holding spa_vdev so we + * have to do it in a synctask. + */ + if (ndraid != 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, + (void *)(uintptr_t)ndraid, tx); + dmu_tx_commit(tx); + } + /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we @@ -6615,14 +6664,27 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + /* + * A dRAID spare can only replace a child of its parent dRAID vdev. + */ + if (newvd->vdev_ops == &vdev_draid_spare_ops && + oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + if (rebuild) { /* - * For rebuilds, the parent vdev must support reconstruction + * For rebuilds, the top vdev must support reconstruction * using only space maps. This means the only allowable - * parents are the root vdev or a mirror vdev. + * vdevs types are the root vdev, a mirror, or dRAID. */ - if (pvd->vdev_ops != &vdev_mirror_ops && - pvd->vdev_ops != &vdev_root_ops) { + tvd = pvd; + if (pvd->vdev_top != NULL) + tvd = pvd->vdev_top; + + if (tvd->vdev_ops != &vdev_mirror_ops && + tvd->vdev_ops != &vdev_root_ops && + tvd->vdev_ops != &vdev_draid_ops) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } } @@ -6915,14 +6977,20 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) } /* - * If we are detaching the original disk from a spare, then it implies - * that the spare should become a real disk, and be removed from the - * active spare list for the pool. + * If we are detaching the original disk from a normal spare, then it + * implies that the spare should become a real disk, and be removed + * from the active spare list for the pool. dRAID spares on the + * other hand are coupled to the pool and thus should never be removed + * from the spares list. */ - if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0 && - pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) - unspare = B_TRUE; + if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { + vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; + + if (last_cvd->vdev_isspare && + last_cvd->vdev_ops != &vdev_draid_spare_ops) { + unspare = B_TRUE; + } + } /* * Erase the disk labels so the disk can be used for other things. @@ -7903,6 +7971,9 @@ spa_async_remove(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); + + /* Tell userspace that the vdev is gone. */ + zfs_post_remove(spa, vd); } for (int c = 0; c < vd->vdev_children; c++) @@ -8013,18 +8084,9 @@ spa_async_thread(void *arg) /* * If any devices are done replacing, detach them. */ - if (tasks & SPA_ASYNC_RESILVER_DONE) + if (tasks & SPA_ASYNC_RESILVER_DONE || + tasks & SPA_ASYNC_REBUILD_DONE) { spa_vdev_resilver_done(spa); - - /* - * If any devices are done replacing, detach them. Then if no - * top-level vdevs are rebuilding attempt to kick off a scrub. - */ - if (tasks & SPA_ASYNC_REBUILD_DONE) { - spa_vdev_resilver_done(spa); - - if (!vdev_rebuild_active(spa->spa_root_vdev)) - (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); } /* @@ -8818,12 +8880,18 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) } for (int i = 0; i < spa->spa_alloc_count; i++) { - ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); - ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); - ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); - normal->mc_alloc_max_slots[i] = slots_per_allocator; - special->mc_alloc_max_slots[i] = slots_per_allocator; - dedup->mc_alloc_max_slots[i] = slots_per_allocator; + ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. + mca_alloc_slots)); + ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. + mca_alloc_slots)); + ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. + mca_alloc_slots)); + normal->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; + special->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; + dedup->mc_allocator[i].mca_alloc_max_slots = + slots_per_allocator; } normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c index 2ab58815400a..2939c0366504 100644 --- a/sys/contrib/openzfs/module/zfs/spa_history.c +++ b/sys/contrib/openzfs/module/zfs/spa_history.c @@ -321,7 +321,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) * posted as a result of the ZPOOL_HIST_CMD key being present * it would result in only one sysevent being posted with the * full command line arguments, requiring the consumer to know - * how to parse and understand zfs(1M) command invocations. + * how to parse and understand zfs(8) command invocations. */ spa_history_log_notify(spa, nvl); } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c index 04210472886c..f49be8eec01a 100644 --- a/sys/contrib/openzfs/module/zfs/spa_misc.c +++ b/sys/contrib/openzfs/module/zfs/spa_misc.c @@ -741,6 +741,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; + spa->spa_min_alloc = INT_MAX; /* Reset cached value */ spa->spa_dedup_dspace = ~0ULL; @@ -1366,7 +1367,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) /* * If anything changed, wait for it to sync. This ensures that, - * from the system administrator's perspective, zpool(1M) commands + * from the system administrator's perspective, zpool(8) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ @@ -1807,10 +1808,11 @@ spa_update_dspace(spa_t *spa) ddt_get_dedup_dspace(spa); if (spa->spa_vdev_removal != NULL) { /* - * We can't allocate from the removing device, so - * subtract its size. This prevents the DMU/DSL from - * filling up the (now smaller) pool while we are in the - * middle of removing the device. + * We can't allocate from the removing device, so subtract + * its size if it was included in dspace (i.e. if this is a + * normal-class vdev, not special/dedup). This prevents the + * DMU/DSL from filling up the (now smaller) pool while we + * are in the middle of removing the device. * * Note that the DMU/DSL doesn't actually know or care * how much space is allocated (it does its own tracking @@ -1822,8 +1824,10 @@ spa_update_dspace(spa_t *spa) spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); - spa->spa_dspace -= spa_deflate(spa) ? - vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + if (vd->vdev_mg->mg_class == spa_normal_class(spa)) { + spa->spa_dspace -= spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + } spa_config_exit(spa, SCL_VDEV, FTAG); } } @@ -2435,7 +2439,7 @@ spa_fini(void) boolean_t spa_has_slogs(spa_t *spa) { - return (spa->spa_log_class->mc_rotor != NULL); + return (spa->spa_log_class->mc_groups != 0); } spa_log_state_t diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c index 65375b579da6..3efd26155014 100644 --- a/sys/contrib/openzfs/module/zfs/txg.c +++ b/sys/contrib/openzfs/module/zfs/txg.c @@ -305,9 +305,7 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) * significance to the chosen tx_cpu. Because.. Why not use * the current cpu to index into the array? */ - kpreempt_disable(); - tc = &tx->tx_cpu[CPU_SEQID]; - kpreempt_enable(); + tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE]; mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; @@ -448,8 +446,9 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", - boot_ncpus, defclsyspri, boot_ncpus, boot_ncpus * 2, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + 100, defclsyspri, boot_ncpus, boot_ncpus * 2, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC | + TASKQ_THREADS_CPU_PCT); } cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c index 6af61cdcd9bf..7ffe924212da 100644 --- a/sys/contrib/openzfs/module/zfs/vdev.c +++ b/sys/contrib/openzfs/module/zfs/vdev.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -193,6 +195,8 @@ vdev_dbgmsg_print_tree(vdev_t *vd, int indent) static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, + &vdev_draid_ops, + &vdev_draid_spare_ops, &vdev_mirror_ops, &vdev_replacing_ops, &vdev_spare_ops, @@ -221,15 +225,16 @@ vdev_getops(const char *type) /* ARGSUSED */ void -vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) +vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { - res->rs_start = in->rs_start; - res->rs_end = in->rs_end; + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; } /* * Derive the enumerated allocation bias from string input. - * String origin is either the per-vdev zap or zpool(1M). + * String origin is either the per-vdev zap or zpool(8). */ static vdev_alloc_bias_t vdev_derive_alloc_bias(const char *bias) @@ -264,6 +269,12 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) return (asize); } +uint64_t +vdev_default_min_asize(vdev_t *vd) +{ + return (vd->vdev_min_asize); +} + /* * Get the minimum allocatable size. We define the allocatable size as * the vdev's asize rounded to the nearest metaslab. This allows us to @@ -289,15 +300,7 @@ vdev_get_min_asize(vdev_t *vd) if (vd == vd->vdev_top) return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); - /* - * The allocatable space for a raidz vdev is N * sizeof(smallest child), - * so each child must provide at least 1/Nth of its asize. - */ - if (pvd->vdev_ops == &vdev_raidz_ops) - return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / - pvd->vdev_children); - - return (pvd->vdev_min_asize); + return (pvd->vdev_ops->vdev_op_min_asize(pvd)); } void @@ -309,6 +312,48 @@ vdev_set_min_asize(vdev_t *vd) vdev_set_min_asize(vd->vdev_child[c]); } +/* + * Get the minimal allocation size for the top-level vdev. + */ +uint64_t +vdev_get_min_alloc(vdev_t *vd) +{ + uint64_t min_alloc = 1ULL << vd->vdev_ashift; + + if (vd->vdev_ops->vdev_op_min_alloc != NULL) + min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); + + return (min_alloc); +} + +/* + * Get the parity level for a top-level vdev. + */ +uint64_t +vdev_get_nparity(vdev_t *vd) +{ + uint64_t nparity = 0; + + if (vd->vdev_ops->vdev_op_nparity != NULL) + nparity = vd->vdev_ops->vdev_op_nparity(vd); + + return (nparity); +} + +/* + * Get the number of data disks for a top-level vdev. + */ +uint64_t +vdev_get_ndisks(vdev_t *vd) +{ + uint64_t ndisks = 1; + + if (vd->vdev_ops->vdev_op_ndisks != NULL) + ndisks = vd->vdev_ops->vdev_op_ndisks(vd); + + return (ndisks); +} + vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { @@ -551,6 +596,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) list_link_init(&vd->vdev_initialize_node); list_link_init(&vd->vdev_leaf_node); list_link_init(&vd->vdev_trim_node); + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); @@ -569,9 +615,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); - cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < DTL_TYPES; t++) { vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, @@ -600,7 +644,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, { vdev_ops_t *ops; char *type; - uint64_t guid = 0, islog, nparity; + uint64_t guid = 0, islog; vdev_t *vd; vdev_indirect_config_t *vic; char *tmp = NULL; @@ -657,48 +701,13 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) return (SET_ERROR(ENOTSUP)); - /* - * Set the nparity property for RAID-Z vdevs. - */ - nparity = -1ULL; - if (ops == &vdev_raidz_ops) { - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, - &nparity) == 0) { - if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) - return (SET_ERROR(EINVAL)); - /* - * Previous versions could only support 1 or 2 parity - * device. - */ - if (nparity > 1 && - spa_version(spa) < SPA_VERSION_RAIDZ2) - return (SET_ERROR(ENOTSUP)); - if (nparity > 2 && - spa_version(spa) < SPA_VERSION_RAIDZ3) - return (SET_ERROR(ENOTSUP)); - } else { - /* - * We require the parity to be specified for SPAs that - * support multiple parity levels. - */ - if (spa_version(spa) >= SPA_VERSION_RAIDZ2) - return (SET_ERROR(EINVAL)); - /* - * Otherwise, we default to 1 parity device for RAID-Z. - */ - nparity = 1; - } - } else { - nparity = 0; - } - ASSERT(nparity != -1ULL); - - /* - * If creating a top-level vdev, check for allocation classes input - */ if (top_level && alloctype == VDEV_ALLOC_ADD) { char *bias; + /* + * If creating a top-level vdev, check for allocation + * classes input. + */ if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0) { alloc_bias = vdev_derive_alloc_bias(bias); @@ -710,13 +719,32 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); } } + + /* spa_vdev_add() expects feature to be enabled */ + if (ops == &vdev_draid_ops && + spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { + return (SET_ERROR(ENOTSUP)); + } + } + + /* + * Initialize the vdev specific data. This is done before calling + * vdev_alloc_common() since it may fail and this simplifies the + * error reporting and cleanup code paths. + */ + void *tsd = NULL; + if (ops->vdev_op_init != NULL) { + rc = ops->vdev_op_init(spa, nv, &tsd); + if (rc != 0) { + return (rc); + } } vd = vdev_alloc_common(spa, id, guid, ops); - vic = &vd->vdev_indirect_config; - + vd->vdev_tsd = tsd; vd->vdev_islog = islog; - vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; @@ -756,6 +784,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_wholedisk) != 0) vd->vdev_wholedisk = -1ULL; + vic = &vd->vdev_indirect_config; + ASSERT0(vic->vic_mapping_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, &vic->vic_mapping_object); @@ -937,6 +967,9 @@ vdev_free(vdev_t *vd) ASSERT(vd->vdev_child == NULL); ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + if (vd->vdev_ops->vdev_op_fini != NULL) + vd->vdev_ops->vdev_op_fini(vd); + /* * Discard allocation state. */ @@ -1028,9 +1061,7 @@ vdev_free(vdev_t *vd) cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); - mutex_destroy(&vd->vdev_rebuild_io_lock); cv_destroy(&vd->vdev_rebuild_cv); - cv_destroy(&vd->vdev_rebuild_io_cv); zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); @@ -1161,7 +1192,8 @@ vdev_top_update(vdev_t *tvd, vdev_t *vd) } /* - * Add a mirror/replacing vdev above an existing vdev. + * Add a mirror/replacing vdev above an existing vdev. There is no need to + * call .vdev_op_init() since mirror/replacing vdevs do not have private state. */ vdev_t * vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) @@ -1296,6 +1328,10 @@ vdev_metaslab_group_create(vdev_t *vd) spa->spa_max_ashift = vd->vdev_ashift; if (vd->vdev_ashift < spa->spa_min_ashift) spa->spa_min_ashift = vd->vdev_ashift; + + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; } } } @@ -1622,39 +1658,67 @@ vdev_uses_zvols(vdev_t *vd) return (B_FALSE); } +/* + * Returns B_TRUE if the passed child should be opened. + */ +static boolean_t +vdev_default_open_children_func(vdev_t *vd) +{ + return (B_TRUE); +} + +/* + * Open the requested child vdevs. If any of the leaf vdevs are using + * a ZFS volume then do the opens in a single thread. This avoids a + * deadlock when the current thread is holding the spa_namespace_lock. + */ +static void +vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + int children = vd->vdev_children; + + taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + vd->vdev_nonrot = B_TRUE; + + for (int c = 0; c < children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (open_func(cvd) == B_FALSE) + continue; + + if (tq == NULL || vdev_uses_zvols(vd)) { + cvd->vdev_open_error = vdev_open(cvd); + } else { + VERIFY(taskq_dispatch(tq, vdev_open_child, + cvd, TQ_SLEEP) != TASKQID_INVALID); + } + + vd->vdev_nonrot &= cvd->vdev_nonrot; + } + + if (tq != NULL) { + taskq_wait(tq); + taskq_destroy(tq); + } +} + +/* + * Open all child vdevs. + */ void vdev_open_children(vdev_t *vd) { - taskq_t *tq; - int children = vd->vdev_children; + vdev_open_children_impl(vd, vdev_default_open_children_func); +} - /* - * in order to handle pools on top of zvols, do the opens - * in a single thread so that the same thread holds the - * spa_namespace_lock - */ - if (vdev_uses_zvols(vd)) { -retry_sync: - for (int c = 0; c < children; c++) - vd->vdev_child[c]->vdev_open_error = - vdev_open(vd->vdev_child[c]); - } else { - tq = taskq_create("vdev_open", children, minclsyspri, - children, children, TASKQ_PREPOPULATE); - if (tq == NULL) - goto retry_sync; - - for (int c = 0; c < children; c++) - VERIFY(taskq_dispatch(tq, vdev_open_child, - vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); - - taskq_destroy(tq); - } - - vd->vdev_nonrot = B_TRUE; - - for (int c = 0; c < children; c++) - vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; +/* + * Conditionally open a subset of child vdevs. + */ +void +vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) +{ + vdev_open_children_impl(vd, open_func); } /* @@ -1952,6 +2016,16 @@ vdev_open(vdev_t *vd) return (error); } + /* + * Track the the minimum allocation size. + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + vd->vdev_islog == 0 && vd->vdev_aux == NULL) { + uint64_t min_alloc = vdev_get_min_alloc(vd); + if (min_alloc < spa->spa_min_alloc) + spa->spa_min_alloc = min_alloc; + } + /* * If this is a leaf vdev, assess whether a resilver is needed. * But don't do this if we are doing a reopen for a scrub, since @@ -2278,7 +2352,9 @@ vdev_close(vdev_t *vd) vdev_t *pvd = vd->vdev_parent; spa_t *spa __maybe_unused = vd->vdev_spa; - ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd != NULL); + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); /* * If our parent is reopening, then we are as well, unless we are @@ -2575,15 +2651,12 @@ vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) /* * While we are loading the pool, the DTLs have not been loaded yet. - * Ignore the DTLs and try all devices. This avoids a recursive - * mutex enter on the vdev_dtl_lock, and also makes us try hard - * when loading the pool (relying on the checksum to ensure that - * we get the right data -- note that we while loading, we are - * only reading the MOS, which is always checksummed). + * This isn't a problem but it can result in devices being tried + * which are known to not have the data. In which case, the import + * is relying on the checksum to ensure that we get the right data. + * Note that while importing we are only reading the MOS, which is + * always checksummed. */ - if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) - return (B_FALSE); - mutex_enter(&vd->vdev_dtl_lock); if (!range_tree_is_empty(rt)) dirty = range_tree_contains(rt, txg, size); @@ -2606,10 +2679,26 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) } /* - * Returns B_TRUE if vdev determines offset needs to be resilvered. + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. */ boolean_t -vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + /* Set by sequential resilver. */ + if (phys_birth == TXG_UNKNOWN) + return (B_TRUE); + + return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); +} + +/* + * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { ASSERT(vd != vd->vdev_spa->spa_root_vdev); @@ -2617,7 +2706,8 @@ vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) vd->vdev_ops->vdev_op_leaf) return (B_TRUE); - return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); + return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, + phys_birth)); } /* @@ -2862,8 +2952,8 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, continue; /* leaf vdevs only */ if (t == DTL_PARTIAL) minref = 1; /* i.e. non-zero */ - else if (vd->vdev_nparity != 0) - minref = vd->vdev_nparity + 1; /* RAID-Z */ + else if (vdev_get_nparity(vd) != 0) + minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */ else minref = vd->vdev_children; /* any kind of mirror */ space_reftree_create(&reftree); @@ -2884,6 +2974,7 @@ vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; + range_tree_t *rt; int error = 0; if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { @@ -2895,10 +2986,17 @@ vdev_dtl_load(vdev_t *vd) return (error); ASSERT(vd->vdev_dtl_sm != NULL); - mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(vd->vdev_dtl_sm, - vd->vdev_dtl[DTL_MISSING], SM_ALLOC); - mutex_exit(&vd->vdev_dtl_lock); + rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); + if (error == 0) { + mutex_enter(&vd->vdev_dtl_lock); + range_tree_walk(rt, range_tree_add, + vd->vdev_dtl[DTL_MISSING]); + mutex_exit(&vd->vdev_dtl_lock); + } + + range_tree_vacate(rt, NULL, NULL); + range_tree_destroy(rt); return (error); } @@ -3727,6 +3825,9 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; mg = tvd->vdev_mg; generation = spa->spa_config_generation + 1; @@ -3971,6 +4072,13 @@ vdev_accessible(vdev_t *vd, zio_t *zio) static void vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) { + /* + * Exclude the dRAID spare when aggregating to avoid double counting + * the ops and bytes. These IOs are counted by the physical leaves. + */ + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return; + for (int t = 0; t < VS_ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; @@ -4063,7 +4171,6 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vdev_get_child_stat(cvd, vs, cvs); if (vsx) vdev_get_child_stat_ex(cvd, vsx, cvsx); - } } else { /* @@ -4248,7 +4355,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize) /* * Repair is the result of a rebuild issued by the - * rebuild thread (vdev_rebuild_thread). + * rebuild thread (vdev_rebuild_thread). To avoid + * double counting repaired bytes the virtual dRAID + * spare vdev is excluded from the processed bytes. */ if (zio->io_priority == ZIO_PRIORITY_REBUILD) { vdev_t *tvd = vd->vdev_top; @@ -4256,8 +4365,10 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; - if (vd->vdev_ops->vdev_op_leaf) + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { atomic_add_64(rebuilt, psize); + } vs->vs_rebuild_processed += psize; } @@ -4353,8 +4464,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize) if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) return; - if (spa->spa_load_state == SPA_LOAD_NONE && - type == ZIO_TYPE_WRITE && txg != 0 && + if (type == ZIO_TYPE_WRITE && txg != 0 && (!(flags & ZIO_FLAG_IO_REPAIR) || (flags & ZIO_FLAG_SCAN_THREAD) || spa->spa_claiming)) { @@ -4981,31 +5091,42 @@ vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) vdev_resilver_needed(vd, NULL, NULL)); } +boolean_t +vdev_xlate_is_empty(range_seg64_t *rs) +{ + return (rs->rs_start == rs->rs_end); +} + /* - * Translate a logical range to the physical range for the specified vdev_t. - * This function is initially called with a leaf vdev and will walk each - * parent vdev until it reaches a top-level vdev. Once the top-level is - * reached the physical range is initialized and the recursive function - * begins to unwind. As it unwinds it calls the parent's vdev specific - * translation function to do the real conversion. + * Translate a logical range to the first contiguous physical range for the + * specified vdev_t. This function is initially called with a leaf vdev and + * will walk each parent vdev until it reaches a top-level vdev. Once the + * top-level is reached the physical range is initialized and the recursive + * function begins to unwind. As it unwinds it calls the parent's vdev + * specific translation function to do the real conversion. */ void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, - range_seg64_t *physical_rs) + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { /* * Walk up the vdev tree */ if (vd != vd->vdev_top) { - vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, + remain_rs); } else { /* - * We've reached the top-level vdev, initialize the - * physical range to the logical range and start to - * unwind. + * We've reached the top-level vdev, initialize the physical + * range to the logical range and set an empty remaining + * range then start to unwind. */ physical_rs->rs_start = logical_rs->rs_start; physical_rs->rs_end = logical_rs->rs_end; + + remain_rs->rs_start = logical_rs->rs_start; + remain_rs->rs_end = logical_rs->rs_start; + return; } @@ -5015,16 +5136,40 @@ vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, /* * As this recursive function unwinds, translate the logical - * range into its physical components by calling the - * vdev specific translate function. + * range into its physical and any remaining components by calling + * the vdev specific translate function. */ range_seg64_t intermediate = { 0 }; - pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); physical_rs->rs_start = intermediate.rs_start; physical_rs->rs_end = intermediate.rs_end; } +void +vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs, + vdev_xlate_func_t *func, void *arg) +{ + range_seg64_t iter_rs = *logical_rs; + range_seg64_t physical_rs; + range_seg64_t remain_rs; + + while (!vdev_xlate_is_empty(&iter_rs)) { + + vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); + + /* + * With raidz and dRAID, it's possible that the logical range + * does not live on this leaf vdev. Only when there is a non- + * zero physical size call the provided function. + */ + if (!vdev_xlate_is_empty(&physical_rs)) + func(arg, &physical_rs); + + iter_rs = remain_rs; + } +} + /* * Look at the vdev tree and determine whether any devices are currently being * replaced. diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c new file mode 100644 index 000000000000..6b7ad7021a50 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c @@ -0,0 +1,2984 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef ZFS_DEBUG +#include /* For vdev_xlate() in vdev_draid_io_verify() */ +#endif + +/* + * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is + * comprised of multiple raidz redundancy groups which are spread over the + * dRAID children. To ensure an even distribution, and avoid hot spots, a + * permutation mapping is applied to the order of the dRAID children. + * This mixing effectively distributes the parity columns evenly over all + * of the disks in the dRAID. + * + * This is beneficial because it means when resilvering all of the disks + * can participate thereby increasing the available IOPs and bandwidth. + * Furthermore, by reserving a small fraction of each child's total capacity + * virtual distributed spare disks can be created. These spares similarly + * benefit from the performance gains of spanning all of the children. The + * consequence of which is that resilvering to a distributed spare can + * substantially reduce the time required to restore full parity to pool + * with a failed disks. + * + * === dRAID group layout === + * + * First, let's define a "row" in the configuration to be a 16M chunk from + * each physical drive at the same offset. This is the minimum allowable + * size since it must be possible to store a full 16M block when there is + * only a single data column. Next, we define a "group" to be a set of + * sequential disks containing both the parity and data columns. We allow + * groups to span multiple rows in order to align any group size to any + * number of physical drives. Finally, a "slice" is comprised of the rows + * which contain the target number of groups. The permutation mappings + * are applied in a round robin fashion to each slice. + * + * Given D+P drives in a group (including parity drives) and C-S physical + * drives (not including the spare drives), we can distribute the groups + * across R rows without remainder by selecting the least common multiple + * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S). + * + * In the example below, there are C=14 physical drives in the configuration + * with S=2 drives worth of spare capacity. Each group has a width of 9 + * which includes D=8 data and P=1 parity drive. There are 4 groups and + * 3 rows per slice. Each group has a size of 144M (16M * 9) and a slice + * size is 576M (144M * 4). When allocating from a dRAID each group is + * filled before moving on to the next as show in slice0 below. + * + * data disks (8 data + 1 parity) spares (2) + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * ^ | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0 + * | +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | | group 0 | group 1..| | + * | +-----------------------------------+-----------+-------| + * | | 0 1 2 3 4 5 6 7 8 | 36 37 38| | r + * | | 9 10 11 12 13 14 15 16 17| 45 46 47| | o + * | | 18 19 20 21 22 23 24 25 26| 54 55 56| | w + * | 27 28 29 30 31 32 33 34 35| 63 64 65| | 0 + * s +-----------------------+-----------------------+-------+ + * l | ..group 1 | group 2.. | | + * i +-----------------------+-----------------------+-------+ + * c | 39 40 41 42 43 44| 72 73 74 75 76 77| | r + * e | 48 49 50 51 52 53| 81 82 83 84 85 86| | o + * 0 | 57 58 59 60 61 62| 90 91 92 93 94 95| | w + * | 66 67 68 69 70 71| 99 100 101 102 103 104| | 1 + * | +-----------+-----------+-----------------------+-------+ + * | |..group 2 | group 3 | | + * | +-----------+-----------+-----------------------+-------+ + * | | 78 79 80|108 109 110 111 112 113 114 115 116| | r + * | | 87 88 89|117 118 119 120 121 122 123 124 125| | o + * | | 96 97 98|126 127 128 129 130 131 132 133 134| | w + * v |105 106 107|135 136 137 138 139 140 141 142 143| | 2 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 4 | group 5..| | row 3 + * i +-----------------------+-----------+-----------+-------| + * c | ..group 5 | group 6.. | | row 4 + * e +-----------+-----------+-----------------------+-------+ + * 1 |..group 6 | group 7 | | row 5 + * +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2 + * s +===+===+===+===+===+===+===+===+===+===+===+===+===+===+ + * l | group 8 | group 9..| | row 6 + * i +-----------------------------------------------+-------| + * c | ..group 9 | group 10.. | | row 7 + * e +-----------------------+-----------------------+-------+ + * 2 |..group 10 | group 11 | | row 8 + * +-----------+-----------------------------------+-------+ + * + * This layout has several advantages over requiring that each row contain + * a whole number of groups. + * + * 1. The group count is not a relevant parameter when defining a dRAID + * layout. Only the group width is needed, and *all* groups will have + * the desired size. + * + * 2. All possible group widths (<= physical disk count) can be supported. + * + * 3. The logic within vdev_draid.c is simplified when the group width is + * the same for all groups (although some of the logic around computing + * permutation numbers and drive offsets is more complicated). + * + * N.B. The following array describes all valid dRAID permutation maps. + * Each row is used to generate a permutation map for a different number + * of children from a unique seed. The seeds were generated and carefully + * evaluated by the 'draid' utility in order to provide balanced mappings. + * In addition to the seed a checksum of the in-memory mapping is stored + * for verification. + * + * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed, + * with a given permutation map) is the ratio of the amounts of I/O that will + * be sent to the least and most busy disks when resilvering. The average + * imbalance ratio (of a given number of disks and permutation map) is the + * average of the ratios of all possible single and double disk failures. + * + * In order to achieve a low imbalance ratio the number of permutations in + * the mapping must be significantly larger than the number of children. + * For dRAID the number of permutations has been limited to 512 to minimize + * the map size. This does result in a gradually increasing imbalance ratio + * as seen in the table below. Increasing the number of permutations for + * larger child counts would reduce the imbalance ratio. However, in practice + * when there are a large number of children each child is responsible for + * fewer total IOs so it's less of a concern. + * + * Note these values are hard coded and must never be changed. Existing + * pools depend on the same mapping always being generated in order to + * read and write from the correct locations. Any change would make + * existing pools completely inaccessible. + */ +static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = { + { 2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d }, /* 1.000 */ + { 3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 }, /* 1.000 */ + { 4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 }, /* 1.000 */ + { 5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 }, /* 1.010 */ + { 6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 }, /* 1.031 */ + { 7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee }, /* 1.043 */ + { 8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 }, /* 1.059 */ + { 9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 }, /* 1.056 */ + { 10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 }, /* 1.072 */ + { 11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c }, /* 1.083 */ + { 12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e }, /* 1.097 */ + { 13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 }, /* 1.100 */ + { 14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 }, /* 1.121 */ + { 15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 }, /* 1.103 */ + { 16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 }, /* 1.111 */ + { 17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe }, /* 1.133 */ + { 18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 }, /* 1.131 */ + { 19, 256, 0x892e343f2f31d690, 0x00000029eb392835 }, /* 1.130 */ + { 20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c }, /* 1.141 */ + { 21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 }, /* 1.139 */ + { 22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 }, /* 1.150 */ + { 23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f }, /* 1.174 */ + { 24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 }, /* 1.168 */ + { 25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 }, /* 1.180 */ + { 26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba }, /* 1.226 */ + { 27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 }, /* 1.228 */ + { 28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c }, /* 1.217 */ + { 29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c }, /* 1.239 */ + { 30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 }, /* 1.238 */ + { 31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f }, /* 1.273 */ + { 32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 }, /* 1.191 */ + { 33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 }, /* 1.199 */ + { 34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 }, /* 1.195 */ + { 35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 }, /* 1.201 */ + { 36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef }, /* 1.194 */ + { 37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 }, /* 1.237 */ + { 38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 }, /* 1.242 */ + { 39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd }, /* 1.231 */ + { 40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 }, /* 1.233 */ + { 41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 }, /* 1.271 */ + { 42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 }, /* 1.263 */ + { 43, 512, 0xbaa5125faa781854, 0x000001c76789e278 }, /* 1.270 */ + { 44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb }, /* 1.281 */ + { 45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 }, /* 1.282 */ + { 46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b }, /* 1.286 */ + { 47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 }, /* 1.329 */ + { 48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b }, /* 1.286 */ + { 49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 }, /* 1.322 */ + { 50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 }, /* 1.335 */ + { 51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 }, /* 1.305 */ + { 52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf }, /* 1.330 */ + { 53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 }, /* 1.365 */ + { 54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 }, /* 1.334 */ + { 55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 }, /* 1.364 */ + { 56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e }, /* 1.374 */ + { 57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 }, /* 1.363 */ + { 58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 }, /* 1.401 */ + { 59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c }, /* 1.392 */ + { 60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 }, /* 1.360 */ + { 61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd }, /* 1.396 */ + { 62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c }, /* 1.453 */ + { 63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 }, /* 1.437 */ + { 64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 }, /* 1.402 */ + { 65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 }, /* 1.459 */ + { 66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 }, /* 1.423 */ + { 67, 512, 0x910b9714f698a877, 0x00000451ea65d5db }, /* 1.447 */ + { 68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 }, /* 1.450 */ + { 69, 512, 0x836d4968fbaa3706, 0x000004954068a380 }, /* 1.455 */ + { 70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d }, /* 1.463 */ + { 71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 }, /* 1.463 */ + { 72, 512, 0x42763a680d5bed8e, 0x000005084275c680 }, /* 1.452 */ + { 73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab }, /* 1.498 */ + { 74, 512, 0x9fa08548b1621a44, 0x0000054708019247 }, /* 1.526 */ + { 75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 }, /* 1.491 */ + { 76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 }, /* 1.470 */ + { 77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 }, /* 1.527 */ + { 78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 }, /* 1.509 */ + { 79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e }, /* 1.569 */ + { 80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c }, /* 1.555 */ + { 81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 }, /* 1.509 */ + { 82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 }, /* 1.596 */ + { 83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e }, /* 1.568 */ + { 84, 512, 0xba02545069ddc6dc, 0x000006d19861364f }, /* 1.541 */ + { 85, 512, 0x447c73192c35073e, 0x000006fce315ce35 }, /* 1.623 */ + { 86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b }, /* 1.620 */ + { 87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 }, /* 1.597 */ + { 88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b }, /* 1.575 */ + { 89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc }, /* 1.627 */ + { 90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb }, /* 1.596 */ + { 91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 }, /* 1.622 */ + { 92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e }, /* 1.695 */ + { 93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c }, /* 1.605 */ + { 94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc }, /* 1.625 */ + { 95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 }, /* 1.687 */ + { 96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a }, /* 1.621 */ + { 97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 }, /* 1.699 */ + { 98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b }, /* 1.688 */ + { 99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce }, /* 1.642 */ + { 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc }, /* 1.683 */ + { 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 }, /* 1.755 */ + { 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 }, /* 1.692 */ + { 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 }, /* 1.747 */ + { 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 }, /* 1.751 */ + { 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 }, /* 1.751 */ + { 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f }, /* 1.726 */ + { 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d }, /* 1.788 */ + { 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 }, /* 1.740 */ + { 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 }, /* 1.780 */ + { 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 }, /* 1.836 */ + { 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 }, /* 1.778 */ + { 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 }, /* 1.831 */ + { 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df }, /* 1.825 */ + { 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 }, /* 1.826 */ + { 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 }, /* 1.843 */ + { 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d }, /* 1.826 */ + { 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b }, /* 1.803 */ + { 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 }, /* 1.857 */ + { 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 }, /* 1.877 */ + { 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 }, /* 1.849 */ + { 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d }, /* 1.867 */ + { 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 }, /* 1.978 */ + { 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d }, /* 1.947 */ + { 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea }, /* 1.865 */ + { 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f }, /* 1.881 */ + { 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b }, /* 1.882 */ + { 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e }, /* 1.867 */ + { 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e }, /* 1.972 */ + { 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 }, /* 1.896 */ + { 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d }, /* 1.965 */ + { 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 }, /* 1.963 */ + { 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 }, /* 1.925 */ + { 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 }, /* 1.862 */ + { 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 }, /* 2.042 */ + { 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 }, /* 1.935 */ + { 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 }, /* 2.005 */ + { 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c }, /* 2.041 */ + { 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 }, /* 1.997 */ + { 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 }, /* 1.996 */ + { 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d }, /* 2.053 */ + { 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a }, /* 1.971 */ + { 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 }, /* 2.018 */ + { 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd }, /* 1.961 */ + { 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 }, /* 2.046 */ + { 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb }, /* 1.968 */ + { 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 }, /* 2.143 */ + { 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 }, /* 2.064 */ + { 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 }, /* 2.023 */ + { 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c }, /* 2.136 */ + { 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 }, /* 2.063 */ + { 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 }, /* 1.974 */ + { 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 }, /* 2.210 */ + { 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a }, /* 2.006 */ + { 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 }, /* 2.193 */ + { 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 }, /* 2.163 */ + { 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc }, /* 2.046 */ + { 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 }, /* 2.084 */ + { 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 }, /* 2.264 */ + { 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 }, /* 2.074 */ + { 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 }, /* 2.282 */ + { 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf }, /* 2.148 */ + { 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 }, /* 2.355 */ + { 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 }, /* 2.164 */ + { 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a }, /* 2.393 */ + { 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 }, /* 2.178 */ + { 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc }, /* 2.334 */ + { 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b }, /* 2.266 */ + { 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 }, /* 2.304 */ + { 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d }, /* 2.218 */ + { 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff }, /* 2.377 */ + { 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 }, /* 2.155 */ + { 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 }, /* 2.404 */ + { 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 }, /* 2.205 */ + { 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d }, /* 2.359 */ + { 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 }, /* 2.158 */ + { 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b }, /* 2.614 */ + { 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc }, /* 2.239 */ + { 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc }, /* 2.493 */ + { 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c }, /* 2.327 */ + { 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 }, /* 2.231 */ + { 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c }, /* 2.237 */ + { 182, 512, 0xe6035defea48f933, 0x00002038e3346658 }, /* 2.691 */ + { 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e }, /* 2.170 */ + { 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 }, /* 2.600 */ + { 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc }, /* 2.391 */ + { 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 }, /* 2.677 */ + { 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c }, /* 2.410 */ + { 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 }, /* 2.776 */ + { 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 }, /* 2.266 */ + { 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 }, /* 2.717 */ + { 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c }, /* 2.474 */ + { 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 }, /* 2.673 */ + { 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 }, /* 2.420 */ + { 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 }, /* 2.898 */ + { 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c }, /* 2.363 */ + { 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e }, /* 2.747 */ + { 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 }, /* 2.531 */ + { 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 }, /* 2.707 */ + { 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 }, /* 2.315 */ + { 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf }, /* 3.012 */ + { 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 }, /* 2.378 */ + { 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 }, /* 2.969 */ + { 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d }, /* 2.594 */ + { 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd }, /* 2.763 */ + { 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 }, /* 2.457 */ + { 206, 512, 0xc02fc96684715a16, 0x0000297515608601 }, /* 3.057 */ + { 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 }, /* 2.590 */ + { 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b }, /* 3.047 */ + { 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 }, /* 2.676 */ + { 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 }, /* 2.993 */ + { 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 }, /* 2.457 */ + { 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 }, /* 3.182 */ + { 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 }, /* 2.563 */ + { 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 }, /* 3.025 */ + { 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f }, /* 2.730 */ + { 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 }, /* 3.036 */ + { 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 }, /* 2.722 */ + { 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 }, /* 3.356 */ + { 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 }, /* 2.697 */ + { 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 }, /* 2.979 */ + { 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 }, /* 2.858 */ + { 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e }, /* 3.258 */ + { 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 }, /* 2.693 */ + { 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 }, /* 3.259 */ + { 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c }, /* 2.733 */ + { 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 }, /* 3.235 */ + { 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 }, /* 2.983 */ + { 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e }, /* 3.308 */ + { 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 }, /* 2.715 */ + { 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f }, /* 3.540 */ + { 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 }, /* 2.779 */ + { 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c }, /* 3.084 */ + { 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc }, /* 2.987 */ + { 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae }, /* 3.341 */ + { 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 }, /* 2.793 */ + { 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 }, /* 3.518 */ + { 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 }, /* 2.962 */ + { 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 }, /* 3.196 */ + { 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 }, /* 2.914 */ + { 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 }, /* 3.408 */ + { 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 }, /* 2.903 */ + { 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 }, /* 3.778 */ + { 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c }, /* 3.026 */ + { 244, 512, 0xc740263f0301efa8, 0x00003a147146512d }, /* 3.347 */ + { 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d }, /* 3.212 */ + { 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 }, /* 3.482 */ + { 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 }, /* 3.146 */ + { 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f }, /* 3.626 */ + { 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 }, /* 2.952 */ + { 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e }, /* 3.463 */ + { 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 }, /* 3.131 */ + { 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c }, /* 3.538 */ + { 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac }, /* 2.974 */ + { 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 }, /* 3.843 */ + { 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 }, /* 3.088 */ +}; + +/* + * Verify the map is valid. Each device index must appear exactly + * once in every row, and the permutation array checksum must match. + */ +static int +verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms, + uint64_t checksum) +{ + int countssz = sizeof (uint16_t) * children; + uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP); + + for (int i = 0; i < nperms; i++) { + for (int j = 0; j < children; j++) { + uint8_t val = perms[(i * children) + j]; + + if (val >= children || counts[val] != i) { + kmem_free(counts, countssz); + return (EINVAL); + } + + counts[val]++; + } + } + + if (checksum != 0) { + int permssz = sizeof (uint8_t) * children * nperms; + zio_cksum_t cksum; + + fletcher_4_native_varsize(perms, permssz, &cksum); + + if (checksum != cksum.zc_word[0]) { + kmem_free(counts, countssz); + return (ECKSUM); + } + } + + kmem_free(counts, countssz); + + return (0); +} + +/* + * Generate the permutation array for the draid_map_t. These maps control + * the placement of all data in a dRAID. Therefore it's critical that the + * seed always generates the same mapping. We provide our own pseudo-random + * number generator for this purpose. + */ +int +vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) +{ + VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN); + VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN); + VERIFY3U(map->dm_seed, !=, 0); + VERIFY3U(map->dm_nperms, !=, 0); + VERIFY3P(map->dm_perms, ==, NULL); + +#ifdef _KERNEL + /* + * The kernel code always provides both a map_seed and checksum. + * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide + * a zero checksum when generating new candidate maps. + */ + VERIFY3U(map->dm_checksum, !=, 0); +#endif + uint64_t children = map->dm_children; + uint64_t nperms = map->dm_nperms; + int rowsz = sizeof (uint8_t) * children; + int permssz = rowsz * nperms; + uint8_t *perms; + + /* Allocate the permutation array */ + perms = vmem_alloc(permssz, KM_SLEEP); + + /* Setup an initial row with a known pattern */ + uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP); + for (int i = 0; i < children; i++) + initial_row[i] = i; + + uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed }; + uint8_t *current_row, *previous_row = initial_row; + + /* + * Perform a Fisher-Yates shuffle of each row using the previous + * row as the starting point. An initial_row with known pattern + * is used as the input for the first row. + */ + for (int i = 0; i < nperms; i++) { + current_row = &perms[i * children]; + memcpy(current_row, previous_row, rowsz); + + for (int j = children - 1; j > 0; j--) { + uint64_t k = vdev_draid_rand(draid_seed) % (j + 1); + uint8_t val = current_row[j]; + current_row[j] = current_row[k]; + current_row[k] = val; + } + + previous_row = current_row; + } + + kmem_free(initial_row, rowsz); + + int error = verify_perms(perms, children, nperms, map->dm_checksum); + if (error) { + vmem_free(perms, permssz); + return (error); + } + + *permsp = perms; + + return (0); +} + +/* + * Lookup the fixed draid_map_t for the requested number of children. + */ +int +vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) +{ + for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + if (draid_maps[i].dm_children == children) { + *mapp = &draid_maps[i]; + return (0); + } + } + + return (ENOENT); +} + +/* + * Lookup the permutation array and iteration id for the provided offset. + */ +static void +vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex, + uint8_t **base, uint64_t *iter) +{ + uint64_t ncols = vdc->vdc_children; + uint64_t poff = pindex % (vdc->vdc_nperms * ncols); + + *base = vdc->vdc_perms + (poff / ncols) * ncols; + *iter = poff % ncols; +} + +static inline uint64_t +vdev_draid_permute_id(vdev_draid_config_t *vdc, + uint8_t *base, uint64_t iter, uint64_t index) +{ + return ((base[index] + iter) % vdc->vdc_children); +} + +/* + * Return the asize which is the psize rounded up to a full group width. + * i.e. vdev_draid_psize_to_asize(). + */ +static uint64_t +vdev_draid_asize(vdev_t *vd, uint64_t psize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_ashift; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1; + uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift; + + ASSERT3U(asize, !=, 0); + ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0); + + return (asize); +} + +/* + * Deflate the asize to the psize, this includes stripping parity. + */ +uint64_t +vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT0(asize % vdc->vdc_groupwidth); + + return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata); +} + +/* + * Convert a logical offset to the corresponding group number. + */ +static uint64_t +vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (offset / vdc->vdc_groupsz); +} + +/* + * Convert a group number to the logical starting offset for that group. + */ +static uint64_t +vdev_draid_group_to_offset(vdev_t *vd, uint64_t group) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (group * vdc->vdc_groupsz); +} + + +static void +vdev_draid_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT0(rm->rm_freed); + rm->rm_freed = B_TRUE; + + if (rm->rm_reports == 0) { + vdev_raidz_map_free(rm); + } +} + +/*ARGSUSED*/ +static void +vdev_draid_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed) + vdev_raidz_map_free(rm); +} + +static void +vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + const size_t c = zcr->zcr_cbinfo; + uint64_t skip_size = zcr->zcr_sector; + uint64_t parity_size; + size_t x, offset, size; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + /* + * Detailed cksum reporting is currently only supported for single + * row draid mappings, this covers the vast majority of zios. Only + * a dRAID zio which spans groups will have multiple rows. + */ + if (rm->rm_nrows != 1) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + raidz_row_t *rr = rm->rm_row[0]; + const abd_t *good = NULL; + const abd_t *bad = rr->rr_col[c].rc_abd; + + if (c < rr->rr_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical zio) + */ + if (rr->rr_col[0].rc_gdata == NULL) { + abd_t *bad_parity[VDEV_DRAID_MAXPARITY]; + + /* + * Set up the rr_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rr->rr_firstdatacol; x++) { + bad_parity[x] = rr->rr_col[x].rc_abd; + rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = + abd_alloc_sametype(rr->rr_col[x].rc_abd, + rr->rr_col[x].rc_size); + } + + /* + * Fill in the data columns from good_data being + * careful to pad short columns and empty columns + * with a skip sector. + */ + uint64_t good_size = abd_get_size((abd_t *)good_data); + + offset = 0; + for (; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); + + if (offset == good_size) { + /* empty data column (small write) */ + rr->rr_col[x].rc_abd = + abd_get_zeros(skip_size); + } else if (x < rr->rr_bigcols) { + /* this is a "big column" */ + size = rr->rr_col[x].rc_size; + rr->rr_col[x].rc_abd = + abd_get_offset_size( + (abd_t *)good_data, offset, size); + offset += size; + } else { + /* short data column, add skip sector */ + size = rr->rr_col[x].rc_size -skip_size; + rr->rr_col[x].rc_abd = abd_alloc( + rr->rr_col[x].rc_size, B_TRUE); + abd_copy_off(rr->rr_col[x].rc_abd, + (abd_t *)good_data, 0, offset, + size); + abd_zero_off(rr->rr_col[x].rc_abd, + size, skip_size); + offset += size; + } + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity_row(rm, rr); + + /* restore everything back to its original state */ + for (x = 0; x < rr->rr_firstdatacol; x++) + rr->rr_col[x].rc_abd = bad_parity[x]; + + offset = 0; + for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { + if (offset == good_size || x < rr->rr_bigcols) + abd_put(rr->rr_col[x].rc_abd); + else + abd_free(rr->rr_col[x].rc_abd); + + rr->rr_col[x].rc_abd = abd_get_offset_size( + rr->rr_abd_copy, offset, + rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; + } + } + + ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, + rr->rr_col[c].rc_size); + } else { + /* adjust good_data to point at the start of our column */ + parity_size = size = rr->rr_col[0].rc_size; + if (c >= rr->rr_bigcols) { + size -= skip_size; + zcr->zcr_length = size; + } + + /* empty column */ + if (size == 0) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE); + return; + } + + offset = 0; + for (x = rr->rr_firstdatacol; x < c; x++) { + if (x < rr->rr_bigcols) { + offset += parity_size; + } else { + offset += parity_size - skip_size; + } + } + + good = abd_get_offset_size((abd_t *)good_data, offset, size); + } + + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_put((abd_t *)good); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_draid_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + raidz_map_t *rm = zio->io_vsd; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_draid_cksum_finish; + zcr->zcr_free = vdev_draid_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_row[0]->rr_abd_copy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. Furthermore, all columns should have been expanded + * by vdev_draid_map_alloc_empty() when attempting reconstruction. + */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset = 0; + size_t size = 0; + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + ASSERT3U(rr->rr_col[c].rc_size, ==, + rr->rr_col[0].rc_size); + size += rr->rr_col[c].rc_size; + } + + rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); + + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, + offset, col->rc_size); + + abd_copy(tmp, col->rc_abd, col->rc_size); + + if (abd_is_gang(col->rc_abd)) + abd_free(col->rc_abd); + else + abd_put(col->rc_abd); + + col->rc_abd = tmp; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); + } +} + +const zio_vsd_ops_t vdev_draid_vsd_ops = { + .vsd_free = vdev_draid_map_free_vsd, + .vsd_cksum_report = vdev_draid_cksum_report +}; + +/* + * Full stripe writes. When writing, all columns (D+P) are required. Parity + * is calculated over all the columns, including empty zero filled sectors, + * and each is written to disk. While only the data columns are needed for + * a normal read, all of the columns are required for reconstruction when + * performing a sequential resilver. + * + * For "big columns" it's sufficient to map the correct range of the zio ABD. + * Partial columns require allocating a gang ABD in order to zero fill the + * empty sectors. When the column is empty a zero filled sector must be + * mapped. In all cases the data ABDs must be the same size as the parity + * ABDs (e.g. rc->rc_size == parity_size). + */ +static void +vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small write), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + rc->rc_abd = abd_get_zeros(skip_size); + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size), + B_TRUE); + } + + ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size); + + abd_off += rc->rc_size; + rc->rc_size = parity_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Scrub/resilver reads. In order to store the contents of the skip sectors + * an additional ABD is allocated. The columns are handled in the same way + * as a full stripe write except instead of using the zero ABD the newly + * allocated skip ABD is used to back the skip sectors. In all cases the + * data ABD must be the same size as the parity ABDs. + */ +static void +vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t abd_off = abd_offset; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column" */ + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + zio->io_abd, abd_off, rc->rc_size), B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + uint64_t abd_size = abd_get_size(rc->rc_abd); + ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd)); + + /* + * Increase rc_size so the skip ABD is included in subsequent + * parity calculations. + */ + abd_off += rc->rc_size; + rc->rc_size = abd_size; + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Normal reads. In this common case only the columns containing data + * are read in to the zio ABDs. Neither the parity columns or empty skip + * sectors are read unless the checksum fails verification. In which case + * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand + * the raid map in order to allow reconstruction using the parity data and + * skip sectors. + */ +static void +vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr) +{ + uint64_t abd_off = abd_offset; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size > 0) { + rc->rc_abd = abd_get_offset_size(zio->io_abd, + abd_off, rc->rc_size); + abd_off += rc->rc_size; + } + } + + IMPLY(abd_offset != 0, abd_off == zio->io_size); +} + +/* + * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key + * difference is that an ABD is allocated to back skip sectors so they may + * be read in to memory, verified, and repaired if needed. + */ +void +vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t skip_off = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, ==, NULL); + + if (rr->rr_nempty > 0) { + rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size, + B_FALSE); + } + + for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_size == 0) { + /* empty data column (small read), add a skip sector */ + ASSERT3U(skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, ==, NULL); + rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty, + skip_off, skip_size); + skip_off += skip_size; + } else if (rc->rc_size == parity_size) { + /* this is a "big column", nothing to add */ + ASSERT3P(rc->rc_abd, !=, NULL); + } else { + /* short data column, add a skip sector */ + ASSERT3U(rc->rc_size + skip_size, ==, parity_size); + ASSERT3U(rr->rr_nempty, !=, 0); + ASSERT3P(rc->rc_abd, !=, NULL); + ASSERT(!abd_is_gang(rc->rc_abd)); + abd_t *read_abd = rc->rc_abd; + rc->rc_abd = abd_alloc_gang_abd(); + abd_gang_add(rc->rc_abd, read_abd, B_TRUE); + abd_gang_add(rc->rc_abd, abd_get_offset_size( + rr->rr_abd_empty, skip_off, skip_size), B_TRUE); + skip_off += skip_size; + } + + /* + * Increase rc_size so the empty ABD is included in subsequent + * parity calculations. + */ + rc->rc_size = parity_size; + } + + ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); +} + +/* + * Given a logical address within a dRAID configuration, return the physical + * address on the first drive in the group that this address maps to + * (at position 'start' in permutation number 'perm'). + */ +static uint64_t +vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset, + uint64_t *perm, uint64_t *start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + /* b is the dRAID (parent) sector offset. */ + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t b_offset = logical_offset >> ashift; + + /* + * The height of a row in units of the vdev's minimum sector size. + * This is the amount of data written to each disk of each group + * in a given permutation. + */ + uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift; + + /* + * We cycle through a disk permutation every groupsz * ngroups chunk + * of address space. Note that ngroups * groupsz must be a multiple + * of the number of data drives (ndisks) in order to guarantee + * alignment. So, for example, if our row height is 16MB, our group + * size is 10, and there are 13 data drives in the draid, then ngroups + * will be 13, we will change permutation every 2.08GB and each + * disk will have 160MB of data per chunk. + */ + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t ngroups = vdc->vdc_ngroups; + uint64_t ndisks = vdc->vdc_ndisks; + + /* + * groupstart is where the group this IO will land in "starts" in + * the permutation array. + */ + uint64_t group = logical_offset / vdc->vdc_groupsz; + uint64_t groupstart = (group * groupwidth) % ndisks; + ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart); + *start = groupstart; + + /* b_offset is the sector offset within a group chunk */ + b_offset = b_offset % (rowheight_sectors * groupwidth); + ASSERT0(b_offset % groupwidth); + + /* + * Find the starting byte offset on each child vdev: + * - within a permutation there are ngroups groups spread over the + * rows, where each row covers a slice portion of the disk + * - each permutation has (groupwidth * ngroups) / ndisks rows + * - so each permutation covers rows * slice portion of the disk + * - so we need to find the row where this IO group target begins + */ + *perm = group / ngroups; + uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) + + (((group % ngroups) * groupwidth) / ndisks); + + return (((rowheight_sectors * row) + + (b_offset / groupwidth)) << ashift); +} + +static uint64_t +vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, + uint64_t abd_offset, uint64_t abd_size) +{ + vdev_t *vd = zio->io_vd; + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t io_size = abd_size; + uint64_t io_asize = vdev_draid_asize(vd, io_size); + uint64_t group = vdev_draid_offset_to_group(vd, io_offset); + uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1); + + /* + * Limit the io_size to the space remaining in the group. A second + * row in the raidz_map_t is created for the remainder. + */ + if (io_offset + io_asize > start_offset) { + io_size = vdev_draid_asize_to_psize(vd, + start_offset - io_offset); + } + + /* + * At most a block may span the logical end of one group and the start + * of the next group. Therefore, at the end of a group the io_size must + * span the group width evenly and the remainder must be aligned to the + * start of the next group. + */ + IMPLY(abd_offset == 0 && io_size < zio->io_size, + (io_asize >> ashift) % vdc->vdc_groupwidth == 0); + IMPLY(abd_offset != 0, + vdev_draid_group_to_offset(vd, group) == io_offset); + + /* Lookup starting byte offset on each child vdev */ + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + io_offset, &perm, &groupstart); + + /* + * If there is less than groupwidth drives available after the group + * start, the group is going to wrap onto the next row. 'wrap' is the + * group disk number that starts on the next row. + */ + uint64_t ndisks = vdc->vdc_ndisks; + uint64_t groupwidth = vdc->vdc_groupwidth; + uint64_t wrap = groupwidth; + + if (groupstart + groupwidth > ndisks) + wrap = ndisks - groupstart; + + /* The io size in units of the vdev's minimum sector size. */ + const uint64_t psize = io_size >> ashift; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + uint64_t q = psize / vdc->vdc_ndata; + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + uint64_t r = psize - q * vdc->vdc_ndata; + + /* The number of "big columns" - those which contain remainder data. */ + uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity); + ASSERT3U(bc, <, groupwidth); + + /* The total number of data and parity sectors for this I/O. */ + uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1))); + + raidz_row_t *rr; + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP); + rr->rr_cols = groupwidth; + rr->rr_scols = groupwidth; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = vdc->vdc_nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; +#ifdef ZFS_DEBUG + rr->rr_offset = io_offset; + rr->rr_size = io_size; +#endif + *rrp = rr; + + uint8_t *base; + uint64_t iter, asize = 0; + vdev_draid_get_perm(vdc, perm, &base, &iter); + for (uint64_t i = 0; i < groupwidth; i++) { + raidz_col_t *rc = &rr->rr_col[i]; + uint64_t c = (groupstart + i) % ndisks; + + /* increment the offset if we wrap to the next row */ + if (i == wrap) + physical_offset += VDEV_DRAID_ROWHEIGHT; + + rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c); + rc->rc_offset = physical_offset; + rc->rc_abd = NULL; + rc->rc_gdata = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_repair = 0; + rc->rc_need_orig_restore = B_FALSE; + + if (q == 0 && i >= bc) + rc->rc_size = 0; + else if (i < bc) + rc->rc_size = (q + 1) << ashift; + else + rc->rc_size = q << ashift; + + asize += rc->rc_size; + } + + ASSERT3U(asize, ==, tot << ashift); + rr->rr_nempty = roundup(tot, groupwidth) - tot; + IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc); + + /* Allocate buffers for the parity columns */ + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + } + + /* + * Map buffers for data columns and allocate/map buffers for skip + * sectors. There are three distinct cases for dRAID which are + * required to support sequential rebuild. + */ + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_draid_map_alloc_write(zio, abd_offset, rr); + } else if ((rr->rr_nempty > 0) && + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + vdev_draid_map_alloc_scrub(zio, abd_offset, rr); + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + vdev_draid_map_alloc_read(zio, abd_offset, rr); + } + + return (io_size); +} + +/* + * Allocate the raidz mapping to be applied to the dRAID I/O. The parity + * calculations for dRAID are identical to raidz however there are a few + * differences in the layout. + * + * - dRAID always allocates a full stripe width. Any extra sectors due + * this padding are zero filled and written to disk. They will be read + * back during a scrub or repair operation since they are included in + * the parity calculation. This property enables sequential resilvering. + * + * - When the block at the logical offset spans redundancy groups then two + * rows are allocated in the raidz_map_t. One row resides at the end of + * the first group and the other at the start of the following group. + */ +static raidz_map_t * +vdev_draid_map_alloc(zio_t *zio) +{ + raidz_row_t *rr[2]; + uint64_t abd_offset = 0; + uint64_t abd_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + uint64_t size; + int nrows = 1; + + size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset, + abd_offset, abd_size); + if (size < abd_size) { + vdev_t *vd = zio->io_vd; + + io_offset += vdev_draid_asize(vd, size); + abd_offset += size; + abd_size -= size; + nrows++; + + ASSERT3U(io_offset, ==, vdev_draid_group_to_offset( + vd, vdev_draid_offset_to_group(vd, io_offset))); + ASSERT3U(abd_offset, <, zio->io_size); + ASSERT3U(abd_size, !=, 0); + + size = vdev_draid_map_alloc_row(zio, &rr[1], + io_offset, abd_offset, abd_size); + VERIFY3U(size, ==, abd_size); + } + + raidz_map_t *rm; + rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP); + rm->rm_ops = vdev_raidz_math_get_ops(); + rm->rm_nrows = nrows; + rm->rm_row[0] = rr[0]; + if (nrows == 2) + rm->rm_row[1] = rr[1]; + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_draid_vsd_ops; + + return (rm); +} + +/* + * Given an offset into a dRAID return the next group width aligned offset + * which can be used to start an allocation. + */ +static uint64_t +vdev_draid_get_astart(vdev_t *vd, const uint64_t start) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift)); +} + +/* + * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child) + * rounded down to the last full slice. So each child must provide at least + * 1 / (children - nspares) of its asize. + */ +static uint64_t +vdev_draid_min_asize(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return ((vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks)); +} + +/* + * When using dRAID the minimum allocation size is determined by the number + * of data disks in the redundancy group. Full stripes are always used. + */ +static uint64_t +vdev_draid_min_alloc(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + return (vdc->vdc_ndata << vd->vdev_ashift); +} + +/* + * Returns true if the txg range does not exist on any leaf vdev. + * + * A dRAID spare does not fit into the DTL model. While it has child vdevs + * there is no redundancy among them, and the effective child vdev is + * determined by offset. Essentially we do a vdev_dtl_reassess() on the + * fly by replacing a dRAID spare with the child vdev under the offset. + * Note that it is a recursive process because the child vdev can be + * another dRAID spare and so on. + */ +boolean_t +vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child + * contains the txg range the data it is not missing. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (!vdev_draid_missing(cvd, physical_offset, + txg, size)) + return (B_FALSE); + } + + return (B_TRUE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_missing(vd, physical_offset, + txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Returns true if the txg is only partially replicated on the leaf vdevs. + */ +static boolean_t +vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg, + uint64_t size) +{ + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + /* + * Check all of the readable children, if any child is + * missing the txg range then it is partially replicated. + */ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + } + + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_draid_spare_ops) { + /* + * When sequentially resilvering we don't have a proper + * txg range so instead we must presume all txgs are + * missing on this vdev until the resilver completes. + */ + if (vd->vdev_rebuild_txg != 0) + return (B_TRUE); + + /* + * DTL_MISSING is set for all prior txgs when a resilver + * is started in spa_vdev_attach(). + */ + if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) + return (B_TRUE); + + /* + * Consult the DTL on the relevant vdev. Either a vdev + * leaf or spare/replace mirror child may be returned so + * we must recursively call vdev_draid_missing_impl(). + */ + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_TRUE); + + return (vdev_draid_partial(vd, physical_offset, txg, size)); + } + + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + +/* + * Determine if the vdev is readable at the given offset. + */ +boolean_t +vdev_draid_readable(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + } + + if (vd->vdev_ops == &vdev_spare_ops || + vd->vdev_ops == &vdev_replacing_ops) { + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!vdev_readable(cvd)) + continue; + + if (vdev_draid_readable(cvd, physical_offset)) + return (B_TRUE); + } + + return (B_FALSE); + } + + return (vdev_readable(vd)); +} + +/* + * Returns the first distributed spare found under the provided vdev tree. + */ +static vdev_t * +vdev_draid_find_spare(vdev_t *vd) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]); + if (svd != NULL) + return (svd); + } + + return (NULL); +} + +/* + * Returns B_TRUE if the passed in vdev is currently "faulted". + * Faulted, in this context, means that the vdev represents a + * replacing or sparing vdev tree. + */ +static boolean_t +vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset) +{ + if (vd->vdev_ops == &vdev_draid_spare_ops) { + vd = vdev_draid_spare_get_child(vd, physical_offset); + if (vd == NULL) + return (B_FALSE); + + /* + * After resolving the distributed spare to a leaf vdev + * check the parent to determine if it's "faulted". + */ + vd = vd->vdev_parent; + } + + return (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Determine if the dRAID block at the logical offset is degraded. + * Used by sequential resilver. + */ +static boolean_t +vdev_draid_group_degraded(vdev_t *vd, uint64_t offset) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Group contains a faulted vdev. */ + if (vdev_draid_faulted(cvd, physical_offset)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Determine if the txg is missing. Used by healing resilver. + */ +static boolean_t +vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg, + uint64_t size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset); + + uint64_t groupstart, perm; + uint64_t physical_offset = vdev_draid_logical_to_physical(vd, + offset, &perm, &groupstart); + + uint8_t *base; + uint64_t iter; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c); + vdev_t *cvd = vd->vdev_child[cid]; + + /* Transaction group is known to be partially replicated. */ + if (vdev_draid_partial(cvd, physical_offset, txg, size)) + return (B_TRUE); + + /* + * Always check groups with active distributed spares + * because any vdev failure in the pool will affect them. + */ + if (vdev_draid_find_spare(cvd) != NULL) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Find the smallest child asize and largest sector size to calculate the + * available capacity. Distributed spares are ignored since their capacity + * is also based of the minimum child size in the top-level dRAID. + */ +static void +vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, + uint64_t *logical_ashiftp, uint64_t *physical_ashiftp) +{ + uint64_t logical_ashift = 0, physical_ashift = 0; + uint64_t asize = 0, max_asize = 0; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + + asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; + max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; + logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); + physical_ashift = MAX(physical_ashift, + cvd->vdev_physical_ashift); + } + + *asizep = asize; + *max_asizep = max_asize; + *logical_ashiftp = logical_ashift; + *physical_ashiftp = physical_ashift; +} + +/* + * Open spare vdevs. + */ +static boolean_t +vdev_draid_open_spares(vdev_t *vd) +{ + return (vd->vdev_ops == &vdev_draid_spare_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); +} + +/* + * Open all children, excluding spares. + */ +static boolean_t +vdev_draid_open_children(vdev_t *vd) +{ + return (!vdev_draid_open_spares(vd)); +} + +/* + * Open a top-level dRAID vdev. + */ +static int +vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + uint64_t nparity = vdc->vdc_nparity; + int open_errors = 0; + + if (nparity > VDEV_DRAID_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * First open the normal children then the distributed spares. This + * ordering is important to ensure the distributed spares calculate + * the correct psize in the event that the dRAID vdevs were expanded. + */ + vdev_open_children_subset(vd, vdev_draid_open_children); + vdev_open_children_subset(vd, vdev_draid_open_spares); + + /* Verify enough of the children are available to continue. */ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c]->vdev_open_error != 0) { + if ((++open_errors) > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (SET_ERROR(ENXIO)); + } + } + } + + /* + * Allocatable capacity is the sum of the space on all children less + * the number of distributed spares rounded down to last full row + * and then to the last full group. An additional 32MB of scratch + * space is reserved at the end of each child for use by the dRAID + * expansion feature. + */ + uint64_t child_asize, child_max_asize; + vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize, + logical_ashift, physical_ashift); + + /* + * Should be unreachable since the minimum child size is 64MB, but + * we want to make sure an underflow absolutely cannot occur here. + */ + if (child_asize < VDEV_DRAID_REFLOW_RESERVE || + child_max_asize < VDEV_DRAID_REFLOW_RESERVE) { + return (SET_ERROR(ENXIO)); + } + + child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) / + VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT; + + *asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + *max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) * + vdc->vdc_groupsz); + + return (0); +} + +/* + * Close a top-level dRAID vdev. + */ +static void +vdev_draid_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } +} + +/* + * Return the maximum asize for a rebuild zio in the provided range + * given the following constraints. A dRAID chunks may not: + * + * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or + * - Span dRAID redundancy groups. + */ +static uint64_t +vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t ashift = vd->vdev_ashift; + uint64_t ndata = vdc->vdc_ndata; + uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift), + SPA_MAXBLOCKSIZE); + + ASSERT3U(vdev_draid_get_astart(vd, start), ==, start); + ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0); + + /* Chunks must evenly span all data columns in the group. */ + psize = (((psize >> ashift) / ndata) * ndata) << ashift; + uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize)); + + /* Reduce the chunk size to the group space remaining. */ + uint64_t group = vdev_draid_offset_to_group(vd, start); + uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start; + chunk_size = MIN(chunk_size, left); + + ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0); + ASSERT3U(vdev_draid_offset_to_group(vd, start), ==, + vdev_draid_offset_to_group(vd, start + chunk_size - 1)); + + return (chunk_size); +} + +/* + * Align the start of the metaslab to the group width and slightly reduce + * its size to a multiple of the group width. Since full stripe writes are + * required by dRAID this space is unallocable. Furthermore, aligning the + * metaslab start is important for vdev initialize and TRIM which both operate + * on metaslab boundaries which vdev_xlate() expects to be aligned. + */ +static void +vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + + uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift; + uint64_t astart = vdev_draid_get_astart(vd, *ms_start); + uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz; + + *ms_start = astart; + *ms_size = asize; + + ASSERT0(*ms_start % sz); + ASSERT0(*ms_size % sz); +} + +/* + * Add virtual dRAID spares to the list of valid spares. In order to accomplish + * this the existing array must be freed and reallocated with the additional + * entries. + */ +int +vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp, + uint64_t next_vdev_id) +{ + uint64_t draid_nspares = 0; + uint64_t ndraid = 0; + int error; + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + + if (cvd->vdev_ops == &vdev_draid_ops) { + vdev_draid_config_t *vdc = cvd->vdev_tsd; + draid_nspares += vdc->vdc_nspares; + ndraid++; + } + } + + if (draid_nspares == 0) { + *ndraidp = ndraid; + return (0); + } + + nvlist_t **old_spares, **new_spares; + uint_t old_nspares; + error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &old_spares, &old_nspares); + if (error) + old_nspares = 0; + + /* Allocate memory and copy of the existing spares. */ + new_spares = kmem_alloc(sizeof (nvlist_t *) * + (draid_nspares + old_nspares), KM_SLEEP); + for (uint_t i = 0; i < old_nspares; i++) + new_spares[i] = fnvlist_dup(old_spares[i]); + + /* Add new distributed spares to ZPOOL_CONFIG_SPARES. */ + uint64_t n = old_nspares; + for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) { + vdev_t *cvd = vd->vdev_child[vdev_id]; + char path[64]; + + if (cvd->vdev_ops != &vdev_draid_ops) + continue; + + vdev_draid_config_t *vdc = cvd->vdev_tsd; + uint64_t nspares = vdc->vdc_nspares; + uint64_t nparity = vdc->vdc_nparity; + + for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) { + bzero(path, sizeof (path)); + (void) snprintf(path, sizeof (path) - 1, + "%s%llu-%llu-%llu", VDEV_TYPE_DRAID, + (u_longlong_t)nparity, + (u_longlong_t)next_vdev_id + vdev_id, + (u_longlong_t)spare_id); + + nvlist_t *spare = fnvlist_alloc(); + fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path); + fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_DRAID_SPARE); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID, + cvd->vdev_guid); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID, + spare_id); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1); + fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT, + cvd->vdev_ashift); + + new_spares[n] = spare; + n++; + } + } + + if (n > 0) { + (void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES); + fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + new_spares, n); + } + + for (int i = 0; i < n; i++) + nvlist_free(new_spares[i]); + + kmem_free(new_spares, sizeof (*new_spares) * n); + *ndraidp = ndraid; + + return (0); +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. + */ +static boolean_t +vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = vdev_draid_asize(vd, psize); + + if (phys_birth == TXG_UNKNOWN) { + /* + * Sequential resilver. There is no meaningful phys_birth + * for this block, we can only determine if block resides + * in a degraded group in which case it must be resilvered. + */ + ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==, + vdev_draid_offset_to_group(vd, offset + asize - 1)); + + return (vdev_draid_group_degraded(vd, offset)); + } else { + /* + * Healing resilver. TXGs not in DTL_PARTIAL are intact, + * as are blocks in non-degraded groups. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + if (vdev_draid_group_missing(vd, offset, phys_birth, 1)) + return (B_TRUE); + + /* The block may span groups in which case check both. */ + if (vdev_draid_offset_to_group(vd, offset) != + vdev_draid_offset_to_group(vd, offset + asize - 1)) { + if (vdev_draid_group_missing(vd, + offset + asize, phys_birth, 1)) + return (B_TRUE); + } + + return (B_FALSE); + } +} + +static boolean_t +vdev_draid_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_draid_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static void +vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col) +{ +#ifdef ZFS_DEBUG + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_draid_asize(vd, rr->rr_size); + + raidz_col_t *rc = &rr->rr_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end); +#endif +} + +/* + * For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. A gang ABD is allocated by vdev_draid_map_alloc() + * if a skip sector needs to be added to a column. + */ +static void +vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + + vdev_raidz_generate_parity_row(rm, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * Empty columns are zero filled and included in the parity + * calculation and therefore must be written. + */ + ASSERT3U(rc->rc_size, !=, 0); + + /* Verify physical to logical translation */ + vdev_draid_io_verify(vd, rr, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } +} + +/* + * For read operations: + * 1. The vdev_draid_map_alloc() function will create a minimal raidz + * mapping for the read based on the zio->io_flags. There are two + * possible mappings either 1) a normal read, or 2) a scrub/resilver. + * 2. Create the zio read operations. This will include all parity + * columns and skip sectors for a scrub/resilver. + */ +static void +vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + + /* Sequential rebuild must do IO at redundancy group boundary. */ + IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last. Any errors along the way will force us to read the parity. + * For scrub/resilver IOs which verify skip sectors, a gang ABD will + * have been allocated to store them and rc->rc_size is increased. + */ + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (!vdev_draid_readable(cvd, rc->rc_offset)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; + rc->rc_skipped = 1; + continue; + } + + if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + + /* + * Empty columns may be read during vdev_draid_io_done(). + * Only skip them after the readable and missing checks + * verify they are available. + */ + if (rc->rc_size == 0) { + rc->rc_skipped = 1; + continue; + } + + if (zio->io_flags & ZIO_FLAG_RESILVER) { + vdev_t *svd; + + /* + * If this child is a distributed spare then the + * offset might reside on the vdev being replaced. + * In which case this data must be written to the + * new device. Failure to do so would result in + * checksum errors when the old device is detached + * and the pool is scrubbed. + */ + if ((svd = vdev_draid_find_spare(cvd)) != NULL) { + svd = vdev_draid_spare_get_child(svd, + rc->rc_offset); + if (svd && (svd->vdev_ops == &vdev_spare_ops || + svd->vdev_ops == &vdev_replacing_ops)) { + rc->rc_repair = 1; + } + } + + /* + * Always issue a repair IO to this child when its + * a spare or replacing vdev with an active rebuild. + */ + if ((cvd->vdev_ops == &vdev_spare_ops || + cvd->vdev_ops == &vdev_replacing_ops) && + vdev_draid_rebuilding(cvd)) { + rc->rc_repair = 1; + } + } + } + + /* + * Either a parity or data column is missing this means a repair + * may be attempted by vdev_draid_io_done(). Expand the raid map + * to read in empty columns which are needed along with the parity + * during reconstruction. + */ + if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) && + rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) { + vdev_draid_map_alloc_empty(zio, rr); + } + + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error || rc->rc_size == 0) + continue; + + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } +} + +/* + * Start an IO operation to a dRAID vdev. + */ +static void +vdev_draid_io_start(zio_t *zio) +{ + vdev_t *vd __maybe_unused = zio->io_vd; + raidz_map_t *rm; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset)); + + rm = vdev_draid_map_alloc(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_write(zio, rm->rm_row[i]); + } + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_draid_io_start_read(zio, rm->rm_row[i]); + } + } + + zio_execute(zio); +} + +/* + * Complete an IO operation on a dRAID vdev. The raidz logic can be applied + * to dRAID since the layout is fully described by the raidz_map_t. + */ +static void +vdev_draid_io_done(zio_t *zio) +{ + vdev_raidz_io_done(zio); +} + +static void +vdev_draid_state_change(vdev_t *vd, int faulted, int degraded) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + ASSERT(vd->vdev_ops == &vdev_draid_ops); + + if (faulted > vdc->vdc_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +static void +vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_draid_ops); + + vdev_draid_config_t *vdc = raidvd->vdev_tsd; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* Make sure the offsets are block-aligned */ + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + + uint64_t logical_start = logical_rs->rs_start; + uint64_t logical_end = logical_rs->rs_end; + + /* + * Unaligned ranges must be skipped. All metaslabs are correctly + * aligned so this should not happen, but this case is handled in + * case it's needed by future callers. + */ + uint64_t astart = vdev_draid_get_astart(raidvd, logical_start); + if (astart != logical_start) { + physical_rs->rs_start = logical_start; + physical_rs->rs_end = logical_start; + remain_rs->rs_start = MIN(astart, logical_end); + remain_rs->rs_end = logical_end; + return; + } + + /* + * Unlike with mirrors and raidz a dRAID logical range can map + * to multiple non-contiguous physical ranges. This is handled by + * limiting the size of the logical range to a single group and + * setting the remain argument such that it describes the remaining + * unmapped logical range. This is stricter than absolutely + * necessary but helps simplify the logic below. + */ + uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start); + uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1); + if (logical_end > nextstart) + logical_end = nextstart; + + /* Find the starting offset for each vdev in the group */ + uint64_t perm, groupstart; + uint64_t start = vdev_draid_logical_to_physical(raidvd, + logical_start, &perm, &groupstart); + uint64_t end = start; + + uint8_t *base; + uint64_t iter, id; + vdev_draid_get_perm(vdc, perm, &base, &iter); + + /* + * Check if the passed child falls within the group. If it does + * update the start and end to reflect the physical range. + * Otherwise, leave them unmodified which will result in an empty + * (zero-length) physical range being returned. + */ + for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) { + uint64_t c = (groupstart + i) % vdc->vdc_ndisks; + + if (c == 0 && i != 0) { + /* the group wrapped, increment the start */ + start += VDEV_DRAID_ROWHEIGHT; + end = start; + } + + id = vdev_draid_permute_id(vdc, base, iter, c); + if (id == cvd->vdev_id) { + uint64_t b_size = (logical_end >> ashift) - + (logical_start >> ashift); + ASSERT3U(b_size, >, 0); + end = start + ((((b_size - 1) / + vdc->vdc_groupwidth) + 1) << ashift); + break; + } + } + physical_rs->rs_start = start; + physical_rs->rs_end = end; + + /* + * Only top-level vdevs are allowed to set remain_rs because + * when .vdev_op_xlate() is called for their children the full + * logical range is not provided by vdev_xlate(). + */ + remain_rs->rs_start = logical_end; + remain_rs->rs_end = logical_rs->rs_end; + + ASSERT3U(physical_rs->rs_start, <=, logical_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_end - logical_start); +} + +/* + * Add dRAID specific fields to the config nvlist. + */ +static void +vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops); + vdev_draid_config_t *vdc = vd->vdev_tsd; + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups); +} + +/* + * Initialize private dRAID specific fields from the nvlist. + */ +static int +vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + uint64_t ndata, nparity, nspares, ngroups; + int error; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata)) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) || + nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) { + return (SET_ERROR(EINVAL)); + } + + uint_t children; + nvlist_t **child; + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0 || children == 0 || + children > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) || + nspares > 100 || nspares > (children - (ndata + nparity))) { + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) || + ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) { + return (SET_ERROR(EINVAL)); + } + + /* + * Validate the minimum number of children exist per group for the + * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4). + */ + if (children < (ndata + nparity + nspares)) + return (SET_ERROR(EINVAL)); + + /* + * Create the dRAID configuration using the pool nvlist configuration + * and the fixed mapping for the correct number of children. + */ + vdev_draid_config_t *vdc; + const draid_map_t *map; + + error = vdev_draid_lookup_map(children, &map); + if (error) + return (SET_ERROR(EINVAL)); + + vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP); + vdc->vdc_ndata = ndata; + vdc->vdc_nparity = nparity; + vdc->vdc_nspares = nspares; + vdc->vdc_children = children; + vdc->vdc_ngroups = ngroups; + vdc->vdc_nperms = map->dm_nperms; + + error = vdev_draid_generate_perms(map, &vdc->vdc_perms); + if (error) { + kmem_free(vdc, sizeof (*vdc)); + return (SET_ERROR(EINVAL)); + } + + /* + * Derived constants. + */ + vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity; + vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares; + vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT; + vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) / + vdc->vdc_ndisks; + + ASSERT3U(vdc->vdc_groupwidth, >=, 2); + ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks); + ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT); + ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0); + ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) % + vdc->vdc_ndisks, ==, 0); + + *tsd = vdc; + + return (0); +} + +static void +vdev_draid_fini(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + vmem_free(vdc->vdc_perms, sizeof (uint8_t) * + vdc->vdc_children * vdc->vdc_nperms); + kmem_free(vdc, sizeof (*vdc)); +} + +static uint64_t +vdev_draid_nparity(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_nparity); +} + +static uint64_t +vdev_draid_ndisks(vdev_t *vd) +{ + vdev_draid_config_t *vdc = vd->vdev_tsd; + + return (vdc->vdc_ndisks); +} + +vdev_ops_t vdev_draid_ops = { + .vdev_op_init = vdev_draid_init, + .vdev_op_fini = vdev_draid_fini, + .vdev_op_open = vdev_draid_open, + .vdev_op_close = vdev_draid_close, + .vdev_op_asize = vdev_draid_asize, + .vdev_op_min_asize = vdev_draid_min_asize, + .vdev_op_min_alloc = vdev_draid_min_alloc, + .vdev_op_io_start = vdev_draid_io_start, + .vdev_op_io_done = vdev_draid_io_done, + .vdev_op_state_change = vdev_draid_state_change, + .vdev_op_need_resilver = vdev_draid_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_draid_xlate, + .vdev_op_rebuild_asize = vdev_draid_rebuild_asize, + .vdev_op_metaslab_init = vdev_draid_metaslab_init, + .vdev_op_config_generate = vdev_draid_config_generate, + .vdev_op_nparity = vdev_draid_nparity, + .vdev_op_ndisks = vdev_draid_ndisks, + .vdev_op_type = VDEV_TYPE_DRAID, + .vdev_op_leaf = B_FALSE, +}; + + +/* + * A dRAID distributed spare is a virtual leaf vdev which is included in the + * parent dRAID configuration. The last N columns of the dRAID permutation + * table are used to determine on which dRAID children a specific offset + * should be written. These spare leaf vdevs can only be used to replace + * faulted children in the same dRAID configuration. + */ + +/* + * Distributed spare state. All fields are set when the distributed spare is + * first opened and are immutable. + */ +typedef struct { + vdev_t *vds_draid_vdev; /* top-level parent dRAID vdev */ + uint64_t vds_top_guid; /* top-level parent dRAID guid */ + uint64_t vds_spare_id; /* spare id (0 - vdc->vdc_nspares-1) */ +} vdev_draid_spare_t; + +/* + * Returns the parent dRAID vdev to which the distributed spare belongs. + * This may be safely called even when the vdev is not open. + */ +vdev_t * +vdev_draid_spare_get_parent(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + if (vds->vds_draid_vdev != NULL) + return (vds->vds_draid_vdev); + + return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev, + vds->vds_top_guid)); +} + +/* + * A dRAID space is active when it's the child of a vdev using the + * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops. + */ +static boolean_t +vdev_draid_spare_is_active(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + + if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops || + pvd->vdev_ops == &vdev_replacing_ops || + pvd->vdev_ops == &vdev_draid_ops)) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +/* + * Given a dRAID distribute spare vdev, returns the physical child vdev + * on which the provided offset resides. This may involve recursing through + * multiple layers of distributed spares. Note that offset is relative to + * this vdev. + */ +vdev_t * +vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + /* The vdev is closed */ + if (vds->vds_draid_vdev == NULL) + return (NULL); + + vdev_t *tvd = vds->vds_draid_vdev; + vdev_draid_config_t *vdc = tvd->vdev_tsd; + + ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops); + ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares); + + uint8_t *base; + uint64_t iter; + uint64_t perm = physical_offset / vdc->vdc_devslicesz; + + vdev_draid_get_perm(vdc, perm, &base, &iter); + + uint64_t cid = vdev_draid_permute_id(vdc, base, iter, + (tvd->vdev_children - 1) - vds->vds_spare_id); + vdev_t *cvd = tvd->vdev_child[cid]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_spare_get_child(cvd, physical_offset)); + + return (cvd); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_close(vdev_t *vd) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vds->vds_draid_vdev = NULL; +} + +/* + * Opening a dRAID spare device is done by looking up the associated dRAID + * top-level vdev guid from the spare configuration. + */ +static int +vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + uint64_t asize, max_asize; + + vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid); + if (tvd == NULL) { + /* + * When spa_vdev_add() is labeling new spares the + * associated dRAID is not attached to the root vdev + * nor does this spare have a parent. Simulate a valid + * device in order to allow the label to be initialized + * and the distributed spare added to the configuration. + */ + if (vd->vdev_parent == NULL) { + *psize = *max_psize = SPA_MINDEVSIZE; + *logical_ashift = *physical_ashift = ASHIFT_MIN; + return (0); + } + + return (SET_ERROR(EINVAL)); + } + + vdev_draid_config_t *vdc = tvd->vdev_tsd; + if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL) + return (SET_ERROR(EINVAL)); + + if (vds->vds_spare_id >= vdc->vdc_nspares) + return (SET_ERROR(EINVAL)); + + /* + * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here + * because the caller may be vdev_draid_open() in which case the + * values are stale as they haven't yet been updated by vdev_open(). + * To avoid this always recalculate the dRAID asize and max_asize. + */ + vdev_draid_calculate_asize(tvd, &asize, &max_asize, + logical_ashift, physical_ashift); + + *psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + + vds->vds_draid_vdev = tvd; + + return (0); +} + +/* + * Completed distributed spare IO. Store the result in the parent zio + * as if it had performed the operation itself. Only the first error is + * preserved if there are multiple errors. + */ +static void +vdev_draid_spare_child_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + /* + * IOs are issued to non-writable vdevs in order to keep their + * DTLs accurate. However, we don't want to propagate the + * error in to the distributed spare's DTL. When resilvering + * vdev_draid_need_resilver() will consult the relevant DTL + * to determine if the data is missing and must be repaired. + */ + if (!vdev_writeable(zio->io_vd)) + return; + + if (pio->io_error == 0) + pio->io_error = zio->io_error; +} + +/* + * Returns a valid label nvlist for the distributed spare vdev. This is + * used to bypass the IO pipeline to avoid the complexity of constructing + * a complete label with valid checksum to return when read. + */ +nvlist_t * +vdev_draid_read_config_spare(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + uint64_t guid = vd->vdev_guid; + + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE, + vdev_draid_spare_is_active(vd) ? + POOL_STATE_ACTIVE : POOL_STATE_SPARE); + + /* Set the vdev guid based on the vdev list in sav_count. */ + for (int i = 0; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops && + strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) { + guid = sav->sav_vdevs[i]->vdev_guid; + break; + } + } + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid); + + return (nv); +} + +/* + * Handle any ioctl requested of the distributed spare. Only flushes + * are supported in which case all children must be flushed. + */ +static int +vdev_draid_spare_ioctl(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + int error = 0; + + if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } else { + error = SET_ERROR(ENOTSUP); + } + + return (error); +} + +/* + * Initiate an IO to the distributed spare. For normal IOs this entails using + * the zio->io_offset and permutation table to calculate which child dRAID vdev + * is responsible for the data. Then passing along the zio to that child to + * perform the actual IO. The label ranges are not stored on disk and require + * some special handling which is described below. + */ +static void +vdev_draid_spare_io_start(zio_t *zio) +{ + vdev_t *cvd = NULL, *vd = zio->io_vd; + vdev_draid_spare_t *vds = vd->vdev_tsd; + uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (vds == NULL) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + zio->io_error = vdev_draid_spare_ioctl(zio); + break; + + case ZIO_TYPE_WRITE: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs and config writers to simulate the + * existence of an on disk label. vdev_label_sync(), + * vdev_uberblock_sync() and vdev_copy_uberblocks() + * skip the distributed spares. This only leaves + * vdev_label_init() which is allowed to succeed to + * avoid adding special cases the function. + */ + if (zio->io_flags & ZIO_FLAG_PROBE || + zio->io_flags & ZIO_FLAG_CONFIG_WRITER) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_READ: + if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) { + /* + * Accept probe IOs to simulate the existence of a + * label. vdev_label_read_config() bypasses the + * pipeline to read the label configuration and + * vdev_uberblock_load() skips distributed spares + * when attempting to locate the best uberblock. + */ + if (zio->io_flags & ZIO_FLAG_PROBE) { + zio->io_error = 0; + } else { + zio->io_error = SET_ERROR(EIO); + } + } else { + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !vdev_readable(cvd)) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + } + break; + + case ZIO_TYPE_TRIM: + /* The vdev label ranges are never trimmed */ + ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)); + + cvd = vdev_draid_spare_get_child(vd, offset); + + if (cvd == NULL || !cvd->vdev_has_trim) { + zio->io_error = SET_ERROR(ENXIO); + } else { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); + } + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + zio_execute(zio); +} + +/* ARGSUSED */ +static void +vdev_draid_spare_io_done(zio_t *zio) +{ +} + +/* + * Lookup the full spare config in spa->spa_spares.sav_config and + * return the top_guid and spare_id for the named spare. + */ +static int +vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp, + uint64_t *spare_idp) +{ + nvlist_t **spares; + uint_t nspares; + int error; + + if ((spa->spa_spares.sav_config == NULL) || + (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) { + return (SET_ERROR(ENOENT)); + } + + char *spare_name; + error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name); + if (error != 0) + return (SET_ERROR(EINVAL)); + + for (int i = 0; i < nspares; i++) { + nvlist_t *spare = spares[i]; + uint64_t top_guid, spare_id; + char *type, *path; + + /* Skip non-distributed spares */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type); + if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0) + continue; + + /* Skip spares with the wrong name */ + error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path); + if (error != 0 || strcmp(path, spare_name) != 0) + continue; + + /* Found the matching spare */ + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_TOP_GUID, &top_guid); + if (error == 0) { + error = nvlist_lookup_uint64(spare, + ZPOOL_CONFIG_SPARE_ID, &spare_id); + } + + if (error != 0) { + return (SET_ERROR(EINVAL)); + } else { + *top_guidp = top_guid; + *spare_idp = spare_id; + return (0); + } + } + + return (SET_ERROR(ENOENT)); +} + +/* + * Initialize private dRAID spare specific fields from the nvlist. + */ +static int +vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_draid_spare_t *vds; + uint64_t top_guid = 0; + uint64_t spare_id; + + /* + * In the normal case check the list of spares stored in the spa + * to lookup the top_guid and spare_id for provided spare config. + * When creating a new pool or adding vdevs the spare list is not + * yet populated and the values are provided in the passed config. + */ + if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID, + &spare_id) != 0) + return (SET_ERROR(EINVAL)); + } + + vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP); + vds->vds_draid_vdev = NULL; + vds->vds_top_guid = top_guid; + vds->vds_spare_id = spare_id; + + *tsd = vds; + + return (0); +} + +static void +vdev_draid_spare_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t)); +} + +static void +vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv) +{ + vdev_draid_spare_t *vds = vd->vdev_tsd; + + ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id); +} + +vdev_ops_t vdev_draid_spare_ops = { + .vdev_op_init = vdev_draid_spare_init, + .vdev_op_fini = vdev_draid_spare_fini, + .vdev_op_open = vdev_draid_spare_open, + .vdev_op_close = vdev_draid_spare_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, + .vdev_op_io_start = vdev_draid_spare_io_start, + .vdev_op_io_done = vdev_draid_spare_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_draid_spare_config_generate, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, + .vdev_op_type = VDEV_TYPE_DRAID_SPARE, + .vdev_op_leaf = B_TRUE, +}; diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c new file mode 100644 index 000000000000..fe1a75c11312 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c @@ -0,0 +1,40 @@ +/* + * Xorshift Pseudo Random Number Generator based on work by David Blackman + * and Sebastiano Vigna (vigna@acm.org). + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * http://prng.di.unimi.it/xoroshiro128plusplus.c + * + * To the extent possible under law, the author has dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide. This software is distributed without any warranty. + * + * See . + * + * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid, + * small-state generators. It is extremely (sub-ns) fast and it passes all + * tests we are aware of, but its state space is large enough only for + * mild parallelism. + */ + +#include + +static inline uint64_t rotl(const uint64_t x, int k) +{ + return (x << k) | (x >> (64 - k)); +} + +uint64_t +vdev_draid_rand(uint64_t *s) +{ + const uint64_t s0 = s[0]; + uint64_t s1 = s[1]; + const uint64_t result = rotl(s0 + s1, 17) + s0; + + s1 ^= s0; + s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b + s[1] = rotl(s1, 28); // c + + return (result); +} diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c index 12ee393bd5db..07d1c922a50c 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_indirect.c +++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c @@ -239,6 +239,7 @@ typedef struct indirect_child { */ struct indirect_child *ic_duplicate; list_node_t ic_node; /* node on is_unique_child */ + int ic_error; /* set when a child does not contain the data */ } indirect_child_t; /* @@ -1272,15 +1273,14 @@ vdev_indirect_read_all(zio_t *zio) continue; /* - * Note, we may read from a child whose DTL - * indicates that the data may not be present here. - * While this might result in a few i/os that will - * likely return incorrect data, it simplifies the - * code since we can treat scrub and resilver - * identically. (The incorrect data will be - * detected and ignored when we verify the - * checksum.) + * If a child is missing the data, set ic_error. Used + * in vdev_indirect_repair(). We perform the read + * nevertheless which provides the opportunity to + * reconstruct the split block if at all possible. */ + if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING, + zio->io_txg, 1)) + ic->ic_error = SET_ERROR(ESTALE); ic->ic_data = abd_alloc_sametype(zio->io_abd, is->is_size); @@ -1410,7 +1410,11 @@ vdev_indirect_checksum_error(zio_t *zio, * Issue repair i/os for any incorrect copies. We do this by comparing * each split segment's correct data (is_good_child's ic_data) with each * other copy of the data. If they differ, then we overwrite the bad data - * with the good copy. Note that we do this without regard for the DTL's, + * with the good copy. The DTL is checked in vdev_indirect_read_all() and + * if a vdev is missing a copy of the data we set ic_error and the read is + * performed. This provides the opportunity to reconstruct the split block + * if at all possible. ic_error is checked here and if set it suppresses + * incrementing the checksum counter. Aside from this DTLs are not checked, * which simplifies this code and also issues the optimal number of writes * (based on which copies actually read bad data, as opposed to which we * think might be wrong). For the same reason, we always use @@ -1447,6 +1451,14 @@ vdev_indirect_repair(zio_t *zio) ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, NULL, NULL)); + /* + * If ic_error is set the current child does not have + * a copy of the data, so suppress incrementing the + * checksum counter. + */ + if (ic->ic_error == ESTALE) + continue; + vdev_indirect_checksum_error(zio, is, ic); } } @@ -1844,9 +1856,13 @@ vdev_indirect_io_done(zio_t *zio) } vdev_ops_t vdev_indirect_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_indirect_open, .vdev_op_close = vdev_indirect_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_indirect_io_start, .vdev_op_io_done = vdev_indirect_io_done, .vdev_op_state_change = NULL, @@ -1855,6 +1871,11 @@ vdev_ops_t vdev_indirect_ops = { .vdev_op_rele = NULL, .vdev_op_remap = vdev_indirect_remap, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c index 7ff7fffcc80e..083ad2861b5b 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_initialize.c +++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c @@ -121,6 +121,8 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { vd->vdev_initialize_action_time = gethrestime_sec(); } + + vdev_initializing_state_t old_state = vd->vdev_initialize_state; vd->vdev_initialize_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); @@ -138,8 +140,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) "vdev=%s suspended", vd->vdev_path); break; case VDEV_INITIALIZE_CANCELED: - spa_history_log_internal(spa, "initialize", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_INITIALIZE_ACTIVE || + old_state == VDEV_INITIALIZE_SUSPENDED) + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); break; case VDEV_INITIALIZE_COMPLETE: spa_history_log_internal(spa, "initialize", tx, @@ -317,6 +321,32 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data) return (0); } +static void +vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_initialize_bytes_est += size; + + if (vd->vdev_initialize_last_offset > physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > physical_rs->rs_start && + vd->vdev_initialize_last_offset < physical_rs->rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - physical_rs->rs_start; + } +} + static void vdev_initialize_calculate_progress(vdev_t *vd) { @@ -331,28 +361,35 @@ vdev_initialize_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been initialized */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been initialized */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_initialize_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_initialize_last_offset > last_rs_end) { vd->vdev_initialize_bytes_done += ms_free; vd->vdev_initialize_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -374,22 +411,9 @@ vdev_initialize_calculate_progress(vdev_t *vd) &where)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); - vdev_xlate(vd, &logical_rs, &physical_rs); - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_initialize_bytes_est += size; - if (vd->vdev_initialize_last_offset > - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += size; - } else if (vd->vdev_initialize_last_offset > - physical_rs.rs_start && - vd->vdev_initialize_last_offset < - physical_rs.rs_end) { - vd->vdev_initialize_bytes_done += - vd->vdev_initialize_last_offset - - physical_rs.rs_start; - } + vdev_xlate_walk(vd, &logical_rs, + vdev_initialize_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -419,6 +443,34 @@ vdev_initialize_load(vdev_t *vd) return (err); } +static void +vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = arg; + + /* Only add segments that we have not visited yet */ + if (physical_rs->rs_end <= vd->vdev_initialize_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_initialize_last_offset > physical_rs->rs_start) { + zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " + "(%llu, %llu)", vd->vdev_path, + (u_longlong_t)physical_rs->rs_start, + (u_longlong_t)physical_rs->rs_end, + (u_longlong_t)vd->vdev_initialize_last_offset, + (u_longlong_t)physical_rs->rs_end); + ASSERT3U(physical_rs->rs_end, >, + vd->vdev_initialize_last_offset); + physical_rs->rs_start = vd->vdev_initialize_last_offset; + } + + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + /* * Convert the logical range into a physical range and add it to our * avl tree. @@ -427,47 +479,12 @@ static void vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) { vdev_t *vd = arg; - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); - - /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) - return; - - /* Pick up where we left off mid-range. */ - if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { - zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " - "(%llu, %llu)", vd->vdev_path, - (u_longlong_t)physical_rs.rs_start, - (u_longlong_t)physical_rs.rs_end, - (u_longlong_t)vd->vdev_initialize_last_offset, - (u_longlong_t)physical_rs.rs_end); - ASSERT3U(physical_rs.rs_end, >, - vd->vdev_initialize_last_offset); - physical_rs.rs_start = vd->vdev_initialize_last_offset; - } - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg); } static void diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c index d063b77ea836..fbd117d2d9ae 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_label.c +++ b/sys/contrib/openzfs/module/zfs/vdev_label.c @@ -142,6 +142,7 @@ #include #include #include +#include #include #include #include @@ -453,31 +454,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_fru != NULL) fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); - if (vd->vdev_nparity != 0) { - ASSERT(strcmp(vd->vdev_ops->vdev_op_type, - VDEV_TYPE_RAIDZ) == 0); + if (vd->vdev_ops->vdev_op_config_generate != NULL) + vd->vdev_ops->vdev_op_config_generate(vd, nv); - /* - * Make sure someone hasn't managed to sneak a fancy new vdev - * into a crufty old storage pool. - */ - ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity <= 2 && - spa_version(spa) >= SPA_VERSION_RAIDZ2) || - (vd->vdev_nparity <= 3 && - spa_version(spa) >= SPA_VERSION_RAIDZ3)); - - /* - * Note that we'll add the nparity tag even on storage pools - * that only support a single parity device -- older software - * will just ignore it. - */ - fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); - } - - if (vd->vdev_wholedisk != -1ULL) + if (vd->vdev_wholedisk != -1ULL) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk); + } if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); @@ -785,6 +768,14 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); + /* + * The label for a dRAID distributed spare is not stored on disk. + * Instead it is generated when needed which allows us to bypass + * the pipeline when reading the config from the label. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return (vdev_draid_read_config_spare(vd)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); vp = abd_to_buf(vp_abd); @@ -1497,7 +1488,8 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (int c = 0; c < vd->vdev_children; c++) vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); - if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) && + vd->vdev_ops != &vdev_draid_spare_ops) { for (int l = 0; l < VDEV_LABELS; l++) { for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, @@ -1586,6 +1578,13 @@ vdev_copy_uberblocks(vdev_t *vd) SCL_STATE); ASSERT(vd->vdev_ops->vdev_op_leaf); + /* + * No uberblocks are stored on distributed spares, they may be + * safely skipped when expanding a leaf vdev. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); @@ -1647,6 +1646,15 @@ vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, if (!vdev_writeable(vd)) return; + /* + * There's no need to write uberblocks to a distributed spare, they + * are already stored on all the leaves of the parent dRAID. For + * this same reason vdev_uberblock_load_impl() skips distributed + * spares when reading uberblocks. + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + /* If the vdev was expanded, need to copy uberblock rings. */ if (vd->vdev_state == VDEV_STATE_HEALTHY && vd->vdev_copy_uberblocks == B_TRUE) { @@ -1763,6 +1771,14 @@ vdev_label_sync(zio_t *zio, uint64_t *good_writes, if (!vdev_writeable(vd)) return; + /* + * The top-level config never needs to be written to a distributed + * spare. When read vdev_dspare_label_read_config() will generate + * the config for the vdev_label_read_config(). + */ + if (vd->vdev_ops == &vdev_draid_spare_ops) + return; + /* * Generate a label describing the top-level config to which we belong. */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c index 71b5adbbd06a..71ca43caec1a 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_mirror.c +++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -99,7 +100,6 @@ vdev_mirror_stat_fini(void) /* * Virtual device vector for mirroring. */ - typedef struct mirror_child { vdev_t *mc_vd; uint64_t mc_offset; @@ -108,6 +108,7 @@ typedef struct mirror_child { uint8_t mc_tried; uint8_t mc_skipped; uint8_t mc_speculative; + uint8_t mc_rebuilding; } mirror_child_t; typedef struct mirror_map { @@ -115,6 +116,7 @@ typedef struct mirror_map { int mm_preferred_cnt; int mm_children; boolean_t mm_resilvering; + boolean_t mm_rebuilding; boolean_t mm_root; mirror_child_t mm_child[]; } mirror_map_t; @@ -239,6 +241,21 @@ vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) return (load + zfs_vdev_mirror_rotating_seek_inc); } +static boolean_t +vdev_mirror_rebuilding(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (vdev_mirror_rebuilding(vd->vdev_child[i])) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Avoid inlining the function to keep vdev_mirror_io_start(), which * is this functions only caller, as small as possible on the stack. @@ -356,6 +373,9 @@ vdev_mirror_map_init(zio_t *zio) mc = &mm->mm_child[c]; mc->mc_vd = vd->vdev_child[c]; mc->mc_offset = zio->io_offset; + + if (vdev_mirror_rebuilding(mc->mc_vd)) + mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE; } } @@ -493,12 +513,37 @@ vdev_mirror_preferred_child_randomize(zio_t *zio) return (mm->mm_preferred[p]); } +static boolean_t +vdev_mirror_child_readable(mirror_child_t *mc) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_readable(vd, mc->mc_offset)); + else + return (vdev_readable(vd)); +} + +static boolean_t +vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size) +{ + vdev_t *vd = mc->mc_vd; + + if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops) + return (vdev_draid_missing(vd, mc->mc_offset, txg, size)); + else + return (vdev_dtl_contains(vd, DTL_MISSING, txg, size)); +} + /* * Try to find a vdev whose DTL doesn't contain the block we want to read - * preferring vdevs based on determined load. + * preferring vdevs based on determined load. If we can't, try the read on + * any vdev we haven't already tried. * - * Try to find a child whose DTL doesn't contain the block we want to read. - * If we can't, try the read on any vdev we haven't already tried. + * Distributed spares are an exception to the above load rule. They are + * always preferred in order to detect gaps in the distributed spare which + * are created when another disk in the dRAID fails. In order to restore + * redundancy those gaps must be read to trigger the required repair IO. */ static int vdev_mirror_child_select(zio_t *zio) @@ -518,20 +563,27 @@ vdev_mirror_child_select(zio_t *zio) if (mc->mc_tried || mc->mc_skipped) continue; - if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + if (mc->mc_vd == NULL || + !vdev_mirror_child_readable(mc)) { mc->mc_error = SET_ERROR(ENXIO); mc->mc_tried = 1; /* don't even try */ mc->mc_skipped = 1; continue; } - if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + if (vdev_mirror_child_missing(mc, txg, 1)) { mc->mc_error = SET_ERROR(ESTALE); mc->mc_skipped = 1; mc->mc_speculative = 1; continue; } + if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) { + mm->mm_preferred[0] = c; + mm->mm_preferred_cnt = 1; + break; + } + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); if (mc->mc_load > lowest_load) continue; @@ -625,11 +677,25 @@ vdev_mirror_io_start(zio_t *zio) while (children--) { mc = &mm->mm_child[c]; + c++; + + /* + * When sequentially resilvering only issue write repair + * IOs to the vdev which is being rebuilt since performance + * is limited by the slowest child. This is an issue for + * faster replacement devices such as distributed spares. + */ + if ((zio->io_priority == ZIO_PRIORITY_REBUILD) && + (zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SCRUB) && + mm->mm_rebuilding && !mc->mc_rebuilding) { + continue; + } + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_child_done, mc)); - c++; } zio_execute(zio); @@ -744,6 +810,8 @@ vdev_mirror_io_done(zio_t *zio) mc = &mm->mm_child[c]; if (mc->mc_error == 0) { + vdev_ops_t *ops = mc->mc_vd->vdev_ops; + if (mc->mc_tried) continue; /* @@ -752,15 +820,16 @@ vdev_mirror_io_done(zio_t *zio) * 1. it's a scrub (in which case we have * tried everything that was healthy) * - or - - * 2. it's an indirect vdev (in which case - * it could point to any other vdev, which - * might have a bad DTL) + * 2. it's an indirect or distributed spare + * vdev (in which case it could point to any + * other vdev, which might have a bad DTL) * - or - * 3. the DTL indicates that this data is * missing from this vdev */ if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - mc->mc_vd->vdev_ops != &vdev_indirect_ops && + ops != &vdev_indirect_ops && + ops != &vdev_draid_spare_ops && !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; @@ -796,50 +865,90 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) } } +/* + * Return the maximum asize for a rebuild zio in the provided range. + */ +static uint64_t +vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize, + uint64_t max_segment) +{ + uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift), + SPA_MAXBLOCKSIZE); + + return (MIN(asize, vdev_psize_to_asize(vd, psize))); +} + vdev_ops_t vdev_mirror_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_mirror_open, .vdev_op_close = vdev_mirror_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_mirror_io_start, .vdev_op_io_done = vdev_mirror_io_done, .vdev_op_state_change = vdev_mirror_state_change, - .vdev_op_need_resilver = NULL, + .vdev_op_need_resilver = vdev_default_need_resilver, .vdev_op_hold = NULL, .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, + .vdev_op_rebuild_asize = vdev_mirror_rebuild_asize, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_missing.c b/sys/contrib/openzfs/module/zfs/vdev_missing.c index ce90df6e8d95..e9145fd012d7 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_missing.c +++ b/sys/contrib/openzfs/module/zfs/vdev_missing.c @@ -81,9 +81,13 @@ vdev_missing_io_done(zio_t *zio) } vdev_ops_t vdev_missing_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, @@ -92,14 +96,23 @@ vdev_ops_t vdev_missing_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_missing_open, .vdev_op_close = vdev_missing_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_missing_io_start, .vdev_op_io_done = vdev_missing_io_done, .vdev_op_state_change = NULL, @@ -108,6 +121,11 @@ vdev_ops_t vdev_hole_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c index a8ef3d7474c9..02040c3ee198 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_queue.c +++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c @@ -121,16 +121,17 @@ /* * The maximum number of i/os active to each device. Ideally, this will be >= - * the sum of each queue's max_active. It must be at least the sum of each - * queue's min_active. + * the sum of each queue's max_active. */ uint32_t zfs_vdev_max_active = 1000; /* * Per-queue limits on the number of i/os active to each device. If the * number of active i/os is < zfs_vdev_max_active, then the min_active comes - * into play. We will send min_active from each queue, and then select from - * queues in the order defined by zio_priority_t. + * into play. We will send min_active from each queue round-robin, and then + * send from queues in the order defined by zio_priority_t up to max_active. + * Some queues have additional mechanisms to limit number of active I/Os in + * addition to min_active and max_active, see below. * * In general, smaller max_active's will lead to lower latency of synchronous * operations. Larger max_active's may lead to higher overall throughput, @@ -151,7 +152,7 @@ uint32_t zfs_vdev_async_read_max_active = 3; uint32_t zfs_vdev_async_write_min_active = 2; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; -uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_scrub_max_active = 3; uint32_t zfs_vdev_removal_min_active = 1; uint32_t zfs_vdev_removal_max_active = 2; uint32_t zfs_vdev_initializing_min_active = 1; @@ -171,6 +172,28 @@ uint32_t zfs_vdev_rebuild_max_active = 3; int zfs_vdev_async_write_active_min_dirty_percent = 30; int zfs_vdev_async_write_active_max_dirty_percent = 60; +/* + * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild), + * the number of concurrently-active I/O's is limited to *_min_active, unless + * the vdev is "idle". When there are no interactive I/Os active (sync or + * async), and zfs_vdev_nia_delay I/Os have completed since the last + * interactive I/O, then the vdev is considered to be "idle", and the number + * of concurrently-active non-interactive I/O's is increased to *_max_active. + */ +uint_t zfs_vdev_nia_delay = 5; + +/* + * Some HDDs tend to prioritize sequential I/O so high that concurrent + * random I/O latency reaches several seconds. On some HDDs it happens + * even if sequential I/Os are submitted one at a time, and so setting + * *_max_active to 1 does not help. To prevent non-interactive I/Os, like + * scrub, from monopolizing the device no more than zfs_vdev_nia_credit + * I/Os can be sent while there are outstanding incomplete interactive + * I/Os. This enforced wait ensures the HDD services the interactive I/O + * within a reasonable amount of time. + */ +uint_t zfs_vdev_nia_credit = 5; + /* * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. * For read I/Os, we also aggregate across small adjacency gaps; for writes @@ -261,7 +284,7 @@ vdev_queue_timestamp_compare(const void *x1, const void *x2) } static int -vdev_queue_class_min_active(zio_priority_t p) +vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -273,15 +296,19 @@ vdev_queue_class_min_active(zio_priority_t p) case ZIO_PRIORITY_ASYNC_WRITE: return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: - return (zfs_vdev_scrub_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); case ZIO_PRIORITY_REMOVAL: - return (zfs_vdev_removal_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); case ZIO_PRIORITY_INITIALIZING: - return (zfs_vdev_initializing_min_active); + return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active: + MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_min_active); case ZIO_PRIORITY_REBUILD: - return (zfs_vdev_rebuild_min_active); + return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active : + MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); default: panic("invalid priority %u", p); return (0); @@ -311,14 +338,12 @@ vdev_queue_max_async_writes(spa_t *spa) * Sync tasks correspond to interactive user actions. To reduce the * execution time of those actions we push data out as fast as possible. */ - if (spa_has_pending_synctask(spa)) + dirty = dp->dp_dirty_total; + if (dirty > max_bytes || spa_has_pending_synctask(spa)) return (zfs_vdev_async_write_max_active); - dirty = dp->dp_dirty_total; if (dirty < min_bytes) return (zfs_vdev_async_write_min_active); - if (dirty > max_bytes) - return (zfs_vdev_async_write_max_active); /* * linear interpolation: @@ -337,7 +362,7 @@ vdev_queue_max_async_writes(spa_t *spa) } static int -vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) +vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -349,14 +374,34 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) case ZIO_PRIORITY_ASYNC_WRITE: return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_scrub_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_scrub_min_active)); return (zfs_vdev_scrub_max_active); case ZIO_PRIORITY_REMOVAL: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_removal_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_removal_min_active)); return (zfs_vdev_removal_max_active); case ZIO_PRIORITY_INITIALIZING: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_initializing_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_initializing_min_active)); return (zfs_vdev_initializing_max_active); case ZIO_PRIORITY_TRIM: return (zfs_vdev_trim_max_active); case ZIO_PRIORITY_REBUILD: + if (vq->vq_ia_active > 0) { + return (MIN(vq->vq_nia_credit, + zfs_vdev_rebuild_min_active)); + } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) + return (MAX(1, zfs_vdev_rebuild_min_active)); return (zfs_vdev_rebuild_max_active); default: panic("invalid priority %u", p); @@ -372,17 +417,24 @@ static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p; + zio_priority_t p, n; if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); - /* find a queue that has not reached its minimum # outstanding i/os */ - for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + /* + * Find a queue that has not reached its minimum # outstanding i/os. + * Do round-robin to reduce starvation due to zfs_vdev_max_active + * and vq_nia_credit limits. + */ + for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { + p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(p)) + vdev_queue_class_min_active(vq, p)) { + vq->vq_last_prio = p; return (p); + } } /* @@ -392,8 +444,10 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, p)) + vdev_queue_class_max_active(spa, vq, p)) { + vq->vq_last_prio = p; return (p); + } } /* No eligible queued i/os */ @@ -493,6 +547,20 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) } } +static boolean_t +vdev_queue_is_interactive(zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SCRUB: + case ZIO_PRIORITY_REMOVAL: + case ZIO_PRIORITY_INITIALIZING: + case ZIO_PRIORITY_REBUILD: + return (B_FALSE); + default: + return (B_TRUE); + } +} + static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { @@ -502,6 +570,12 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; + if (vdev_queue_is_interactive(zio->io_priority)) { + if (++vq->vq_ia_active == 1) + vq->vq_nia_credit = 1; + } else if (vq->vq_ia_active > 0) { + vq->vq_nia_credit--; + } avl_add(&vq->vq_active_tree, zio); if (shk->kstat != NULL) { @@ -520,6 +594,13 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; + if (vdev_queue_is_interactive(zio->io_priority)) { + if (--vq->vq_ia_active == 0) + vq->vq_nia_credit = 0; + else + vq->vq_nia_credit = zfs_vdev_nia_credit; + } else if (vq->vq_ia_active == 0) + vq->vq_nia_credit++; avl_remove(&vq->vq_active_tree, zio); if (shk->kstat != NULL) { @@ -593,6 +674,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) return (NULL); + /* + * I/Os to distributed spares are directly dispatched to the dRAID + * leaf vdevs for aggregation. See the comment at the end of the + * zio_vdev_io_start() function. + */ + ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops); + first = last = zio; if (zio->io_type == ZIO_TYPE_READ) @@ -1065,6 +1153,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, "Min active rebuild I/Os per vdev"); +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW, + "Number of non-interactive I/Os to allow in sequence"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW, + "Number of non-interactive I/Os before _max_active"); + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c index 47312e02f70a..989b90dc2635 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -134,25 +135,51 @@ VDEV_RAIDZ_64MUL_2((x), mask); \ } -void -vdev_raidz_map_free(raidz_map_t *rm) +static void +vdev_raidz_row_free(raidz_row_t *rr) { int c; - for (c = 0; c < rm->rm_firstdatacol; c++) { - abd_free(rm->rm_col[c].rc_abd); + for (c = 0; c < rr->rr_firstdatacol && c < rr->rr_cols; c++) { + abd_free(rr->rr_col[c].rc_abd); - if (rm->rm_col[c].rc_gdata != NULL) - abd_free(rm->rm_col[c].rc_gdata); + if (rr->rr_col[c].rc_gdata != NULL) { + abd_free(rr->rr_col[c].rc_gdata); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } + } + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_size != 0) { + if (abd_is_gang(rr->rr_col[c].rc_abd)) + abd_free(rr->rr_col[c].rc_abd); + else + abd_put(rr->rr_col[c].rc_abd); + } + if (rr->rr_col[c].rc_orig_data != NULL) { + zio_buf_free(rr->rr_col[c].rc_orig_data, + rr->rr_col[c].rc_size); + } } - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - abd_put(rm->rm_col[c].rc_abd); + if (rr->rr_abd_copy != NULL) + abd_free(rr->rr_abd_copy); - if (rm->rm_abd_copy != NULL) - abd_free(rm->rm_abd_copy); + if (rr->rr_abd_empty != NULL) + abd_free(rr->rr_abd_empty); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); + kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); +} + +void +vdev_raidz_map_free(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) + vdev_raidz_row_free(rm->rm_row[i]); + + kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); } static void @@ -161,10 +188,11 @@ vdev_raidz_map_free_vsd(zio_t *zio) raidz_map_t *rm = zio->io_vsd; ASSERT0(rm->rm_freed); - rm->rm_freed = 1; + rm->rm_freed = B_TRUE; - if (rm->rm_reports == 0) + if (rm->rm_reports == 0) { vdev_raidz_map_free(rm); + } } /*ARGSUSED*/ @@ -175,7 +203,7 @@ vdev_raidz_cksum_free(void *arg, size_t ignored) ASSERT3U(rm->rm_reports, >, 0); - if (--rm->rm_reports == 0 && rm->rm_freed != 0) + if (--rm->rm_reports == 0 && rm->rm_freed) vdev_raidz_map_free(rm); } @@ -186,77 +214,79 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) const size_t c = zcr->zcr_cbinfo; size_t x, offset; - const abd_t *good = NULL; - const abd_t *bad = rm->rm_col[c].rc_abd; - if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); return; } - if (c < rm->rm_firstdatacol) { + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + const abd_t *good = NULL; + const abd_t *bad = rr->rr_col[c].rc_abd; + + if (c < rr->rr_firstdatacol) { /* * The first time through, calculate the parity blocks for * the good data (this relies on the fact that the good * data never changes for a given logical ZIO) */ - if (rm->rm_col[0].rc_gdata == NULL) { + if (rr->rr_col[0].rc_gdata == NULL) { abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; /* - * Set up the rm_col[]s to generate the parity for + * Set up the rr_col[]s to generate the parity for * good_data, first saving the parity bufs and * replacing them with buffers to hold the result. */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - bad_parity[x] = rm->rm_col[x].rc_abd; - rm->rm_col[x].rc_abd = - rm->rm_col[x].rc_gdata = - abd_alloc_sametype(rm->rm_col[x].rc_abd, - rm->rm_col[x].rc_size); + for (x = 0; x < rr->rr_firstdatacol; x++) { + bad_parity[x] = rr->rr_col[x].rc_abd; + rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata = + abd_alloc_sametype(rr->rr_col[x].rc_abd, + rr->rr_col[x].rc_size); } /* fill in the data columns from good_data */ offset = 0; - for (; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); + for (; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); - rm->rm_col[x].rc_abd = + rr->rr_col[x].rc_abd = abd_get_offset_size((abd_t *)good_data, - offset, rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; + offset, rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; } /* * Construct the parity from the good data. */ - vdev_raidz_generate_parity(rm); + vdev_raidz_generate_parity_row(rm, rr); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) - rm->rm_col[x].rc_abd = bad_parity[x]; + for (x = 0; x < rr->rr_firstdatacol; x++) + rr->rr_col[x].rc_abd = bad_parity[x]; offset = 0; - for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset_size( - rm->rm_abd_copy, offset, - rm->rm_col[x].rc_size); - offset += rm->rm_col[x].rc_size; + for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) { + abd_put(rr->rr_col[x].rc_abd); + rr->rr_col[x].rc_abd = abd_get_offset_size( + rr->rr_abd_copy, offset, + rr->rr_col[x].rc_size); + offset += rr->rr_col[x].rc_size; } } - ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); - good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, - rm->rm_col[c].rc_size); + ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0, + rr->rr_col[c].rc_size); } else { /* adjust good_data to point at the start of our column */ offset = 0; - for (x = rm->rm_firstdatacol; x < c; x++) - offset += rm->rm_col[x].rc_size; + for (x = rr->rr_firstdatacol; x < c; x++) + offset += rr->rr_col[x].rc_size; good = abd_get_offset_size((abd_t *)good_data, offset, - rm->rm_col[c].rc_size); + rr->rr_col[c].rc_size); } /* we drop the ereport if it ends up that the data was good */ @@ -274,10 +304,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - size_t offset; - raidz_map_t *rm = zio->io_vsd; - size_t size; /* set up the report and bump the refcount */ zcr->zcr_cbdata = rm; @@ -287,8 +314,9 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) rm->rm_reports++; ASSERT3U(rm->rm_reports, >, 0); + ASSERT3U(rm->rm_nrows, ==, 1); - if (rm->rm_abd_copy != NULL) + if (rm->rm_row[0]->rr_abd_copy != NULL) return; /* @@ -299,26 +327,30 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) * Our parity data is already in separate buffers, so there's no need * to copy them. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + size_t offset = 0; + size_t size = 0; - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - size += rm->rm_col[c].rc_size; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) + size += rr->rr_col[c].rc_size; - rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); + rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE); - for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, - col->rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; + abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy, + offset, col->rc_size); - abd_copy(tmp, col->rc_abd, col->rc_size); + abd_copy(tmp, col->rc_abd, col->rc_size); - abd_put(col->rc_abd); - col->rc_abd = tmp; + abd_put(col->rc_abd); + col->rc_abd = tmp; - offset += col->rc_size; + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); } - ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -337,7 +369,7 @@ noinline raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t nparity) { - raidz_map_t *rm; + raidz_row_t *rr; /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = zio->io_offset >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ @@ -349,6 +381,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; uint64_t off = 0; + raidz_map_t *rm = + kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); + rm->rm_nrows = 1; + /* * "Quotient": The number of data sectors for this stripe on all but * the "big column" child vdevs that also contain "remainder" data. @@ -370,8 +406,10 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); - /* acols: The columns that will be accessed. */ - /* scols: The columns that will be accessed or skipped. */ + /* + * acols: The columns that will be accessed. + * scols: The columns that will be accessed or skipped. + */ if (q == 0) { /* Our I/O request doesn't span all child vdevs. */ acols = bc; @@ -383,65 +421,70 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP); + rm->rm_row[0] = rr; - rm->rm_cols = acols; - rm->rm_scols = scols; - rm->rm_bigcols = bc; - rm->rm_skipstart = bc; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - rm->rm_firstdatacol = nparity; - rm->rm_abd_copy = NULL; - rm->rm_reports = 0; - rm->rm_freed = 0; - rm->rm_ecksuminjected = 0; + rr->rr_cols = acols; + rr->rr_scols = scols; + rr->rr_bigcols = bc; + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + rr->rr_firstdatacol = nparity; + rr->rr_abd_copy = NULL; + rr->rr_abd_empty = NULL; + rr->rr_nempty = 0; +#ifdef ZFS_DEBUG + rr->rr_offset = zio->io_offset; + rr->rr_size = zio->io_size; +#endif asize = 0; for (c = 0; c < scols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << ashift; } - rm->rm_col[c].rc_devidx = col; - rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_abd = NULL; - rm->rm_col[c].rc_gdata = NULL; - rm->rm_col[c].rc_error = 0; - rm->rm_col[c].rc_tried = 0; - rm->rm_col[c].rc_skipped = 0; + rc->rc_devidx = col; + rc->rc_offset = coff; + rc->rc_abd = NULL; + rc->rc_gdata = NULL; + rc->rc_orig_data = NULL; + rc->rc_error = 0; + rc->rc_tried = 0; + rc->rc_skipped = 0; + rc->rc_repair = 0; + rc->rc_need_orig_restore = B_FALSE; if (c >= acols) - rm->rm_col[c].rc_size = 0; + rc->rc_size = 0; else if (c < bc) - rm->rm_col[c].rc_size = (q + 1) << ashift; + rc->rc_size = (q + 1) << ashift; else - rm->rm_col[c].rc_size = q << ashift; + rc->rc_size = q << ashift; - asize += rm->rm_col[c].rc_size; + asize += rc->rc_size; } ASSERT3U(asize, ==, tot << ashift); - rm->rm_asize = roundup(asize, (nparity + 1) << ashift); rm->rm_nskip = roundup(tot, nparity + 1) - tot; - ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); - ASSERT3U(rm->rm_nskip, <=, nparity); + rm->rm_skipstart = bc; - for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); + for (c = 0; c < rr->rr_firstdatacol; c++) + rr->rr_col[c].rc_abd = + abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, - rm->rm_col[c].rc_size); - off = rm->rm_col[c].rc_size; + rr->rr_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, + rr->rr_col[c].rc_size); + off = rr->rr_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, - rm->rm_col[c].rc_size); - off += rm->rm_col[c].rc_size; + raidz_col_t *rc = &rr->rr_col[c]; + rc->rc_abd = abd_get_offset_size(zio->io_abd, off, rc->rc_size); + off += rc->rc_size; } /* @@ -464,24 +507,21 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, * skip the first column since at least one data and one parity * column must appear in each row. */ - ASSERT(rm->rm_cols >= 2); - ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + ASSERT(rr->rr_cols >= 2); + ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); - if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { - devidx = rm->rm_col[0].rc_devidx; - o = rm->rm_col[0].rc_offset; - rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; - rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; - rm->rm_col[1].rc_devidx = devidx; - rm->rm_col[1].rc_offset = o; + if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + devidx = rr->rr_col[0].rc_devidx; + o = rr->rr_col[0].rc_offset; + rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; + rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; + rr->rr_col[1].rc_devidx = devidx; + rr->rr_col[1].rc_offset = o; if (rm->rm_skipstart == 0) rm->rm_skipstart = 1; } - zio->io_vsd = rm; - zio->io_vsd_ops = &vdev_raidz_vsd_ops; - /* init RAIDZ parity ops */ rm->rm_ops = vdev_raidz_math_get_ops(); @@ -550,50 +590,43 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) } static void -vdev_raidz_generate_parity_p(raidz_map_t *rm) +vdev_raidz_generate_parity_p(raidz_row_t *rr) { - uint64_t *p; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + if (c == rr->rr_firstdatacol) { + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_p_func, &pqr); } } } static void -vdev_raidz_generate_parity_pq(raidz_map_t *rm) +vdev_raidz_generate_parity_pq(raidz_row_t *rr) { - uint64_t *p, *q, pcnt, ccnt, mask, i; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } @@ -601,14 +634,15 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) struct pqr_struct pqr = { p, q, NULL }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pq_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); } } @@ -616,33 +650,29 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) } static void -vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { - uint64_t *p, *q, *r, pcnt, ccnt, mask, i; - int c; - abd_t *src; + uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); + uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == + rr->rr_col[VDEV_RAIDZ_R].rc_size); - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_Q].rc_size); - ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == - rm->rm_col[VDEV_RAIDZ_R].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + abd_t *src = rr->rr_col[c].rc_abd; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - src = rm->rm_col[c].rc_abd; - p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); + uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { ASSERT(ccnt == pcnt || ccnt == 0); - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - (void) memcpy(r, p, rm->rm_col[c].rc_size); + abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); + (void) memcpy(q, p, rr->rr_col[c].rc_size); + (void) memcpy(r, p, rr->rr_col[c].rc_size); - for (i = ccnt; i < pcnt; i++) { + for (uint64_t i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; @@ -651,14 +681,15 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) struct pqr_struct pqr = { p, q, r }; ASSERT(ccnt <= pcnt); - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, vdev_raidz_pqr_func, &pqr); /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (i = ccnt; i < pcnt; i++) { + uint64_t mask; + for (uint64_t i = ccnt; i < pcnt; i++) { VDEV_RAIDZ_64MUL_2(q[i], mask); VDEV_RAIDZ_64MUL_4(r[i], mask); } @@ -671,27 +702,38 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) * parity columns available. */ void -vdev_raidz_generate_parity(raidz_map_t *rm) +vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) { + ASSERT3U(rr->rr_cols, !=, 0); + /* Generate using the new math implementation */ - if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) + if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) return; - switch (rm->rm_firstdatacol) { + switch (rr->rr_firstdatacol) { case 1: - vdev_raidz_generate_parity_p(rm); + vdev_raidz_generate_parity_p(rr); break; case 2: - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); break; case 3: - vdev_raidz_generate_parity_pqr(rm); + vdev_raidz_generate_parity_pqr(rr); break; default: cmn_err(CE_PANIC, "invalid RAID-Z configuration"); } } +void +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_generate_parity_row(rm, rr); + } +} + /* ARGSUSED */ static int vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) @@ -809,30 +851,27 @@ vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) } static int -vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; - int c; abd_t *dst, *src; - ASSERT(ntgts == 1); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(x < rm->rm_cols); + ASSERT3U(ntgts, ==, 1); + ASSERT3U(x, >=, rr->rr_firstdatacol); + ASSERT3U(x, <, rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); - ASSERT(rm->rm_col[x].rc_size > 0); + ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); - src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + dst = rr->rr_col[x].rc_abd; - abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); + abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; if (c == x) continue; @@ -845,7 +884,7 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) } static int -vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) { int x = tgts[0]; int c, exp; @@ -853,44 +892,44 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 1); - ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, - rm->rm_col[c].rc_size); + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, + rr->rr_col[c].rc_size); - src = rm->rm_col[c].rc_abd; - dst = rm->rm_col[x].rc_abd; + src = rr->rr_col[c].rc_abd; + dst = rr->rr_col[x].rc_abd; - if (c == rm->rm_firstdatacol) { + if (c == rr->rr_firstdatacol) { abd_copy(dst, src, size); - if (rm->rm_col[x].rc_size > size) + if (rr->rr_col[x].rc_size > size) { abd_zero_off(dst, size, - rm->rm_col[x].rc_size - size); - + rr->rr_col[x].rc_size - size); + } } else { - ASSERT3U(size, <=, rm->rm_col[x].rc_size); + ASSERT3U(size, <=, rr->rr_col[x].rc_size); (void) abd_iterate_func2(dst, src, 0, 0, size, vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, - size, rm->rm_col[x].rc_size - size, + size, rr->rr_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); } } - src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - dst = rm->rm_col[x].rc_abd; - exp = 255 - (rm->rm_cols - 1 - x); + src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + dst = rr->rr_col[x].rc_abd; + exp = 255 - (rr->rr_cols - 1 - x); struct reconst_q_struct rq = { abd_to_buf(src), exp }; - (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } static int -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; abd_t *pdata, *qdata; @@ -901,10 +940,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ASSERT(ntgts == 2); ASSERT(x < y); - ASSERT(x >= rm->rm_firstdatacol); - ASSERT(y < rm->rm_cols); + ASSERT(x >= rr->rr_firstdatacol); + ASSERT(y < rr->rr_cols); - ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); /* * Move the parity data aside -- we're going to compute parity as @@ -913,29 +952,29 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) * parity so we make those columns appear to be full of zeros by * setting their lengths to zero. */ - pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; - qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; - xsize = rm->rm_col[x].rc_size; - ysize = rm->rm_col[y].rc_size; + pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; + qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; + xsize = rr->rr_col[x].rc_size; + ysize = rr->rr_col[y].rc_size; - rm->rm_col[VDEV_RAIDZ_P].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = - abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); - rm->rm_col[x].rc_size = 0; - rm->rm_col[y].rc_size = 0; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); + rr->rr_col[x].rc_size = 0; + rr->rr_col[y].rc_size = 0; - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity_pq(rr); - rm->rm_col[x].rc_size = xsize; - rm->rm_col[y].rc_size = ysize; + rr->rr_col[x].rc_size = xsize; + rr->rr_col[y].rc_size = ysize; p = abd_to_buf(pdata); q = abd_to_buf(qdata); - pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); - xd = rm->rm_col[x].rc_abd; - yd = rm->rm_col[y].rc_abd; + pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); + xd = rr->rr_col[x].rc_abd; + yd = rr->rr_col[y].rc_abd; /* * We now have: @@ -953,7 +992,7 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) */ a = vdev_raidz_pow2[255 + x - y]; - b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; tmp = 255 - vdev_raidz_log2[a ^ 1]; aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; @@ -967,14 +1006,14 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) (void) abd_iterate_func(xd, ysize, xsize - ysize, vdev_raidz_reconst_pq_tail_func, &rpq); - abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); - abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); /* * Restore the saved parity data. */ - rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; - rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; + rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; + rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); } @@ -1134,13 +1173,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) /* END CSTYLED */ static void -vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, +vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, uint8_t **rows) { int i, j; int pow; - ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); /* * Fill in the missing rows of interest. @@ -1164,7 +1203,7 @@ vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, } static void -vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, +vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **rows, uint8_t **invrows, const uint8_t *used) { int i, j, ii, jj; @@ -1176,10 +1215,10 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, * correspond to data columns. */ for (i = 0; i < nmissing; i++) { - ASSERT3S(used[i], <, rm->rm_firstdatacol); + ASSERT3S(used[i], <, rr->rr_firstdatacol); } for (; i < n; i++) { - ASSERT3S(used[i], >=, rm->rm_firstdatacol); + ASSERT3S(used[i], >=, rr->rr_firstdatacol); } /* @@ -1196,8 +1235,8 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, */ for (i = 0; i < nmissing; i++) { for (j = nmissing; j < n; j++) { - ASSERT3U(used[j], >=, rm->rm_firstdatacol); - jj = used[j] - rm->rm_firstdatacol; + ASSERT3U(used[j], >=, rr->rr_firstdatacol); + jj = used[j] - rr->rr_firstdatacol; ASSERT3S(jj, <, n); invrows[i][j] = rows[i][jj]; rows[i][jj] = 0; @@ -1258,7 +1297,7 @@ vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, } static void -vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, +vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, int *missing, uint8_t **invrows, const uint8_t *used) { int i, j, x, cc, c; @@ -1290,22 +1329,24 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, for (i = 0; i < n; i++) { c = used[i]; - ASSERT3U(c, <, rm->rm_cols); + ASSERT3U(c, <, rr->rr_cols); - src = abd_to_buf(rm->rm_col[c].rc_abd); - ccount = rm->rm_col[c].rc_size; + ccount = rr->rr_col[c].rc_size; + ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); + if (ccount == 0) + continue; + src = abd_to_buf(rr->rr_col[c].rc_abd); for (j = 0; j < nmissing; j++) { - cc = missing[j] + rm->rm_firstdatacol; - ASSERT3U(cc, >=, rm->rm_firstdatacol); - ASSERT3U(cc, <, rm->rm_cols); + cc = missing[j] + rr->rr_firstdatacol; + ASSERT3U(cc, >=, rr->rr_firstdatacol); + ASSERT3U(cc, <, rr->rr_cols); ASSERT3U(cc, !=, c); - dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); - dcount[j] = rm->rm_col[cc].rc_size; + dcount[j] = rr->rr_col[cc].rc_size; + if (dcount[j] != 0) + dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); } - ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); - for (x = 0; x < ccount; x++, src++) { if (*src != 0) log = vdev_raidz_log2[*src]; @@ -1334,16 +1375,14 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, } static int -vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int n, i, c, t, tt; int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; - uint8_t *p, *pp; size_t psize; - uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; @@ -1354,30 +1393,39 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Matrix reconstruction can't use scatter ABDs yet, so we allocate - * temporary linear ABDs. + * temporary linear ABDs if any non-linear ABDs are found. */ - if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { - bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { + if (!abd_is_linear(rr->rr_col[i].rc_abd)) { + bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), + KM_PUSHPAGE); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - bufs[c] = col->rc_abd; - col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); - abd_copy(col->rc_abd, bufs[c], col->rc_size); + bufs[c] = col->rc_abd; + if (bufs[c] != NULL) { + col->rc_abd = abd_alloc_linear( + col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], + col->rc_size); + } + } + + break; } } - n = rm->rm_cols - rm->rm_firstdatacol; + n = rr->rr_cols - rr->rr_firstdatacol; /* * Figure out which data columns are missing. */ nmissing_rows = 0; for (t = 0; t < ntgts; t++) { - if (tgts[t] >= rm->rm_firstdatacol) { + if (tgts[t] >= rr->rr_firstdatacol) { missing_rows[nmissing_rows++] = - tgts[t] - rm->rm_firstdatacol; + tgts[t] - rr->rr_firstdatacol; } } @@ -1387,7 +1435,7 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) */ for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { ASSERT(tt < ntgts); - ASSERT(c < rm->rm_firstdatacol); + ASSERT(c < rr->rr_firstdatacol); /* * Skip any targeted parity columns. @@ -1422,9 +1470,9 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) used[i] = parity_map[i]; } - for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { if (tt < nmissing_rows && - c == missing_rows[tt] + rm->rm_firstdatacol) { + c == missing_rows[tt] + rr->rr_firstdatacol) { tt++; continue; } @@ -1437,18 +1485,18 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) /* * Initialize the interesting rows of the matrix. */ - vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); /* * Invert the matrix. */ - vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, invrows, used); /* * Reconstruct the missing data using the generated matrix. */ - vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, invrows, used); kmem_free(p, psize); @@ -1457,21 +1505,24 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) * copy back from temporary linear abds and free them */ if (bufs) { - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - raidz_col_t *col = &rm->rm_col[c]; + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *col = &rr->rr_col[c]; - abd_copy(bufs[c], col->rc_abd, col->rc_size); - abd_free(col->rc_abd); + if (bufs[c] != NULL) { + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + } col->rc_abd = bufs[c]; } - kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); } return (code); } -int -vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +static int +vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, + const int *t, int nt) { int tgts[VDEV_RAIDZ_MAXPARITY], *dt; int ntgts; @@ -1480,26 +1531,19 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) int nbadparity, nbaddata; int parity_valid[VDEV_RAIDZ_MAXPARITY]; - /* - * The tgts list must already be sorted. - */ - for (i = 1; i < nt; i++) { - ASSERT(t[i] > t[i - 1]); - } - - nbadparity = rm->rm_firstdatacol; - nbaddata = rm->rm_cols - nbadparity; + nbadparity = rr->rr_firstdatacol; + nbaddata = rr->rr_cols - nbadparity; ntgts = 0; - for (i = 0, c = 0; c < rm->rm_cols; c++) { - if (c < rm->rm_firstdatacol) + for (i = 0, c = 0; c < rr->rr_cols; c++) { + if (c < rr->rr_firstdatacol) parity_valid[c] = B_FALSE; if (i < nt && c == t[i]) { tgts[ntgts++] = c; i++; - } else if (rm->rm_col[c].rc_error != 0) { + } else if (rr->rr_col[c].rc_error != 0) { tgts[ntgts++] = c; - } else if (c >= rm->rm_firstdatacol) { + } else if (c >= rr->rr_firstdatacol) { nbaddata--; } else { parity_valid[c] = B_TRUE; @@ -1514,7 +1558,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) dt = &tgts[nbadparity]; /* Reconstruct using the new math implementation */ - ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); if (ret != RAIDZ_ORIGINAL_IMPL) return (ret); @@ -1524,29 +1568,29 @@ vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) switch (nbaddata) { case 1: if (parity_valid[VDEV_RAIDZ_P]) - return (vdev_raidz_reconstruct_p(rm, dt, 1)); + return (vdev_raidz_reconstruct_p(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_q(rm, dt, 1)); + return (vdev_raidz_reconstruct_q(rr, dt, 1)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; case 2: - ASSERT(rm->rm_firstdatacol > 1); + ASSERT(rr->rr_firstdatacol > 1); if (parity_valid[VDEV_RAIDZ_P] && parity_valid[VDEV_RAIDZ_Q]) - return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + return (vdev_raidz_reconstruct_pq(rr, dt, 2)); - ASSERT(rm->rm_firstdatacol > 2); + ASSERT(rr->rr_firstdatacol > 2); break; } - code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + code = vdev_raidz_reconstruct_general(rr, tgts, ntgts); ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); ASSERT(code > 0); return (code); @@ -1556,8 +1600,8 @@ static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, uint64_t *logical_ashift, uint64_t *physical_ashift) { - vdev_t *cvd; - uint64_t nparity = vd->vdev_nparity; + vdev_raidz_t *vdrz = vd->vdev_tsd; + uint64_t nparity = vdrz->vd_nparity; int c; int lasterror = 0; int numerrors = 0; @@ -1573,7 +1617,7 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, vdev_open_children(vd); for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_t *cvd = vd->vdev_child[c]; if (cvd->vdev_open_error != 0) { lasterror = cvd->vdev_open_error; @@ -1602,19 +1646,20 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, static void vdev_raidz_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) - vdev_close(vd->vdev_child[c]); + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c] != NULL) + vdev_close(vd->vdev_child[c]); + } } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; - uint64_t cols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t cols = vdrz->vd_logical_width; + uint64_t nparity = vdrz->vd_nparity; asize = ((psize - 1) >> ashift) + 1; asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); @@ -1623,7 +1668,18 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize) return (asize); } -static void +/* + * The allocatable space for a raidz vdev is N * sizeof(smallest child) + * so each child must provide at least 1/Nth of its asize. + */ +static uint64_t +vdev_raidz_min_asize(vdev_t *vd) +{ + return ((vd->vdev_min_asize + vd->vdev_children - 1) / + vd->vdev_children); +} + +void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; @@ -1634,21 +1690,21 @@ vdev_raidz_child_done(zio_t *zio) } static void -vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col) { #ifdef ZFS_DEBUG - vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - range_seg64_t logical_rs, physical_rs; - logical_rs.rs_start = zio->io_offset; + range_seg64_t logical_rs, physical_rs, remain_rs; + logical_rs.rs_start = rr->rr_offset; logical_rs.rs_end = logical_rs.rs_start + - vdev_raidz_asize(zio->io_vd, zio->io_size); + vdev_raidz_asize(vd, rr->rr_size); - raidz_col_t *rc = &rm->rm_col[col]; + raidz_col_t *rc = &rr->rr_col[col]; vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - vdev_xlate(cvd, &logical_rs, &physical_rs); + vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); + ASSERT(vdev_xlate_is_empty(&remain_rs)); ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); /* @@ -1666,6 +1722,91 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) #endif } +static void +vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift) +{ + vdev_t *vd = zio->io_vd; + raidz_map_t *rm = zio->io_vsd; + int c, i; + + vdev_raidz_generate_parity_row(rm, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + + /* Verify physical to logical translation */ + vdev_raidz_io_verify(vd, rr, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], rc->rc_offset, + rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority, + 0, vdev_raidz_child_done, rc)); + } + + /* + * Generate optional I/Os for skip sectors to improve aggregation + * contiguity. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT(c <= rr->rr_scols); + if (c == rr->rr_scols) + c = 0; + + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } +} + +static void +vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (int c = rr->rr_cols - 1; c >= 0; c--) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_size == 0) + continue; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_readable(cvd)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + if (c >= rr->rr_firstdatacol) + rr->rr_missingdata++; + else + rr->rr_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } +} + /* * Start an IO operation on a RAIDZ VDev * @@ -1688,96 +1829,32 @@ vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - vdev_t *cvd; + vdev_raidz_t *vdrz = vd->vdev_tsd; raidz_map_t *rm; - raidz_col_t *rc; - int c, i; - rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, - vd->vdev_nparity); - - ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); - - if (zio->io_type == ZIO_TYPE_WRITE) { - vdev_raidz_generate_parity(rm); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - - /* - * Verify physical to logical translation. - */ - vdev_raidz_io_verify(zio, rm, c); - - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } - - /* - * Generate optional I/Os for any skipped sectors to improve - * aggregation contiguity. - */ - for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { - ASSERT(c <= rm->rm_scols); - if (c == rm->rm_scols) - c = 0; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset + rc->rc_size, NULL, - 1 << tvd->vdev_ashift, - zio->io_type, zio->io_priority, - ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); - } - - zio_execute(zio); - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, + vdrz->vd_logical_width, vdrz->vd_nparity); /* - * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity. + * Until raidz expansion is implemented all maps for a raidz vdev + * contain a single row. */ - for (c = rm->rm_cols - 1; c >= 0; c--) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (!vdev_readable(cvd)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = SET_ERROR(ENXIO); - rc->rc_tried = 1; /* don't even try */ - rc->rc_skipped = 1; - continue; - } - if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { - if (c >= rm->rm_firstdatacol) - rm->rm_missingdata++; - else - rm->rm_missingparity++; - rc->rc_error = SET_ERROR(ESTALE); - rc->rc_skipped = 1; - continue; - } - if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || - (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { - zio_nowait(zio_vdev_child_io(zio, NULL, cvd, - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } + ASSERT3U(rm->rm_nrows, ==, 1); + raidz_row_t *rr = rm->rm_row[0]; + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift); + } else { + ASSERT(zio->io_type == ZIO_TYPE_READ); + vdev_raidz_io_start_read(zio, rr); } zio_execute(zio); } - /* * Report a checksum error for a child of a RAID-Z device. */ @@ -1786,7 +1863,8 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && + zio->io_priority != ZIO_PRIORITY_REBUILD) { zio_bad_cksum_t zbc; raidz_map_t *rm = zio->io_vsd; @@ -1827,13 +1905,14 @@ raidz_checksum_verify(zio_t *zio) * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the * data we read. If it doesn't, we fire off a checksum error. Return the - * number such failures. + * number of such failures. */ static int -raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +raidz_parity_verify(zio_t *zio, raidz_row_t *rr) { abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; + raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; blkptr_t *bp = zio->io_bp; @@ -1843,8 +1922,18 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (checksum == ZIO_CHECKSUM_NOPARITY) return (ret); - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; + /* + * All data columns must have been successfully read in order + * to use them to generate parity columns for comparison. + */ + for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + rc = &rr->rr_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + return (ret); + } + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; @@ -1852,12 +1941,19 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) abd_copy(orig[c], rc->rc_abd, rc->rc_size); } - vdev_raidz_generate_parity(rm); + /* + * Regenerates parity even for !tried||rc_error!=0 columns. This + * isn't harmful but it does have the side effect of fixing stuff + * we didn't realize was necessary (i.e. even if we return 0). + */ + vdev_raidz_generate_parity_row(rm, rr); + + for (c = 0; c < rr->rr_firstdatacol; c++) { + rc = &rr->rr_col[c]; - for (c = 0; c < rm->rm_firstdatacol; c++) { - rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; + if (abd_cmp(orig[c], rc->rc_abd) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); @@ -1870,464 +1966,606 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) } static int -vdev_raidz_worst_error(raidz_map_t *rm) +vdev_raidz_worst_error(raidz_row_t *rr) { int error = 0; - for (int c = 0; c < rm->rm_cols; c++) - error = zio_worst_error(error, rm->rm_col[c].rc_error); + for (int c = 0; c < rr->rr_cols; c++) + error = zio_worst_error(error, rr->rr_col[c].rc_error); return (error); } -/* - * Iterate over all combinations of bad data and attempt a reconstruction. - * Note that the algorithm below is non-optimal because it doesn't take into - * account how reconstruction is actually performed. For example, with - * triple-parity RAID-Z the reconstruction procedure is the same if column 4 - * is targeted as invalid as if columns 1 and 4 are targeted since in both - * cases we'd only use parity information in column 0. - */ -static int -vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) -{ - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc; - abd_t *orig[VDEV_RAIDZ_MAXPARITY]; - int tstore[VDEV_RAIDZ_MAXPARITY + 2]; - int *tgts = &tstore[1]; - int curr, next, i, c, n; - int code, ret = 0; - - ASSERT(total_errors < rm->rm_firstdatacol); - - /* - * This simplifies one edge condition. - */ - tgts[-1] = -1; - - for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { - /* - * Initialize the targets array by finding the first n columns - * that contain no error. - * - * If there were no data errors, we need to ensure that we're - * always explicitly attempting to reconstruct at least one - * data column. To do this, we simply push the highest target - * up into the data columns. - */ - for (c = 0, i = 0; i < n; i++) { - if (i == n - 1 && data_errors == 0 && - c < rm->rm_firstdatacol) { - c = rm->rm_firstdatacol; - } - - while (rm->rm_col[c].rc_error != 0) { - c++; - ASSERT3S(c, <, rm->rm_cols); - } - - tgts[i] = c++; - } - - /* - * Setting tgts[n] simplifies the other edge condition. - */ - tgts[n] = rm->rm_cols; - - /* - * These buffers were allocated in previous iterations. - */ - for (i = 0; i < n - 1; i++) { - ASSERT(orig[i] != NULL); - } - - orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, - rm->rm_col[0].rc_size); - - curr = 0; - next = tgts[curr]; - - while (curr != n) { - tgts[curr] = next; - curr = 0; - - /* - * Save off the original data that we're going to - * attempt to reconstruct. - */ - for (i = 0; i < n; i++) { - ASSERT(orig[i] != NULL); - c = tgts[i]; - ASSERT3S(c, >=, 0); - ASSERT3S(c, <, rm->rm_cols); - rc = &rm->rm_col[c]; - abd_copy(orig[i], rc->rc_abd, rc->rc_size); - } - - /* - * Attempt a reconstruction and exit the outer loop on - * success. - */ - code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(zio) == 0) { - - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - ASSERT(rc->rc_error == 0); - if (rc->rc_tried) - raidz_checksum_error(zio, rc, - orig[i]); - rc->rc_error = SET_ERROR(ECKSUM); - } - - ret = code; - goto done; - } - - /* - * Restore the original data. - */ - for (i = 0; i < n; i++) { - c = tgts[i]; - rc = &rm->rm_col[c]; - abd_copy(rc->rc_abd, orig[i], rc->rc_size); - } - - do { - /* - * Find the next valid column after the curr - * position.. - */ - for (next = tgts[curr] + 1; - next < rm->rm_cols && - rm->rm_col[next].rc_error != 0; next++) - continue; - - ASSERT(next <= tgts[curr + 1]); - - /* - * If that spot is available, we're done here. - */ - if (next != tgts[curr + 1]) - break; - - /* - * Otherwise, find the next valid column after - * the previous position. - */ - for (c = tgts[curr - 1] + 1; - rm->rm_col[c].rc_error != 0; c++) - continue; - - tgts[curr] = c; - curr++; - - } while (curr != n); - } - } - n--; -done: - for (i = 0; i < n; i++) - abd_free(orig[i]); - - return (ret); -} - -/* - * Complete an IO operation on a RAIDZ VDev - * - * Outline: - * - For write operations: - * 1. Check for errors on the child IOs. - * 2. Return, setting an error code if too few child VDevs were written - * to reconstruct the data later. Note that partial writes are - * considered successful if they can be reconstructed at all. - * - For read operations: - * 1. Check for errors on the child IOs. - * 2. If data errors occurred: - * a. Try to reassemble the data from the parity available. - * b. If we haven't yet read the parity drives, read them now. - * c. If all parity drives have been read but the data still doesn't - * reassemble with a correct checksum, then try combinatorial - * reconstruction. - * d. If that doesn't work, return an error. - * 3. If there were unexpected errors or this is a resilver operation, - * rewrite the vdevs that had errors. - */ static void -vdev_raidz_io_done(zio_t *zio) +vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { - vdev_t *vd = zio->io_vd; - vdev_t *cvd; - raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc = NULL; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; - int total_errors = 0; - int n, c; - int tgts[VDEV_RAIDZ_MAXPARITY]; - int code; - ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); - ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); - - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; if (rc->rc_error) { - ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ - - if (c < rm->rm_firstdatacol) + if (c < rr->rr_firstdatacol) parity_errors++; else data_errors++; if (!rc->rc_skipped) unexpected_errors++; - - total_errors++; - } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { parity_untried++; } } - if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * XXX -- for now, treat partial writes as a success. - * (If we couldn't write enough columns to reconstruct - * the data, the I/O failed. Otherwise, good enough.) - * - * Now that we support write reallocation, it would be better - * to treat partial failure as real failure unless there are - * no non-degraded top-level vdevs left, and not update DTLs - * if we intend to reallocate. - */ - /* XXPOLICY */ - if (total_errors > rm->rm_firstdatacol) - zio->io_error = vdev_raidz_worst_error(rm); - - return; - } - - ASSERT(zio->io_type == ZIO_TYPE_READ); /* - * There are three potential phases for a read: - * 1. produce valid data from the columns read - * 2. read all disks and try again - * 3. perform combinatorial reconstruction + * If we read more parity disks than were used for + * reconstruction, confirm that the other parity disks produced + * correct data. * - * Each phase is progressively both more expensive and less likely to - * occur. If we encounter more errors than we can repair or all phases - * fail, we have no choice but to return an error. + * Note that we also regenerate parity when resilvering so we + * can write it out to failed devices later. */ - - /* - * If the number of errors we saw was correctable -- less than or equal - * to the number of parity disks read -- attempt to produce data that - * has a valid checksum. Naturally, this case applies in the absence of - * any errors. - */ - if (total_errors <= rm->rm_firstdatacol - parity_untried) { - if (data_errors == 0) { - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read parity information (unnecessarily - * as it happens since no reconstruction was - * needed) regenerate and verify the parity. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. - */ - if (parity_errors + parity_untried < - rm->rm_firstdatacol || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - goto done; - } - } else { - /* - * We either attempt to read all the parity columns or - * none of them. If we didn't try to read parity, we - * wouldn't be here in the correctable case. There must - * also have been fewer parity errors than parity - * columns or, again, we wouldn't be in this code path. - */ - ASSERT(parity_untried == 0); - ASSERT(parity_errors < rm->rm_firstdatacol); - - /* - * Identify the data columns that reported an error. - */ - n = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) { - ASSERT(n < VDEV_RAIDZ_MAXPARITY); - tgts[n++] = c; - } - } - - ASSERT(rm->rm_firstdatacol >= n); - - code = vdev_raidz_reconstruct(rm, tgts, n); - - if (raidz_checksum_verify(zio) == 0) { - /* - * If we read more parity disks than were used - * for reconstruction, confirm that the other - * parity disks produced correct data. This - * routine is suboptimal in that it regenerates - * the parity that we already used in addition - * to the parity that we're attempting to - * verify, but this should be a relatively - * uncommon case, and can be optimized if it - * becomes a problem. Note that we regenerate - * parity when resilvering so we can write it - * out to failed devices later. - */ - if (parity_errors < rm->rm_firstdatacol - n || - (zio->io_flags & ZIO_FLAG_RESILVER)) { - n = raidz_parity_verify(zio, rm); - unexpected_errors += n; - ASSERT(parity_errors + n <= - rm->rm_firstdatacol); - } - - goto done; - } - } + if (parity_errors + parity_untried < + rr->rr_firstdatacol - data_errors || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + int n = raidz_parity_verify(zio, rr); + unexpected_errors += n; + ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); } - /* - * This isn't a typical situation -- either we got a read error or - * a child silently returned bad data. Read every block so we can - * try again with as much data and parity as we can track down. If - * we've already been through once before, all children will be marked - * as tried so we'll proceed to combinatorial reconstruction. - */ - unexpected_errors = 1; - rm->rm_missingdata = 0; - rm->rm_missingparity = 0; - - for (c = 0; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_tried) - continue; - - zio_vdev_io_redone(zio); - do { - rc = &rm->rm_col[c]; - if (rc->rc_tried) - continue; - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[rc->rc_devidx], - rc->rc_offset, rc->rc_abd, rc->rc_size, - zio->io_type, zio->io_priority, 0, - vdev_raidz_child_done, rc)); - } while (++c < rm->rm_cols); - - return; - } - - /* - * At this point we've attempted to reconstruct the data given the - * errors we detected, and we've attempted to read all columns. There - * must, therefore, be one or more additional problems -- silent errors - * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. We check if there is enough additional data to - * possibly reconstruct the data and then perform combinatorial - * reconstruction over all possible combinations. If that fails, - * we're cooked. - */ - if (total_errors > rm->rm_firstdatacol) { - zio->io_error = vdev_raidz_worst_error(rm); - - } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { - /* - * If we didn't use all the available parity for the - * combinatorial reconstruction, verify that the remaining - * parity is correct. - */ - if (code != (1 << rm->rm_firstdatacol) - 1) - (void) raidz_parity_verify(zio, rm); - } else { - /* - * We're here because either: - * - * total_errors == rm_first_datacol, or - * vdev_raidz_combrec() failed - * - * In either case, there is enough bad data to prevent - * reconstruction. - * - * Start checksum ereports for all children which haven't - * failed, and the IO wasn't speculative. - */ - zio->io_error = SET_ERROR(ECKSUM); - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - vdev_t *cvd; - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error != 0) - continue; - - zio_bad_cksum_t zbc; - zbc.zbc_has_cksum = 0; - zbc.zbc_injected = rm->rm_ecksuminjected; - - int ret = zfs_ereport_start_checksum( - zio->io_spa, cvd, &zio->io_bookmark, zio, - rc->rc_offset, rc->rc_size, - (void *)(uintptr_t)c, &zbc); - if (ret != EALREADY) { - mutex_enter(&cvd->vdev_stat_lock); - cvd->vdev_stat.vs_checksum_errors++; - mutex_exit(&cvd->vdev_stat_lock); - } - } - } - } - -done: - zio_checksum_verified(zio); - if (zio->io_error == 0 && spa_writeable(zio->io_spa) && - (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. */ - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - cvd = vd->vdev_child[rc->rc_devidx]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *vd = zio->io_vd; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; - if (rc->rc_error == 0) + if ((rc->rc_error == 0 || rc->rc_size == 0) && + (rc->rc_repair == 0)) { continue; + } zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_abd, rc->rc_size, - ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_IO_REPAIR | (unexpected_errors ? ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } +static void +raidz_restore_orig_data(raidz_map_t *rm) +{ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + abd_copy_from_buf(rc->rc_abd, + rc->rc_orig_data, rc->rc_size); + rc->rc_need_orig_restore = B_FALSE; + } + } + } +} + +/* + * returns EINVAL if reconstruction of the block will not be possible + * returns ECKSUM if this specific reconstruction failed + * returns 0 on successful reconstruction + */ +static int +raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) +{ + raidz_map_t *rm = zio->io_vsd; + + /* Reconstruct each row */ + for (int r = 0; r < rm->rm_nrows; r++) { + raidz_row_t *rr = rm->rm_row[r]; + int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ + int t = 0; + int dead = 0; + int dead_data = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ASSERT0(rc->rc_need_orig_restore); + if (rc->rc_error != 0) { + dead++; + if (c >= nparity) + dead_data++; + continue; + } + if (rc->rc_size == 0) + continue; + for (int lt = 0; lt < ntgts; lt++) { + if (rc->rc_devidx == ltgts[lt]) { + if (rc->rc_orig_data == NULL) { + rc->rc_orig_data = + zio_buf_alloc(rc->rc_size); + abd_copy_to_buf( + rc->rc_orig_data, + rc->rc_abd, rc->rc_size); + } + rc->rc_need_orig_restore = B_TRUE; + + dead++; + if (c >= nparity) + dead_data++; + my_tgts[t++] = c; + break; + } + } + } + if (dead > nparity) { + /* reconstruction not possible */ + raidz_restore_orig_data(rm); + return (EINVAL); + } + rr->rr_code = 0; + if (dead_data > 0) + rr->rr_code = vdev_raidz_reconstruct_row(rm, rr, + my_tgts, t); + } + + /* Check for success */ + if (raidz_checksum_verify(zio) == 0) { + + /* Reconstruction succeeded - report errors */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_need_orig_restore) { + /* + * Note: if this is a parity column, + * we don't really know if it's wrong. + * We need to let + * vdev_raidz_io_done_verified() check + * it, and if we set rc_error, it will + * think that it is a "known" error + * that doesn't need to be checked + * or corrected. + */ + if (rc->rc_error == 0 && + c >= rr->rr_firstdatacol) { + raidz_checksum_error(zio, + rc, rc->rc_gdata); + rc->rc_error = + SET_ERROR(ECKSUM); + } + rc->rc_need_orig_restore = B_FALSE; + } + } + + vdev_raidz_io_done_verified(zio, rr); + } + + zio_checksum_verified(zio); + + return (0); + } + + /* Reconstruction failed - restore original data */ + raidz_restore_orig_data(rm); + return (ECKSUM); +} + +/* + * Iterate over all combinations of N bad vdevs and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + * + * The order that we find the various possible combinations of failed + * disks is dictated by these rules: + * - Examine each "slot" (the "i" in tgts[i]) + * - Try to increment this slot (tgts[i] = tgts[i] + 1) + * - if we can't increment because it runs into the next slot, + * reset our slot to the minimum, and examine the next slot + * + * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose + * 3 columns to reconstruct), we will generate the following sequence: + * + * STATE ACTION + * 0 1 2 special case: skip since these are all parity + * 0 1 3 first slot: reset to 0; middle slot: increment to 2 + * 0 2 3 first slot: increment to 1 + * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 + * 0 1 4 first: reset to 0; middle: increment to 2 + * 0 2 4 first: increment to 1 + * 1 2 4 first: reset to 0; middle: increment to 3 + * 0 3 4 first: increment to 1 + * 1 3 4 first: increment to 2 + * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 + * 0 1 5 first: reset to 0; middle: increment to 2 + * 0 2 5 first: increment to 1 + * 1 2 5 first: reset to 0; middle: increment to 3 + * 0 3 5 first: increment to 1 + * 1 3 5 first: increment to 2 + * 2 3 5 first: reset to 0; middle: increment to 4 + * 0 4 5 first: increment to 1 + * 1 4 5 first: increment to 2 + * 2 4 5 first: increment to 3 + * 3 4 5 done + * + * This strategy works for dRAID but is less effecient when there are a large + * number of child vdevs and therefore permutations to check. Furthermore, + * since the raidz_map_t rows likely do not overlap reconstruction would be + * possible as long as there are no more than nparity data errors per row. + * These additional permutations are not currently checked but could be as + * a future improvement. + */ +static int +vdev_raidz_combrec(zio_t *zio) +{ + int nparity = vdev_get_nparity(zio->io_vd); + raidz_map_t *rm = zio->io_vsd; + + /* Check if there's enough data to attempt reconstrution. */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + int total_errors = 0; + + for (int c = 0; c < rr->rr_cols; c++) { + if (rr->rr_col[c].rc_error) + total_errors++; + } + + if (total_errors > nparity) + return (vdev_raidz_worst_error(rr)); + } + + for (int num_failures = 1; num_failures <= nparity; num_failures++) { + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *ltgts = &tstore[1]; /* value is logical child ID */ + + /* Determine number of logical children, n */ + int n = zio->io_vd->vdev_children; + + ASSERT3U(num_failures, <=, nparity); + ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); + + /* Handle corner cases in combrec logic */ + ltgts[-1] = -1; + for (int i = 0; i < num_failures; i++) { + ltgts[i] = i; + } + ltgts[num_failures] = n; + + for (;;) { + int err = raidz_reconstruct(zio, ltgts, num_failures, + nparity); + if (err == EINVAL) { + /* + * Reconstruction not possible with this # + * failures; try more failures. + */ + break; + } else if (err == 0) + return (0); + + /* Compute next targets to try */ + for (int t = 0; ; t++) { + ASSERT3U(t, <, num_failures); + ltgts[t]++; + if (ltgts[t] == n) { + /* try more failures */ + ASSERT3U(t, ==, num_failures - 1); + break; + } + + ASSERT3U(ltgts[t], <, n); + ASSERT3U(ltgts[t], <=, ltgts[t + 1]); + + /* + * If that spot is available, we're done here. + * Try the next combination. + */ + if (ltgts[t] != ltgts[t + 1]) + break; + + /* + * Otherwise, reset this tgt to the minimum, + * and move on to the next tgt. + */ + ltgts[t] = ltgts[t - 1] + 1; + ASSERT3U(ltgts[t], ==, t); + } + + /* Increase the number of failures and keep trying. */ + if (ltgts[num_failures - 1] == n) + break; + } + } + + return (ECKSUM); +} + +void +vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +{ + for (uint64_t row = 0; row < rm->rm_nrows; row++) { + raidz_row_t *rr = rm->rm_row[row]; + vdev_raidz_reconstruct_row(rm, rr, t, nt); + } +} + +/* + * Complete a write IO operation on a RAIDZ VDev + * + * Outline: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + */ +static void +vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) +{ + int total_errors = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + total_errors++; + } + } + + /* + * Treat partial writes as a success. If we couldn't write enough + * columns to reconstruct the data, the I/O failed. Otherwise, + * good enough. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + if (total_errors > rr->rr_firstdatacol) { + zio->io_error = zio_worst_error(zio->io_error, + vdev_raidz_worst_error(rr)); + } +} + +/* + * return 0 if no reconstruction occurred, otherwise the "code" from + * vdev_raidz_reconstruct(). + */ +static int +vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, + raidz_row_t *rr) +{ + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; + int total_errors = 0; + int code = 0; + + ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); + ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + if (c < rr->rr_firstdatacol) + parity_errors++; + else + data_errors++; + + total_errors++; + } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + /* + * If there were data errors and the number of errors we saw was + * correctable -- less than or equal to the number of parity disks read + * -- reconstruct based on the missing data. + */ + if (data_errors != 0 && + total_errors <= rr->rr_firstdatacol - parity_untried) { + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rr->rr_firstdatacol); + + /* + * Identify the data columns that reported an error. + */ + int n = 0; + int tgts[VDEV_RAIDZ_MAXPARITY]; + for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } + } + + ASSERT(rr->rr_firstdatacol >= n); + + code = vdev_raidz_reconstruct_row(rm, rr, tgts, n); + } + + return (code); +} + +/* + * Return the number of reads issued. + */ +static int +vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) +{ + vdev_t *vd = zio->io_vd; + int nread = 0; + + rr->rr_missingdata = 0; + rr->rr_missingparity = 0; + + /* + * If this rows contains empty sectors which are not required + * for a normal read then allocate an ABD for them now so they + * may be read, verified, and any needed repairs performed. + */ + if (rr->rr_nempty && rr->rr_abd_empty == NULL) + vdev_draid_map_alloc_empty(zio, rr); + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (rc->rc_tried || rc->rc_size == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + nread++; + } + return (nread); +} + +/* + * We're here because either there were too many errors to even attempt + * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() + * failed. In either case, there is enough bad data to prevent reconstruction. + * Start checksum ereports for all children which haven't failed. + */ +static void +vdev_raidz_io_done_unrecoverable(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error != 0) + continue; + + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + int ret = zfs_ereport_start_checksum(zio->io_spa, + cvd, &zio->io_bookmark, zio, rc->rc_offset, + rc->rc_size, (void *)(uintptr_t)c, &zbc); + if (ret != EALREADY) { + mutex_enter(&cvd->vdev_stat_lock); + cvd->vdev_stat.vs_checksum_errors++; + mutex_exit(&cvd->vdev_stat_lock); + } + } + } +} + +void +vdev_raidz_io_done(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + if (zio->io_type == ZIO_TYPE_WRITE) { + for (int i = 0; i < rm->rm_nrows; i++) { + vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); + } + } else { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + rr->rr_code = + vdev_raidz_io_done_reconstruct_known_missing(zio, + rm, rr); + } + + if (raidz_checksum_verify(zio) == 0) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + vdev_raidz_io_done_verified(zio, rr); + } + zio_checksum_verified(zio); + } else { + /* + * A sequential resilver has no checksum which makes + * combinatoral reconstruction impossible. This code + * path is unreachable since raidz_checksum_verify() + * has no checksum to verify and must succeed. + */ + ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); + + /* + * This isn't a typical situation -- either we got a + * read error or a child silently returned bad data. + * Read every block so we can try again with as much + * data and parity as we can track down. If we've + * already been through once before, all children will + * be marked as tried so we'll proceed to combinatorial + * reconstruction. + */ + int nread = 0; + for (int i = 0; i < rm->rm_nrows; i++) { + nread += vdev_raidz_read_all(zio, + rm->rm_row[i]); + } + if (nread != 0) { + /* + * Normally our stage is VDEV_IO_DONE, but if + * we've already called redone(), it will have + * changed to VDEV_IO_START, in which case we + * don't want to call redone() again. + */ + if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) + zio_vdev_io_redone(zio); + return; + } + + zio->io_error = vdev_raidz_combrec(zio); + if (zio->io_error == ECKSUM && + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + vdev_raidz_io_done_unrecoverable(zio); + } + } + } +} + static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { - if (faulted > vd->vdev_nparity) + vdev_raidz_t *vdrz = vd->vdev_tsd; + if (faulted > vdrz->vd_nparity) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) @@ -2343,18 +2581,26 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) * width blocks must be resilvered. */ static boolean_t -vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, + uint64_t phys_birth) { + vdev_raidz_t *vdrz = vd->vdev_tsd; uint64_t dcols = vd->vdev_children; - uint64_t nparity = vd->vdev_nparity; + uint64_t nparity = vdrz->vd_nparity; uint64_t ashift = vd->vdev_top->vdev_ashift; /* The starting RAIDZ (parent) vdev sector of the block. */ - uint64_t b = offset >> ashift; + uint64_t b = DVA_GET_OFFSET(dva) >> ashift; /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = ((psize - 1) >> ashift) + 1; /* The first column for this stripe. */ uint64_t f = b % dcols; + /* Unreachable by sequential resilver. */ + ASSERT3U(phys_birth, !=, TXG_UNKNOWN); + + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + if (s + nparity >= dcols) return (B_TRUE); @@ -2375,7 +2621,8 @@ vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) } static void -vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) +vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs, range_seg64_t *remain_rs) { vdev_t *raidvd = cvd->vdev_parent; ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); @@ -2385,10 +2632,10 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) uint64_t ashift = raidvd->vdev_top->vdev_ashift; /* make sure the offsets are block-aligned */ - ASSERT0(in->rs_start % (1 << ashift)); - ASSERT0(in->rs_end % (1 << ashift)); - uint64_t b_start = in->rs_start >> ashift; - uint64_t b_end = in->rs_end >> ashift; + ASSERT0(logical_rs->rs_start % (1 << ashift)); + ASSERT0(logical_rs->rs_end % (1 << ashift)); + uint64_t b_start = logical_rs->rs_start >> ashift; + uint64_t b_end = logical_rs->rs_end >> ashift; uint64_t start_row = 0; if (b_start > tgt_col) /* avoid underflow */ @@ -2398,17 +2645,119 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) if (b_end > tgt_col) end_row = ((b_end - tgt_col - 1) / width) + 1; - res->rs_start = start_row << ashift; - res->rs_end = end_row << ashift; + physical_rs->rs_start = start_row << ashift; + physical_rs->rs_end = end_row << ashift; - ASSERT3U(res->rs_start, <=, in->rs_start); - ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); + ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); + ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, + logical_rs->rs_end - logical_rs->rs_start); +} + +/* + * Initialize private RAIDZ specific fields from the nvlist. + */ +static int +vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) +{ + vdev_raidz_t *vdrz; + uint64_t nparity; + + uint_t children; + nvlist_t **child; + int error = nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &children); + if (error != 0) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) + return (SET_ERROR(EINVAL)); + + /* + * Previous versions could only support 1 or 2 parity + * device. + */ + if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) + return (SET_ERROR(EINVAL)); + } else { + /* + * We require the parity to be specified for SPAs that + * support multiple parity levels. + */ + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + + /* + * Otherwise, we default to 1 parity device for RAID-Z. + */ + nparity = 1; + } + + vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); + vdrz->vd_logical_width = children; + vdrz->vd_nparity = nparity; + + *tsd = vdrz; + + return (0); +} + +static void +vdev_raidz_fini(vdev_t *vd) +{ + kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t)); +} + +/* + * Add RAIDZ specific fields to the config nvlist. + */ +static void +vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) +{ + ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); + vdev_raidz_t *vdrz = vd->vdev_tsd; + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vdrz->vd_nparity == 1 || + (vdrz->vd_nparity <= 2 && + spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || + (vdrz->vd_nparity <= 3 && + spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); + + /* + * Note that we'll add these even on storage pools where they + * aren't strictly required -- older software will just ignore + * it. + */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); +} + +static uint64_t +vdev_raidz_nparity(vdev_t *vd) +{ + vdev_raidz_t *vdrz = vd->vdev_tsd; + return (vdrz->vd_nparity); +} + +static uint64_t +vdev_raidz_ndisks(vdev_t *vd) +{ + return (vd->vdev_children); } vdev_ops_t vdev_raidz_ops = { + .vdev_op_init = vdev_raidz_init, + .vdev_op_fini = vdev_raidz_fini, .vdev_op_open = vdev_raidz_open, .vdev_op_close = vdev_raidz_close, .vdev_op_asize = vdev_raidz_asize, + .vdev_op_min_asize = vdev_raidz_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_raidz_io_start, .vdev_op_io_done = vdev_raidz_io_done, .vdev_op_state_change = vdev_raidz_state_change, @@ -2417,6 +2766,11 @@ vdev_ops_t vdev_raidz_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_raidz_xlate, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = vdev_raidz_config_generate, + .vdev_op_nparity = vdev_raidz_nparity, + .vdev_op_ndisks = vdev_raidz_ndisks, .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c index 9595a7b95251..25d76970e99a 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c @@ -149,7 +149,7 @@ vdev_raidz_math_get_ops(void) * Select parity generation method for raidz_map */ int -vdev_raidz_math_generate(raidz_map_t *rm) +vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr) { raidz_gen_f gen_parity = NULL; @@ -174,7 +174,7 @@ vdev_raidz_math_generate(raidz_map_t *rm) if (gen_parity == NULL) return (RAIDZ_ORIGINAL_IMPL); - gen_parity(rm); + gen_parity(rr); return (0); } @@ -241,8 +241,8 @@ reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, * @nbaddata - Number of failed data columns */ int -vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, - const int *dt, const int nbaddata) +vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr, + const int *parity_valid, const int *dt, const int nbaddata) { raidz_rec_f rec_fn = NULL; @@ -265,7 +265,7 @@ vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, if (rec_fn == NULL) return (RAIDZ_ORIGINAL_IMPL); else - return (rec_fn(rm, dt)); + return (rec_fn(rr, dt)); } const char *raidz_gen_name[] = { @@ -360,7 +360,7 @@ raidz_math_kstat_addr(kstat_t *ksp, loff_t n) #define BENCH_D_COLS (8ULL) #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ -#define BENCH_NS MSEC2NSEC(25) /* 25ms */ +#define BENCH_NS MSEC2NSEC(1) /* 1ms */ typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); @@ -410,7 +410,7 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) t_start = gethrtime(); do { - for (i = 0; i < 25; i++, run_cnt++) + for (i = 0; i < 5; i++, run_cnt++) bench_fn(bench_rm, fn); t_diff = gethrtime() - t_start; diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h index 89c2082c4ab9..35e016fc65a5 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h +++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h @@ -26,6 +26,7 @@ #define _VDEV_RAIDZ_MATH_IMPL_H #include +#include #define raidz_inline inline __attribute__((always_inline)) #ifndef noinline @@ -36,33 +37,33 @@ * Functions calculate multiplication constants for data reconstruction. * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and * used parity columns for reconstruction. - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes * @coeff output array of coefficients. Array must be provided by * user and must hold minimum MUL_CNT values. */ static noinline void -raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); } static noinline void -raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); } static noinline void -raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; gf_t a, b, e; @@ -76,9 +77,9 @@ raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; @@ -93,9 +94,9 @@ raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; @@ -114,9 +115,9 @@ raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) } static noinline void -raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff) { - const unsigned ncols = raidz_ncols(rm); + const unsigned ncols = rr->rr_cols; const unsigned x = tgtidx[TARGET_X]; const unsigned y = tgtidx[TARGET_Y]; const unsigned z = tgtidx[TARGET_Z]; @@ -347,26 +348,26 @@ raidz_mul_abd_cb(void *dc, size_t size, void *private) /* * Generate P parity (RAIDZ1) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_p_impl(raidz_map_t * const rm) +raidz_generate_p_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t psize = rm->rm_col[CODE_P].rc_size; - abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + const size_t ncols = rr->rr_cols; + const size_t psize = rr->rr_col[CODE_P].rc_size; + abd_t *pabd = rr->rr_col[CODE_P].rc_abd; size_t size; abd_t *dabd; raidz_math_begin(); /* start with first data column */ - raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); + raidz_copy(pabd, rr->rr_col[1].rc_abd, psize); for (c = 2; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - size = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + size = rr->rr_col[c].rc_size; /* add data column */ raidz_add(pabd, dabd, size); @@ -414,29 +415,29 @@ raidz_gen_pq_add(void **c, const void *dc, const size_t csize, /* * Generate PQ parity (RAIDZ2) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_pq_impl(raidz_map_t * const rm) +raidz_generate_pq_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t csize = rm->rm_col[CODE_P].rc_size; + const size_t ncols = rr->rr_cols; + const size_t csize = rr->rr_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize); for (c = 3; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, raidz_gen_pq_add); @@ -487,31 +488,31 @@ raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, /* * Generate PQR parity (RAIDZ2) * - * @rm RAIDZ map + * @rr RAIDZ row */ static raidz_inline void -raidz_generate_pqr_impl(raidz_map_t * const rm) +raidz_generate_pqr_impl(raidz_row_t * const rr) { size_t c; - const size_t ncols = raidz_ncols(rm); - const size_t csize = rm->rm_col[CODE_P].rc_size; + const size_t ncols = rr->rr_cols; + const size_t csize = rr->rr_col[CODE_P].rc_size; size_t dsize; abd_t *dabd; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; raidz_math_begin(); - raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); - raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize); for (c = 4; c < ncols; c++) { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, raidz_gen_pqr_add); @@ -579,33 +580,36 @@ raidz_generate_pqr_impl(raidz_map_t * const rm) * @syn_method raidz_add_abd() * @rec_method not applicable * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - const size_t xsize = rm->rm_col[x].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; size_t size; abd_t *dabd; + if (xabd == NULL) + return (1 << CODE_P); + raidz_math_begin(); /* copy P into target */ - raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize); /* generate p_syndrome */ for (c = firstdc; c < ncols; c++) { if (c == x) continue; - dabd = rm->rm_col[c].rc_abd; - size = MIN(rm->rm_col[c].rc_size, xsize); + dabd = rr->rr_col[c].rc_abd; + size = MIN(rr->rr_col[c].rc_size, xsize); raidz_add(xabd, dabd, size); } @@ -653,30 +657,33 @@ raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - abd_t *xabd = rm->rm_col[x].rc_abd; - const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; abd_t *tabds[] = { xabd }; + if (xabd == NULL) + return (1 << CODE_Q); + unsigned coeff[MUL_CNT]; - raidz_rec_q_coeff(rm, tgtidx, coeff); + raidz_rec_q_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } @@ -687,8 +694,8 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, @@ -696,7 +703,7 @@ raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) } /* add Q to the syndrome */ - raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); @@ -744,30 +751,33 @@ raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, * @syn_method raidz_add_abd() * @rec_method raidz_mul_abd_cb() * - * @rm RAIDZ map + * @rr RAIDZ rr * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; - const size_t xsize = rm->rm_col[x].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; abd_t *tabds[] = { xabd }; + if (xabd == NULL) + return (1 << CODE_R); + unsigned coeff[MUL_CNT]; - raidz_rec_r_coeff(rm, tgtidx, coeff); + raidz_rec_r_coeff(rr, tgtidx, coeff); raidz_math_begin(); /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); } @@ -779,8 +789,8 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, @@ -788,7 +798,7 @@ raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) } /* add R to the syndrome */ - raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize); /* transform the syndrome */ abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); @@ -881,31 +891,34 @@ raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, * @syn_method raidz_syn_pq_abd() * @rec_method raidz_rec_pq_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd }; + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_Q)); + unsigned coeff[MUL_CNT]; - raidz_rec_pq_coeff(rm, tgtidx, coeff); + raidz_rec_pq_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -921,8 +934,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -934,8 +947,8 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -946,7 +959,7 @@ raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) /* Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); @@ -1038,30 +1051,34 @@ raidz_rec_pr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_pr_abd() * @rec_method raidz_rec_pr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[0]; const size_t y = tgtidx[1]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_pr_coeff(rm, tgtidx, coeff); + raidz_rec_pr_coeff(rr, tgtidx, coeff); /* * Check if some of targets are shorter then others. @@ -1077,8 +1094,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1090,8 +1107,8 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -1104,14 +1121,14 @@ raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); if (ysize < xsize) abd_free(yabd); - return ((1 << CODE_P) | (1 << CODE_Q)); + return ((1 << CODE_P) | (1 << CODE_R)); } @@ -1201,30 +1218,34 @@ raidz_rec_qr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_qr_abd() * @rec_method raidz_rec_qr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; abd_t *tabds[2] = { xabd, yabd }; abd_t *cabds[] = { - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_Q) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_qr_coeff(rm, tgtidx, coeff); + raidz_rec_qr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -1240,8 +1261,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1253,8 +1274,8 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, @@ -1267,7 +1288,7 @@ raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); raidz_math_end(); @@ -1384,34 +1405,38 @@ raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, * @syn_method raidz_syn_pqr_abd() * @rec_method raidz_rec_pqr_abd() * - * @rm RAIDZ map + * @rr RAIDZ row * @tgtidx array of missing data indexes */ static raidz_inline int -raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) +raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx) { size_t c; size_t dsize; abd_t *dabd; - const size_t firstdc = raidz_parity(rm); - const size_t ncols = raidz_ncols(rm); + const size_t firstdc = rr->rr_firstdatacol; + const size_t ncols = rr->rr_cols; const size_t x = tgtidx[TARGET_X]; const size_t y = tgtidx[TARGET_Y]; const size_t z = tgtidx[TARGET_Z]; - const size_t xsize = rm->rm_col[x].rc_size; - const size_t ysize = rm->rm_col[y].rc_size; - const size_t zsize = rm->rm_col[z].rc_size; - abd_t *xabd = rm->rm_col[x].rc_abd; - abd_t *yabd = rm->rm_col[y].rc_abd; - abd_t *zabd = rm->rm_col[z].rc_abd; + const size_t xsize = rr->rr_col[x].rc_size; + const size_t ysize = rr->rr_col[y].rc_size; + const size_t zsize = rr->rr_col[z].rc_size; + abd_t *xabd = rr->rr_col[x].rc_abd; + abd_t *yabd = rr->rr_col[y].rc_abd; + abd_t *zabd = rr->rr_col[z].rc_abd; abd_t *tabds[] = { xabd, yabd, zabd }; abd_t *cabds[] = { - rm->rm_col[CODE_P].rc_abd, - rm->rm_col[CODE_Q].rc_abd, - rm->rm_col[CODE_R].rc_abd + rr->rr_col[CODE_P].rc_abd, + rr->rr_col[CODE_Q].rc_abd, + rr->rr_col[CODE_R].rc_abd }; + + if (xabd == NULL) + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); + unsigned coeff[MUL_CNT]; - raidz_rec_pqr_coeff(rm, tgtidx, coeff); + raidz_rec_pqr_coeff(rr, tgtidx, coeff); /* * Check if some of targets is shorter then others @@ -1431,9 +1456,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) /* Start with first data column if present */ if (firstdc != x) { - raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); - raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize); } else { raidz_zero(xabd, xsize); raidz_zero(yabd, xsize); @@ -1446,8 +1471,8 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) dabd = NULL; dsize = 0; } else { - dabd = rm->rm_col[c].rc_abd; - dsize = rm->rm_col[c].rc_size; + dabd = rr->rr_col[c].rc_abd; + dsize = rr->rr_col[c].rc_size; } abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, @@ -1460,9 +1485,9 @@ raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) * Copy shorter targets back to the original abd buffer */ if (ysize < xsize) - raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize); if (zsize < xsize) - raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); + raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize); raidz_math_end(); diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c index 3362d608c037..784d1af15a81 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c +++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c @@ -25,6 +25,7 @@ */ #include +#include #include #include #include @@ -63,13 +64,15 @@ * * Limitations: * - * - Only supported for mirror vdev types. Due to the variable stripe - * width used by raidz sequential reconstruction is not possible. + * - Sequential reconstruction is not possible on RAIDZ due to its + * variable stripe width. Note dRAID uses a fixed stripe width which + * avoids this issue, but comes at the expense of some usable capacity. * - * - Block checksums are not verified during sequential reconstuction. + * - Block checksums are not verified during sequential reconstruction. * Similar to traditional RAID the parity/mirror data is reconstructed * but cannot be immediately double checked. For this reason when the - * last active resilver completes the pool is automatically scrubbed. + * last active resilver completes the pool is automatically scrubbed + * by default. * * - Deferred resilvers using sequential reconstruction are not currently * supported. When adding another vdev to an active top-level resilver @@ -77,8 +80,8 @@ * * Advantages: * - * - Sequential reconstuction is performed in LBA order which may be faster - * than healing reconstuction particularly when using using HDDs (or + * - Sequential reconstruction is performed in LBA order which may be faster + * than healing reconstruction particularly when using using HDDs (or * especially with SMR devices). Only allocated capacity is resilvered. * * - Sequential reconstruction is not constrained by ZFS block boundaries. @@ -86,9 +89,9 @@ * allowing all of these logical blocks to be repaired with a single IO. * * - Unlike a healing resilver or scrub which are pool wide operations, - * sequential reconstruction is handled by the top-level mirror vdevs. - * This allows for it to be started or canceled on a top-level vdev - * without impacting any other top-level vdevs in the pool. + * sequential reconstruction is handled by the top-level vdevs. This + * allows for it to be started or canceled on a top-level vdev without + * impacting any other top-level vdevs in the pool. * * - Data only referenced by a pool checkpoint will be repaired because * that space is reflected in the space maps. This differs for a @@ -97,18 +100,36 @@ /* - * Maximum number of queued rebuild I/Os top-level vdev. The number of - * concurrent rebuild I/Os issued to the device is controlled by the - * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module - * options. - */ -unsigned int zfs_rebuild_queue_limit = 20; - -/* - * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE. + * Size of rebuild reads; defaults to 1MiB per data disk and is capped at + * SPA_MAXBLOCKSIZE. */ unsigned long zfs_rebuild_max_segment = 1024 * 1024; +/* + * Maximum number of parallelly executed bytes per leaf vdev caused by a + * sequential resilver. We attempt to strike a balance here between keeping + * the vdev queues full of I/Os at all times and not overflowing the queues + * to cause long latency, which would cause long txg sync times. + * + * A large default value can be safely used here because the default target + * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep + * the queue depth short. + * + * 32MB was selected as the default value to achieve good performance with + * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential + * rebuild was unable to saturate all of the drives using smaller values. + * With a value of 32MB the sequential resilver write rate was measured at + * 800MB/s sustained while rebuilding to a distributed spare. + */ +unsigned long zfs_rebuild_vdev_limit = 32 << 20; + +/* + * Automatically start a pool scrub when the last active sequential resilver + * completes in order to verify the checksums of all blocks which have been + * resilvered. This option is enabled by default and is strongly recommended. + */ +int zfs_rebuild_scrub_enabled = 1; + /* * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). */ @@ -293,7 +314,7 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), REBUILD_PHYS_ENTRIES, vrp, tx)); - vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); + vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); spa_history_log_internal(spa, "rebuild", tx, @@ -306,7 +327,16 @@ vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) vd->vdev_rebuilding = B_FALSE; mutex_exit(&vd->vdev_rebuild_lock); - spa_notify_waiters(spa); + /* + * While we're in syncing context take the opportunity to + * setup the scrub when there are no more active rebuilds. + */ + if (!vdev_rebuild_active(spa->spa_root_vdev) && + zfs_rebuild_scrub_enabled) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_setup_sync(&func, tx); + } + cv_broadcast(&vd->vdev_rebuild_cv); } @@ -438,7 +468,7 @@ vdev_rebuild_cb(zio_t *zio) vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; vdev_t *vd = vr->vr_top_vdev; - mutex_enter(&vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); if (zio->io_error == ENXIO && !vdev_writeable(vd)) { /* * The I/O failed because the top-level vdev was unavailable. @@ -455,34 +485,30 @@ vdev_rebuild_cb(zio_t *zio) abd_free(zio->io_abd); - ASSERT3U(vd->vdev_rebuild_inflight, >, 0); - vd->vdev_rebuild_inflight--; - cv_broadcast(&vd->vdev_rebuild_io_cv); - mutex_exit(&vd->vdev_rebuild_io_lock); + ASSERT3U(vr->vr_bytes_inflight, >, 0); + vr->vr_bytes_inflight -= zio->io_size; + cv_broadcast(&vr->vr_io_cv); + mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } /* - * Rebuild the data in this range by constructing a special dummy block - * pointer for the given range. It has no relation to any existing blocks - * in the pool. But by disabling checksum verification and issuing a scrub - * I/O mirrored vdevs will replicate the block using any available mirror - * leaf vdevs. + * Initialize a block pointer that can be used to read the given segment + * for sequential rebuild. */ static void -vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, - uint64_t txg) +vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, + uint64_t asize) { - vdev_t *vd = vr->vr_top_vdev; - spa_t *spa = vd->vdev_spa; - uint64_t psize = asize; - - ASSERT(vd->vdev_ops == &vdev_mirror_ops || + ASSERT(vd->vdev_ops == &vdev_draid_ops || + vd->vdev_ops == &vdev_mirror_ops || vd->vdev_ops == &vdev_replacing_ops || vd->vdev_ops == &vdev_spare_ops); - blkptr_t blk, *bp = &blk; + uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? + vdev_draid_asize_to_psize(vd, asize) : asize; + BP_ZERO(bp); DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); @@ -499,19 +525,6 @@ vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, BP_SET_LEVEL(bp, 0); BP_SET_DEDUP(bp, 0); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); - - /* - * We increment the issued bytes by the asize rather than the psize - * so the scanned and issued bytes may be directly compared. This - * is consistent with the scrub/resilver issued reporting. - */ - vr->vr_pass_bytes_issued += asize; - vr->vr_rebuild_phys.vrp_bytes_issued += asize; - - zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp, - abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, - ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | - ZIO_FLAG_RESILVER, NULL)); } /* @@ -525,6 +538,7 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; vdev_t *vd = vr->vr_top_vdev; spa_t *spa = vd->vdev_spa; + blkptr_t blk; ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); @@ -532,14 +546,26 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) vr->vr_pass_bytes_scanned += size; vr->vr_rebuild_phys.vrp_bytes_scanned += size; - mutex_enter(&vd->vdev_rebuild_io_lock); + /* + * Rebuild the data in this range by constructing a special block + * pointer. It has no relation to any existing blocks in the pool. + * However, by disabling checksum verification and issuing a scrub IO + * we can reconstruct and repair any children with missing data. + */ + vdev_rebuild_blkptr_init(&blk, vd, start, size); + uint64_t psize = BP_GET_PSIZE(&blk); + + if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) + return (0); + + mutex_enter(&vr->vr_io_lock); /* Limit in flight rebuild I/Os */ - while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit) - cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) + cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); - vd->vdev_rebuild_inflight++; - mutex_exit(&vd->vdev_rebuild_io_lock); + vr->vr_bytes_inflight += psize; + mutex_exit(&vr->vr_io_lock); dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); @@ -558,45 +584,29 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) /* When exiting write out our progress. */ if (vdev_rebuild_should_stop(vd)) { - mutex_enter(&vd->vdev_rebuild_io_lock); - vd->vdev_rebuild_inflight--; - mutex_exit(&vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); + vr->vr_bytes_inflight -= psize; + mutex_exit(&vr->vr_io_lock); spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); mutex_exit(&vd->vdev_rebuild_lock); dmu_tx_commit(tx); return (SET_ERROR(EINTR)); } mutex_exit(&vd->vdev_rebuild_lock); - - vr->vr_scan_offset[txg & TXG_MASK] = start + size; - vdev_rebuild_rebuild_block(vr, start, size, txg); - dmu_tx_commit(tx); + vr->vr_scan_offset[txg & TXG_MASK] = start + size; + vr->vr_pass_bytes_issued += size; + vr->vr_rebuild_phys.vrp_bytes_issued += size; + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, + abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, + ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER, NULL)); + return (0); } -/* - * Split range into legally-sized logical chunks given the constraints of the - * top-level mirror vdev type. - */ -static uint64_t -vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size) -{ - uint64_t chunk_size, max_asize, max_segment; - - ASSERT(vd->vdev_ops == &vdev_mirror_ops || - vd->vdev_ops == &vdev_replacing_ops || - vd->vdev_ops == &vdev_spare_ops); - - max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment, - 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); - max_asize = vdev_psize_to_asize(vd, max_segment); - chunk_size = MIN(size, max_asize); - - return (chunk_size); -} - /* * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. */ @@ -625,7 +635,14 @@ vdev_rebuild_ranges(vdev_rebuild_t *vr) while (size > 0) { uint64_t chunk_size; - chunk_size = vdev_rebuild_chunk_size(vd, start, size); + /* + * Split range into legally-sized logical chunks + * given the constraints of the top-level vdev + * being rebuilt (dRAID or mirror). + */ + ASSERT3P(vd->vdev_ops, !=, NULL); + chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, + start, size, zfs_rebuild_max_segment); error = vdev_rebuild_range(vr, start, chunk_size); if (error != 0) @@ -747,10 +764,16 @@ vdev_rebuild_thread(void *arg) vr->vr_top_vdev = vd; vr->vr_scan_msp = NULL; vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); + vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; + vr->vr_bytes_inflight_max = MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children); + uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -780,21 +803,32 @@ vdev_rebuild_thread(void *arg) ASSERT0(range_tree_space(vr->vr_scan_tree)); - /* - * Disable any new allocations to this metaslab and wait - * for any writes inflight to complete. This is needed to - * ensure all allocated ranges are rebuilt. - */ + /* Disable any new allocations to this metaslab */ metaslab_disable(msp); spa_config_exit(spa, SCL_CONFIG, FTAG); - txg_wait_synced(dsl, 0); mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + /* + * If there are outstanding allocations wait for them to be + * synced. This is needed to ensure all allocated ranges are + * on disk and therefore will be rebuilt. + */ + for (int j = 0; j < TXG_SIZE; j++) { + if (range_tree_space(msp->ms_allocating[j])) { + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + txg_wait_synced(dsl, 0); + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + break; + } + } + /* * When a metaslab has been allocated from read its allocated - * ranges from the space map object in to the vr_scan_tree. + * ranges from the space map object into the vr_scan_tree. * Then add inflight / unflushed ranges and remove inflight / * unflushed frees. This is the minimum range to be rebuilt. */ @@ -827,7 +861,7 @@ vdev_rebuild_thread(void *arg) /* * To provide an accurate estimate re-calculate the estimated * size every 5 minutes to account for recent allocations and - * frees made space maps which have not yet been rebuilt. + * frees made to space maps which have not yet been rebuilt. */ if (gethrtime() > update_est_time + SEC2NSEC(300)) { update_est_time = gethrtime(); @@ -851,11 +885,14 @@ vdev_rebuild_thread(void *arg) spa_config_exit(spa, SCL_CONFIG, FTAG); /* Wait for any remaining rebuild I/O to complete */ - mutex_enter(&vd->vdev_rebuild_io_lock); - while (vd->vdev_rebuild_inflight > 0) - cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + mutex_enter(&vr->vr_io_lock); + while (vr->vr_bytes_inflight > 0) + cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); - mutex_exit(&vd->vdev_rebuild_io_lock); + mutex_exit(&vr->vr_io_lock); + + mutex_destroy(&vr->vr_io_lock); + cv_destroy(&vr->vr_io_cv); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1100,5 +1137,11 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, - "Max segment size in bytes of rebuild reads"); + "Max segment size in bytes of rebuild reads"); + +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, + "Max bytes in flight per leaf vdev for sequential resilvers"); + +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, + "Automatically scrub after sequential resilver completes"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c index ed7d1d4b3030..6eaaddd3979f 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_removal.c +++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c @@ -250,7 +250,7 @@ vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) spa_vdev_removal_t *svr = NULL; uint64_t txg __maybe_unused = dmu_tx_get_txg(tx); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT0(vdev_get_nparity(vd)); svr = spa_vdev_removal_create(vd); ASSERT(vd->vdev_removing); @@ -993,7 +993,7 @@ spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, * An allocation class might not have any remaining vdevs or space */ metaslab_class_t *mc = mg->mg_class; - if (mc != spa_normal_class(spa) && mc->mc_groups <= 1) + if (mc->mc_groups == 0) mc = spa_normal_class(spa); int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, zal, 0); @@ -1120,7 +1120,7 @@ static void vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) { ASSERT3P(zlist, !=, NULL); - ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + ASSERT0(vdev_get_nparity(vd)); if (vd->vdev_leaf_zap != 0) { char zkey[32]; @@ -1976,32 +1976,38 @@ spa_vdev_remove_top_check(vdev_t *vd) if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) return (SET_ERROR(ENOTSUP)); - /* available space in the pool's normal class */ - uint64_t available = dsl_dir_space_available( - spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); metaslab_class_t *mc = vd->vdev_mg->mg_class; - - /* - * When removing a vdev from an allocation class that has - * remaining vdevs, include available space from the class. - */ - if (mc != spa_normal_class(spa) && mc->mc_groups > 1) { - uint64_t class_avail = metaslab_class_get_space(mc) - - metaslab_class_get_alloc(mc); - - /* add class space, adjusted for overhead */ - available += (class_avail * 94) / 100; - } - - /* - * There has to be enough free space to remove the - * device and leave double the "slop" space (i.e. we - * must leave at least 3% of the pool free, in addition to - * the normal slop space). - */ - if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { - return (SET_ERROR(ENOSPC)); + metaslab_class_t *normal = spa_normal_class(spa); + if (mc != normal) { + /* + * Space allocated from the special (or dedup) class is + * included in the DMU's space usage, but it's not included + * in spa_dspace (or dsl_pool_adjustedsize()). Therefore + * there is always at least as much free space in the normal + * class, as is allocated from the special (and dedup) class. + * As a backup check, we will return ENOSPC if this is + * violated. See also spa_update_dspace(). + */ + uint64_t available = metaslab_class_get_space(normal) - + metaslab_class_get_alloc(normal); + ASSERT3U(available, >=, vd->vdev_stat.vs_alloc); + if (available < vd->vdev_stat.vs_alloc) + return (SET_ERROR(ENOSPC)); + } else { + /* available space in the pool's normal class */ + uint64_t available = dsl_dir_space_available( + spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); + if (available < + vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { + /* + * This is a normal device. There has to be enough free + * space to remove the device and leave double the + * "slop" space (i.e. we must leave at least 3% of the + * pool free, in addition to the normal slop space). + */ + return (SET_ERROR(ENOSPC)); + } } /* @@ -2041,7 +2047,7 @@ spa_vdev_remove_top_check(vdev_t *vd) /* * All vdevs in normal class must have the same ashift - * and not be raidz. + * and not be raidz or draid. */ vdev_t *rvd = spa->spa_root_vdev; int num_indirect = 0; @@ -2064,7 +2070,7 @@ spa_vdev_remove_top_check(vdev_t *vd) num_indirect++; if (!vdev_is_concrete(cvd)) continue; - if (cvd->vdev_ops == &vdev_raidz_ops) + if (vdev_get_nparity(cvd) != 0) return (SET_ERROR(EINVAL)); /* * Need the mirror to be mirror of leaf vdevs only @@ -2217,18 +2223,30 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * in this pool. */ if (vd == NULL || unspare) { - if (vd == NULL) - vd = spa_lookup_by_guid(spa, guid, B_TRUE); - ev = spa_event_create(spa, vd, NULL, - ESC_ZFS_VDEV_REMOVE_AUX); + char *type; + boolean_t draid_spare = B_FALSE; - vd_type = VDEV_TYPE_SPARE; - vd_path = spa_strdup(fnvlist_lookup_string( - nv, ZPOOL_CONFIG_PATH)); - spa_vdev_remove_aux(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares, nv); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) + == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0) + draid_spare = B_TRUE; + + if (vd == NULL && draid_spare) { + error = SET_ERROR(ENOTSUP); + } else { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, + guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + + vd_type = VDEV_TYPE_SPARE; + vd_path = spa_strdup(fnvlist_lookup_string( + nv, ZPOOL_CONFIG_PATH)); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } } else { error = SET_ERROR(EBUSY); } diff --git a/sys/contrib/openzfs/module/zfs/vdev_root.c b/sys/contrib/openzfs/module/zfs/vdev_root.c index 9e8aac7d03de..45ddc2f71927 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_root.c +++ b/sys/contrib/openzfs/module/zfs/vdev_root.c @@ -142,9 +142,13 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_root_ops = { + .vdev_op_init = NULL, + .vdev_op_fini = NULL, .vdev_op_open = vdev_root_open, .vdev_op_close = vdev_root_close, .vdev_op_asize = vdev_default_asize, + .vdev_op_min_asize = vdev_default_min_asize, + .vdev_op_min_alloc = NULL, .vdev_op_io_start = NULL, /* not applicable to the root */ .vdev_op_io_done = NULL, /* not applicable to the root */ .vdev_op_state_change = vdev_root_state_change, @@ -153,6 +157,11 @@ vdev_ops_t vdev_root_ops = { .vdev_op_rele = NULL, .vdev_op_remap = NULL, .vdev_op_xlate = NULL, + .vdev_op_rebuild_asize = NULL, + .vdev_op_metaslab_init = NULL, + .vdev_op_config_generate = NULL, + .vdev_op_nparity = NULL, + .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c index 02b42ddd5a6c..895957bda195 100644 --- a/sys/contrib/openzfs/module/zfs/vdev_trim.c +++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c @@ -311,7 +311,8 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, vd->vdev_trim_secure = secure; } - boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED); + vdev_trim_state_t old_state = vd->vdev_trim_state; + boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED); vd->vdev_trim_state = new_state; dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); @@ -332,9 +333,12 @@ vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, "vdev=%s suspended", vd->vdev_path); break; case VDEV_TRIM_CANCELED: - spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); - spa_history_log_internal(spa, "trim", tx, - "vdev=%s canceled", vd->vdev_path); + if (old_state == VDEV_TRIM_ACTIVE || + old_state == VDEV_TRIM_SUSPENDED) { + spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); + spa_history_log_internal(spa, "trim", tx, + "vdev=%s canceled", vd->vdev_path); + } break; case VDEV_TRIM_COMPLETE: spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); @@ -601,6 +605,32 @@ vdev_trim_ranges(trim_args_t *ta) return (0); } +static void +vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs) +{ + uint64_t *last_rs_end = (uint64_t *)arg; + + if (physical_rs->rs_end > *last_rs_end) + *last_rs_end = physical_rs->rs_end; +} + +static void +vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs) +{ + vdev_t *vd = (vdev_t *)arg; + + uint64_t size = physical_rs->rs_end - physical_rs->rs_start; + vd->vdev_trim_bytes_est += size; + + if (vd->vdev_trim_last_offset >= physical_rs->rs_end) { + vd->vdev_trim_bytes_done += size; + } else if (vd->vdev_trim_last_offset > physical_rs->rs_start && + vd->vdev_trim_last_offset <= physical_rs->rs_end) { + vd->vdev_trim_bytes_done += + vd->vdev_trim_last_offset - physical_rs->rs_start; + } +} + /* * Calculates the completion percentage of a manual TRIM. */ @@ -618,27 +648,35 @@ vdev_trim_calculate_progress(vdev_t *vd) metaslab_t *msp = vd->vdev_top->vdev_ms[i]; mutex_enter(&msp->ms_lock); - uint64_t ms_free = msp->ms_size - - metaslab_allocated_space(msp); - - if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) - ms_free /= vd->vdev_top->vdev_children; + uint64_t ms_free = (msp->ms_size - + metaslab_allocated_space(msp)) / + vdev_get_ndisks(vd->vdev_top); /* * Convert the metaslab range to a physical range * on our vdev. We use this to determine if we are * in the middle of this metaslab range. */ - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs, physical_rs, remain_rs; logical_rs.rs_start = msp->ms_start; logical_rs.rs_end = msp->ms_start + msp->ms_size; - vdev_xlate(vd, &logical_rs, &physical_rs); + /* Metaslab space after this offset has not been trimmed. */ + vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs); if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); continue; - } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) { + } + + /* Metaslab space before this offset has been trimmed */ + uint64_t last_rs_end = physical_rs.rs_end; + if (!vdev_xlate_is_empty(&remain_rs)) { + vdev_xlate_walk(vd, &remain_rs, + vdev_trim_xlate_last_rs_end, &last_rs_end); + } + + if (vd->vdev_trim_last_offset > last_rs_end) { vd->vdev_trim_bytes_done += ms_free; vd->vdev_trim_bytes_est += ms_free; mutex_exit(&msp->ms_lock); @@ -659,21 +697,9 @@ vdev_trim_calculate_progress(vdev_t *vd) rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { logical_rs.rs_start = rs_get_start(rs, rt); logical_rs.rs_end = rs_get_end(rs, rt); - vdev_xlate(vd, &logical_rs, &physical_rs); - uint64_t size = physical_rs.rs_end - - physical_rs.rs_start; - vd->vdev_trim_bytes_est += size; - if (vd->vdev_trim_last_offset >= physical_rs.rs_end) { - vd->vdev_trim_bytes_done += size; - } else if (vd->vdev_trim_last_offset > - physical_rs.rs_start && - vd->vdev_trim_last_offset <= - physical_rs.rs_end) { - vd->vdev_trim_bytes_done += - vd->vdev_trim_last_offset - - physical_rs.rs_start; - } + vdev_xlate_walk(vd, &logical_rs, + vdev_trim_xlate_progress, vd); } mutex_exit(&msp->ms_lock); } @@ -741,8 +767,38 @@ vdev_trim_load(vdev_t *vd) return (err); } +static void +vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs) +{ + trim_args_t *ta = arg; + vdev_t *vd = ta->trim_vdev; + + /* + * Only a manual trim will be traversing the vdev sequentially. + * For an auto trim all valid ranges should be added. + */ + if (ta->trim_type == TRIM_TYPE_MANUAL) { + + /* Only add segments that we have not visited yet */ + if (physical_rs->rs_end <= vd->vdev_trim_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_trim_last_offset > physical_rs->rs_start) { + ASSERT3U(physical_rs->rs_end, >, + vd->vdev_trim_last_offset); + physical_rs->rs_start = vd->vdev_trim_last_offset; + } + } + + ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start); + + range_tree_add(ta->trim_tree, physical_rs->rs_start, + physical_rs->rs_end - physical_rs->rs_start); +} + /* - * Convert the logical range into a physical range and add it to the + * Convert the logical range into physical ranges and add them to the * range tree passed in the trim_args_t. */ static void @@ -750,7 +806,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) { trim_args_t *ta = arg; vdev_t *vd = ta->trim_vdev; - range_seg64_t logical_rs, physical_rs; + range_seg64_t logical_rs; logical_rs.rs_start = start; logical_rs.rs_end = start + size; @@ -767,44 +823,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) } ASSERT(vd->vdev_ops->vdev_op_leaf); - vdev_xlate(vd, &logical_rs, &physical_rs); - - IMPLY(vd->vdev_top == vd, - logical_rs.rs_start == physical_rs.rs_start); - IMPLY(vd->vdev_top == vd, - logical_rs.rs_end == physical_rs.rs_end); - - /* - * Only a manual trim will be traversing the vdev sequentially. - * For an auto trim all valid ranges should be added. - */ - if (ta->trim_type == TRIM_TYPE_MANUAL) { - - /* Only add segments that we have not visited yet */ - if (physical_rs.rs_end <= vd->vdev_trim_last_offset) - return; - - /* Pick up where we left off mid-range. */ - if (vd->vdev_trim_last_offset > physical_rs.rs_start) { - ASSERT3U(physical_rs.rs_end, >, - vd->vdev_trim_last_offset); - physical_rs.rs_start = vd->vdev_trim_last_offset; - } - } - - ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); - - /* - * With raidz, it's possible that the logical range does not live on - * this leaf vdev. We only add the physical range to this vdev's if it - * has a length greater than 0. - */ - if (physical_rs.rs_end > physical_rs.rs_start) { - range_tree_add(ta->trim_tree, physical_rs.rs_start, - physical_rs.rs_end - physical_rs.rs_start); - } else { - ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); - } + vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg); } /* diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c index 793e0e4f0b75..1ad53eae1eef 100644 --- a/sys/contrib/openzfs/module/zfs/zcp.c +++ b/sys/contrib/openzfs/module/zfs/zcp.c @@ -722,8 +722,6 @@ static void * zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) { zcp_alloc_arg_t *allocargs = ud; - int flags = (allocargs->aa_must_succeed) ? - KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI); if (nsize == 0) { if (ptr != NULL) { @@ -746,10 +744,7 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) return (NULL); } - allocbuf = vmem_alloc(allocsize, flags); - if (allocbuf == NULL) { - return (NULL); - } + allocbuf = vmem_alloc(allocsize, KM_SLEEP); allocargs->aa_alloc_remaining -= allocsize; *allocbuf = allocsize; diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c index a8341f50ba09..ea71ef325c89 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_fm.c +++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c @@ -1111,7 +1111,9 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, bcopy(info, report->zcr_ckinfo, sizeof (*info)); } - report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_align = + vdev_psize_to_asize(vd->vdev_top, report->zcr_sector); report->zcr_length = length; #ifdef _KERNEL diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c index 8703290020a5..8eb9474cadb0 100644 --- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c +++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c @@ -231,6 +231,13 @@ zfsdev_state_t *zfsdev_state_list; */ unsigned long zfs_max_nvlist_src_size = 0; +/* + * When logging the output nvlist of an ioctl in the on-disk history, limit + * the logged size to this many bytes. This must be less then DMU_MAX_ACCESS. + * This applies primarily to zfs_ioc_channel_program(). + */ +unsigned long zfs_history_output_max = 1024 * 1024; + uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; @@ -5851,7 +5858,6 @@ zfs_ioc_userspace_many(zfs_cmd_t *zc) static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { - objset_t *os; int error = 0; zfsvfs_t *zfsvfs; @@ -5872,19 +5878,54 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) error = zfs_resume_fs(zfsvfs, newds); } } - if (error == 0) - error = dmu_objset_userspace_upgrade(zfsvfs->z_os); + if (error == 0) { + mutex_enter(&zfsvfs->z_os->os_upgrade_lock); + if (zfsvfs->z_os->os_upgrade_id == 0) { + /* clear potential error code and retry */ + zfsvfs->z_os->os_upgrade_status = 0; + mutex_exit(&zfsvfs->z_os->os_upgrade_lock); + + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_userspace_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } else { + mutex_exit(&zfsvfs->z_os->os_upgrade_lock); + } + + taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq, + zfsvfs->z_os->os_upgrade_id); + error = zfsvfs->z_os->os_upgrade_status; + } zfs_vfs_rele(zfsvfs); } else { + objset_t *os; + /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); - error = dmu_objset_userspace_upgrade(os); - dmu_objset_rele_flags(os, B_TRUE, FTAG); - } + mutex_enter(&os->os_upgrade_lock); + if (os->os_upgrade_id == 0) { + /* clear potential error code and retry */ + os->os_upgrade_status = 0; + mutex_exit(&os->os_upgrade_lock); + dmu_objset_userspace_upgrade(os); + } else { + mutex_exit(&os->os_upgrade_lock); + } + + dsl_pool_rele(dmu_objset_pool(os), FTAG); + + taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); + error = os->os_upgrade_status; + + dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, + FTAG); + } return (error); } @@ -6609,14 +6650,17 @@ static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { int err; - boolean_t force = B_FALSE; + boolean_t rc, force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); - if (innvl) - force = fnvlist_lookup_boolean_value(innvl, "force"); + if (innvl) { + err = nvlist_lookup_boolean_value(innvl, "force", &rc); + if (err == 0) + force = rc; + } if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); @@ -6627,7 +6671,7 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) spa_close(spa, FTAG); - return (err); + return (0); } /* @@ -7519,8 +7563,14 @@ zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { - fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, - outnvl); + size_t out_size = fnvlist_size(outnvl); + if (out_size > zfs_history_output_max) { + fnvlist_add_int64(lognv, + ZPOOL_HIST_OUTPUT_SIZE, out_size); + } else { + fnvlist_add_nvlist(lognv, + ZPOOL_HIST_OUTPUT_NVL, outnvl); + } } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, @@ -7629,4 +7679,7 @@ zfs_kmod_fini(void) /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); + +ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW, + "Maximum size in bytes of ZFS ioctl output that will be logged"); /* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c new file mode 100644 index 000000000000..3b7c52b8dd34 --- /dev/null +++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c @@ -0,0 +1,895 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static ulong_t zfs_fsync_sync_cnt = 4; + +int +zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + tsd_set(zfs_fsyncer_key, NULL); + + return (0); +} + + +#if defined(SEEK_HOLE) && defined(SEEK_DATA) +/* + * Lseek support for finding holes (cmd == SEEK_HOLE) and + * data (cmd == SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) +{ + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == F_SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +int +zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_holey_common(zp, cmd, off); + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* SEEK_HOLE && SEEK_DATA */ + +/*ARGSUSED*/ +int +zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: zp - inode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - O_SYNC flags; used to provide FRSYNC semantics. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * inode - atime updated if byte count > 0 + */ +/* ARGSUSED */ +int +zfs_read(struct znode *zp, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + boolean_t frsync = B_FALSE; + + zfsvfs_t *zfsvfs = ZTOZSB(zp); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* We don't copy out anything useful for directories. */ + if (Z_ISDIR(ZTOTYPE(zp))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + +#ifdef FRSYNC + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + * Only do this for non-snapshots. + * + * Some platforms do not support FRSYNC and instead map it + * to O_SYNC, which results in unnecessary calls to zil_commit. We + * only honor FRSYNC requests on platforms which support it. + */ + frsync = !!(ioflag & FRSYNC); +#endif + if (zfsvfs->z_log && + (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, + uio->uio_loffset, uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_size); + ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + ssize_t start_resid = n; + + while (n > 0) { + ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_vnops_read_chunk_size)); +#ifdef UIO_NOCOPY + if (uio->uio_segflg == UIO_NOCOPY) + error = mappedread_sf(zp, nbytes, uio); + else +#endif + if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) { + error = mappedread(zp, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } + + int64_t nread = start_resid - n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); + task_io_account_read(nread); +out: + zfs_rangelock_exit(lr); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: zp - znode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - O_APPEND flag set if in append mode. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +int +zfs_write(znode_t *zp, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + ssize_t start_resid = uio->uio_resid; + + /* + * Fasttrack empty write + */ + ssize_t n = start_resid; + if (n == 0) + return (0); + + zfsvfs_t *zfsvfs = ZTOZSB(zp); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + sa_bulk_attr_t bulk[4]; + int count = 0; + uint64_t mtime[2], ctime[2]; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM + */ + if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && + (uio->uio_loffset < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Validate file offset + */ + offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + const uint64_t max_blksz = zfsvfs->z_max_blksz; + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFAULT)); + } + + /* + * If in append mode, set the io offset pointer to eof. + */ + zfs_locked_range_t *lr; + if (ioflag & O_APPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio->uio_loffset = woff; + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + + if (zn_rlimit_fsize(zp, uio, uio->uio_td)) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + const rlim64_t limit = MAXOFFSET_T; + + if (woff >= limit) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if (n > limit - woff) + n = limit - woff; + + uint64_t end_size = MAX(zp->z_size, woff + n); + zilog_t *zilog = zfsvfs->z_log; + + const uint64_t uid = KUID_TO_SUID(ZTOUID(zp)); + const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); + const uint64_t projid = zp->z_projid; + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + woff = uio->uio_loffset; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || + (projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + arc_buf_t *abuf = NULL; + if (n >= max_blksz && woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT3S(cbytes, ==, max_blksz); + } + + /* + * Start a transaction. + */ + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If rangelock_enter() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + const ssize_t nbytes = + MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + ssize_t tx_bytes; + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + uio_fault_disable(uio, B_TRUE); + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + uio_fault_disable(uio, B_FALSE); +#ifdef __linux__ + if (error == EFAULT) { + dmu_tx_commit(tx); + /* + * Account for partial writes before + * continuing the loop. + * Update needs to occur before the next + * uio_prefaultpages, or prefaultpages may + * error, and we may break the loop early. + */ + if (tx_bytes != uio->uio_resid) + n -= tx_bytes - uio->uio_resid; + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + break; + } + continue; + } +#endif + if (error != 0) { + dmu_tx_commit(tx); + break; + } + tx_bytes -= uio->uio_resid; + } else { + /* Implied by abuf != NULL: */ + ASSERT3S(n, >=, max_blksz); + ASSERT0(P2PHASE(woff, max_blksz)); + /* + * We can simplify nbytes to MIN(n, max_blksz) since + * P2PHASE(woff, max_blksz) is 0, and knowing + * n >= max_blksz lets us simplify further: + */ + ASSERT3S(nbytes, ==, max_blksz); + /* + * Thus, we're writing a full block at a block-aligned + * offset and extending the file past EOF. + * + * dmu_assign_arcbuf_by_dbuf() will directly assign the + * arc buffer to a dbuf. + */ + error = dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + if (error != 0) { + dmu_return_arcbuf(abuf); + dmu_tx_commit(tx); + break; + } + ASSERT3S(nbytes, <=, uio->uio_resid); + uioskip(uio, nbytes); + tx_bytes = nbytes; + } + if (tx_bytes && zn_has_cached_data(zp) && + !(ioflag & O_DIRECT)) { + update_pages(zp, woff, tx_bytes, zfsvfs->z_os); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the execute bits is set. + * + * It would be nice to do this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(zp, cr, + ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio->uio_loffset); + ASSERT(error == 0); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT3S(tx_bytes, ==, nbytes); + n -= nbytes; + + if (n > 0) { + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + error = SET_ERROR(EFAULT); + break; + } + } + } + + zfs_inode_update(zp); + zfs_rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, or the + * uio data is inaccessible return an error. Otherwise, it's + * at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio->uio_resid == start_resid || + error == EFAULT) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (O_SYNC | O_DSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + const int64_t nwritten = start_resid - uio->uio_resid; + dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); + task_io_account_write(nwritten); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +int +zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifdef ZFS_DEBUG +static int zil_fault_io = 0; +#endif + +static void zfs_get_done(zgd_t *zgd, int error); + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + return (SET_ERROR(ENOENT)); + } + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef ZFS_DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + + +/* ARGSUSED */ +static void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + + kmem_free(zgd, sizeof (zgd_t)); +} + +EXPORT_SYMBOL(zfs_access); +EXPORT_SYMBOL(zfs_fsync); +EXPORT_SYMBOL(zfs_holey); +EXPORT_SYMBOL(zfs_read); +EXPORT_SYMBOL(zfs_write); +EXPORT_SYMBOL(zfs_getsecattr); +EXPORT_SYMBOL(zfs_setsecattr); + +ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW, + "Bytes to read per chunk"); diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c index 933aedebd084..dfd92b893b9f 100644 --- a/sys/contrib/openzfs/module/zfs/zio.c +++ b/sys/contrib/openzfs/module/zfs/zio.c @@ -1301,7 +1301,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_min_claim_txg(spa)); ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); - ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ + ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(8) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, @@ -1733,16 +1733,16 @@ zio_write_compress(zio_t *zio) return (zio); } else { /* - * Round up compressed size up to the ashift - * of the smallest-ashift device, and zero the tail. - * This ensures that the compressed size of the BP - * (and thus compressratio property) are correct, + * Round compressed size up to the minimum allocation + * size of the smallest-ashift device, and zero the + * tail. This ensures that the compressed size of the + * BP (and thus compressratio property) are correct, * in that we charge for the padding used to fill out * the last sector. */ - ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); - size_t rounded = (size_t)P2ROUNDUP(psize, - 1ULL << spa->spa_min_ashift); + ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT); + size_t rounded = (size_t)roundup(psize, + spa->spa_min_alloc); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); @@ -2275,9 +2275,7 @@ zio_nowait(zio_t *zio) * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; - kpreempt_disable(); - pio = spa->spa_async_zio_root[CPU_SEQID]; - kpreempt_enable(); + pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE]; zio_add_child(pio, zio); } @@ -2816,8 +2814,8 @@ zio_write_gang_block(zio_t *pio) ASSERT(has_data); flags |= METASLAB_ASYNC_ALLOC; - VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator], - pio)); + VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator]. + mca_alloc_slots, pio)); /* * The logical zio has already placed a reservation for @@ -3618,17 +3616,16 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * of, so we just hash the objset ID to pick the allocator to get * some parallelism. */ - error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, - txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, - cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % - spa->spa_alloc_count); + int flags = METASLAB_FASTWRITE | METASLAB_ZIL; + int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % + spa->spa_alloc_count; + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, + 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); if (error == 0) { *slog = TRUE; } else { - error = metaslab_alloc(spa, spa_normal_class(spa), size, - new_bp, 1, txg, NULL, METASLAB_FASTWRITE, - &io_alloc_list, NULL, cityhash4(0, 0, 0, - os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); + error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, + 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); if (error == 0) *slog = FALSE; } @@ -3787,19 +3784,37 @@ zio_vdev_io_start(zio_t *zio) * However, indirect vdevs point off to other vdevs which may have * DTL's, so we never bypass them. The child i/os on concrete vdevs * will be properly bypassed instead. + * + * Leaf DTL_PARTIAL can be empty when a legitimate write comes from + * a dRAID spare vdev. For example, when a dRAID spare is first + * used, its spare blocks need to be written to but the leaf vdev's + * of such blocks can have empty DTL_PARTIAL. + * + * There seemed no clean way to allow such writes while bypassing + * spurious ones. At this point, just avoid all bypassing for dRAID + * for correctness. */ if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && zio->io_txg != 0 && /* not a delegated i/o */ vd->vdev_ops != &vdev_indirect_ops && + vd->vdev_top->vdev_ops != &vdev_draid_ops && !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { ASSERT(zio->io_type == ZIO_TYPE_WRITE); zio_vdev_io_bypass(zio); return (zio); } - if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || - zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { + /* + * Select the next best leaf I/O to process. Distributed spares are + * excluded since they dispatch the I/O directly to a leaf vdev after + * applying the dRAID mapping. + */ + if (vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops && + (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_TRIM)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) return (zio); @@ -3836,8 +3851,8 @@ zio_vdev_io_done(zio_t *zio) if (zio->io_delay) zio->io_delay = gethrtime() - zio->io_delay; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { - + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); if (zio->io_type == ZIO_TYPE_WRITE) @@ -4239,7 +4254,7 @@ zio_checksum_verify(zio_t *zio) if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) return (zio); - ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); + ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL); } if ((error = zio_checksum_error(zio, &info)) != 0) { @@ -4483,9 +4498,8 @@ zio_done(zio_t *zio) metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, zio->io_allocator); - VERIFY(zfs_refcount_not_held( - &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], - zio)); + VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class-> + mc_allocator[zio->io_allocator].mca_alloc_slots, zio)); } diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c index fb8ce0916eb5..e56ea88682ff 100644 --- a/sys/contrib/openzfs/module/zfs/zio_inject.c +++ b/sys/contrib/openzfs/module/zfs/zio_inject.c @@ -265,6 +265,12 @@ zio_handle_fault_injection(zio_t *zio, int error) if (zio->io_type != ZIO_TYPE_READ) return (0); + /* + * A rebuild I/O has no checksum to verify. + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM) + return (0); + rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c index 2b20b02e4942..7c6dae8650c7 100644 --- a/sys/contrib/openzfs/module/zfs/zvol.c +++ b/sys/contrib/openzfs/module/zfs/zvol.c @@ -772,7 +772,7 @@ zvol_setup_zv(zvol_state_t *zv) if (error) return (SET_ERROR(error)); - error = dnode_hold(os, ZVOL_OBJ, FTAG, &zv->zv_dn); + error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error) return (SET_ERROR(error)); @@ -807,7 +807,7 @@ zvol_shutdown_zv(zvol_state_t *zv) zv->zv_zilog = NULL; - dnode_rele(zv->zv_dn, FTAG); + dnode_rele(zv->zv_dn, zv); zv->zv_dn = NULL; /* @@ -1376,7 +1376,9 @@ typedef struct zvol_volmode_cb_arg { static void zvol_set_volmode_impl(char *name, uint64_t volmode) { - fstrans_cookie_t cookie = spl_fstrans_mark(); + fstrans_cookie_t cookie; + uint64_t old_volmode; + zvol_state_t *zv; if (strchr(name, '@') != NULL) return; @@ -1386,9 +1388,18 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) * this is necessary because our backing gendisk (zvol_state->zv_disk) * could be different when we set, for instance, volmode from "geom" * to "dev" (or vice versa). - * A possible optimization is to modify our consumers so we don't get - * called when "volmode" does not change. */ + zv = zvol_find_by_name(name, RW_NONE); + if (zv == NULL && volmode == ZFS_VOLMODE_NONE) + return; + if (zv != NULL) { + old_volmode = zv->zv_volmode; + mutex_exit(&zv->zv_state_lock); + if (old_volmode == volmode) + return; + zvol_wait_close(zv); + } + cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: (void) zvol_remove_minor_impl(name); @@ -1406,7 +1417,6 @@ zvol_set_volmode_impl(char *name, uint64_t volmode) (void) ops->zv_create_minor(name); break; } - spl_fstrans_unmark(cookie); } diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c index 3d1805f49cca..69ebf252d1ba 100644 --- a/sys/contrib/openzfs/module/zstd/zfs_zstd.c +++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c @@ -202,6 +202,34 @@ static struct zstd_fallback_mem zstd_dctx_fallback; static struct zstd_pool *zstd_mempool_cctx; static struct zstd_pool *zstd_mempool_dctx; + +static void +zstd_mempool_reap(struct zstd_pool *zstd_mempool) +{ + struct zstd_pool *pool; + + if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) { + return; + } + + /* free obsolete slots */ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + pool = &zstd_mempool[i]; + if (pool->mem && mutex_tryenter(&pool->barrier)) { + /* Free memory if unused object older than 2 minutes */ + if (pool->mem && gethrestime_sec() > pool->timeout) { + vmem_free(pool->mem, pool->size); + ZSTDSTAT_SUB(zstd_stat_buffers, 1); + ZSTDSTAT_SUB(zstd_stat_size, pool->size); + pool->mem = NULL; + pool->size = 0; + pool->timeout = 0; + } + mutex_exit(&pool->barrier); + } + } +} + /* * Try to get a cached allocated buffer from memory pool or allocate a new one * if necessary. If a object is older than 2 minutes and does not fit the @@ -215,6 +243,7 @@ static struct zstd_pool *zstd_mempool_dctx; * * The scheduled release will be updated every time a object is reused. */ + static void * zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) { @@ -242,31 +271,16 @@ zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) * Check if objects fits the size, if so we take it and * update the timestamp. */ - if (size && !mem && pool->mem && size <= pool->size) { + if (pool->mem && size <= pool->size) { pool->timeout = gethrestime_sec() + ZSTD_POOL_TIMEOUT; mem = pool->mem; - continue; + return (mem); } - - /* Free memory if unused object older than 2 minutes */ - if (pool->mem && gethrestime_sec() > pool->timeout) { - vmem_free(pool->mem, pool->size); - ZSTDSTAT_SUB(zstd_stat_buffers, 1); - ZSTDSTAT_SUB(zstd_stat_size, pool->size); - pool->mem = NULL; - pool->size = 0; - pool->timeout = 0; - } - mutex_exit(&pool->barrier); } } - if (!size || mem) { - return (mem); - } - /* * If no preallocated slot was found, try to fill in a new one. * @@ -711,8 +725,8 @@ zfs_zstd_cache_reap_now(void) * calling alloc with zero size seeks * and releases old unused objects */ - zstd_mempool_alloc(zstd_mempool_cctx, 0); - zstd_mempool_alloc(zstd_mempool_dctx, 0); + zstd_mempool_reap(zstd_mempool_cctx); + zstd_mempool_reap(zstd_mempool_dctx); } extern int __init diff --git a/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in b/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in index 782ad465e36d..6e4bfdcfedc0 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in @@ -88,10 +88,6 @@ BuildRequires: %{_bindir}/kmodtool %global __global_ldflags %{nil} %endif -%if 0%{?fedora} >= 17 -%define prefix /usr -%endif - # Kmodtool does its magic here. A patched version of kmodtool is shipped # with the source rpm until kmod development packages are supported upstream. # https://bugzilla.rpmfusion.org/show_bug.cgi?id=2714 diff --git a/sys/contrib/openzfs/rpm/generic/zfs.spec.in b/sys/contrib/openzfs/rpm/generic/zfs.spec.in index 86e983718ee8..ef0699d36e11 100644 --- a/sys/contrib/openzfs/rpm/generic/zfs.spec.in +++ b/sys/contrib/openzfs/rpm/generic/zfs.spec.in @@ -120,10 +120,10 @@ License: @ZFS_META_LICENSE@ URL: https://github.com/openzfs/zfs Source0: %{name}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) -Requires: libzpool2 = %{version} -Requires: libnvpair1 = %{version} -Requires: libuutil1 = %{version} -Requires: libzfs2 = %{version} +Requires: libzpool4 = %{version} +Requires: libnvpair3 = %{version} +Requires: libuutil3 = %{version} +Requires: libzfs4 = %{version} Requires: %{name}-kmod = %{version} Provides: %{name}-kmod-common = %{version} Obsoletes: spl @@ -162,35 +162,38 @@ Requires: sysstat %description This package contains the core ZFS command line utilities. -%package -n libzpool2 +%package -n libzpool4 Summary: Native ZFS pool library for Linux Group: System Environment/Kernel +Obsoletes: libzpool2 -%description -n libzpool2 +%description -n libzpool4 This package contains the zpool library, which provides support for managing zpools -%post -n libzpool2 -p /sbin/ldconfig -%postun -n libzpool2 -p /sbin/ldconfig +%post -n libzpool4 -p /sbin/ldconfig +%postun -n libzpool4 -p /sbin/ldconfig -%package -n libnvpair1 +%package -n libnvpair3 Summary: Solaris name-value library for Linux Group: System Environment/Kernel +Obsoletes: libnvpair1 -%description -n libnvpair1 +%description -n libnvpair3 This package contains routines for packing and unpacking name-value pairs. This functionality is used to portably transport data across process boundaries, between kernel and user space, and can be used to write self describing data structures on disk. -%post -n libnvpair1 -p /sbin/ldconfig -%postun -n libnvpair1 -p /sbin/ldconfig +%post -n libnvpair3 -p /sbin/ldconfig +%postun -n libnvpair3 -p /sbin/ldconfig -%package -n libuutil1 +%package -n libuutil3 Summary: Solaris userland utility library for Linux Group: System Environment/Kernel +Obsoletes: libuutil1 -%description -n libuutil1 +%description -n libuutil3 This library provides a variety of compatibility functions for OpenZFS: * libspl: The Solaris Porting Layer userland library, which provides APIs that make it possible to run Solaris user code in a Linux environment @@ -201,32 +204,34 @@ This library provides a variety of compatibility functions for OpenZFS: partitioning. * libshare: NFS, SMB, and iSCSI service integration for ZFS. -%post -n libuutil1 -p /sbin/ldconfig -%postun -n libuutil1 -p /sbin/ldconfig +%post -n libuutil3 -p /sbin/ldconfig +%postun -n libuutil3 -p /sbin/ldconfig -%package -n libzfs2 +%package -n libzfs4 Summary: Native ZFS filesystem library for Linux Group: System Environment/Kernel +Obsoletes: libzfs2 -%description -n libzfs2 +%description -n libzfs4 This package provides support for managing ZFS filesystems -%post -n libzfs2 -p /sbin/ldconfig -%postun -n libzfs2 -p /sbin/ldconfig +%post -n libzfs4 -p /sbin/ldconfig +%postun -n libzfs4 -p /sbin/ldconfig -%package -n libzfs2-devel +%package -n libzfs4-devel Summary: Development headers Group: System Environment/Kernel -Requires: libzfs2 = %{version} -Requires: libzpool2 = %{version} -Requires: libnvpair1 = %{version} -Requires: libuutil1 = %{version} -Provides: libzpool2-devel -Provides: libnvpair1-devel -Provides: libuutil1-devel +Requires: libzfs4 = %{version} +Requires: libzpool4 = %{version} +Requires: libnvpair3 = %{version} +Requires: libuutil3 = %{version} +Provides: libzpool4-devel +Provides: libnvpair3-devel +Provides: libuutil3-devel Obsoletes: zfs-devel +Obsoletes: libzfs2-devel -%description -n libzfs2-devel +%description -n libzfs4-devel This package contains the header files needed for building additional applications against the ZFS libraries. @@ -273,8 +278,8 @@ Summary: Python %{python_version} wrapper for libzfs_core Group: Development/Languages/Python License: Apache-2.0 BuildArch: noarch -Requires: libzfs2 = %{version} -Requires: libnvpair1 = %{version} +Requires: libzfs4 = %{version} +Requires: libnvpair3 = %{version} Requires: libffi Requires: python%{__python_pkg_version} Requires: %{__python_cffi_pkg} @@ -477,19 +482,19 @@ systemctl --system daemon-reload >/dev/null || true %{_datadir}/pam-configs/* %endif -%files -n libzpool2 +%files -n libzpool4 %{_libdir}/libzpool.so.* -%files -n libnvpair1 +%files -n libnvpair3 %{_libdir}/libnvpair.so.* -%files -n libuutil1 +%files -n libuutil3 %{_libdir}/libuutil.so.* -%files -n libzfs2 +%files -n libzfs4 %{_libdir}/libzfs*.so.* -%files -n libzfs2-devel +%files -n libzfs4-devel %{_pkgconfigdir}/libzfs.pc %{_pkgconfigdir}/libzfsbootenv.pc %{_pkgconfigdir}/libzfs_core.pc diff --git a/sys/contrib/openzfs/scripts/Makefile.am b/sys/contrib/openzfs/scripts/Makefile.am index 9d39947525a3..2deece7f4122 100644 --- a/sys/contrib/openzfs/scripts/Makefile.am +++ b/sys/contrib/openzfs/scripts/Makefile.am @@ -36,6 +36,7 @@ export ZPOOL_SCRIPT_DIR=$$CMD_DIR/zpool/zpool.d export ZPOOL_SCRIPTS_PATH=$$CMD_DIR/zpool/zpool.d export CONTRIB_DIR=@abs_top_builddir@/contrib export LIB_DIR=@abs_top_builddir@/lib +export SYSCONF_DIR=@abs_top_builddir@/etc export INSTALL_UDEV_DIR=@udevdir@ export INSTALL_UDEV_RULE_DIR=@udevruledir@ @@ -60,6 +61,7 @@ export EXTRA_ENVIRONMENT all-local: -$(SED) -e '\|^export BIN_DIR=|s|$$|@abs_top_builddir@/bin|' \ -e '\|^export SBIN_DIR=|s|$$|@abs_top_builddir@/bin|' \ + -e '\|^export LIBEXEC_DIR=|s|$$|@abs_top_builddir@/bin|' \ -e '\|^export ZTS_DIR=|s|$$|@abs_top_srcdir@/tests|' \ -e '\|^export SCRIPT_DIR=|s|$$|@abs_top_srcdir@/scripts|' \ $(abs_top_srcdir)/scripts/common.sh.in >common.sh @@ -71,6 +73,7 @@ clean-local: install-data-hook: -$(SED) -e '\|^export BIN_DIR=|s|$$|@bindir@|' \ -e '\|^export SBIN_DIR=|s|$$|@sbindir@|' \ + -e '\|^export LIBEXEC_DIR=|s|$$|@zfsexecdir@|' \ -e '\|^export ZTS_DIR=|s|$$|@datadir@/@PACKAGE@|' \ -e '\|^export SCRIPT_DIR=|s|$$|@datadir@/@PACKAGE@|' \ $(abs_top_srcdir)/scripts/common.sh.in \ diff --git a/sys/contrib/openzfs/scripts/commitcheck.sh b/sys/contrib/openzfs/scripts/commitcheck.sh index c7515c23e1d0..71cf521666a6 100755 --- a/sys/contrib/openzfs/scripts/commitcheck.sh +++ b/sys/contrib/openzfs/scripts/commitcheck.sh @@ -1,23 +1,10 @@ -#!/usr/bin/env bash +#!/bin/sh REF="HEAD" -# test a url -function test_url() -{ - url="$1" - if ! curl --output /dev/null --max-time 60 \ - --silent --head --fail "$url" ; then - echo "\"$url\" is unreachable" - return 1 - fi - - return 0 -} - # test commit body for length # lines containing urls are exempt for the length limit. -function test_commit_bodylength() +test_commit_bodylength() { length="72" body=$(git log -n 1 --pretty=%b "$REF" | grep -Ev "http(s)*://" | grep -E -m 1 ".{$((length + 1))}") @@ -30,9 +17,9 @@ function test_commit_bodylength() } # check for a tagged line -function check_tagged_line() +check_tagged_line() { - regex='^\s*'"$1"':\s[[:print:]]+\s<[[:graph:]]+>$' + regex='^[[:space:]]*'"$1"':[[:space:]][[:print:]]+[[:space:]]<[[:graph:]]+>$' foundline=$(git log -n 1 "$REF" | grep -E -m 1 "$regex") if [ -z "$foundline" ]; then echo "error: missing \"$1\"" @@ -42,30 +29,8 @@ function check_tagged_line() return 0 } -# check for a tagged line and check that the link is valid -function check_tagged_line_with_url() -{ - regex='^\s*'"$1"':\s\K([[:graph:]]+)$' - foundline=$(git log -n 1 "$REF" | grep -Po "$regex") - if [ -z "$foundline" ]; then - echo "error: missing \"$1\"" - return 1 - fi - - OLDIFS=$IFS - IFS=$'\n' - for url in $(echo -e "$foundline"); do - if ! test_url "$url"; then - return 1 - fi - done - IFS=$OLDIFS - - return 0 -} - # check commit message for a normal commit -function new_change_commit() +new_change_commit() { error=0 @@ -89,57 +54,7 @@ function new_change_commit() return $error } -function is_openzfs_port() -{ - # subject starts with OpenZFS means it's an openzfs port - subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^OpenZFS') - if [ -n "$subject" ]; then - return 0 - fi - - return 1 -} - -function openzfs_port_commit() -{ - error=0 - - # subject starts with OpenZFS dddd - subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^OpenZFS [[:digit:]]+(, [[:digit:]]+)* - ') - if [ -z "$subject" ]; then - echo "error: OpenZFS patch ports must have a subject line that starts with \"OpenZFS dddd - \"" - error=1 - fi - - # need an authored by line - if ! check_tagged_line "Authored by" ; then - error=1 - fi - - # need a reviewed by line - if ! check_tagged_line "Reviewed by" ; then - error=1 - fi - - # need ported by line - if ! check_tagged_line "Ported-by" ; then - error=1 - fi - - # need a url to openzfs commit and it should be valid - if ! check_tagged_line_with_url "OpenZFS-commit" ; then - error=1 - fi - - # need a url to illumos issue and it should be valid - if ! check_tagged_line_with_url "OpenZFS-issue" ; then - error=1 - fi - - return $error -} - -function is_coverity_fix() +is_coverity_fix() { # subject starts with Fix coverity defects means it's a coverity fix subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^Fix coverity defects') @@ -150,7 +65,7 @@ function is_coverity_fix() return 1 } -function coverity_fix_commit() +coverity_fix_commit() { error=0 @@ -169,11 +84,12 @@ function coverity_fix_commit() # test each summary line for the proper format OLDIFS=$IFS - IFS=$'\n' + IFS=' +' for line in $(git log -n 1 --pretty=%b "$REF" | grep -E '^CID'); do echo "$line" | grep -E '^CID [[:digit:]]+: ([[:graph:]]+|[[:space:]])+ \(([[:upper:]]|\_)+\)' > /dev/null # shellcheck disable=SC2181 - if [[ $? -ne 0 ]]; then + if [ $? -ne 0 ]; then echo "error: commit message has an improperly formatted CID defect line" error=1 fi @@ -192,15 +108,6 @@ if [ -n "$1" ]; then REF="$1" fi -# if openzfs port, test against that -if is_openzfs_port; then - if ! openzfs_port_commit ; then - exit 1 - else - exit 0 - fi -fi - # if coverity fix, test against that if is_coverity_fix; then if ! coverity_fix_commit; then diff --git a/sys/contrib/openzfs/scripts/common.sh.in b/sys/contrib/openzfs/scripts/common.sh.in index 2d9d9c786622..8268315b3361 100644 --- a/sys/contrib/openzfs/scripts/common.sh.in +++ b/sys/contrib/openzfs/scripts/common.sh.in @@ -3,6 +3,7 @@ # Directories export BIN_DIR= export SBIN_DIR= +export LIBEXEC_DIR= export ZTS_DIR= export SCRIPT_DIR= diff --git a/sys/contrib/openzfs/scripts/dkms.mkconf b/sys/contrib/openzfs/scripts/dkms.mkconf index 28d9609f721b..8649b93183a2 100755 --- a/sys/contrib/openzfs/scripts/dkms.mkconf +++ b/sys/contrib/openzfs/scripts/dkms.mkconf @@ -22,6 +22,7 @@ cat >${filename} < +#include +#include +#include +#include +#include + +/* + * The number of rows to generate for new permutation maps. + */ +#define MAP_ROWS_DEFAULT 256 + +/* + * Key values for dRAID maps when stored as nvlists. + */ +#define MAP_SEED "seed" +#define MAP_CHECKSUM "checksum" +#define MAP_WORST_RATIO "worst_ratio" +#define MAP_AVG_RATIO "avg_ratio" +#define MAP_CHILDREN "children" +#define MAP_NPERMS "nperms" +#define MAP_PERMS "perms" + +static void +draid_usage(void) +{ + (void) fprintf(stderr, + "usage: draid command args ...\n" + "Available commands are:\n" + "\n" + "\tdraid generate [-cv] [-m min] [-n max] [-p passes] FILE\n" + "\tdraid verify [-rv] FILE\n" + "\tdraid dump [-v] [-m min] [-n max] FILE\n" + "\tdraid table FILE\n" + "\tdraid merge FILE SRC SRC...\n"); + exit(1); +} + +static int +read_map(const char *filename, nvlist_t **allcfgs) +{ + int block_size = 131072; + int buf_size = 131072; + int tmp_size, error; + char *tmp_buf; + + struct stat64 stat; + if (lstat64(filename, &stat) != 0) + return (errno); + + if (stat.st_size == 0 || + !(S_ISREG(stat.st_mode) || S_ISLNK(stat.st_mode))) { + return (EINVAL); + } + + gzFile fp = gzopen(filename, "rb"); + if (fp == Z_NULL) + return (errno); + + char *buf = malloc(buf_size); + if (buf == NULL) { + (void) gzclose(fp); + return (ENOMEM); + } + + ssize_t rc, bytes = 0; + while (!gzeof(fp)) { + rc = gzread(fp, buf + bytes, block_size); + if ((rc < 0) || (rc == 0 && !gzeof(fp))) { + free(buf); + (void) gzclose(fp); + (void) gzerror(fp, &error); + return (error); + } else { + bytes += rc; + + if (bytes + block_size >= buf_size) { + tmp_size = 2 * buf_size; + tmp_buf = malloc(tmp_size); + if (tmp_buf == NULL) { + free(buf); + (void) gzclose(fp); + return (ENOMEM); + } + + memcpy(tmp_buf, buf, bytes); + free(buf); + buf = tmp_buf; + buf_size = tmp_size; + } + } + } + + (void) gzclose(fp); + + error = nvlist_unpack(buf, bytes, allcfgs, 0); + free(buf); + + return (error); +} + +/* + * Read a map from the specified filename. A file contains multiple maps + * which are indexed by the number of children. The caller is responsible + * for freeing the configuration returned. + */ +static int +read_map_key(const char *filename, char *key, nvlist_t **cfg) +{ + nvlist_t *allcfgs, *foundcfg = NULL; + int error; + + error = read_map(filename, &allcfgs); + if (error != 0) + return (error); + + nvlist_lookup_nvlist(allcfgs, key, &foundcfg); + if (foundcfg != NULL) { + nvlist_dup(foundcfg, cfg, KM_SLEEP); + error = 0; + } else { + error = ENOENT; + } + + nvlist_free(allcfgs); + + return (error); +} + +/* + * Write all mappings to the map file. + */ +static int +write_map(const char *filename, nvlist_t *allcfgs) +{ + size_t buflen = 0; + int error; + + error = nvlist_size(allcfgs, &buflen, NV_ENCODE_XDR); + if (error) + return (error); + + char *buf = malloc(buflen); + if (buf == NULL) + return (ENOMEM); + + error = nvlist_pack(allcfgs, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); + if (error) { + free(buf); + return (error); + } + + /* + * Atomically update the file using a temporary file and the + * traditional unlink then rename steps. This code provides + * no locking, it only guarantees the packed nvlist on disk + * is updated atomically and is internally consistent. + */ + char *tmpname = calloc(MAXPATHLEN, 1); + if (tmpname == NULL) { + free(buf); + return (ENOMEM); + } + + snprintf(tmpname, MAXPATHLEN - 1, "%s.XXXXXX", filename); + + int fd = mkstemp(tmpname); + if (fd < 0) { + error = errno; + free(buf); + free(tmpname); + return (error); + } + (void) close(fd); + + gzFile fp = gzopen(tmpname, "w9b"); + if (fp == Z_NULL) { + error = errno; + free(buf); + free(tmpname); + return (errno); + } + + ssize_t rc, bytes = 0; + while (bytes < buflen) { + size_t size = MIN(buflen - bytes, 131072); + rc = gzwrite(fp, buf + bytes, size); + if (rc < 0) { + free(buf); + (void) gzerror(fp, &error); + (void) gzclose(fp); + (void) unlink(tmpname); + free(tmpname); + return (error); + } else if (rc == 0) { + break; + } else { + bytes += rc; + } + } + + free(buf); + (void) gzclose(fp); + + if (bytes != buflen) { + (void) unlink(tmpname); + free(tmpname); + return (EIO); + } + + /* + * Unlink the previous config file and replace it with the updated + * version. If we're able to unlink the file then directory is + * writable by us and the subsequent rename should never fail. + */ + error = unlink(filename); + if (error != 0 && errno != ENOENT) { + error = errno; + (void) unlink(tmpname); + free(tmpname); + return (error); + } + + error = rename(tmpname, filename); + if (error != 0) { + error = errno; + (void) unlink(tmpname); + free(tmpname); + return (error); + } + + free(tmpname); + + return (0); +} + +/* + * Add the dRAID map to the file and write it out. + */ +static int +write_map_key(const char *filename, char *key, draid_map_t *map, + double worst_ratio, double avg_ratio) +{ + nvlist_t *nv_cfg, *allcfgs; + int error; + + /* + * Add the configuration to an existing or new file. The new + * configuration will replace an existing configuration with the + * same key if it has a lower ratio and is therefore better. + */ + error = read_map(filename, &allcfgs); + if (error == ENOENT) { + allcfgs = fnvlist_alloc(); + } else if (error != 0) { + return (error); + } + + error = nvlist_lookup_nvlist(allcfgs, key, &nv_cfg); + if (error == 0) { + uint64_t nv_cfg_worst_ratio = fnvlist_lookup_uint64(nv_cfg, + MAP_WORST_RATIO); + double nv_worst_ratio = (double)nv_cfg_worst_ratio / 1000.0; + + if (worst_ratio < nv_worst_ratio) { + /* Replace old map with the more balanced new map. */ + fnvlist_remove(allcfgs, key); + } else { + /* The old map is preferable, keep it. */ + nvlist_free(allcfgs); + return (EEXIST); + } + } + + nvlist_t *cfg = fnvlist_alloc(); + fnvlist_add_uint64(cfg, MAP_SEED, map->dm_seed); + fnvlist_add_uint64(cfg, MAP_CHECKSUM, map->dm_checksum); + fnvlist_add_uint64(cfg, MAP_CHILDREN, map->dm_children); + fnvlist_add_uint64(cfg, MAP_NPERMS, map->dm_nperms); + fnvlist_add_uint8_array(cfg, MAP_PERMS, map->dm_perms, + map->dm_children * map->dm_nperms * sizeof (uint8_t)); + + fnvlist_add_uint64(cfg, MAP_WORST_RATIO, + (uint64_t)(worst_ratio * 1000.0)); + fnvlist_add_uint64(cfg, MAP_AVG_RATIO, + (uint64_t)(avg_ratio * 1000.0)); + + error = nvlist_add_nvlist(allcfgs, key, cfg); + if (error == 0) + error = write_map(filename, allcfgs); + + nvlist_free(cfg); + nvlist_free(allcfgs); + return (error); +} + +static void +dump_map(draid_map_t *map, char *key, double worst_ratio, double avg_ratio, + int verbose) +{ + if (verbose == 0) { + return; + } else if (verbose == 1) { + printf(" \"%s\": seed: 0x%016llx worst_ratio: %2.03f " + "avg_ratio: %2.03f\n", key, (u_longlong_t)map->dm_seed, + worst_ratio, avg_ratio); + return; + } else { + printf(" \"%s\":\n" + " seed: 0x%016llx\n" + " checksum: 0x%016llx\n" + " worst_ratio: %2.03f\n" + " avg_ratio: %2.03f\n" + " children: %llu\n" + " nperms: %llu\n", + key, (u_longlong_t)map->dm_seed, + (u_longlong_t)map->dm_checksum, worst_ratio, avg_ratio, + (u_longlong_t)map->dm_children, + (u_longlong_t)map->dm_nperms); + + if (verbose > 2) { + printf(" perms = {\n"); + for (int i = 0; i < map->dm_nperms; i++) { + printf(" { "); + for (int j = 0; j < map->dm_children; j++) { + printf("%3d%s ", map->dm_perms[ + i * map->dm_children + j], + j < map->dm_children - 1 ? + "," : ""); + } + printf(" },\n"); + } + printf(" }\n"); + } else if (verbose == 2) { + printf(" draid_perms = \n"); + } + } +} + +static void +dump_map_nv(char *key, nvlist_t *cfg, int verbose) +{ + draid_map_t map; + uint_t c; + + uint64_t worst_ratio = fnvlist_lookup_uint64(cfg, MAP_WORST_RATIO); + uint64_t avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); + + map.dm_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + map.dm_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + map.dm_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + map.dm_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + nvlist_lookup_uint8_array(cfg, MAP_PERMS, &map.dm_perms, &c); + + dump_map(&map, key, (double)worst_ratio / 1000.0, + avg_ratio / 1000.0, verbose); +} + +/* + * Print a summary of the mapping. + */ +static int +dump_map_key(const char *filename, char *key, int verbose) +{ + nvlist_t *cfg; + int error; + + error = read_map_key(filename, key, &cfg); + if (error != 0) + return (error); + + dump_map_nv(key, cfg, verbose); + + return (0); +} + +/* + * Allocate a new permutation map for evaluation. + */ +static int +alloc_new_map(uint64_t children, uint64_t nperms, uint64_t seed, + draid_map_t **mapp) +{ + draid_map_t *map; + int error; + + map = malloc(sizeof (draid_map_t)); + if (map == NULL) + return (ENOMEM); + + map->dm_children = children; + map->dm_nperms = nperms; + map->dm_seed = seed; + map->dm_checksum = 0; + + error = vdev_draid_generate_perms(map, &map->dm_perms); + if (error) { + free(map); + return (error); + } + + *mapp = map; + + return (0); +} + +/* + * Allocate the fixed permutation map for N children. + */ +static int +alloc_fixed_map(uint64_t children, draid_map_t **mapp) +{ + const draid_map_t *fixed_map; + draid_map_t *map; + int error; + + error = vdev_draid_lookup_map(children, &fixed_map); + if (error) + return (error); + + map = malloc(sizeof (draid_map_t)); + if (map == NULL) + return (ENOMEM); + + memcpy(map, fixed_map, sizeof (draid_map_t)); + VERIFY3U(map->dm_checksum, !=, 0); + + error = vdev_draid_generate_perms(map, &map->dm_perms); + if (error) { + free(map); + return (error); + } + + *mapp = map; + + return (0); +} + +/* + * Free a permutation map. + */ +static void +free_map(draid_map_t *map) +{ + free(map->dm_perms); + free(map); +} + +/* + * Check if dev is in the provided list of faulted devices. + */ +static inline boolean_t +is_faulted(int *faulted_devs, int nfaulted, int dev) +{ + for (int i = 0; i < nfaulted; i++) + if (faulted_devs[i] == dev) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Evaluate how resilvering I/O will be distributed given a list of faulted + * vdevs. As a simplification we assume one IO is sufficient to repair each + * damaged device in a group. + */ +static double +eval_resilver(draid_map_t *map, uint64_t groupwidth, uint64_t nspares, + int *faulted_devs, int nfaulted, int *min_child_ios, int *max_child_ios) +{ + uint64_t children = map->dm_children; + uint64_t ngroups = 1; + uint64_t ndisks = children - nspares; + + /* + * Calculate the minimum number of groups required to fill a slice. + */ + while (ngroups * (groupwidth) % (children - nspares) != 0) + ngroups++; + + int *ios = calloc(map->dm_children, sizeof (uint64_t)); + + /* Resilver all rows */ + for (int i = 0; i < map->dm_nperms; i++) { + uint8_t *row = &map->dm_perms[i * map->dm_children]; + + /* Resilver all groups with faulted drives */ + for (int j = 0; j < ngroups; j++) { + uint64_t spareidx = map->dm_children - nspares; + boolean_t repair_needed = B_FALSE; + + /* See if any devices in this group are faulted */ + uint64_t groupstart = (j * groupwidth) % ndisks; + + for (int k = 0; k < groupwidth; k++) { + uint64_t groupidx = (groupstart + k) % ndisks; + + repair_needed = is_faulted(faulted_devs, + nfaulted, row[groupidx]); + if (repair_needed) + break; + } + + if (repair_needed == B_FALSE) + continue; + + /* + * This group is degraded. Calculate the number of + * reads the non-faulted drives require and the number + * of writes to the distributed hot spare for this row. + */ + for (int k = 0; k < groupwidth; k++) { + uint64_t groupidx = (groupstart + k) % ndisks; + + if (!is_faulted(faulted_devs, nfaulted, + row[groupidx])) { + ios[row[groupidx]]++; + } else if (nspares > 0) { + while (is_faulted(faulted_devs, + nfaulted, row[spareidx])) { + spareidx++; + } + + ASSERT3U(spareidx, <, map->dm_children); + ios[row[spareidx]]++; + spareidx++; + } + } + } + } + + *min_child_ios = INT_MAX; + *max_child_ios = 0; + + /* + * Find the drives with fewest and most required I/O. These values + * are used to calculate the imbalance ratio. To avoid returning an + * infinite value for permutations which have children that perform + * no IO a floor of 1 IO per child is set. This ensures a meaningful + * ratio is returned for comparison and it is not an uncommon when + * there are a large number of children. + */ + for (int i = 0; i < map->dm_children; i++) { + + if (is_faulted(faulted_devs, nfaulted, i)) { + ASSERT0(ios[i]); + continue; + } + + if (ios[i] == 0) + ios[i] = 1; + + if (ios[i] < *min_child_ios) + *min_child_ios = ios[i]; + + if (ios[i] > *max_child_ios) + *max_child_ios = ios[i]; + } + + ASSERT3S(*min_child_ios, !=, INT_MAX); + ASSERT3S(*max_child_ios, !=, 0); + + double ratio = (double)(*max_child_ios) / (double)(*min_child_ios); + + free(ios); + + return (ratio); +} + +/* + * Evaluate the quality of the permutation mapping by considering possible + * device failures. Returns the imbalance ratio for the worst mapping which + * is defined to be the largest number of child IOs over the fewest number + * child IOs. A value of 1.0 indicates the mapping is perfectly balance and + * all children perform an equal amount of work during reconstruction. + */ +static void +eval_decluster(draid_map_t *map, double *worst_ratiop, double *avg_ratiop) +{ + uint64_t children = map->dm_children; + double worst_ratio = 1.0; + double sum = 0; + int worst_min_ios = 0, worst_max_ios = 0; + int n = 0; + + /* + * When there are only 2 children there can be no distributed + * spare and no resilver to evaluate. Default to a ratio of 1.0 + * for this degenerate case. + */ + if (children == VDEV_DRAID_MIN_CHILDREN) { + *worst_ratiop = 1.0; + *avg_ratiop = 1.0; + return; + } + + /* + * Score the mapping as if it had either 1 or 2 distributed spares. + */ + for (int nspares = 1; nspares <= 2; nspares++) { + uint64_t faults = nspares; + + /* + * Score groupwidths up to 19. This value was choosen as the + * largest reasonable width (16d+3p). dRAID pools may be still + * be created with wider stripes but they are not considered in + * this analysis in order to optimize for the most common cases. + */ + for (uint64_t groupwidth = 2; + groupwidth <= MIN(children - nspares, 19); + groupwidth++) { + int faulted_devs[2]; + int min_ios, max_ios; + + /* + * Score possible devices faults. This is limited + * to exactly one fault per distributed spare for + * the purposes of this similation. + */ + for (int f1 = 0; f1 < children; f1++) { + faulted_devs[0] = f1; + double ratio; + + if (faults == 1) { + ratio = eval_resilver(map, groupwidth, + nspares, faulted_devs, faults, + &min_ios, &max_ios); + + if (ratio > worst_ratio) { + worst_ratio = ratio; + worst_min_ios = min_ios; + worst_max_ios = max_ios; + } + + sum += ratio; + n++; + } else if (faults == 2) { + for (int f2 = f1 + 1; f2 < children; + f2++) { + faulted_devs[1] = f2; + + ratio = eval_resilver(map, + groupwidth, nspares, + faulted_devs, faults, + &min_ios, &max_ios); + + if (ratio > worst_ratio) { + worst_ratio = ratio; + worst_min_ios = min_ios; + worst_max_ios = max_ios; + } + + sum += ratio; + n++; + } + } + } + } + } + + *worst_ratiop = worst_ratio; + *avg_ratiop = sum / n; + + /* + * Log the min/max io values for particularly unbalanced maps. + * Since the maps are generated entirely randomly these are possible + * be exceedingly unlikely. We log it for possible investigation. + */ + if (worst_ratio > 100.0) { + dump_map(map, "DEBUG", worst_ratio, *avg_ratiop, 2); + printf("worst_min_ios=%d worst_max_ios=%d\n", + worst_min_ios, worst_max_ios); + } +} + +static int +eval_maps(uint64_t children, int passes, uint64_t *map_seed, + draid_map_t **best_mapp, double *best_ratiop, double *avg_ratiop) +{ + draid_map_t *best_map = NULL; + double best_worst_ratio = 1000.0; + double best_avg_ratio = 1000.0; + + /* + * Perform the requested number of passes evaluating randomly + * generated permutation maps. Only the best version is kept. + */ + for (int i = 0; i < passes; i++) { + double worst_ratio, avg_ratio; + draid_map_t *map; + int error; + + /* + * Calculate the next seed and generate a new candidate map. + */ + error = alloc_new_map(children, MAP_ROWS_DEFAULT, + vdev_draid_rand(map_seed), &map); + if (error) + return (error); + + /* + * Consider maps with a lower worst_ratio to be of higher + * quality. Some maps may have a lower avg_ratio but they + * are discarded since they might include some particularly + * imbalanced permuations. The average is tracked to in + * order to get a sense of the average permutation quality. + */ + eval_decluster(map, &worst_ratio, &avg_ratio); + + if (best_map == NULL || worst_ratio < best_worst_ratio) { + + if (best_map != NULL) + free_map(best_map); + + best_map = map; + best_worst_ratio = worst_ratio; + best_avg_ratio = avg_ratio; + } else { + free_map(map); + } + } + + /* + * After determining the best map generate a checksum over the full + * permutation array. This checksum is verified when opening a dRAID + * pool to ensure the generated in memory permutations are correct. + */ + zio_cksum_t cksum; + fletcher_4_native_varsize(best_map->dm_perms, + sizeof (uint8_t) * best_map->dm_children * best_map->dm_nperms, + &cksum); + best_map->dm_checksum = cksum.zc_word[0]; + + *best_mapp = best_map; + *best_ratiop = best_worst_ratio; + *avg_ratiop = best_avg_ratio; + + return (0); +} + +static int +draid_generate(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + uint64_t map_seed; + int c, fd, error, verbose = 0, passes = 1, continuous = 0; + int min_children = VDEV_DRAID_MIN_CHILDREN; + int max_children = VDEV_DRAID_MAX_CHILDREN; + int restarts = 0; + + while ((c = getopt(argc, argv, ":cm:n:p:v")) != -1) { + switch (c) { + case 'c': + continuous++; + break; + case 'm': + min_children = (int)strtol(optarg, NULL, 0); + if (min_children < VDEV_DRAID_MIN_CHILDREN) { + (void) fprintf(stderr, "A minimum of 2 " + "children are required.\n"); + return (1); + } + + break; + case 'n': + max_children = (int)strtol(optarg, NULL, 0); + if (max_children > VDEV_DRAID_MAX_CHILDREN) { + (void) fprintf(stderr, "A maximum of %d " + "children are allowed.\n", + VDEV_DRAID_MAX_CHILDREN); + return (1); + } + break; + case 'p': + passes = (int)strtol(optarg, NULL, 0); + break; + case 'v': + /* + * 0 - Only log when a better map is added to the file. + * 1 - Log the current best map for each child count. + * Minimal output on a single summary line. + * 2 - Log the current best map for each child count. + * More verbose includes most map fields. + * 3 - Log the current best map for each child count. + * Very verbose all fields including the full map. + */ + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + +restart: + /* + * Start with a fresh seed from /dev/urandom. + */ + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + printf("Unable to open /dev/urandom: %s\n:", strerror(errno)); + return (1); + } else { + ssize_t bytes = sizeof (map_seed); + ssize_t bytes_read = 0; + + while (bytes_read < bytes) { + ssize_t rc = read(fd, ((char *)&map_seed) + bytes_read, + bytes - bytes_read); + if (rc < 0) { + printf("Unable to read /dev/urandom: %s\n:", + strerror(errno)); + return (1); + } + bytes_read += rc; + } + + (void) close(fd); + } + + if (restarts == 0) + printf("Writing generated mappings to '%s':\n", filename); + + /* + * Generate maps for all requested child counts. The best map for + * each child count is written out to the specified file. If the file + * already contains a better mapping this map will not be added. + */ + for (uint64_t children = min_children; + children <= max_children; children++) { + char key[8] = { 0 }; + draid_map_t *map; + double worst_ratio = 1000.0; + double avg_ratio = 1000.0; + + error = eval_maps(children, passes, &map_seed, &map, + &worst_ratio, &avg_ratio); + if (error) { + printf("Error eval_maps(): %s\n", strerror(error)); + return (1); + } + + if (worst_ratio < 1.0 || avg_ratio < 1.0) { + printf("Error ratio < 1.0: worst_ratio = %2.03f " + "avg_ratio = %2.03f\n", worst_ratio, avg_ratio); + return (1); + } + + snprintf(key, 7, "%llu", (u_longlong_t)children); + error = write_map_key(filename, key, map, worst_ratio, + avg_ratio); + if (error == 0) { + /* The new map was added to the file. */ + dump_map(map, key, worst_ratio, avg_ratio, + MAX(verbose, 1)); + } else if (error == EEXIST) { + /* The existing map was preferable and kept. */ + if (verbose > 0) + dump_map_key(filename, key, verbose); + } else { + printf("Error write_map_key(): %s\n", strerror(error)); + return (1); + } + + free_map(map); + } + + /* + * When the continuous option is set restart at the minimum number of + * children instead of exiting. This option is useful as a mechanism + * to continuous try and refine the discovered permutations. + */ + if (continuous) { + restarts++; + printf("Restarting by request (-c): %d\n", restarts); + goto restart; + } + + return (0); +} + +/* + * Verify each map in the file by generating its in-memory permutation array + * and comfirming its checksum is correct. + */ +static int +draid_verify(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int n = 0, c, error, verbose = 1; + int check_ratios = 0; + + while ((c = getopt(argc, argv, ":rv")) != -1) { + switch (c) { + case 'r': + check_ratios++; + break; + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + char *abspath = malloc(MAXPATHLEN); + if (abspath == NULL) + return (ENOMEM); + + bzero(filename, MAXPATHLEN); + if (realpath(argv[optind], abspath) != NULL) + strncpy(filename, abspath, MAXPATHLEN - 1); + else + strncpy(filename, argv[optind], MAXPATHLEN - 1); + + free(abspath); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + printf("Verifying permutation maps: '%s'\n", filename); + + /* + * Lookup hardcoded permutation map for each valid number of children + * and verify a generated map has the correct checksum. Then compare + * the generated map values with the nvlist map values read from the + * reference file to cross-check the permutation. + */ + for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; + children <= VDEV_DRAID_MAX_CHILDREN; + children++) { + draid_map_t *map; + char key[8]; + + bzero(key, 8); + snprintf(key, 8, "%llu", (u_longlong_t)children); + + error = alloc_fixed_map(children, &map); + if (error) { + printf("Error alloc_fixed_map() failed: %s\n", + error == ECKSUM ? "Invalid checksum" : + strerror(error)); + return (1); + } + + uint64_t nv_seed, nv_checksum, nv_children, nv_nperms; + uint8_t *nv_perms; + nvlist_t *cfg; + uint_t c; + + error = read_map_key(filename, key, &cfg); + if (error != 0) { + printf("Error read_map_key() failed: %s\n", + strerror(error)); + free_map(map); + return (1); + } + + nv_seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + nv_checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + nv_children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + nv_nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + nvlist_lookup_uint8_array(cfg, MAP_PERMS, &nv_perms, &c); + + /* + * Compare draid_map_t and nvlist reference values. + */ + if (map->dm_seed != nv_seed) { + printf("Error different seeds: 0x%016llx != " + "0x%016llx\n", (u_longlong_t)map->dm_seed, + (u_longlong_t)nv_seed); + error = EINVAL; + } + + if (map->dm_checksum != nv_checksum) { + printf("Error different checksums: 0x%016llx " + "!= 0x%016llx\n", + (u_longlong_t)map->dm_checksum, + (u_longlong_t)nv_checksum); + error = EINVAL; + } + + if (map->dm_children != nv_children) { + printf("Error different children: %llu " + "!= %llu\n", (u_longlong_t)map->dm_children, + (u_longlong_t)nv_children); + error = EINVAL; + } + + if (map->dm_nperms != nv_nperms) { + printf("Error different nperms: %llu " + "!= %llu\n", (u_longlong_t)map->dm_nperms, + (u_longlong_t)nv_nperms); + error = EINVAL; + } + + for (uint64_t i = 0; i < nv_children * nv_nperms; i++) { + if (map->dm_perms[i] != nv_perms[i]) { + printf("Error different perms[%llu]: " + "%d != %d\n", (u_longlong_t)i, + (int)map->dm_perms[i], + (int)nv_perms[i]); + error = EINVAL; + break; + } + } + + /* + * For good measure recalculate the worst and average + * ratios and confirm they match the nvlist values. + */ + if (check_ratios) { + uint64_t nv_worst_ratio, nv_avg_ratio; + double worst_ratio, avg_ratio; + + eval_decluster(map, &worst_ratio, &avg_ratio); + + nv_worst_ratio = fnvlist_lookup_uint64(cfg, + MAP_WORST_RATIO); + nv_avg_ratio = fnvlist_lookup_uint64(cfg, + MAP_AVG_RATIO); + + if (worst_ratio < 1.0 || avg_ratio < 1.0) { + printf("Error ratio out of range %2.03f, " + "%2.03f\n", worst_ratio, avg_ratio); + error = EINVAL; + } + + if ((uint64_t)(worst_ratio * 1000.0) != + nv_worst_ratio) { + printf("Error different worst_ratio %2.03f " + "!= %2.03f\n", (double)nv_worst_ratio / + 1000.0, worst_ratio); + error = EINVAL; + } + + if ((uint64_t)(avg_ratio * 1000.0) != nv_avg_ratio) { + printf("Error different average_ratio %2.03f " + "!= %2.03f\n", (double)nv_avg_ratio / + 1000.0, avg_ratio); + error = EINVAL; + } + } + + if (error) { + free_map(map); + nvlist_free(cfg); + return (1); + } + + if (verbose > 0) { + printf("- %llu children: good\n", + (u_longlong_t)children); + } + n++; + + free_map(map); + nvlist_free(cfg); + } + + if (n != (VDEV_DRAID_MAX_CHILDREN - 1)) { + printf("Error permutation maps missing: %d / %d checked\n", + n, VDEV_DRAID_MAX_CHILDREN - 1); + return (1); + } + + printf("Successfully verified %d / %d permutation maps\n", + n, VDEV_DRAID_MAX_CHILDREN - 1); + + return (0); +} + +/* + * Dump the contents of the specified mapping(s) for inspection. + */ +static int +draid_dump(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int c, error, verbose = 1; + int min_children = VDEV_DRAID_MIN_CHILDREN; + int max_children = VDEV_DRAID_MAX_CHILDREN; + + while ((c = getopt(argc, argv, ":vm:n:")) != -1) { + switch (c) { + case 'm': + min_children = (int)strtol(optarg, NULL, 0); + if (min_children < 2) { + (void) fprintf(stderr, "A minimum of 2 " + "children are required.\n"); + return (1); + } + + break; + case 'n': + max_children = (int)strtol(optarg, NULL, 0); + if (max_children > VDEV_DRAID_MAX_CHILDREN) { + (void) fprintf(stderr, "A maximum of %d " + "children are allowed.\n", + VDEV_DRAID_MAX_CHILDREN); + return (1); + } + break; + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + /* + * Dump maps for the requested child counts. + */ + for (uint64_t children = min_children; + children <= max_children; children++) { + char key[8] = { 0 }; + + snprintf(key, 7, "%llu", (u_longlong_t)children); + error = dump_map_key(filename, key, verbose); + if (error) { + printf("Error dump_map_key(): %s\n", strerror(error)); + return (1); + } + } + + return (0); +} + +/* + * Print all of the mappings as a C formated draid_map_t array. This table + * is found in the module/zcommon/zfs_draid.c file and is the definative + * source for all mapping used by dRAID. It cannot be updated without + * changing the dRAID on disk format. + */ +static int +draid_table(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int error; + + if (argc > optind) { + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + } else { + (void) fprintf(stderr, "A FILE must be specified.\n"); + return (1); + } + + printf("static const draid_map_t " + "draid_maps[VDEV_DRAID_MAX_MAPS] = {\n"); + + for (uint64_t children = VDEV_DRAID_MIN_CHILDREN; + children <= VDEV_DRAID_MAX_CHILDREN; + children++) { + uint64_t seed, checksum, nperms, avg_ratio; + nvlist_t *cfg; + char key[8]; + + bzero(key, 8); + snprintf(key, 8, "%llu", (u_longlong_t)children); + + error = read_map_key(filename, key, &cfg); + if (error != 0) { + printf("Error read_map_key() failed: %s\n", + strerror(error)); + return (1); + } + + seed = fnvlist_lookup_uint64(cfg, MAP_SEED); + checksum = fnvlist_lookup_uint64(cfg, MAP_CHECKSUM); + children = fnvlist_lookup_uint64(cfg, MAP_CHILDREN); + nperms = fnvlist_lookup_uint64(cfg, MAP_NPERMS); + avg_ratio = fnvlist_lookup_uint64(cfg, MAP_AVG_RATIO); + + printf("\t{ %3llu, %3llu, 0x%016llx, 0x%016llx },\t" + "/* %2.03f */\n", (u_longlong_t)children, + (u_longlong_t)nperms, (u_longlong_t)seed, + (u_longlong_t)checksum, (double)avg_ratio / 1000.0); + + nvlist_free(cfg); + } + + printf("};\n"); + + return (0); +} + +static int +draid_merge_impl(nvlist_t *allcfgs, const char *srcfilename, int *mergedp) +{ + nvlist_t *srccfgs; + nvpair_t *elem = NULL; + int error, merged = 0; + + error = read_map(srcfilename, &srccfgs); + if (error != 0) + return (error); + + while ((elem = nvlist_next_nvpair(srccfgs, elem)) != NULL) { + uint64_t nv_worst_ratio; + uint64_t allcfg_worst_ratio; + nvlist_t *cfg, *allcfg; + char *key; + + switch (nvpair_type(elem)) { + case DATA_TYPE_NVLIST: + + (void) nvpair_value_nvlist(elem, &cfg); + key = nvpair_name(elem); + + nv_worst_ratio = fnvlist_lookup_uint64(cfg, + MAP_WORST_RATIO); + + error = nvlist_lookup_nvlist(allcfgs, key, &allcfg); + if (error == 0) { + allcfg_worst_ratio = fnvlist_lookup_uint64( + allcfg, MAP_WORST_RATIO); + + if (nv_worst_ratio < allcfg_worst_ratio) { + fnvlist_remove(allcfgs, key); + error = nvlist_add_nvlist(allcfgs, + key, cfg); + merged++; + } + } else if (error == ENOENT) { + error = nvlist_add_nvlist(allcfgs, key, cfg); + merged++; + } else { + return (error); + } + + break; + default: + continue; + } + } + + nvlist_free(srccfgs); + + *mergedp = merged; + + return (0); +} + +/* + * Merge the best map for each child count found in the listed files into + * a new file. This allows 'draid generate' to be run in parallel and for + * the results maps to be combined. + */ +static int +draid_merge(int argc, char *argv[]) +{ + char filename[MAXPATHLEN]; + int c, error, total_merged = 0, verbose = 0; + nvlist_t *allcfgs; + + while ((c = getopt(argc, argv, ":v")) != -1) { + switch (c) { + case 'v': + verbose++; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + draid_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + draid_usage(); + break; + } + } + + if (argc < 4) { + (void) fprintf(stderr, + "A FILE and multiple SRCs must be specified.\n"); + return (1); + } + + bzero(filename, MAXPATHLEN); + strncpy(filename, argv[optind], MAXPATHLEN - 1); + optind++; + + error = read_map(filename, &allcfgs); + if (error == ENOENT) { + allcfgs = fnvlist_alloc(); + } else if (error != 0) { + printf("Error read_map(): %s\n", strerror(error)); + return (error); + } + + while (optind < argc) { + char srcfilename[MAXPATHLEN]; + int merged = 0; + + bzero(srcfilename, MAXPATHLEN); + strncpy(srcfilename, argv[optind], MAXPATHLEN - 1); + + error = draid_merge_impl(allcfgs, srcfilename, &merged); + if (error) { + printf("Error draid_merge_impl(): %s\n", + strerror(error)); + nvlist_free(allcfgs); + return (1); + } + + total_merged += merged; + printf("Merged %d key(s) from '%s' into '%s'\n", merged, + srcfilename, filename); + + optind++; + } + + if (total_merged > 0) + write_map(filename, allcfgs); + + printf("Merged a total of %d key(s) into '%s'\n", total_merged, + filename); + + nvlist_free(allcfgs); + + return (0); +} + +int +main(int argc, char *argv[]) +{ + if (argc < 2) + draid_usage(); + + char *subcommand = argv[1]; + + if (strcmp(subcommand, "generate") == 0) { + return (draid_generate(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "verify") == 0) { + return (draid_verify(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "dump") == 0) { + return (draid_dump(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "table") == 0) { + return (draid_table(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "merge") == 0) { + return (draid_merge(argc - 1, argv + 1)); + } else { + draid_usage(); + } +} diff --git a/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c index 63217104f3fe..b671af7d8f42 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c +++ b/sys/contrib/openzfs/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -796,7 +796,7 @@ zfs_ioc_input_tests(const char *pool) (void) snprintf(clonesnap, sizeof (clonesnap), "%s@snap", clone); (void) snprintf(backup, sizeof (backup), "%s/backup", pool); - err = lzc_create(dataset, DMU_OST_ZFS, NULL, NULL, 0); + err = lzc_create(dataset, LZC_DATSET_TYPE_ZFS, NULL, NULL, -1); if (err) { (void) fprintf(stderr, "could not create '%s': %s\n", dataset, strerror(errno)); diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib index b34f2c04d743..3f29d4f594a1 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib @@ -548,22 +548,37 @@ function list_file_blocks # input_file # # Establish a mapping between vdev ids as shown in a DVA and the - # pathnames they correspond to in ${VDEV_MAP[]}. + # pathnames they correspond to in ${VDEV_MAP[][]}. + # + # The vdev bits in a DVA refer to the top level vdev id. + # ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev. # eval $(zdb -C $pool | awk ' - BEGIN { - printf("typeset VDEV_MAP\n"); - looking = 0; - } - /^ children/ { - id = $1; - looking = 1; - } - /path: / && looking == 1 { - print id" "$2; - looking = 0; - } - ' | sed -n 's/^children\[\([0-9]\)\]: \(.*\)$/VDEV_MAP[\1]=\2/p') + BEGIN { printf "typeset -a VDEV_MAP;" } + function subscript(s) { + # "[#]" is more convenient than the bare "#" + match(s, /\[[0-9]*\]/) + return substr(s, RSTART, RLENGTH) + } + id && !/^ / { + # left a top level vdev + id = 0 + } + id && $1 ~ /^path:$/ { + # found a vdev path; save it in the map + printf "VDEV_MAP%s%s=%s;", id, child, $2 + } + /^ children/ { + # entering a top level vdev + id = subscript($0) + child = "[0]" # default in case there is no nested vdev + printf "typeset -a VDEV_MAP%s;", id + } + /^ children/ { + # entering a nested vdev (e.g. child of a top level mirror) + child = subscript($0) + } + ') # # The awk below parses the output of zdb, printing out the level @@ -571,22 +586,40 @@ function list_file_blocks # input_file # two are converted to decimal in the while loop. 4M is added to # the offset to compensate for the first two labels and boot # block. Lastly, the offset and length are printed in units of - # 512b blocks for ease of use with dd. + # 512B blocks for ease of use with dd. # + typeset level vdev path offset length + if awk -n '' 2>/dev/null; then + # gawk needs -n to decode hex + AWK='awk -n' + else + AWK='awk' + fi log_must zpool sync -f - typeset level path offset length - zdb -ddddd $ds $objnum | awk -F: ' - BEGIN { looking = 0 } - /^Indirect blocks:/ { looking = 1} - /^\t\tsegment / { looking = 0} - /L[0-8]/ && looking == 1 { print $0} - ' | sed -n 's/^.*\(L[0-9]\) \([0-9]*\):\([0-9a-f]*\):\([0-9a-f]*\) .*$/\1 \2 \3 \4/p' | \ - while read level path offset length; do - offset=$((16#$offset)) # Conversion from hex - length=$((16#$length)) - offset="$(((offset + 4 * 1024 * 1024) / 512))" - length="$((length / 512))" - echo "$level ${VDEV_MAP[$path]} $offset $length" + zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 ' + /^$/ { looking = 0 } + looking { + level = $2 + field = 3 + while (split($field, dva, ":") == 3) { + # top level vdev id + vdev = int(dva[1]) + # offset + 4M label/boot pad in 512B blocks + offset = (int("0x"dva[2]) + pad) / bs + # length in 512B blocks + len = int("0x"dva[3]) / bs + + print level, vdev, offset, len + + ++field + } + } + /^Indirect blocks:/ { looking = 1 } + ' | \ + while read level vdev offset length; do + for path in ${VDEV_MAP[$vdev][@]}; do + echo "$level $path $offset $length" + done done 2>/dev/null } diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg index 5a507b94ab6c..299653547759 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg @@ -184,17 +184,20 @@ export ZFS_FILES='zdb arc_summary arcstat dbufstat + mount.zfs zed zgenhostid zstream zstreamdump - zfs_ids_to_path' + zfs_ids_to_path + zpool_influxdb' export ZFSTEST_FILES='badsend btree_test chg_usr_exec devname2devid dir_rd_update + draid file_check file_trunc file_write diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib index dec723e9a477..d494eda5533f 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib +++ b/sys/contrib/openzfs/tests/zfs-tests/include/libtest.shlib @@ -2336,7 +2336,7 @@ function check_pool_status # pool token keyword function is_pool_resilvering #pool { check_pool_status "$1" "scan" \ - "resilver[ ()0-9A-Za-z_-]* in progress since" $2 + "resilver[ ()0-9A-Za-z:_-]* in progress since" $2 return $? } @@ -4191,6 +4191,45 @@ function get_arcstat # stat esac } +# +# Wait for the specified arcstat to reach non-zero quiescence. +# If echo is 1 echo the value after reaching quiescence, otherwise +# if echo is 0 print the arcstat we are waiting on. +# +function arcstat_quiescence # stat echo +{ + typeset stat=$1 + typeset echo=$2 + typeset do_once=true + + if [[ $echo -eq 0 ]]; then + echo "Waiting for arcstat $1 quiescence." + fi + + while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do + typeset stat1=$(get_arcstat $stat) + sleep 2 + typeset stat2=$(get_arcstat $stat) + do_once=false + done + + if [[ $echo -eq 1 ]]; then + echo $stat2 + fi +} + +function arcstat_quiescence_noecho # stat +{ + typeset stat=$1 + arcstat_quiescence $stat 0 +} + +function arcstat_quiescence_echo # stat +{ + typeset stat=$1 + arcstat_quiescence $stat 1 +} + # # Given an array of pids, wait until all processes # have completed and check their return status. diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg index fab852a0a607..e93e299ea25a 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg @@ -36,6 +36,7 @@ INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export LUA_MAX_MEMLIMIT lua.max_memlimit zfs_lua_max_memlimit +L2ARC_MFUONLY l2arc.mfuonly l2arc_mfuonly L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled @@ -58,6 +59,8 @@ MULTIHOST_HISTORY multihost.history zfs_multihost_history MULTIHOST_IMPORT_INTERVALS multihost.import_intervals zfs_multihost_import_intervals MULTIHOST_INTERVAL multihost.interval zfs_multihost_interval OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_estimate_recordsize +PREFETCH_DISABLE prefetch.disable zfs_prefetch_disable +REBUILD_SCRUB_ENABLED rebuild_scrub_enabled zfs_rebuild_scrub_enabled REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/Makefile.am index c56518c55a03..3a5b7b0b9747 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/Makefile.am @@ -31,6 +31,7 @@ SUBDIRS = \ inheritance \ inuse \ io \ + l2arc \ large_files \ largest_pool \ libzfs \ @@ -47,7 +48,6 @@ SUBDIRS = \ nopwrite \ online_offline \ pam \ - persist_l2arc \ pool_checkpoint \ pool_names \ poolversion \ @@ -82,6 +82,7 @@ SUBDIRS = \ vdev_zaps \ write_dirs \ xattr \ + zpool_influxdb \ zvol if BUILD_LINUX diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh index 1cfe6642d8a8..b49a8919ed8c 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh @@ -33,8 +33,9 @@ function file_in_special_vdev # { typeset dataset="$1" typeset inum="$2" + typeset num_normal=$(echo $ZPOOL_DISKS | wc -w | xargs) - zdb -dddddd $dataset $inum | awk '{ + zdb -dddddd $dataset $inum | awk -v d=$num_normal '{ # find DVAs from string "offset level dva" only for L0 (data) blocks if (match($0,"L0 [0-9]+")) { dvas[0]=$3 @@ -49,7 +50,7 @@ if (match($0,"L0 [0-9]+")) { exit 1; } # verify vdev is "special" - if (arr[1] < 3) { + if (arr[1] < d) { exit 1; } } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/Makefile.am index 3ad48ccd4d96..ddabc0302010 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/Makefile.am @@ -12,7 +12,8 @@ dist_pkgdata_SCRIPTS = \ run_edonr_test.ksh \ run_sha2_test.ksh \ run_skein_test.ksh \ - filetest_001_pos.ksh + filetest_001_pos.ksh \ + filetest_002_pos.ksh dist_pkgdata_DATA = \ default.cfg diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh index 0cad8047cdee..615b41f312b6 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh @@ -62,6 +62,7 @@ log_assert "Create and read back files with using different checksum algorithms" log_onexit cleanup WRITESZ=1048576 +NWRITES=5 # Get a list of vdevs in our pool set -A array $(get_disklist_fullpath) @@ -75,7 +76,7 @@ while [[ $i -lt ${#CHECKSUM_TYPES[*]} ]]; do type=${CHECKSUM_TYPES[i]} log_must zfs set checksum=$type $TESTPOOL log_must file_write -o overwrite -f $TESTDIR/test_$type \ - -b $WRITESZ -c 5 -d R + -b $WRITESZ -c $NWRITES -d R (( i = i + 1 )) done @@ -96,7 +97,7 @@ while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do type=${CHECKSUM_TYPES[$j]} log_must zfs set checksum=$type $TESTPOOL log_must file_write -o overwrite -f $TESTDIR/test_$type \ - -b $WRITESZ -c 5 -d R + -b $WRITESZ -c $NWRITES -d R # Corrupt the level 0 blocks of this file corrupt_blocks_at_level $TESTDIR/test_$type diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh new file mode 100644 index 000000000000..921a4b392a45 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/checksum/filetest_002_pos.ksh @@ -0,0 +1,91 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018, 2019 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/checksum/default.cfg + +# DESCRIPTION: +# Sanity test to make sure checksum algorithms work. +# For each checksum, create a file in the pool using that checksum. Verify +# that there are no checksum errors. Next, for each checksum, create a single +# file in the pool using that checksum, corrupt the file, and verify that we +# correctly catch the checksum errors. +# +# STRATEGY: +# Test 1 +# 1. For each checksum: +# 2. Create a file using the checksum +# 3. Corrupt all level 1 blocks in the file +# 4. Export and import the pool +# 5. Verify that there are checksum errors + +verify_runnable "both" + +function cleanup +{ + rm -fr $TESTDIR/* +} + +log_assert "Test corrupting files at L1 and seeing checksum errors" + +log_onexit cleanup + +WRITESZ=1048576 +NWRITES=5 + +# Get a list of vdevs in our pool +set -A array $(get_disklist_fullpath) + +# Get the first vdev, since we will corrupt it later +firstvdev=${array[0]} + +typeset -i j=1 +while [[ $j -lt ${#CHECKSUM_TYPES[*]} ]]; do + type=${CHECKSUM_TYPES[$j]} + log_must zfs set checksum=$type $TESTPOOL + log_must file_write -o overwrite -f $TESTDIR/test_$type \ + -b $WRITESZ -c $NWRITES -d R + + # Corrupt the level 1 blocks of this file + corrupt_blocks_at_level $TESTDIR/test_$type 1 + + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + + log_mustnot eval "cat $TESTDIR/test_$type >/dev/null" + + cksum=$(zpool status -P -v $TESTPOOL | grep "$firstvdev" | \ + awk '{print $5}') + + log_assert "Checksum '$type' caught $cksum checksum errors" + log_must [ $cksum -ne 0 ] + + rm -f $TESTDIR/test_$type + log_must zpool clear $TESTPOOL + + (( j = j + 1 )) +done diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh index 0c949f9839e1..8d677affb9fe 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh @@ -209,11 +209,11 @@ function histo_check_test_pool # 4096 blocksize count for asize. For verification we stick # to just lsize counts. # - # The max_variance is hard-coded here at 10%. testing so far - # has shown this to be in the range of 2%-8% so we leave a - # generous allowance... This might need changes in the future + # The max_variance is hard-coded here at 12% to leave us some + # margin. Testing has shown this normally to be in the range + # of 2%-8%, but it may be as large as 11%. ################### - let max_variance=10 + let max_variance=12 let fail_value=0 let error_count=0 log_note "Comparisons for ${pool}" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh index c88e300412f1..6e293ca63829 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh @@ -92,7 +92,7 @@ for val in 1 2 3; do check_used $used $val done -log_note "Verify df(1M) can correctly display the space charged." +log_note "Verify df(1) can correctly display the space charged." for val in 1 2 3; do if is_freebsd; then used=`df -m /$TESTPOOL/fs_$val | grep $TESTPOOL/fs_$val \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am index cb65507ae711..7515753c1bc2 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am @@ -19,6 +19,7 @@ dist_pkgdata_SCRIPTS = \ zfs_create_encrypted.ksh \ zfs_create_crypt_combos.ksh \ zfs_create_dryrun.ksh \ + zfs_create_nomount.ksh \ zfs_create_verbose.ksh dist_pkgdata_DATA = \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh index 0e580a8474ca..d0807ac8d176 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh @@ -51,6 +51,8 @@ function cleanup log_must zfs destroy -f ${datasets[$i]} ((i = i + 1)) done + + zfs destroy -f "$TESTPOOL/with a space" } log_onexit cleanup @@ -68,4 +70,8 @@ while (( $i < ${#datasets[*]} )); do ((i = i + 1)) done +log_must zfs create "$TESTPOOL/with a space" +log_must zfs unmount "$TESTPOOL/with a space" +log_must zfs mount "$TESTPOOL/with a space" + log_pass "'zfs create ' works as expected." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_nomount.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_nomount.ksh new file mode 100644 index 000000000000..e1fbbe63ad31 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_nomount.ksh @@ -0,0 +1,51 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020 iXsystems, Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# zfs create -u should leave the new file system unmounted. +# It should not work for a volume. +# +# STRATEGY: +# 1. Create a file system using -u and make sure the file system is not mounted. +# 3. Do it for a volume to verify it fails. +# + +verify_runnable "both" + +function cleanup +{ + local ds + + for ds in "$fs" "$vol"; do + datasetexists "$ds" && destroy_dataset "$ds" + done +} +log_onexit cleanup + +log_assert "zfs create -u leaves the new file system unmounted" + +typeset fs="$TESTPOOL/$TESTFS1" +typeset vol="$TESTPOOL/$TESTVOL1" + +log_must create_dataset "$fs" "-u" +log_mustnot ismounted "$fs" + +log_mustnot zfs create -V $VOLSIZE -u "$vol" + +log_pass "zfs create -u leaves the new file system unmounted" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am index 37c0942381cc..8c90b2e75e5a 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am @@ -14,6 +14,8 @@ dist_pkgdata_SCRIPTS = \ zfs_mount_010_neg.ksh \ zfs_mount_011_neg.ksh \ zfs_mount_012_pos.ksh \ + zfs_mount_013_pos.ksh \ + zfs_mount_014_neg.ksh \ zfs_mount_all_001_pos.ksh \ zfs_mount_all_fail.ksh \ zfs_mount_all_mountpoints.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib index bd45fabbcde9..85566e565319 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib @@ -66,7 +66,8 @@ function setup_filesystem #disklist #pool #fs #mntpoint #type #vdev if [[ $vdev != "" && \ $vdev != "mirror" && \ - $vdev != "raidz" ]] ; then + $vdev != "raidz" && \ + $vdev != "draid" ]] ; then log_note "Wrong vdev: (\"$vdev\")" return 1 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh new file mode 100644 index 000000000000..810a69470d34 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib + +# +# DESCRIPTION: +# Verify zfs mount helper functions for both devices and pools. +# + +verify_runnable "both" + +set -A vdevs $(get_disklist_fullpath $TESTPOOL) +typeset -r mntpoint=$(get_prop mountpoint $TESTPOOL) +typeset -r helper="mount.zfs -o zfsutil" +typeset -r fs=$TESTPOOL/$TESTFS + +function cleanup +{ + cd $STF_SUITE + [[ -d $TESTDIR/$$ ]] && (rm -rf $TESTDIR/$$ || log_fail) + mounted && zfs $mountcmd $TESTPOOL + return 0 +} +log_onexit cleanup + +log_note "Verify zfs mount helper functions for both devices and pools" + +# Ensure that the ZFS filesystem is unmounted +force_unmount $TESTPOOL + +log_note "Verify ' '" +log_must $helper $fs $mntpoint +log_must ismounted $fs +force_unmount $fs + +log_note "Verify mount(8) does not canonicalize before calling helper" +# Canonicalization is confused by files in PWD matching [device|mountpoint] +mkdir -p $TESTDIR/$$/$TESTPOOL && cd $TESTDIR/$$ || log_fail +# The env flag directs zfs to exec /bin/mount, which then calls helper +log_must eval ZFS_MOUNT_HELPER=1 zfs $mountcmd -v $TESTPOOL +# mount (2.35.2) still suffers from a cosmetic PWD prefix bug +log_must mounted $TESTPOOL +force_unmount $TESTPOOL + +log_note "Verify '-f ' fakemount" +log_must $helper -f $fs $mntpoint +log_mustnot ismounted $fs + +log_note "Verify '-o ro -v ' verbose RO" +log_must ${helper},ro -v $fs $mntpoint +log_must ismounted $fs +force_unmount $fs + +log_note "Verify '-o abc -s ' sloppy option" +log_must ${helper},abc -s ${vdevs[0]} $mntpoint +log_must mounted $mntpoint +force_unmount $TESTPOOL + +log_note "Verify ' '" +log_must $helper ${vdevs[0]} $mntpoint +log_must mounted $mntpoint + +log_pass "zfs mount helper correctly handles both device and pool strings" \ No newline at end of file diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh new file mode 100644 index 000000000000..5cf0bc7b3a05 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib + +# +# DESCRIPTION: +# Verify zfs mount helper failure on known bad parameters +# + +verify_runnable "both" + +set -A vdevs $(get_disklist_fullpath $TESTPOOL) +vdev=${vdevs[0]} + +mntpoint="$(get_prop mountpoint $TESTPOOL)" +helper="mount.zfs -o zfsutil" +fs=$TESTPOOL/$TESTFS + +function cleanup +{ + log_must force_unmount $vdev + return 0 +} +log_onexit cleanup + +log_note "Verify zfs mount helper failure on known bad parameters" + +# Ensure that the ZFS filesystem is unmounted. +force_unmount $fs + +log_note "Verify failure without '-o zfsutil'" +log_mustnot mount.zfs $fs $mntpoint + +log_note "Verify '-o abc ' bad option fails" +log_mustnot ${helper},abc $vdev $mntpoint + +log_note "Verify '\$NONEXISTFSNAME ' fails" +log_mustnot $helper $NONEXISTFSNAME $mntpoint + +log_note "Verify ' (\$NONEXISTFSNAME|/dev/null)' fails" +log_mustnot $helper $fs $NONEXISTFSNAME +log_mustnot $helper $fs /dev/null + +log_note "Verify '/dev/null ' fails" +log_mustnot $helper /dev/null $mntpoint + +log_note "Verify '[device|pool]' fails" +log_mustnot mount.zfs +log_mustnot $helper +log_mustnot $helper $vdev +log_mustnot $helper $TESTPOOL + +log_pass "zfs mount helper fails when expected" \ No newline at end of file diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh index 3e6a24bbcda3..faeae4227acd 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh @@ -109,6 +109,8 @@ function cleanup_all export __ZFS_POOL_RESTRICT="$TESTPOOL" log_must zfs $unmountall unset __ZFS_POOL_RESTRICT + # make sure we leave $TESTPOOL mounted + log_must zfs mount $TESTPOOL for fs in ${filesystems[@]}; do cleanup_filesystem "$TESTPOOL" "$fs" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh index bc45820a1bad..762436678dcb 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh @@ -29,7 +29,7 @@ # # DESCRIPTION: # Verify that 'zfs set sharenfs=on', 'zfs share', and 'zfs unshare' can -# run concurrently. The test creates 300 filesystem and 300 threads. +# run concurrently. The test creates 50 filesystem and 50 threads. # Each thread will run through the test strategy in parallel. # # STRATEGY: @@ -47,7 +47,7 @@ verify_runnable "global" function cleanup { wait - for fs in $(seq 0 100) + for fs in $(seq 0 50) do log_must zfs set sharenfs=off $TESTPOOL/$TESTFS1/$fs log_must zfs set sharenfs=off $TESTPOOL/$TESTFS2/$fs @@ -79,7 +79,7 @@ function cleanup function create_filesystems { - for fs in $(seq 0 100) + for fs in $(seq 0 50) do log_must zfs create -p $TESTPOOL/$TESTFS1/$fs log_must zfs create -p $TESTPOOL/$TESTFS2/$fs @@ -137,7 +137,7 @@ log_onexit cleanup create_filesystems child_pids=() -for fs in $(seq 0 100) +for fs in $(seq 0 50) do test_share $TESTPOOL/$TESTFS1/$fs & child_pids+=($!) @@ -158,7 +158,7 @@ log_note "Verify 'zfs share -a' succeeds." # Unshare each of the file systems. # child_pids=() -for fs in $(seq 0 100) +for fs in $(seq 0 50) do unshare_fs $TESTPOOL/$TESTFS1/$fs & child_pids+=($!) @@ -181,7 +181,7 @@ log_must zfs share -a # unset __ZFS_POOL_EXCLUDE -for fs in $(seq 0 100) +for fs in $(seq 0 50) do is_shared $TESTPOOL/$TESTFS1/$fs || \ log_fail "File system $TESTPOOL/$TESTFS1/$fs is not shared" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh index f0682b816ae8..a20fcc4ce224 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh @@ -12,6 +12,7 @@ # # Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 by Datto Inc. All rights reserved. # # @@ -23,6 +24,8 @@ # 2. Create multiple snapshots with a list of valid and invalid # snapshot names # 3. Verify the valid snapshot creation +# 4. Verify creation of snapshots report the correct numbers by +# performing a snapshot directory listing . $STF_SUITE/include/libtest.shlib @@ -34,6 +37,7 @@ function cleanup datasetexists $ds && log_must zfs destroy -r $ds done zfs destroy -r $TESTPOOL/TESTFS4 + zfs destroy -r $TESTPOOL/TESTFS5 } datasets="$TESTPOOL/$TESTFS1 $TESTPOOL/$TESTFS2 $TESTPOOL/$TESTFS3" @@ -112,4 +116,17 @@ log_must zfs rename $TESTPOOL/$TESTFS3/TESTFSA$DATASET_XXX \ log_must zfs snapshot -r $TESTPOOL/$TESTFS1@snap1 $TESTPOOL/$TESTFS2@snap1 \ $TESTPOOL/$TESTFS3@snap1 $TESTPOOL/TESTFS4@snap1 +MYTEST="TESTFS5" +ITERATIONS=10 +NUM_SNAPS=5 +for x in {1..$ITERATIONS}; do + log_must zfs create $TESTPOOL/$MYTEST + for y in {1..$NUM_SNAPS}; do + log_must zfs snapshot $TESTPOOL/$MYTEST@$y + done; + n=$(ls -1 /$TESTPOOL/$MYTEST/.zfs/snapshot | wc -l) + verify_eq $n $NUM_SNAPS "count" + zfs destroy -r $TESTPOOL/$MYTEST; +done; + log_pass "zfs multiple snapshot verified correctly" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am index a7f62b6f9f79..8d54d13f7207 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am @@ -14,7 +14,8 @@ dist_pkgdata_SCRIPTS = \ zpool_add_010_pos.ksh \ add-o_ashift.ksh \ add_prop_ashift.ksh \ - add_nested_replacing_spare.ksh + add_nested_replacing_spare.ksh \ + zpool_add_dryrun_output.ksh dist_pkgdata_DATA = \ zpool_add.cfg \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh index aa50de3be290..191ec839a955 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh @@ -55,23 +55,26 @@ log_assert "'zpool add ...' can add devices to the pool." log_onexit cleanup -set -A keywords "" "mirror" "raidz" "raidz1" "spare" +set -A keywords "" "mirror" "raidz" "raidz1" "draid:1s" "draid1:1s" "spare" pooldevs="${DISK0} \ \"${DISK0} ${DISK1}\" \ \"${DISK0} ${DISK1} ${DISK2}\"" mirrordevs="\"${DISK0} ${DISK1}\"" raidzdevs="\"${DISK0} ${DISK1}\"" +draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" disk0=$TEST_BASE_DIR/disk0 disk1=$TEST_BASE_DIR/disk1 -truncate -s $MINVDEVSIZE $disk0 $disk1 +disk2=$TEST_BASE_DIR/disk2 +truncate -s $MINVDEVSIZE $disk0 $disk1 $disk2 typeset -i i=0 typeset vdev eval set -A poolarray $pooldevs eval set -A mirrorarray $mirrordevs eval set -A raidzarray $raidzdevs +eval set -A draidarray $draiddevs while (( $i < ${#keywords[*]} )); do @@ -107,6 +110,19 @@ while (( $i < ${#keywords[*]} )); do destroy_pool "$TESTPOOL" done + ;; + draid:1s|draid1:1s) + for vdev in "${draidarray[@]}"; do + create_pool "$TESTPOOL" "${keywords[i]}" \ + "$disk0" "$disk1" "$disk2" + log_must poolexists "$TESTPOOL" + log_must zpool add "$TESTPOOL" ${keywords[i]} $vdev + log_must vdevs_in_pool "$TESTPOOL" "$vdev" + log_must vdevs_in_pool "$TESTPOOL" "draid1-0-0" + log_must vdevs_in_pool "$TESTPOOL" "draid1-1-0" + destroy_pool "$TESTPOOL" + done + ;; esac diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh index f270041308b7..a6b03ff3257f 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh @@ -61,7 +61,7 @@ log_onexit cleanup typeset TMPFILE_PREFIX="$TEST_BASE_DIR/zpool_add_003" typeset STR_DRYRUN="would update '$TESTPOOL' to the following configuration:" typeset VDEV_PREFIX="$TEST_BASE_DIR/filedev" -typeset -a VDEV_TYPES=("" "dedup" "special" "log" "cache") +typeset -a VDEV_TYPES=("" "dedup" "special" "log" "cache" "spare") vdevs="" config="" @@ -91,7 +91,7 @@ log_must zpool add -f $TESTPOOL $config zpool status $TESTPOOL | awk 'NR == 1, /NAME/ { next } /^$/ {exit} {print $1}' > "$TMPFILE_PREFIX-vdevtree" cat "$TMPFILE_PREFIX-dryrun" | awk 'NR == 1, /would/ {next} - {print $1}' > "$TMPFILE_PREFIX-vdevtree-n" + /^$/ {next} {print $1}' > "$TMPFILE_PREFIX-vdevtree-n" log_must eval "diff $TMPFILE_PREFIX-vdevtree-n $TMPFILE_PREFIX-vdevtree" log_pass "'zpool add -n ...' executes successfully." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh new file mode 100644 index 000000000000..dbf81262ee1a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh @@ -0,0 +1,175 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Attila Fülöp +# + +. $STF_SUITE/include/libtest.shlib + +typeset STR_DRYRUN="would update '$TESTPOOL' to the following configuration:" +typeset VDEV_PREFIX="$TEST_BASE_DIR/filedev" + +# +# DESCRIPTION: +# 'zpool add -n ...' can display the correct configuration +# +# STRATEGY: +# 1. Create different storage pools, use -n to add devices to the pool and +# verify the output is as expected. +# 2. Create a pool whith a hole vdev and verify it's not listed with add -n. +# + +typeset -a dev=( + "${VDEV_PREFIX}00" "${VDEV_PREFIX}01" "${VDEV_PREFIX}02" + "${VDEV_PREFIX}03" "${VDEV_PREFIX}04" "${VDEV_PREFIX}05" + "${VDEV_PREFIX}06" "${VDEV_PREFIX}07" "${VDEV_PREFIX}08" + "${VDEV_PREFIX}09" "${VDEV_PREFIX}10" "${VDEV_PREFIX}11" +) + +typeset -a tests=( + ( + tree="'${dev[0]}' log '${dev[1]}' special '${dev[2]}' dedup '${dev[3]}'" + add="spare '${dev[4]}' cache '${dev[5]}'" + want="$STR_DRYRUN + + $TESTPOOL + ${dev[0]} + dedup + ${dev[3]} + special + ${dev[2]} + logs + ${dev[1]} + cache + ${dev[5]} + spares + ${dev[4]}" + ) + ( + tree="'${dev[0]}' log '${dev[1]}' special '${dev[2]}' dedup '${dev[3]}' \ + spare '${dev[4]}' cache '${dev[5]}'" + + add="'${dev[6]}' log '${dev[7]}' special '${dev[8]}' dedup '${dev[9]}' \ + spare '${dev[10]}' cache '${dev[11]}'" + + want="$STR_DRYRUN + + $TESTPOOL + ${dev[0]} + ${dev[6]} + dedup + ${dev[3]} + ${dev[9]} + special + ${dev[2]} + ${dev[8]} + logs + ${dev[1]} + ${dev[7]} + cache + ${dev[5]} + ${dev[11]} + spares + ${dev[4]} + ${dev[10]}" + ) + ( + tree="mirror '${dev[0]}' '${dev[1]}' \ + log mirror '${dev[2]}' '${dev[3]}' \ + dedup mirror '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}'" + + add="special mirror '${dev[4]}' '${dev[5]}' \ + spare '${dev[9]}' cache '${dev[10]}' '${dev[11]}'" + + want="$STR_DRYRUN + + $TESTPOOL + mirror-0 + ${dev[0]} + ${dev[1]} + dedup + mirror + ${dev[6]} + ${dev[7]} + special + mirror + ${dev[4]} + ${dev[5]} + logs + mirror + ${dev[2]} + ${dev[3]} + cache + ${dev[10]} + ${dev[11]} + spares + ${dev[8]} + ${dev[9]}" + ) +) + +verify_runnable "global" + +function cleanup +{ + destroy_pool "$TESTPOOL" + rm -f "$VDEV_PREFIX"* +} + +log_assert "'zpool add -n ...' can display the configuration" + +log_onexit cleanup + +# Create needed file vdevs. +for (( i=0; i < ${#dev[@]}; i+=1 )); do + log_must truncate -s $SPA_MINDEVSIZE "${dev[$i]}" +done + +# Foreach test create pool, add -n devices and check output. +for (( i=0; i < ${#tests[@]}; i+=1 )); do + typeset tree="${tests[$i].tree}" + typeset add="${tests[$i].add}" + typeset want="${tests[$i].want}" + + log_must eval zpool create "$TESTPOOL" $tree + log_must poolexists "$TESTPOOL" + typeset out="$(log_must eval "zpool add -n '$TESTPOOL' $add" | \ + sed /^SUCCESS/d)" + + if [[ "$out" != "$want" ]]; then + log_fail "Got:\n" "$out" "\nbut expected:\n" "$want" + fi + log_must destroy_pool "$TESTPOOL" +done + +# Make sure hole vdevs are skiped in output. +log_must eval "zpool create '$TESTPOOL' '${dev[0]}' log '${dev[1]}' \ + cache '${dev[2]}'" + +# Create a hole vdev. +log_must eval "zpool remove '$TESTPOOL' '${dev[1]}'" +log_mustnot eval "zpool add -n '$TESTPOOL' '${dev[1]}' | \ + grep -qE '[[:space:]]+hole'" + +log_pass "'zpool add -n ...' displays config correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am index 3c595935a1f2..ea0cc49b0945 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am @@ -27,14 +27,20 @@ dist_pkgdata_SCRIPTS = \ zpool_create_024_pos.ksh \ zpool_create_encrypted.ksh \ zpool_create_crypt_combos.ksh \ + zpool_create_draid_001_pos.ksh \ + zpool_create_draid_002_pos.ksh \ + zpool_create_draid_003_pos.ksh \ + zpool_create_draid_004_pos.ksh \ zpool_create_features_001_pos.ksh \ zpool_create_features_002_pos.ksh \ zpool_create_features_003_pos.ksh \ zpool_create_features_004_neg.ksh \ zpool_create_features_005_pos.ksh \ create-o_ashift.ksh \ - zpool_create_tempname.ksh + zpool_create_tempname.ksh \ + zpool_create_dryrun_output.ksh dist_pkgdata_DATA = \ + draidcfg.gz \ zpool_create.cfg \ zpool_create.shlib diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh index 7991607221cb..42f57beae2a3 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh @@ -64,14 +64,16 @@ pooldevs="${DISK0} \ \"${DISK0} ${DISK1}\" \ \"${DISK0} ${DISK1} ${DISK2}\" \ \"$disk1 $disk2\"" -raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\"" mirrordevs="\"${DISK0} ${DISK1}\" \ $raidzdevs \ \"$disk1 $disk2\"" +raidzdevs="\"${DISK0} ${DISK1} ${DISK2}\"" +draiddevs="\"${DISK0} ${DISK1} ${DISK2}\"" create_pool_test "$TESTPOOL" "" "$pooldevs" create_pool_test "$TESTPOOL" "mirror" "$mirrordevs" create_pool_test "$TESTPOOL" "raidz" "$raidzdevs" create_pool_test "$TESTPOOL" "raidz1" "$raidzdevs" +create_pool_test "$TESTPOOL" "draid" "$draiddevs" log_pass "'zpool create ...' success." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh index 165453e8bb56..e1d8cc474545 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh @@ -54,7 +54,7 @@ log_assert "'zpool create [-R root][-m mountpoint] ...' can create "an alternate pool or a new pool mounted at the specified mountpoint." log_onexit cleanup -set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" +set -A pooltype "" "mirror" "raidz" "raidz1" "raidz2" "draid" "draid2" # # cleanup the pools created in previous case if zpool_create_004_pos timedout @@ -67,8 +67,8 @@ done rm -rf $TESTDIR log_must mkdir -p $TESTDIR typeset -i i=1 -while (( i < 4 )); do - log_must mkfile $FILESIZE $TESTDIR/file.$i +while (( i < 5 )); do + log_must truncate -s $FILESIZE $TESTDIR/file.$i (( i = i + 1 )) done @@ -87,7 +87,7 @@ do log_must zpool destroy -f $TESTPOOL [[ -d $TESTDIR1 ]] && rm -rf $TESTDIR1 log_must zpool create $opt $TESTPOOL ${pooltype[i]} \ - $file.1 $file.2 $file.3 + $file.1 $file.2 $file.3 $file.4 ! poolexists $TESTPOOL && \ log_fail "Creating pool with $opt fails." mpt=`zfs mount | egrep "^$TESTPOOL[^/]" | awk '{print $2}'` diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh index 15cd23e4450a..79b41fdaec90 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh @@ -97,6 +97,20 @@ set -A valid_args \ "raidz2 $vdev0 $vdev1 $vdev2 spare $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ "raidz3 $vdev0 $vdev1 $vdev2 $vdev3 \ mirror $vdev4 $vdev5 $vdev6 $vdev7" \ + "draid $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4" \ + "draid $vdev0 $vdev1 $vdev2 raidz1 $vdev3 $vdev4 $vdev5" \ + "draid $vdev0 $vdev1 $vdev2 draid1 $vdev3 $vdev4 $vdev5" \ + "draid $vdev0 $vdev1 $vdev2 special mirror $vdev3 $vdev4" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 mirror $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 draid2 $vdev4 $vdev5 $vdev6 $vdev7"\ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 \ + special mirror $vdev4 $vdev5 $vdev6" \ + "draid2 $vdev0 $vdev1 $vdev2 $vdev3 \ + special mirror $vdev4 $vdev5 $vdev6 \ + cache $vdev7 log mirror $vdev8 $vdev9" \ + "draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 $vdev7 \ + special mirror $vdev8 $vdev9" \ "spare $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4 raidz $vdev5 $vdev6" set -A forced_args \ @@ -109,11 +123,19 @@ set -A forced_args \ "raidz $vdev0 $vdev1 raidz2 $vdev2 $vdev3 $vdev4" \ "raidz $vdev0 $vdev1 raidz2 $vdev2 $vdev3 $vdev4 spare $vdev5" \ "raidz $vdev0 $vdev1 spare $vdev2 raidz2 $vdev3 $vdev4 $vdev5" \ + "raidz $vdev0 $vdev1 draid2 $vdev2 $vdev3 $vdev4 $vdev5" \ + "raidz $vdev0 $vdev1 draid3 $vdev2 $vdev3 $vdev4 $vdev5 $vdev6" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 raidz2 $vdev4 $vdev5 $vdev6" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 \ raidz2 $vdev4 $vdev5 $vdev6 spare $vdev7" \ "mirror $vdev0 $vdev1 raidz $vdev2 $vdev3 \ spare $vdev4 raidz2 $vdev5 $vdev6 $vdev7" \ + "mirror $vdev0 $vdev1 draid $vdev2 $vdev3 $vdev4 \ + draid2 $vdev5 $vdev6 $vdev7 $vdev8 spare $vdev9" \ + "draid $vdev0 $vdev1 $vdev2 $vdev3 \ + draid2 $vdev4 $vdev5 $vdev6 $vdev7 $vdev8" \ + "draid $vdev0 $vdev1 $vdev2 draid $vdev4 $vdev5 $vdev6 \ + special mirror $vdev7 $vdev8 $vdev9" \ "spare $vdev0 $vdev1 $vdev2 mirror $vdev3 $vdev4 \ raidz2 $vdev5 $vdev6 $vdev7" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh index bafc238ea289..2873202cce91 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh @@ -54,13 +54,16 @@ set -A args "" "-?" "-n" "-f" "-nf" "-fn" "-f -n" "--f" "-e" "-s" \ "$TESTPOOL c0txd0" "$TESTPOOL c0t0dx" "$TESTPOOL cxtxdx" \ "$TESTPOOL mirror" "$TESTPOOL raidz" "$TESTPOOL mirror raidz" \ "$TESTPOOL raidz1" "$TESTPOOL mirror raidz1" \ + "$TESTPOOL draid1" "$TESTPOOL mirror draid1" \ "$TESTPOOL mirror c?t?d?" "$TESTPOOL mirror $DISK0 c0t1d?" \ "$TESTPOOL RAIDZ $DISK0 $DISK1" \ "$TESTPOOL $DISK0 log $DISK1 log $DISK2" \ "$TESTPOOL $DISK0 spare $DISK1 spare $DISK2" \ - "$TESTPOOL RAIDZ1 $DISK0 $DISK1" \ - "$TESTPOOL MIRROR $DISK0" "$TESTPOOL raidz $DISK0" \ - "$TESTPOOL raidz1 $DISK0" \ + "$TESTPOOL RAIDZ1 $DISK0 $DISK1" "$TESTPOOL MIRROR $DISK0" \ + "$TESTPOOL DRAID $DISK1 $DISK2 $DISK3" "$TESTPOOL raidz $DISK0" \ + "$TESTPOOL raidz1 $DISK0" "$TESTPOOL draid $DISK0" \ + "$TESTPOOL draid2 $DISK0 $DISK1" \ + "$TESTPOOL draid $DISK0 $DISK1 $DISK2 spare s0-draid1-0" \ "1tank $DISK0" "1234 $DISK0" "?tank $DISK0" \ "tan%k $DISK0" "ta@# $DISK0" "tan+k $DISK0" \ "$BYND_MAX_NAME $DISK0" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh index 0d7acdb4089c..e2f38990314c 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh @@ -63,7 +63,7 @@ log_onexit cleanup unset NOINUSE_CHECK typeset opt -for opt in "" "mirror" "raidz" "raidz1"; do +for opt in "" "mirror" "raidz" "draid"; do if [[ $opt == "" ]]; then typeset disks=$DISK0 else diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh index e0b3850e4a95..36bbaa7de33a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh @@ -63,15 +63,16 @@ log_must zfs create $TESTPOOL/$TESTFS log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS typeset -l devsize=$(($SPA_MINDEVSIZE - 1024 * 1024)) -for files in $TESTDIR/file1 $TESTDIR/file2 +for files in $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3 do - log_must mkfile $devsize $files + log_must truncate -s $devsize $files done set -A args \ "$TOOSMALL $TESTDIR/file1" "$TESTPOOL1 $TESTDIR/file1 $TESTDIR/file2" \ "$TOOSMALL mirror $TESTDIR/file1 $TESTDIR/file2" \ - "$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" + "$TOOSMALL raidz $TESTDIR/file1 $TESTDIR/file2" \ + "$TOOSMALL draid $TESTDIR/file1 $TESTDIR/file2 $TESTDIR/file3" typeset -i i=0 while [[ $i -lt ${#args[*]} ]]; do diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh index 140771d4f82d..9437033ae547 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh @@ -54,7 +54,7 @@ function cleanup destroy_pool $pool done - rm -rf $disk1 $disk2 $disk3 + rm -rf $disk1 $disk2 $disk3 $disk4 if [[ -n $saved_dump_dev ]]; then log_must dumpadm -u -d $saved_dump_dev @@ -66,12 +66,16 @@ log_onexit cleanup disk1=$(create_blockfile $FILESIZE) disk2=$(create_blockfile $FILESIZE) -disk3=$(create_blockfile $FILESIZE1) +disk3=$(create_blockfile $FILESIZE) +disk4=$(create_blockfile $FILESIZE1) mirror1="$DISK0 $DISK1" mirror2="$disk1 $disk2" raidz1=$mirror1 raidz2=$mirror2 -diff_size_dev="$disk2 $disk3" +draid1="$DISK0 $DISK1 $DISK2" +draid2="$disk1 $disk2 $disk3" +diff_size_dev="$disk2 $disk4" +draid_diff_size_dev="$disk1 $disk2 $disk4" vfstab_dev=$(find_vfstab_dev) if is_illumos; then @@ -91,13 +95,17 @@ set -A arg \ "$TESTPOOL1 mirror mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 raidz raidz $raidz1 raidz $raidz2" \ "$TESTPOOL1 raidz1 raidz1 $raidz1 raidz1 $raidz2" \ + "$TESTPOOL1 draid draid $draid draid $draid2" \ "$TESTPOOL1 mirror raidz $raidz1 raidz $raidz2" \ "$TESTPOOL1 mirror raidz1 $raidz1 raidz1 $raidz2" \ + "$TESTPOOL1 mirror draid $draid1 draid $draid2" \ "$TESTPOOL1 raidz mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 raidz1 mirror $mirror1 mirror $mirror2" \ + "$TESTPOOL1 draid1 mirror $mirror1 mirror $mirror2" \ "$TESTPOOL1 mirror $diff_size_dev" \ "$TESTPOOL1 raidz $diff_size_dev" \ "$TESTPOOL1 raidz1 $diff_size_dev" \ + "$TESTPOOL1 draid1 $draid_diff_size_dev" \ "$TESTPOOL1 mirror $mirror1 spare $mirror2 spare $diff_size_dev" \ "$TESTPOOL1 $vfstab_dev" \ "$TESTPOOL1 ${DISK0}s10" \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh new file mode 100644 index 000000000000..9717af505267 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create a variety of dRAID pools using the minimal dRAID vdev syntax. +# +# STRATEGY: +# 1) Create the required number of allowed dRAID vdevs. +# 2) Create few pools of various sizes using the draid1|draid2|draid3 syntax. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $all_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create ...' can create a pool." + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/file.{01..84}) + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +# Verify all configurations up to 24 vdevs. +for parity in {1..3}; do + for children in {$((parity + 2))..24}; do + vdevs=$(echo $TESTDIR/file.{01..${children}}) + log_must zpool create $TESTPOOL draid$parity $vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + done +done + +# Spot check a few large configurations. +children_counts="53 84" +for children in $children_counts; do + vdevs=$(echo $TESTDIR/file.{01..${children}}) + log_must zpool create $TESTPOOL draid $vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL +done + +log_pass "'zpool create ...' success." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh new file mode 100644 index 000000000000..2e1ff39311ab --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh @@ -0,0 +1,82 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Create dRAID pool using the maximum number of vdevs (255). Then verify +# that creating a pool with 256 fails as expected. +# +# STRATEGY: +# 1) Verify a pool with fewer than the required vdevs fails. +# 2) Verify pools with a valid number of vdevs succeed. +# 3) Verify a pool which exceeds the maximum number of vdevs fails. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $all_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create draid '" + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/file.{01..256}) + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +# Below maximum dRAID vdev count for specified parity level. +log_mustnot zpool create $TESTPOOL draid1 $(echo $TESTDIR/file.{01..01}) +log_mustnot zpool create $TESTPOOL draid2 $(echo $TESTDIR/file.{01..02}) +log_mustnot zpool create $TESTPOOL draid3 $(echo $TESTDIR/file.{01..03}) + +# Verify pool sizes from 2-10. Values in between are skipped to speed +# up the test case but will be exercised by the random pool creation +# done in zpool_create_draid_002_pos.ksh. +for (( i=2; i<=10; i++ )); do + log_must zpool create $TESTPOOL draid:${i}c \ + $(echo $TESTDIR/file.{01..$i}) + log_must destroy_pool $TESTPOOL +done + +# Verify pool sizes from 254-255. +for (( i=254; i<=255; i++ )); do + log_must zpool create $TESTPOOL draid:${i}c \ + $(echo $TESTDIR/file.{01..$i}) + log_must destroy_pool $TESTPOOL +done + +# Exceeds maximum dRAID vdev count (256). +log_mustnot zpool create $TESTPOOL draid $(echo $TESTDIR/file.{01..256}) + +log_pass "'zpool create draid '" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh new file mode 100644 index 000000000000..52cd00cf4ee4 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify allowed striped widths (data+parity) and hot spares may be +# configured at pool creation time. +# +# STRATEGY: +# 1) Test valid stripe/spare combinations given the number of children. +# 2) Test invalid stripe/spare/children combinations outside the allow limits. +# + +verify_runnable "global" + +function cleanup +{ + poolexists $TESTPOOL && destroy_pool $TESTPOOL + + rm -f $draid_vdevs + rmdir $TESTDIR +} + +log_assert "'zpool create draid:#d:#c:#s '" + +log_onexit cleanup + +mkdir $TESTDIR + +# Generate 10 random valid configurations to test. +for (( i=0; i<10; i++ )); do + parity=$(random_int_between 1 3) + spares=$(random_int_between 0 3) + data=$(random_int_between 1 16) + + (( min_children = (data + parity + spares) )) + children=$(random_int_between $min_children 32) + + draid="draid${parity}:${data}d:${children}c:${spares}s" + + draid_vdevs=$(echo $TESTDIR/file.{01..$children}) + log_must truncate -s $MINVDEVSIZE $draid_vdevs + + log_must zpool create $TESTPOOL $draid $draid_vdevs + log_must poolexists $TESTPOOL + destroy_pool $TESTPOOL + + rm -f $draid_vdevs +done + +children=32 +draid_vdevs=$(echo $TESTDIR/file.{01..$children}) +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +mkdir $TESTDIR +log_must truncate -s $MINVDEVSIZE $draid_vdevs + +# Out of order and unknown suffixes should fail. +log_mustnot zpool create $TESTPOOL draid:d8 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:s3 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:c32 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:10x $draid_vdevs +log_mustnot zpool create $TESTPOOL draid:x10 $draid_vdevs + +# Exceeds maximum data disks (limited by total children) +log_must zpool create $TESTPOOL draid2:30d $draid_vdevs +log_must destroy_pool $TESTPOOL +log_mustnot zpool create $TESTPOOL draid2:31d $draid_vdevs + +# At least one data disk must be requested. +log_mustnot zpool create $TESTPOOL draid2:0d $draid_vdevs + +# Check invalid parity levels. +log_mustnot zpool create $TESTPOOL draid0 $draid_vdevs +log_mustnot zpool create $TESTPOOL draid4 $draid_vdevs + +# Spares are limited: spares < children - (parity + data). +log_must zpool create $TESTPOOL draid2:20d:10s $draid_vdevs +log_must destroy_pool $TESTPOOL +log_mustnot zpool create $TESTPOOL draid2:20d:11s $draid_vdevs + +# The required children argument is enforced. +log_mustnot zpool create $TESTPOOL draid2:0c $draid_vdevs +log_mustnot zpool create $TESTPOOL draid2:31c $draid_vdevs +log_must zpool create $TESTPOOL draid2:32c $draid_vdevs +destroy_pool $TESTPOOL + +log_pass "'zpool create draid:#d:#c:#s '" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh new file mode 100644 index 000000000000..6b700fa362a4 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh @@ -0,0 +1,43 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify generated dRAID permutation maps against the authoritative +# reference file contains the full permutations. +# + +verify_runnable "global" + +log_assert "'draid verify'" + +DRAIDCFG="$STF_SUITE/tests/functional/cli_root/zpool_create/draidcfg.gz" + +log_must draid verify $DRAIDCFG + +log_pass "'draid verify'" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh new file mode 100644 index 000000000000..1e4db20cfedf --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh @@ -0,0 +1,138 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Attila Fülöp +# + +. $STF_SUITE/include/libtest.shlib + +typeset STR_DRYRUN="would create '$TESTPOOL' with the following layout:" +typeset VDEV_PREFIX="$TEST_BASE_DIR/filedev" + +# +# DESCRIPTION: +# 'zpool create -n ...' can display the correct configuration +# +# STRATEGY: +# 1. Create -n a storage pool and verify the output is as expected. +# + +typeset -a dev=( + "${VDEV_PREFIX}00" "${VDEV_PREFIX}01" "${VDEV_PREFIX}02" + "${VDEV_PREFIX}03" "${VDEV_PREFIX}04" "${VDEV_PREFIX}05" + "${VDEV_PREFIX}06" "${VDEV_PREFIX}07" "${VDEV_PREFIX}08" + "${VDEV_PREFIX}09" "${VDEV_PREFIX}10" "${VDEV_PREFIX}11" +) + +typeset -a tests=( + ( + tree="'${dev[0]}' '${dev[1]}' log '${dev[2]}' '${dev[3]}' \ + special '${dev[4]}' '${dev[5]}' dedup '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}' '${dev[9]}' cache '${dev[10]}' '${dev[11]}'" + + want="$STR_DRYRUN + + $TESTPOOL + ${dev[0]} + ${dev[1]} + dedup + ${dev[6]} + ${dev[7]} + special + ${dev[4]} + ${dev[5]} + logs + ${dev[2]} + ${dev[3]} + cache + ${dev[10]} + ${dev[11]} + spares + ${dev[8]} + ${dev[9]}" + ) + ( + tree="mirror '${dev[0]}' '${dev[1]}' \ + log mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}' \ + dedup mirror '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}' '${dev[9]}' \ + cache '${dev[10]}' '${dev[11]}'" + + want="$STR_DRYRUN + + $TESTPOOL + mirror + ${dev[0]} + ${dev[1]} + dedup + mirror + ${dev[6]} + ${dev[7]} + special + mirror + ${dev[4]} + ${dev[5]} + logs + mirror + ${dev[2]} + ${dev[3]} + cache + ${dev[10]} + ${dev[11]} + spares + ${dev[8]} + ${dev[9]}" + ) +) + +verify_runnable "global" + +function cleanup +{ + rm -f "$VDEV_PREFIX"* +} + +log_assert "'zpool add -n ...' can display the configuration" + +log_onexit cleanup + +# Create needed file vdevs. +for (( i=0; i < ${#dev[@]}; i+=1 )); do + log_must truncate -s $SPA_MINDEVSIZE "${dev[$i]}" +done + +# Foreach test create pool, add -n devices and check output. +for (( i=0; i < ${#tests[@]}; i+=1 )); do + typeset tree="${tests[$i].tree}" + typeset want="${tests[$i].want}" + + typeset out="$(log_must eval "zpool create -n '$TESTPOOL' $tree" | \ + sed /^SUCCESS/d)" + + if [[ "$out" != "$want" ]]; then + log_fail "Got:\n" "$out" "\nbut expected:\n" "$want" + fi +done + +log_pass "'zpool add -n ...' displays config correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh index f39e6267bc3a..922e35125e4a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh @@ -72,7 +72,7 @@ log_onexit cleanup log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do log_note "Setting up loopback, scsi_debug, and file vdevs" log_must truncate -s $org_size $FILE_LO DEV1=$(losetup -f) @@ -144,6 +144,16 @@ for type in " " mirror raidz raidz2; do if [[ $? -ne 0 ]] ; then log_fail "pool $TESTPOOL1 has not expanded" fi + elif [[ $type == "draid" ]]; then + typeset expansion_size=$((2*($exp_size-$org_size))) + zpool history -il $TESTPOOL1 | \ + grep "pool '$TESTPOOL1' size:" | \ + grep "vdev online" | \ + grep "(+${expansion_size})" >/dev/null 2>&1 + + if [[ $? -ne 0 ]]; then + log_fail "pool $TESTPOOL has not expanded" + fi else typeset expansion_size=$((3*($exp_size-$org_size))) zpool history -il $TESTPOOL1 | \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh index a49d4fc17068..62843b062291 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh @@ -63,7 +63,7 @@ log_onexit cleanup log_assert "zpool can expand after zpool online -e zvol vdevs on vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid:1s; do # Initialize the file devices and the pool for i in 1 2 3; do log_must truncate -s $org_size ${TEMPFILE}.$i @@ -92,6 +92,8 @@ for type in " " mirror raidz raidz2; do if [[ $type == "mirror" ]]; then typeset expected_zpool_expandsize=$(($exp_size-$org_size)) + elif [[ $type == "draid:1s" ]]; then + typeset expected_zpool_expandsize=$((2*($exp_size-$org_size))) else typeset expected_zpool_expandsize=$((3*($exp_size-$org_size))) fi @@ -147,6 +149,17 @@ for type in " " mirror raidz raidz2; do log_fail "pool $TESTPOOL1 has not expanded " \ "after zpool online -e" fi + elif [[ $type == "draid:1s" ]]; then + typeset expansion_size=$((2*($exp_size-$org_size))) + zpool history -il $TESTPOOL1 | \ + grep "pool '$TESTPOOL1' size:" | \ + grep "vdev online" | \ + grep "(+${expansion_size})" >/dev/null 2>&1 + + if [[ $? -ne 0 ]] ; then + log_fail "pool $TESTPOOL1 has not expanded " \ + "after zpool online -e" + fi else typeset expansion_size=$((3*($exp_size-$org_size))) zpool history -il $TESTPOOL1 | \ @@ -160,9 +173,17 @@ for type in " " mirror raidz raidz2; do fi fi else - log_fail "pool $TESTPOOL1 did not expand after vdev expansion " \ - "and zpool online -e" + log_fail "pool $TESTPOOL1 did not expand after vdev " \ + "expansion and zpool online -e" fi + + # For dRAID pools verify the distributed spare was resized after + # expansion and it is large enough to be used to replace a pool vdev. + if [[ $type == "draid:1s" ]]; then + log_must zpool replace -w $TESTPOOL1 $TEMPFILE.3 draid1-0-0 + verify_pool $TESTPOOL1 + fi + log_must zpool destroy $TESTPOOL1 done log_pass "zpool can expand after zpool online -e" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh index 323d0b907bd0..b3c71b666a59 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh @@ -73,7 +73,7 @@ log_onexit cleanup log_assert "zpool can not expand if set autoexpand=off after vdev expansion" -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do log_note "Setting up loopback, scsi_debug, and file vdevs" log_must truncate -s $org_size $FILE_LO DEV1=$(losetup -f) diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh index 8a4db824bc9c..09e2b6da2148 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh @@ -61,7 +61,7 @@ log_onexit cleanup log_assert "After vdev expansion, all 4 labels have the same set of uberblocks." -for type in " " mirror raidz raidz2; do +for type in " " mirror raidz draid; do for i in 1 2 3; do log_must truncate -s $org_size ${TEMPFILE}.$i done diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 8abef65de19f..3c536ca12eac 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -80,6 +80,7 @@ typeset -a properties=( "feature@bookmark_written" "feature@log_spacemap" "feature@device_rebuild" + "feature@draid" ) if is_linux || is_freebsd; then diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am index ad0f9c46edc7..a99c5011e250 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am @@ -29,6 +29,8 @@ dist_pkgdata_SCRIPTS = \ zpool_import_013_neg.ksh \ zpool_import_014_pos.ksh \ zpool_import_015_pos.ksh \ + zpool_import_016_pos.ksh \ + zpool_import_017_pos.ksh \ zpool_import_all_001_pos.ksh \ zpool_import_features_001_pos.ksh \ zpool_import_features_002_neg.ksh \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh index ab72042a219c..3238faaa9a52 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh @@ -69,6 +69,8 @@ test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" \ "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" \ "$VDEV0 raidz $VDEV1 $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "draid $VDEV1 $VDEV2 $VDEV3" \ + "$VDEV0 draid $VDEV1 $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "log $VDEV1" "$VDEV0 log $VDEV1" test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" "$VDEV0 $VDEV2 log $VDEV1" test_add_vdevs "$VDEV0" "$VDEV1 log $VDEV2" "$VDEV0 $VDEV1 log $VDEV2" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh index a42c6974762f..8a81c18cd8ca 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh @@ -155,6 +155,12 @@ test_replacing_vdevs "raidz $VDEV0 $VDEV1 $VDEV2" \ "$VDEV0 $VDEV1 $VDEV2" \ true 20 +test_replacing_vdevs "draid:1s $VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4" \ + "$VDEV1" "$VDEV5" \ + "draid $VDEV0 $VDEV5 $VDEV2 $VDEV3 $VDEV4 spares draid1-0-0" \ + "$VDEV0 $VDEV1 $VDEV2 $VDEV3 $VDEV4" \ + true 30 + set_zfs_txg_timeout $ZFS_TXG_TIMEOUT log_pass "zpool import -c cachefile_unaware_of_replace passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh index 887993dfd1ec..87942b4a52e4 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh @@ -108,6 +108,7 @@ test_shared_device "mirror $VDEV0 $VDEV1" "mirror $VDEV1 $VDEV2" "$VDEV1" test_shared_device "mirror $VDEV0 $VDEV1 $VDEV2" "mirror $VDEV2 $VDEV3" \ "$VDEV2" test_shared_device "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV2" +test_shared_device "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV2" "$VDEV2" test_shared_device "$VDEV0 log $VDEV1" "$VDEV2 log $VDEV1" "$VDEV1" "-m" log_pass "Pool doesn't write to a device it doesn't own anymore." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh index 7ee306e26d58..15f3a0a7b400 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh @@ -89,9 +89,11 @@ test_new_paths "$VDEV0 $VDEV1" "$VDEV0 $VDEV1" test_new_paths "mirror $VDEV0 $VDEV1" "$VDEV0 $VDEV1" test_new_paths "$VDEV0 log $VDEV1" "$VDEV1" test_new_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV1" +test_new_paths "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV1" test_swap_paths "$VDEV0 $VDEV1" "$VDEV0" "$VDEV1" test_swap_paths "raidz $VDEV0 $VDEV1 $VDEV2" "$VDEV0" "$VDEV1" +test_swap_paths "draid $VDEV0 $VDEV1 $VDEV2" "$VDEV0" "$VDEV1" test_swap_paths "mirror $VDEV0 $VDEV1 mirror $VDEV2 $VDEV3" \ "$VDEV0" "$VDEV2" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh index 74d75b6cdec0..3ac8c104f1ca 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh @@ -220,6 +220,7 @@ test_add_vdevs "$VDEV0 $VDEV1" "$VDEV2" test_add_vdevs "$VDEV0" "$VDEV1 $VDEV2" test_add_vdevs "mirror $VDEV0 $VDEV1" "mirror $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "raidz $VDEV1 $VDEV2 $VDEV3" +test_add_vdevs "$VDEV0" "draid $VDEV1 $VDEV2 $VDEV3" test_add_vdevs "$VDEV0" "log $VDEV1" test_add_vdevs "$VDEV0 log $VDEV1" "$VDEV2" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh index 94d1cb25d184..b03b39d178ca 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh @@ -176,6 +176,11 @@ test_replace_vdev "raidz $VDEV0 $VDEV1 $VDEV2" \ "raidz $VDEV0 $VDEV3 $VDEV2" \ "$VDEV0 $VDEV1 $VDEV2" 10 +test_replace_vdev "draid $VDEV0 $VDEV1 $VDEV2 $VDEV3" \ + "$VDEV1" "$VDEV4" \ + "draid $VDEV0 $VDEV4 $VDEV2 $VDEV3 spares draid1-0-0" \ + "$VDEV0 $VDEV1 $VDEV2 $VDEV3" 10 + set_zfs_txg_timeout $ZFS_TXG_TIMEOUT log_pass "zpool import rewind after device replacement passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh index 74324c84eac4..22e619d7411b 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh @@ -49,7 +49,7 @@ log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i (( i = i + 1 )) done diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg index 6c1ab194e92c..25f541ebf185 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg @@ -34,7 +34,7 @@ export DISK=${DISKS%% *} export FS_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 32))m" export FILE_SIZE="$((MINVDEVSIZE))" export SLICE_SIZE="$((($MINVDEVSIZE / (1024 * 1024)) * 2))m" -export MAX_NUM=5 +export MAX_NUM=6 export DEVICE_DIR=$TEST_BASE_DIR/dev_import-test export BACKUP_DEVICE_DIR=$TEST_BASE_DIR/bakdev_import-test export DEVICE_FILE=disk @@ -60,5 +60,6 @@ export VDEV1=$DEVICE_DIR/${DEVICE_FILE}1 export VDEV2=$DEVICE_DIR/${DEVICE_FILE}2 export VDEV3=$DEVICE_DIR/${DEVICE_FILE}3 export VDEV4=$DEVICE_DIR/${DEVICE_FILE}4 +export VDEV5=$DEVICE_DIR/${DEVICE_FILE}5 export ALTER_ROOT=/alter_import-test diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib index 48794c982cf4..8bbd668a9317 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib @@ -31,7 +31,7 @@ function cleanup log_must rm -rf $DEVICE_DIR/* typeset i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done is_linux && set_tunable32 TXG_HISTORY 0 @@ -163,7 +163,7 @@ function increase_device_sizes typeset -i i=0 while (( i < $MAX_NUM )); do - log_must mkfile $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $newfilesize ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done } @@ -171,15 +171,18 @@ function increase_device_sizes # # Translate vdev names returned by zpool status into more generic names. # -# eg: mirror-2 --> mirror -# function _translate_vdev { typeset vdev=$1 - typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect" + # + # eg: mirror-2 --> mirror + # eg: draid2:4d:12c:1s-0 --> draid2 + # + typeset keywords="mirror replacing raidz1 raidz2 raidz3 indirect draid1 draid2 draid3" for word in $keywords; do - echo $vdev | egrep "^${word}-[0-9]+\$" > /dev/null + echo $vdev | egrep -qE \ + "^${word}-[0-9]+\$|^${word}:[0-9]+d:[0-9]c:[0-9]+s-[0-9]+\$" if [[ $? -eq 0 ]]; then vdev=$word break @@ -188,6 +191,7 @@ function _translate_vdev [[ $vdev == "logs" ]] && echo "log" && return 0 [[ $vdev == "raidz1" ]] && echo "raidz" && return 0 + [[ $vdev == "draid1" ]] && echo "draid" && return 0 echo $vdev return 0 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh index 6e93fd471171..928efebdd2d5 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh @@ -63,7 +63,7 @@ log_assert "For raidz, one destroyed pools devices was removed or used by " \ "other pool, it still can be imported correctly." log_onexit cleanup -log_must zpool create $TESTPOOL1 raidz $VDEV0 $VDEV1 $VDEV2 $VDIV3 +log_must zpool create $TESTPOOL1 raidz $VDEV0 $VDEV1 $VDEV2 $VDEV3 typeset guid=$(get_config $TESTPOOL1 pool_guid) typeset target=$TESTPOOL1 if (( RANDOM % 2 == 0 )) ; then diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh index 096bbe8114ac..f8da584aad1c 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh @@ -63,7 +63,7 @@ log_assert "For raidz2, two destroyed pools devices was removed or used by " \ "other pool, it still can be imported correctly." log_onexit cleanup -log_must zpool create $TESTPOOL1 raidz2 $VDEV0 $VDEV1 $VDEV2 $VDIV3 +log_must zpool create $TESTPOOL1 raidz2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 typeset guid=$(get_config $TESTPOOL1 pool_guid) typeset target=$TESTPOOL1 if (( RANDOM % 2 == 0 )) ; then diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh index b337bd00f1e5..212024dfcb34 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh @@ -39,7 +39,7 @@ # STRATEGY: # 1. Create a 5 ways mirror pool A with dev0/1/2/3/4, then destroy it. # 2. Create a stripe pool B with dev1. Then destroy it. -# 3. Create a raidz2 pool C with dev2/3/4. Then destroy it. +# 3. Create a draid2 pool C with dev2/3/4/5. Then destroy it. # 4. Create a raidz pool D with dev3/4. Then destroy it. # 5. Create a stripe pool E with dev4. Then destroy it. # 6. Verify 'zpool import -D -a' recover all the pools. @@ -74,7 +74,7 @@ log_must zpool destroy $poolA log_must zpool create $poolB $VDEV1 log_must zpool destroy $poolB -log_must zpool create $poolC raidz2 $VDEV2 $VDEV3 $VDEV4 +log_must zpool create $poolC draid2 $VDEV2 $VDEV3 $VDEV4 $VDEV5 log_must zpool destroy $poolC log_must zpool create $poolD raidz $VDEV3 $VDEV4 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh new file mode 100644 index 000000000000..5434625cb985 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_016_pos.ksh @@ -0,0 +1,91 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# For draid, one destroyed pools devices was removed or used by other +# pool, it still can be imported correctly. +# +# STRATEGY: +# 1. Create a draid pool A with N disks. +# 2. Destroy this pool A. +# 3. Create another pool B with 1 disk which was used by pool A. +# 4. Verify import this draid pool can succeed. +# + +verify_runnable "global" + +function cleanup +{ + destroy_pool $TESTPOOL2 + destroy_pool $TESTPOOL1 + + log_must rm -rf $DEVICE_DIR/* + typeset i=0 + while (( i < $MAX_NUM )); do + log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done +} + +log_assert "For draid, one destroyed pools devices was removed or used by " \ + "other pool, it still can be imported correctly." +log_onexit cleanup + +log_must zpool create $TESTPOOL1 draid $VDEV0 $VDEV1 $VDEV2 $VDEV3 +typeset guid=$(get_config $TESTPOOL1 pool_guid) +typeset target=$TESTPOOL1 +if (( RANDOM % 2 == 0 )) ; then + target=$guid + log_note "Import by guid." +fi +log_must zpool destroy $TESTPOOL1 + +log_must zpool create $TESTPOOL2 $VDEV0 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_must zpool destroy $TESTPOOL2 +log_must rm -rf $VDEV0 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_note "For draid, two destroyed pool's devices were used, import failed." +log_must mkfile $FILE_SIZE $VDEV0 +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 +log_mustnot zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL2 + +log_pass "zpool import -D draid passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh new file mode 100644 index 000000000000..2e6cef265c4f --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_017_pos.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2012, 2016 by Delphix. All rights reserved. +# Copyright (c) 2020 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# For draid2, two destroyed pool's devices were removed or used by other +# pool, it still can be imported correctly. +# +# STRATEGY: +# 1. Create a draid2 pool A with N disks. +# 2. Destroy this pool A. +# 3. Create another pool B with two disks which were used by pool A. +# 4. Verify import this draid2 pool can succeed. +# + +verify_runnable "global" + +function cleanup +{ + destroy_pool $TESTPOOL2 + destroy_pool $TESTPOOL1 + + log_must rm -rf $DEVICE_DIR/* + typeset i=0 + while (( i < $MAX_NUM )); do + log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + ((i += 1)) + done +} + +log_assert "For draid2, two destroyed pools devices was removed or used by " \ + "other pool, it still can be imported correctly." +log_onexit cleanup + +log_must zpool create $TESTPOOL1 draid2 $VDEV0 $VDEV1 $VDEV2 $VDEV3 +typeset guid=$(get_config $TESTPOOL1 pool_guid) +typeset target=$TESTPOOL1 +if (( RANDOM % 2 == 0 )) ; then + target=$guid + log_note "Import by guid." +fi +log_must zpool destroy $TESTPOOL1 + +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_must zpool destroy $TESTPOOL2 +log_must rm -rf $VDEV0 $VDEV1 +log_must zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL1 + +log_note "For draid2, more than two destroyed pool's devices were used, " \ + "import failed." +log_must mkfile $FILE_SIZE $VDEV0 $VDEV1 +log_must zpool create $TESTPOOL2 $VDEV0 $VDEV1 $VDEV2 +log_mustnot zpool import -d $DEVICE_DIR -D -f $target +log_must zpool destroy $TESTPOOL2 + +log_pass "zpool import -D draid2 passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh index 78e9bbf689d0..3b5167ff0374 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh @@ -57,8 +57,8 @@ # Using the various combinations. # - Regular import # - Alternate Root Specified -# It should be succeed with single d/m device upon 'raidz' & 'mirror', -# but failed against 'regular' or more d/m devices. +# It should succeed with single d/m device upon 'raidz', 'mirror', +# 'draid' but failed against 'regular' or more d/m devices. # 6. If import succeed, verify following is true: # - The pool shows up under 'zpool list'. # - The pool's health should be DEGRADED. @@ -67,7 +67,16 @@ verify_runnable "global" -set -A vdevs "" "mirror" "raidz" +# Randomly test a subset of combinations to speed up the test. +(( rc=RANDOM % 3 )) +if [[ $rc == 0 ]] ; then + set -A vdevs "" "mirror" "raidz" +elif [[ $rc == 1 ]] ; then + set -A vdevs "" "mirror" "draid" +else + set -A vdevs "" "raidz" "draid" +fi + set -A options "" "-R $ALTER_ROOT" function cleanup @@ -89,7 +98,8 @@ function recreate_files log_must rm -rf $DEVICE_DIR/* typeset i=0 while (( i < $MAX_NUM )); do - log_must mkfile $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must rm -f ${DEVICE_DIR}/${DEVICE_FILE}$i + log_must truncate -s $FILE_SIZE ${DEVICE_DIR}/${DEVICE_FILE}$i ((i += 1)) done } @@ -157,6 +167,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( count > 1 )) && \ action=log_mustnot ;; + 'draid') (( count > 1 )) && \ + action=log_mustnot + ;; '') action=log_mustnot ;; esac diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh index c6d2637074fe..60af3f321947 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh @@ -43,6 +43,8 @@ # before data integrity is compromised # - Raidz could withstand one devices failing # before data integrity is compromised +# - dRAID could withstand one devices failing +# before data integrity is compromised # Verify that is true. # # STRATEGY: @@ -50,6 +52,7 @@ # - Regular pool # - Mirror # - Raidz +# - dRAID # 2. Create necessary filesystem and test files. # 3. Export the test pool. # 4. Move one or more device files to other directory @@ -62,7 +65,16 @@ verify_runnable "global" -set -A vdevs "" "mirror" "raidz" +# Randomly test a subset of combinations to speed up the test. +(( rc=RANDOM % 3 )) +if [[ $rc == 0 ]] ; then + set -A vdevs "" "mirror" "raidz" +elif [[ $rc == 1 ]] ; then + set -A vdevs "" "mirror" "draid" +else + set -A vdevs "" "raidz" "draid" +fi + set -A options "" "-R $ALTER_ROOT" function cleanup @@ -88,7 +100,8 @@ function cleanup_all while (( i < $MAX_NUM )); do typeset dev_file=${DEVICE_DIR}/${DEVICE_FILE}$i if [[ ! -e ${dev_file} ]]; then - log_must mkfile $FILE_SIZE ${dev_file} + log_must rm -f ${dev_file} + log_must truncate -s $FILE_SIZE ${dev_file} fi ((i += 1)) done @@ -158,7 +171,8 @@ while (( i < ${#vdevs[*]} )); do # Backup all device files while filesystem prepared. # if [[ -z $backup ]] ; then - log_must tar cf $DEVICE_DIR/$DEVICE_ARCHIVE ${DEVICE_FILE}* + log_must tar cf $DEVICE_DIR/$DEVICE_ARCHIVE \ + ${DEVICE_FILE}0 ${DEVICE_FILE}1 ${DEVICE_FILE}2 backup="true" fi @@ -174,6 +188,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( count == 1 )) && \ action=log_must ;; + 'draid') (( count == 1 )) && \ + action=log_must + ;; esac typeset target=$TESTPOOL1 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh index 6fa55250a77d..9d4629a77912 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh @@ -64,7 +64,7 @@ if ! is_illumos; then log_unsupported "Test case may be slow" fi -set -A vdevs "" "mirror" "raidz" +set -A vdevs "" "mirror" "raidz" "draid" function verify { @@ -207,6 +207,9 @@ while (( i < ${#vdevs[*]} )); do 'raidz') (( overlap > 1 )) && \ action=log_mustnot ;; + 'draid') (( overlap > 1 )) && \ + action=log_mustnot + ;; '') action=log_mustnot ;; esac diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh index 6a8f7d49f2b2..f774970a71be 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh @@ -24,7 +24,6 @@ # Copyright (c) 2016 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib # # DESCRIPTION: @@ -33,8 +32,8 @@ # STRATEGY: # 1. Create a one-disk pool. # 2. Initialize the disk to completion. -# 3. Load all metaslabs that don't have a spacemap, and make sure the entire -# metaslab has been filled with the initializing pattern (deadbeef). +# 3. Load all metaslabs and make sure that each contains at least +# once instance of the initializing pattern (deadbeef). # function cleanup @@ -58,32 +57,34 @@ ORIG_PATTERN=$(get_tunable INITIALIZE_VALUE) log_must set_tunable64 INITIALIZE_VALUE $(printf %llu 0x$PATTERN) log_must mkdir "$TESTDIR" -log_must mkfile $MINVDEVSIZE "$SMALLFILE" +log_must truncate -s $MINVDEVSIZE "$SMALLFILE" log_must zpool create $TESTPOOL "$SMALLFILE" -log_must zpool initialize $TESTPOOL - -while [[ "$(initialize_progress $TESTPOOL $SMALLFILE)" -lt "100" ]]; do - sleep 0.5 -done - +log_must zpool initialize -w $TESTPOOL log_must zpool export $TESTPOOL -spacemaps=0 +metaslabs=0 bs=512 -while read -r sm; do - typeset offset="$(echo $sm | cut -d ' ' -f1)" - typeset size="$(echo $sm | cut -d ' ' -f2)" +zdb -p $TESTDIR -Pme $TESTPOOL | awk '/metaslab[ ]+[0-9]+/ { print $4, $8 }' | +while read -r offset_size; do + typeset offset=$(echo $offset_size | cut -d ' ' -f1) + typeset size=$(echo $offset_size | cut -d ' ' -f2) - spacemaps=$((spacemaps + 1)) - offset=$(((4 * 1024 * 1024) + 16#$offset)) - out=$(dd if=$SMALLFILE skip=$(($offset / $bs)) \ - count=$(($size / $bs)) bs=$bs 2>/dev/null | od -t x8 -Ad) - echo "$out" | log_must egrep "$PATTERN|\*|$size" -done <<< "$(zdb -p $TESTDIR -Pme $TESTPOOL | egrep 'spacemap[ ]+0 ' | \ - awk '{print $4, $8}')" + log_note "offset: '$offset'" + log_note "size: '$size'" -if [[ $spacemaps -eq 0 ]];then - log_fail "Did not find any empty space maps to check" + metaslabs=$((metaslabs + 1)) + offset=$(((4 * 1024 * 1024) + 16#$offset)) + log_note "vdev file offset: '$offset'" + + # Note we use '-t x4' instead of '-t x8' here because x8 is not + # a supported format on FreeBSD. + dd if=$SMALLFILE skip=$((offset / bs)) count=$((size / bs)) bs=$bs | + od -t x4 -Ad | egrep -q "deadbeef +deadbeef +deadbeef +deadbeef" || + log_fail "Pattern not found in metaslab free space" +done + +if [[ $metaslabs -eq 0 ]]; then + log_fail "Did not find any metaslabs to check" else - log_pass "Initializing wrote appropriate amount to disk" + log_pass "Initializing wrote to each metaslab" fi diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am index 1ca05a4e8e8d..aac5e0d6e7b1 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am @@ -12,7 +12,8 @@ dist_pkgdata_SCRIPTS = \ zpool_split_vdevs.ksh \ zpool_split_resilver.ksh \ zpool_split_wholedisk.ksh \ - zpool_split_indirect.ksh + zpool_split_indirect.ksh \ + zpool_split_dryrun_output.ksh dist_pkgdata_DATA = \ zpool_split.cfg diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh new file mode 100644 index 000000000000..2267ea7bd895 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh @@ -0,0 +1,152 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Attila Fülöp +# + +. $STF_SUITE/include/libtest.shlib + +typeset NEWPOOL="${TESTPOOL}split" +typeset STR_DRYRUN="would create '$NEWPOOL' with the following layout:" +typeset VDEV_PREFIX="$TEST_BASE_DIR/filedev" + +# +# DESCRIPTION: +# 'zpool split -n [ ...]' can display the correct +# configuration +# +# STRATEGY: +# 1. Create a mirrored storage pool, split -n and verify the output is as +# expected. +# + +typeset -a dev=( + "${VDEV_PREFIX}00" "${VDEV_PREFIX}01" "${VDEV_PREFIX}02" + "${VDEV_PREFIX}03" "${VDEV_PREFIX}04" "${VDEV_PREFIX}05" + "${VDEV_PREFIX}06" "${VDEV_PREFIX}07" "${VDEV_PREFIX}08" + "${VDEV_PREFIX}09" "${VDEV_PREFIX}10" "${VDEV_PREFIX}11" +) + +typeset -a tests=( + # Test for hole. + ( + tree="mirror '${dev[0]}' '${dev[1]}' log mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}'" + + devs="" + want="$STR_DRYRUN + + $NEWPOOL + ${dev[1]} + special + ${dev[5]}" + ) + ( + tree="mirror '${dev[0]}' '${dev[1]}' log mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}'" + + devs="'${dev[0]}' '${dev[4]}'" + want="$STR_DRYRUN + + $NEWPOOL + ${dev[0]} + special + ${dev[4]}" + ) + + # Full set of vdev types. + ( + tree="mirror '${dev[0]}' '${dev[1]}' + dedup mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}' \ + cache '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}' '${dev[9]}'\ + log mirror '${dev[10]}' '${dev[11]}'" + + devs="" + want="$STR_DRYRUN + + $NEWPOOL + ${dev[1]} + dedup + ${dev[3]} + special + ${dev[5]}" + ) + ( + tree="mirror '${dev[0]}' '${dev[1]}' + dedup mirror '${dev[2]}' '${dev[3]}' \ + special mirror '${dev[4]}' '${dev[5]}' \ + cache '${dev[6]}' '${dev[7]}' \ + spare '${dev[8]}' '${dev[9]}'\ + log mirror '${dev[10]}' '${dev[11]}'" + + devs="'${dev[0]}' '${dev[2]}' '${dev[4]}'" + want="$STR_DRYRUN + + $NEWPOOL + ${dev[0]} + dedup + ${dev[2]} + special + ${dev[4]}" + ) +) + +verify_runnable "global" + +function cleanup +{ + destroy_pool "$TESTPOOL" + rm -f "$VDEV_PREFIX"* +} + +log_assert \ +"'zpool split -n []...' can display the configuration" + +log_onexit cleanup + +# Create needed file vdevs. +for (( i=0; i < ${#dev[@]}; i+=1 )); do + log_must truncate -s $SPA_MINDEVSIZE "${dev[$i]}" +done + +# Foreach test create pool, add -n devices and check output. +for (( i=0; i < ${#tests[@]}; i+=1 )); do + typeset tree="${tests[$i].tree}" + typeset devs="${tests[$i].devs}" + typeset want="${tests[$i].want}" + + log_must eval zpool create "$TESTPOOL" $tree + log_must poolexists "$TESTPOOL" + typeset out="$(log_must eval "zpool split -n \ + '$TESTPOOL' '$NEWPOOL' $devs" | sed /^SUCCESS/d)" + + if [[ "$out" != "$want" ]]; then + log_fail "Got:\n" "$out" "\nbut expected:\n" "$want" + fi + log_must destroy_pool "$TESTPOOL" +done + +log_pass \ +"'zpool split -n []...' displays config correctly." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib index 1c54c66c129c..e8d43cc8c740 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib @@ -30,6 +30,23 @@ function trim_progress # pool disk trim_prog_line "$1" "$2" | sed 's/.*(\([0-9]\{1,\}\)% trimmed.*/\1/g' } +# +# Write a bit of data and sync several times. +# +function sync_and_rewrite_some_data_a_few_times +{ + typeset pool=$1 + typeset -i a_few_times=${2:-20} + + typeset file="/$pool/tmpfile" + for i in {0..$a_few_times}; do + dd if=/dev/urandom of=${file} bs=128k count=10 + sync_pool "$pool" + done + + return 0 +} + function cleanup { if poolexists $TESTPOOL; then diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh index 681cd12f71c5..afc9a2ed19bd 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh @@ -27,7 +27,7 @@ # Trimming automatically resumes across offline/online. # # STRATEGY: -# 1. Create a pool with a two-way mirror. +# 1. Create a pool with a two-way mirror, prepare blocks to trim. # 2. Start trimming one of the disks and verify that trimming is active. # 3. Offline the disk. # 4. Online the disk. @@ -39,8 +39,10 @@ DISK1=${DISKS%% *} DISK2="$(echo $DISKS | cut -d' ' -f2)" -log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 -log_must zpool trim -r 128M $TESTPOOL $DISK1 +log_must zpool create -f $TESTPOOL mirror $DISK1 $DISK2 -O recordsize=4k +sync_and_rewrite_some_data_a_few_times $TESTPOOL + +log_must zpool trim -r 1 $TESTPOOL $DISK1 log_must zpool offline $TESTPOOL $DISK1 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh index faf134fbbd8d..68e99090077c 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh @@ -39,8 +39,10 @@ DISK2="$(echo $DISKS | cut -d' ' -f2)" DISK3="$(echo $DISKS | cut -d' ' -f3)" log_must zpool list -v -log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 -log_must zpool trim -r 128M $TESTPOOL $DISK1 +log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 -O recordsize=4k +sync_and_rewrite_some_data_a_few_times $TESTPOOL + +log_must zpool trim -r 1 $TESTPOOL $DISK1 [[ -z "$(trim_progress $TESTPOOL $DISK1)" ]] && \ log_fail "Trim did not start" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh index f135de4bc774..a899e9f99f14 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh @@ -41,6 +41,7 @@ function cleanup log_must zpool detach $TESTPOOL $DISK2 get_disklist $TESTPOOL | grep $DISK3 >/dev/null && \ log_must zpool detach $TESTPOOL $DISK3 + log_must zpool sync $TESTPOOL } typeset pid diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_001_pos.ksh index 5121f66b78b0..189cf435e88e 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_001_pos.ksh @@ -94,22 +94,22 @@ run_and_verify -p "$MPOOL"\ -e "resource.fs.zfs.statechange" \ -e "sysevent.fs.zfs.config_sync" \ "zpool offline $MPOOL $VDEV1" -run_and_verify -p "$MPOOL" -d 10 \ +run_and_verify -p "$MPOOL" \ -e "resource.fs.zfs.statechange" \ -e "sysevent.fs.zfs.vdev_online" \ - -e "sysevent.fs.zfs.resilver_start" \ - -e "sysevent.fs.zfs.resilver_finish" \ - -e "sysevent.fs.zfs.history_event" \ -e "sysevent.fs.zfs.config_sync" \ + -e "sysevent.fs.zfs.resilver_start" \ + -e "sysevent.fs.zfs.history_event" \ + -e "sysevent.fs.zfs.resilver_finish" \ "zpool online $MPOOL $VDEV1" # Attach then detach a device from the mirror. -run_and_verify -p "$MPOOL" -d 10 \ +run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.vdev_attach" \ -e "sysevent.fs.zfs.resilver_start" \ - -e "sysevent.fs.zfs.resilver_finish" \ - -e "sysevent.fs.zfs.history_event" \ -e "sysevent.fs.zfs.config_sync" \ + -e "sysevent.fs.zfs.history_event" \ + -e "sysevent.fs.zfs.resilver_finish" \ "zpool attach $MPOOL $VDEV1 $VDEV4" run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.vdev_remove" \ @@ -117,20 +117,20 @@ run_and_verify -p "$MPOOL" \ "zpool detach $MPOOL $VDEV4" # Replace a device -run_and_verify -p "$MPOOL" -d 10 \ +run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.vdev_attach" \ -e "sysevent.fs.zfs.resilver_start" \ + -e "sysevent.fs.zfs.config_sync" \ + -e "sysevent.fs.zfs.history_event" \ -e "sysevent.fs.zfs.resilver_finish" \ -e "sysevent.fs.zfs.vdev_remove" \ - -e "sysevent.fs.zfs.history_event" \ - -e "sysevent.fs.zfs.config_sync" \ "zpool replace -f $MPOOL $VDEV1 $VDEV4" # Scrub a pool. -run_and_verify -p "$MPOOL" -d 10 \ +run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.scrub_start" \ - -e "sysevent.fs.zfs.scrub_finish" \ -e "sysevent.fs.zfs.history_event" \ + -e "sysevent.fs.zfs.scrub_finish" \ "zpool scrub $MPOOL" # Export then import a pool @@ -139,9 +139,9 @@ run_and_verify -p "$MPOOL" \ -e "sysevent.fs.zfs.config_sync" \ "zpool export $MPOOL" run_and_verify -p "$MPOOL" \ - -e "sysevent.fs.zfs.pool_import" \ - -e "sysevent.fs.zfs.history_event" \ -e "sysevent.fs.zfs.config_sync" \ + -e "sysevent.fs.zfs.history_event" \ + -e "sysevent.fs.zfs.pool_import" \ "zpool import -d $TEST_BASE_DIR $MPOOL" # Destroy the pool diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_002_pos.ksh index 76ad6237fc23..7a78d93a8438 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_002_pos.ksh @@ -92,7 +92,7 @@ done # 5. Start the ZED and verify it only handled the new missed events. log_must zed_start -log_must file_wait $ZED_DEBUG_LOG 15 +log_must file_wait $ZED_DEBUG_LOG 35 log_must cp $ZED_DEBUG_LOG $TMP_EVENTS_ZED log_mustnot grep -q "sysevent.fs.zfs.pool_create" $TMP_EVENTS_ZED diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_common.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_common.kshlib index 26afc109174f..9c5879183b15 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_common.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/events/events_common.kshlib @@ -23,10 +23,34 @@ # Copyright (c) 2017 by Lawrence Livermore National Security, LLC. # Use is subject to license terms. # +# Copyright (c) 2020 by Delphix. All rights reserved. +# . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/events/events.cfg +# +# wait for 'event' to show up in the log 'file' +function file_wait_event # file event timeout +{ + file=$1 + event=$2 + timeout=${3:-120} + + SECONDS=0 + + until grep -q "^ZEVENT_CLASS=$event" $ZED_DEBUG_LOG ; do + if [[ $SECONDS -gt $timeout ]]; then + echo file_wait_event exceeded $SECONDS seconds + return 1 + fi + + sleep 1 + done + + return 0; +} + # # Wait for up to 'timeout' seconds for the 'file' to settle, i.e. # not be updated for a period of 'delay' seconds. @@ -41,6 +65,7 @@ function file_wait # file delay timeout while [ $(( $(date +%s) - $(stat -c %Y $file) )) -lt $delay ]; do if [[ $SECONDS -gt $timeout ]]; then + echo file_wait exceeded $SECONDS seconds return 1 fi @@ -52,30 +77,22 @@ function file_wait # file delay timeout function run_and_verify { - typeset delay event pool zedlog + typeset event pool set -A events - while getopts "d:e:p:z:" opt; do + while getopts "e:p:" opt; do case $opt in - d) - delay=$OPTARG - ;; e) - events[${#events[*]}+1]=$OPTARG + events+=("$OPTARG") ;; p) pool=$OPTARG ;; - z) - zedlog=$OPTARG - ;; esac done shift $(($OPTIND - 1)) pool=${pool:-$TESTPOOL} - delay=${delay:-3} - zedlog=${zedlog:-$ZED_DEBUG_LOG} fullcmd="$1" cmd=$(echo $fullcmd | awk '{print $1}') @@ -87,21 +104,38 @@ function run_and_verify # Remove any previous events from the logs. log_must zpool events -c - log_must truncate -s 0 $zedlog + log_must truncate -s 0 $ZED_DEBUG_LOG # Run the command as provided. log_must eval "$fullcmd" # Collect the new events and verify there are some. log_must zpool sync -f - log_must file_wait $zedlog $delay - log_must cp $zedlog $TMP_EVENTS_ZED log_must eval "zpool events >$TMP_EVENTS 2>/dev/null" log_must eval "zpool events -v > $TMP_EVENTS_FULL 2>/dev/null" log_must test -s $TMP_EVENTS log_must test -s $TMP_EVENTS_FULL - log_must test -s $TMP_EVENTS_ZED + + # If the only event is history then we don't observe zed debug log + if [[ "${events[0]}" != "sysevent.fs.zfs.history_event" ]]; then + # wait for all the non-history events to show up in the + # debug log, all-debug.sh filters history events. + for event in ${events[*]}; do + if [[ "$event" == \ + "sysevent.fs.zfs.history_event" ]]; then + continue + fi + + log_must file_wait_event $ZED_DEBUG_LOG "$event" + done + + log_must cp $ZED_DEBUG_LOG $TMP_EVENTS_ZED + log_must test -s $TMP_EVENTS_ZED + + log_note "Events logged:" + grep "^ZEVENT_CLASS" $TMP_EVENTS_ZED + fi log_note "Events generated:" cat $TMP_EVENTS @@ -118,6 +152,11 @@ function run_and_verify $TMP_EVENTS_FULL >$TMP_EVENT_FULL log_must grep -q "pool = \"$pool\"" $TMP_EVENT_FULL + # all-debug.sh filters history events (seen in ZED_DEBUG_LOG) + if [[ "$event" == "sysevent.fs.zfs.history_event" ]]; then + continue + fi + # Verify the event was received by the ZED and logged. awk -v event="$event" \ 'BEGIN{FS="\n"; RS=""} $0 ~ event { print $0 }' \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh index 1bf54b1a86d4..0abe1e2ce599 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh @@ -54,7 +54,7 @@ if is_linux; then # Add one 512b scsi_debug device (4Kn would generate IO errors) # NOTE: must be larger than other "file" vdevs and minimum SPA devsize: # add 32m of fudge - load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) 1 1 1 '512b' + load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) 1 1 1 '512b' else log_unsupported "scsi debug module unsupported" fi @@ -85,10 +85,10 @@ typeset poolconfs=( "mirror $filedev1 $filedev2 special mirror $filedev3 $removedev" ) -log_must truncate -s $SPA_MINDEVSIZE $filedev1 -log_must truncate -s $SPA_MINDEVSIZE $filedev2 -log_must truncate -s $SPA_MINDEVSIZE $filedev3 -log_must truncate -s $SPA_MINDEVSIZE $sparedev +log_must truncate -s $MINVDEVSIZE $filedev1 +log_must truncate -s $MINVDEVSIZE $filedev2 +log_must truncate -s $MINVDEVSIZE $filedev3 +log_must truncate -s $MINVDEVSIZE $sparedev for conf in "${poolconfs[@]}" do diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh index b6af1a3f40f8..a93267185b06 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh @@ -55,36 +55,59 @@ zed_events_drain TESTFILE="/$TESTPOOL/$TESTFS/testfile" -for type in "mirror" "raidz" "raidz2"; do - # 1. Create a pool with hot spares - truncate -s $SPA_MINDEVSIZE $VDEV_FILES $SPARE_FILE - log_must zpool create -f $TESTPOOL $type $VDEV_FILES spare $SPARE_FILE +for type in "mirror" "raidz" "raidz2" "draid:1s"; do + if [ "$type" = "draid:1s" ]; then + # 1. Create a dRAID pool with a distributed hot spare + # + # Corruption is injected in the file-2 instead of file-1 + # vdev since the dRAID permutation at these offsets maps + # to distributed spare space and not data devices. + # + log_must truncate -s $MINVDEVSIZE $VDEV_FILES + log_must zpool create -f $TESTPOOL $type $VDEV_FILES + SPARE="draid1-0-0" + FAULT="$TEST_BASE_DIR/file-2" + else + # 1. Create a pool with hot spares + log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ + spare $SPARE_FILE + SPARE=$SPARE_FILE + FAULT=$FAULT_FILE + fi # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS log_must zfs set recordsize=16k $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=16 + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 # 4. Inject IO ERRORS on read with a zinject error handler - log_must zinject -d $FAULT_FILE -e io -T read $TESTPOOL + log_must zinject -d $FAULT -e io -T read $TESTPOOL log_must cp $TESTFILE /dev/null # 5. Verify the ZED kicks in a hot spare and expected pool/device status log_note "Wait for ZED to auto-spare" - log_must wait_vdev_state $TESTPOOL $FAULT_FILE "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_FILE "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "INUSE" + log_must wait_vdev_state $TESTPOOL $FAULT "FAULTED" 60 + log_must wait_vdev_state $TESTPOOL $SPARE "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" + # The ZED will use a sequential resilver for dRAID. Wait for the + # resilver and subsequent scrub to complete before moving on. + if [ "$type" = "draid:1s" ]; then + log_must wait_scrubbed $TESTPOOL + fi + # 6. Clear the fault log_must zinject -c all - log_must zpool clear $TESTPOOL $FAULT_FILE + log_must zpool clear $TESTPOOL $FAULT # 7. Verify the hot spare is available and expected pool/device status - log_must wait_vdev_state $TESTPOOL $FAULT_FILE "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_FILE "AVAIL" + log_must wait_vdev_state $TESTPOOL $FAULT "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE "AVAIL" + log_must is_pool_resilvered $TESTPOOL log_must check_state $TESTPOOL "" "ONLINE" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh index f6d720a01bf6..e9517bad7131 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh @@ -60,15 +60,16 @@ TESTFILE="/$TESTPOOL/$TESTFS/testfile" for type in "mirror" "raidz" "raidz2"; do # 1. Create a pool with hot spares - truncate -s $SPA_MINDEVSIZE $VDEV_FILES $SPARE_FILE - log_must zpool create -f $TESTPOOL $type $VDEV_FILES spare $SPARE_FILE + log_must truncate -s $MINVDEVSIZE $VDEV_FILES $SPARE_FILE + log_must zpool create -f $TESTPOOL $type $VDEV_FILES \ + spare $SPARE_FILE # 2. Create a filesystem with the primary cache disable to force reads log_must zfs create -o primarycache=none $TESTPOOL/$TESTFS log_must zfs set recordsize=16k $TESTPOOL/$TESTFS # 3. Write a file to the pool to be read back - log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=16 + log_must dd if=/dev/urandom of=$TESTFILE bs=1M count=64 # 4. Inject CHECKSUM ERRORS on read with a zinject error handler log_must zinject -d $FAULT_FILE -e corrupt -f 50 -T read $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh index e9857518ed8a..f4fd21d0433d 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh @@ -60,7 +60,7 @@ FAIL_DEVICE="$TEST_BASE_DIR/fail-dev" # 1. Create a pool from 512b devices and set "ashift" pool property accordingly for vdev in $SAFE_DEVICE $FAIL_DEVICE; do - truncate -s $SPA_MINDEVSIZE $vdev + truncate -s $MINVDEVSIZE $vdev done log_must zpool create -f $TESTPOOL mirror $SAFE_DEVICE $FAIL_DEVICE # NOTE: file VDEVs should be added as 512b devices, verify this "just in case" @@ -71,7 +71,7 @@ log_must zpool set ashift=9 $TESTPOOL # 2. Add one 512e spare device (4Kn would generate IO errors on replace) # NOTE: must be larger than the existing 512b devices, add 32m of fudge -load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) $SDHOSTS $SDTGTS $SDLUNS '512e' +load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) $SDHOSTS $SDTGTS $SDLUNS '512e' SPARE_DEVICE=$(get_debug_device) log_must_busy zpool add $TESTPOOL spare $SPARE_DEVICE diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh index bec41352752b..8a9cf6f5324e 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh @@ -63,15 +63,43 @@ FAULT_DEV1="$TEST_BASE_DIR/fault-dev1" FAULT_DEV2="$TEST_BASE_DIR/fault-dev2" SAFE_DEV1="$TEST_BASE_DIR/safe-dev1" SAFE_DEV2="$TEST_BASE_DIR/safe-dev2" -DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2" +SAFE_DEV3="$TEST_BASE_DIR/safe-dev3" +SAFE_DEV4="$TEST_BASE_DIR/safe-dev4" +DATA_DEVS="$FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 $SAFE_DEV2 $SAFE_DEV3 $SAFE_DEV4" SPARE_DEV1="$TEST_BASE_DIR/spare-dev1" SPARE_DEV2="$TEST_BASE_DIR/spare-dev2" SPARE_DEVS="$SPARE_DEV1 $SPARE_DEV2" -for type in "mirror" "raidz" "raidz2" "raidz3"; do - # 1. Create a pool with two hot spares - truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS - log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS +for type in "mirror" "raidz" "raidz2" "raidz3" "draid2:1s"; do + if [ "$type" = "draid2:1s" ]; then + # 1. Create a dRAID pool with a distributed and traditional + # hot spare to provide test coverage for both configurations. + # + # Corruption is injected in the third and fourth vdevs + # since the dRAID permutation at these offsets maps to + # distributed spare space and not data devices. + # + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 + log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ + $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ + spare $SPARE_DEV1 + SPARE1=$SPARE_DEV1 + SPARE2="draid2-0-0" + elif [ "$type" = "mirror" ]; then + # 1. Create a 3-way mirror pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type \ + $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + else + # 1. Create a raidz pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ + spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + fi # 2. Inject IO ERRORS with a zinject error handler on the first device log_must zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL @@ -79,11 +107,11 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 3. Start a scrub log_must zpool scrub $TESTPOOL - # 4. Verify the ZED kicks in a hot spare and expected pool/device status + # 4. Verify the ZED kicks in a hot spare and the pool/device status log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" # 5. Inject IO ERRORS on a second device @@ -98,10 +126,14 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 7. Verify the ZED kicks in a second hot spare log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" + while is_pool_scrubbing $TESTPOOL || is_pool_resilvering $TESTPOOL; do + sleep 1 + done + # 8. Clear the fault on both devices log_must zinject -c all log_must zpool clear $TESTPOOL $FAULT_DEV1 @@ -110,8 +142,8 @@ for type in "mirror" "raidz" "raidz2" "raidz3"; do # 9. Verify the hot spares are available and expected pool/device status log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "ONLINE" 60 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "AVAIL" - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "AVAIL" + log_must wait_hotspare_state $TESTPOOL $SPARE1 "AVAIL" + log_must wait_hotspare_state $TESTPOOL $SPARE2 "AVAIL" log_must check_state $TESTPOOL "" "ONLINE" # Cleanup @@ -120,11 +152,37 @@ done # Rinse and repeat, this time faulting both devices at the same time # NOTE: "raidz" is excluded since it cannot survive 2 faulted devices -# NOTE: "mirror" is a 4-way mirror here and should survive this test -for type in "mirror" "raidz2" "raidz3"; do - # 1. Create a pool with two hot spares - truncate -s $SPA_MINDEVSIZE $DATA_DEVS $SPARE_DEVS - log_must zpool create -f $TESTPOOL $type $DATA_DEVS spare $SPARE_DEVS +# NOTE: "mirror" is a 3-way mirror here and should survive this test +for type in "mirror" "raidz2" "raidz3" "draid2:1s"; do + if [ "$type" = "draid2:1s" ]; then + # 1. Create a dRAID pool with a distributed and traditional + # hot spare to provide test coverage for both configurations. + # + # Corruption is injected in the third and fourth vdevs + # since the dRAID permutation at these offsets maps to + # distributed spare space and not data devices. + # + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEV1 + log_must zpool create -f $TESTPOOL $type $SAFE_DEV1 \ + $SAFE_DEV2 $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV3 $SAFE_DEV4 \ + spare $SPARE_DEV1 + SPARE1=$SPARE_DEV1 + SPARE2="draid2-0-0" + elif [ "$type" = "mirror" ]; then + # 1. Create a 3-way mirror pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type \ + $FAULT_DEV1 $FAULT_DEV2 $SAFE_DEV1 spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + else + # 1. Create a raidz pool with two hot spares + truncate -s $MINVDEVSIZE $DATA_DEVS $SPARE_DEVS + log_must zpool create -f $TESTPOOL $type $DATA_DEVS \ + spare $SPARE_DEVS + SPARE1=$SPARE_DEV1 + SPARE2=$SPARE_DEV2 + fi # 2. Inject IO ERRORS with a zinject error handler on two devices log_must eval "zinject -d $FAULT_DEV1 -e io -T all -f 100 $TESTPOOL &" @@ -133,14 +191,14 @@ for type in "mirror" "raidz2" "raidz3"; do # 3. Start a scrub log_must zpool scrub $TESTPOOL - # 4. Verify the ZED kicks in two hot spares and expected pool/device status + # 4. Verify the ZED kicks in two hot spares and the pool/device status log_note "Wait for ZED to auto-spare" log_must wait_vdev_state $TESTPOOL $FAULT_DEV1 "FAULTED" 60 log_must wait_vdev_state $TESTPOOL $FAULT_DEV2 "FAULTED" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV1 "ONLINE" 60 - log_must wait_vdev_state $TESTPOOL $SPARE_DEV2 "ONLINE" 60 - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV1 "INUSE" - log_must wait_hotspare_state $TESTPOOL $SPARE_DEV2 "INUSE" + log_must wait_vdev_state $TESTPOOL $SPARE1 "ONLINE" 60 + log_must wait_vdev_state $TESTPOOL $SPARE2 "ONLINE" 60 + log_must wait_hotspare_state $TESTPOOL $SPARE1 "INUSE" + log_must wait_hotspare_state $TESTPOOL $SPARE2 "INUSE" log_must check_state $TESTPOOL "" "DEGRADED" # 5. Clear the fault on both devices diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh index 467161359df4..4229537b3953 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh @@ -42,7 +42,7 @@ if is_linux; then # Add one 512b spare device (4Kn would generate IO errors on replace) # NOTE: must be larger than other "file" vdevs and minimum SPA devsize: # add 32m of fudge - load_scsi_debug $(($SPA_MINDEVSIZE/1024/1024+32)) 1 1 1 '512b' + load_scsi_debug $(($MINVDEVSIZE/1024/1024+32)) 1 1 1 '512b' else log_unsupported "scsi debug module unsupported" fi @@ -72,7 +72,7 @@ SPARE_DISKDEV="$(get_debug_device)" for vdev in $SAFE_FILEDEVPOOL1 $SAFE_FILEDEVPOOL2 $FAIL_FILEDEVPOOL1 \ $FAIL_FILEDEVPOOL2 $SPARE_FILEDEV; do - log_must truncate -s $SPA_MINDEVSIZE $vdev + log_must truncate -s $MINVDEVSIZE $vdev done for spare in $SPARE_FILEDEV $SPARE_DISKDEV; do diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh new file mode 100644 index 000000000000..24fcefadfd07 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_arcstats_pos.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# L2ARC MFU/MRU arcstats do not leak +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create a random file in that pool, smaller than the cache device +# and random read for 10 sec. +# 3. Read l2arc_mfu_asize and l2arc_mru_asize +# 4. Export pool. +# 5. Verify l2arc_mfu_asize and l2arc_mru_asize are 0. +# 6. Import pool. +# 7. Read random read for 10 sec. +# 8. Read l2arc_mfu_asize and l2arc_mru_asize +# 9. Verify that L2ARC MFU increased and MFU+MRU = L2_asize. +# + +verify_runnable "global" + +log_assert "L2ARC MFU/MRU arcstats do not leak." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +log_must set_tunable32 L2ARC_NOPREFETCH 0 + +typeset fill_mb=800 +typeset cache_sz=$(( 1.4 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +arcstat_quiescence_noecho l2_size +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +typeset l2_mfu_init=$(get_arcstat l2_mfu_asize) +typeset l2_mru_init=$(get_arcstat l2_mru_asize) +typeset l2_prefetch_init=$(get_arcstat l2_prefetch_asize) +typeset l2_asize_init=$(get_arcstat l2_asize) + +log_must zpool online $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size +log_must zpool export $TESTPOOL +arcstat_quiescence_noecho l2_feeds + +log_must test $(get_arcstat l2_mfu_asize) -eq 0 +log_must test $(get_arcstat l2_mru_asize) -eq 0 +log_must zpool import -d $VDIR $TESTPOOL +arcstat_quiescence_noecho l2_size + +log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +typeset l2_mfu_end=$(get_arcstat l2_mfu_asize) +typeset l2_mru_end=$(get_arcstat l2_mru_asize) +typeset l2_prefetch_end=$(get_arcstat l2_prefetch_asize) +typeset l2_asize_end=$(get_arcstat l2_asize) + +log_must test $(( $l2_mfu_end - $l2_mfu_init )) -gt 0 +log_must test $(( $l2_mru_end + $l2_mfu_end + $l2_prefetch_end - \ + $l2_asize_end )) -eq 0 +log_must test $(( $l2_mru_init + $l2_mfu_init + $l2_prefetch_init - \ + $l2_asize_init )) -eq 0 + +log_must zpool destroy -f $TESTPOOL + +log_pass "L2ARC MFU/MRU arcstats do not leak." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh new file mode 100644 index 000000000000..783484f52c13 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_l2miss_pos.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Adam Moss. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# l2arc_misses does not increment upon reads from a pool without l2arc +# +# STRATEGY: +# 1. Create pool with a cache device. +# 2. Create pool without a cache device. +# 3. Create a random file in the no-cache-device pool, +# and random read for 10 sec. +# 4. Check that l2arc_misses hasn't risen +# 5. Create a random file in the pool with the cache device, +# and random read for 10 sec. +# 6. Check that l2arc_misses has risen +# + +verify_runnable "global" + +log_assert "l2arc_misses does not increment upon reads from a pool without l2arc." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + if poolexists $TESTPOOL1 ; then + destroy_pool $TESTPOOL1 + fi +} +log_onexit cleanup + +typeset fill_mb=800 +typeset cache_sz=$(( 1.4 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +log_must zpool create -O compression=off -f $TESTPOOL $VDEV cache $VDEV_CACHE +log_must zpool create -O compression=off -f $TESTPOOL1 $VDEV1 + +# I/O to pool without l2arc - expect that l2_misses stays constant +export DIRECTORY=/$TESTPOOL1 +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio +# attempt to remove entries for pool from ARC so we would try +# to hit the nonexistent L2ARC for subsequent reads +log_must zpool export $TESTPOOL1 +log_must zpool import $TESTPOOL1 -d $VDEV1 + +typeset starting_miss_count=$(get_arcstat l2_misses) + +log_must fio $FIO_SCRIPTS/random_reads.fio +log_must test $(get_arcstat l2_misses) -eq $starting_miss_count + +# I/O to pool with l2arc - expect that l2_misses rises +export DIRECTORY=/$TESTPOOL +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio +# wait for L2ARC writes to actually happen +arcstat_quiescence_noecho l2_size +# attempt to remove entries for pool from ARC so we would try +# to hit L2ARC for subsequent reads +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL -d $VDEV + +log_must fio $FIO_SCRIPTS/random_reads.fio +log_must test $(get_arcstat l2_misses) -gt $starting_miss_count + +log_must zpool destroy -f $TESTPOOL +log_must zpool destroy -f $TESTPOOL1 + +log_pass "l2arc_misses does not increment upon reads from a pool without l2arc." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh new file mode 100644 index 000000000000..489360d8c523 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/l2arc/l2arc_mfuonly_pos.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg + +# +# DESCRIPTION: +# l2arc_mfuonly does not cache MRU buffers +# +# STRATEGY: +# 1. Set l2arc_mfuonly=yes +# 2. Create pool with a cache device. +# 3. Create a random file in that pool, smaller than the cache device +# and random read for 10 sec. +# 4. Export and re-import the pool. This is necessary as some MFU ghost +# buffers with prefetch status may transition to MRU eventually. +# By re-importing the pool the l2 arcstats reflect the ARC state +# of L2ARC buffers upon their caching in L2ARC. +# 5. Verify l2arc_mru_asize is 0. +# + +verify_runnable "global" + +log_assert "l2arc_mfuonly does not cache MRU buffers." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch + log_must set_tunable32 L2ARC_MFUONLY $mfuonly + log_must set_tunable32 PREFETCH_DISABLE $zfsprefetch +} +log_onexit cleanup + +# L2ARC_NOPREFETCH is set to 1 as some prefetched buffers may +# transition to MRU. +typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH) +log_must set_tunable32 L2ARC_NOPREFETCH 1 + +typeset mfuonly=$(get_tunable L2ARC_MFUONLY) +log_must set_tunable32 L2ARC_MFUONLY 1 + +typeset zfsprefetch=$(get_tunable PREFETCH_DISABLE) +log_must set_tunable32 PREFETCH_DISABLE 1 + +typeset fill_mb=800 +typeset cache_sz=$(( 1.4 * $fill_mb )) +export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M + +log_must truncate -s ${cache_sz}M $VDEV_CACHE + +typeset log_blk_start=$(get_arcstat l2_log_blk_writes) + +log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/random_reads.fio + +log_must zpool export $TESTPOOL +log_must zpool import -d $VDIR $TESTPOOL + +# Regardless of l2arc_noprefetch, some MFU buffers might be evicted +# from ARC, accessed later on as prefetches and transition to MRU as +# prefetches. +# If accessed again they are counted as MRU and the l2arc_mru_asize arcstat +# will not be 0 (mentioned also in zfs-module-parameters.5) +# For the purposes of this test we mitigate this by disabling (predictive) +# ZFS prefetches with zfs_prefetch_disable=1. +log_must test $(get_arcstat l2_mru_asize) -eq 0 + +log_must zpool destroy -f $TESTPOOL + +log_pass "l2arc_mfuonly does not cache MRU buffers." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am index 14a43de9c173..9baf580eeadb 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am @@ -1,7 +1,10 @@ -pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/persist_l2arc +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/l2arc dist_pkgdata_SCRIPTS = \ cleanup.ksh \ setup.ksh \ + l2arc_arcstats_pos.ksh \ + l2arc_l2miss_pos.ksh \ + l2arc_mfuonly_pos.ksh \ persist_l2arc_001_pos.ksh \ persist_l2arc_002_pos.ksh \ persist_l2arc_003_neg.ksh \ @@ -12,4 +15,4 @@ dist_pkgdata_SCRIPTS = \ persist_l2arc_008_pos.ksh dist_pkgdata_DATA = \ - persist_l2arc.cfg + l2arc.cfg diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh index 828de3862508..c3d88e3ffc71 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh @@ -18,12 +18,12 @@ # Copyright (c) 2020, George Amanakis. All rights reserved. # -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg verify_runnable "global" -if datasetexists $TESTPOOL ; then - log_must zpool destroy -f $TESTPOOL +if poolexists $TESTPOOL ; then + log_must destroy_pool $TESTPOOL fi log_must rm -rf $VDIR diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg index 60bb2463760f..0302392f4c7f 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg @@ -21,14 +21,15 @@ . $STF_SUITE/include/libtest.shlib export SIZE=1G -export VDIR=$TESTDIR/disk.persist_l2arc +export VDIR=$TESTDIR/disk.l2arc export VDEV="$VDIR/a" export VDEV_CACHE="$VDIR/b" +export VDEV1="$VDIR/c" # fio options export DIRECTORY=/$TESTPOOL export NUMJOBS=4 -export RUNTIME=30 +export RUNTIME=10 export PERF_RANDSEED=1234 export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh index f313923d1469..0a9049490c71 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh @@ -19,7 +19,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg # # DESCRIPTION: @@ -28,13 +28,13 @@ # STRATEGY: # 1. Create pool with a cache device. # 2. Export and re-import pool without writing any data. -# 3. Create a random file in that pool and random read for 30 sec. +# 3. Create a random file in that pool and random read for 10 sec. # 4. Export pool. # 5. Read the amount of log blocks written from the header of the # L2ARC device. # 6. Import pool. # 7. Read the amount of log blocks rebuilt in arcstats and compare to -# (4). +# (5). # 8. Check if the labels of the L2ARC device are intact. # # * We can predict the minimum bytes of L2ARC restored if we subtract @@ -83,7 +83,9 @@ log_must zpool import -d $VDIR $TESTPOOL log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size log_must zpool export $TESTPOOL +arcstat_quiescence_noecho l2_feeds typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ awk '{print $2}') @@ -91,15 +93,18 @@ typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL +arcstat_quiescence_noecho l2_size -sleep 2 +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) -typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) - -log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - + $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk -gt 0 -log_must zdb -lll $VDEV_CACHE +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + +log_must zdb -lllq $VDEV_CACHE log_must zpool destroy -f $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh index 0184f06efa50..93982e6c605b 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh @@ -19,7 +19,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg . $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib # @@ -30,7 +30,7 @@ # 1. Create pool with a cache device. # 2. Create a an encrypted ZFS file system. # 3. Create a random file in the encrypted file system and random -# read for 30 sec. +# read for 10 sec. # 4. Export pool. # 5. Read the amount of log blocks written from the header of the # L2ARC device. @@ -86,9 +86,9 @@ log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size log_must zpool export $TESTPOOL - -sleep 2 +arcstat_quiescence_noecho l2_feeds typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ awk '{print $2}') @@ -97,14 +97,17 @@ typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1" +arcstat_quiescence_noecho l2_size -sleep 2 +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) -typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) - -log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk -gt 0 +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + log_must zdb -lq $VDEV_CACHE log_must zpool destroy -f $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh index 7824dfe8f1b1..fe35c8fc4500 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh @@ -19,7 +19,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg # # DESCRIPTION: @@ -28,11 +28,11 @@ # STRATEGY: # 1. Set L2ARC_REBUILD_ENABLED = 0 # 2. Create pool with a cache device. -# 3. Create a random file in that pool and random read for 30 sec. +# 3. Create a random file in that pool and random read for 10 sec. # 4. Export pool. # 5. Import pool. # 6. Check in zpool iostat if the cache device has space allocated. -# 7. Read the file written in (2) and check if l2_hits in +# 7. Read the file written in (3) and check if l2_hits in # /proc/spl/kstat/zfs/arcstats increased. # diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh index 6620131d182d..544e9291de29 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh @@ -19,7 +19,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg # # DESCRIPTION: @@ -28,12 +28,12 @@ # STRATEGY: # 1. Create pool with a cache device. # 2. Create a random file in that pool, smaller than the cache device -# and random read for 30 sec. +# and random read for 10 sec. # 3. Export pool. # 4. Read amount of log blocks written. # 5. Import pool. # 6. Read amount of log blocks built. -# 7. Compare the two amounts +# 7. Compare the two amounts. # 8. Read the file written in (2) and check if l2_hits in # /proc/spl/kstat/zfs/arcstats increased. # 9. Check if the labels of the L2ARC device are intact. @@ -70,30 +70,31 @@ log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size log_must zpool export $TESTPOOL - -sleep 2 +arcstat_quiescence_noecho l2_feeds typeset log_blk_end=$(get_arcstat l2_log_blk_writes) - typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL typeset l2_hits_start=$(get_arcstat l2_hits) -export RUNTIME=10 log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size +typeset log_blk_rebuild_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) typeset l2_hits_end=$(get_arcstat l2_hits) -typeset log_blk_rebuild_end=$(get_arcstat l2_rebuild_log_blks) - log_must test $(( $log_blk_rebuild_end - $log_blk_rebuild_start )) -eq \ $(( $log_blk_end - $log_blk_start )) log_must test $l2_hits_end -gt $l2_hits_start +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + log_must zdb -lq $VDEV_CACHE log_must zpool destroy -f $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh index 9fc6a5923864..ee46e7b8cad6 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh @@ -19,7 +19,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg . $STF_SUITE/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib # @@ -30,13 +30,13 @@ # 1. Create pool with a cache device. # 2. Create a an encrypted ZFS file system. # 3. Create a random file in the entrypted file system, -# smaller than the cache device, and random read for 30 sec. +# smaller than the cache device, and random read for 10 sec. # 4. Export pool. # 5. Read amount of log blocks written. # 6. Import pool. # 7. Mount the encrypted ZFS file system. # 8. Read amount of log blocks built. -# 9. Compare the two amounts +# 9. Compare the two amounts. # 10. Read the file written in (3) and check if l2_hits in # /proc/spl/kstat/zfs/arcstats increased. # 11. Check if the labels of the L2ARC device are intact. @@ -76,12 +76,11 @@ log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size log_must zpool export $TESTPOOL - -sleep 2 +arcstat_quiescence_noecho l2_feeds typeset log_blk_end=$(get_arcstat l2_log_blk_writes) - typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL @@ -89,18 +88,20 @@ log_must eval "echo $PASSPHRASE | zfs mount -l $TESTPOOL/$TESTFS1" typeset l2_hits_start=$(get_arcstat l2_hits) -export RUNTIME=10 log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size +typeset log_blk_rebuild_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) typeset l2_hits_end=$(get_arcstat l2_hits) -typeset log_blk_rebuild_end=$(get_arcstat l2_rebuild_log_blks) - log_must test $(( $log_blk_rebuild_end - $log_blk_rebuild_start )) -eq \ $(( $log_blk_end - $log_blk_start )) log_must test $l2_hits_end -gt $l2_hits_start +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + log_must zdb -lq $VDEV_CACHE log_must zpool destroy -f $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh index 55e9f9585c0e..051773540233 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh @@ -19,7 +19,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg # # DESCRIPTION: @@ -28,7 +28,7 @@ # # STRATEGY: # 1. Create pool with a cache device. -# 2. Create a random file in that pool and random read for 30 sec. +# 2. Create a random file in that pool and random read for 10 sec. # 3. Read the amount of log blocks written from the header of the # L2ARC device. # 4. Offline the L2ARC device and export pool. @@ -71,26 +71,29 @@ log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size log_must zpool export $TESTPOOL - -sleep 5 +arcstat_quiescence_noecho l2_feeds typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) - typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ awk '{print $2}') log_must zpool import -d $VDIR $TESTPOOL log_must zpool online $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size -sleep 5 +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) -typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) - -log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk -gt 0 +log must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + log_must zdb -lq $VDEV_CACHE log_must zpool destroy -f $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh index c79c3927652d..9208b81d4905 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh @@ -19,7 +19,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg # # DESCRIPTION: @@ -27,13 +27,13 @@ # # STRATEGY: # 1. Create pool with a cache device. -# 2. Create a random file in that pool and random read for 30 sec. -# 3. Read the amount of log blocks written from the header of the +# 2. Create a random file in that pool and random read for 10 sec. +# 3. Offline the L2ARC device. +# 4. Read the amount of log blocks written from the header of the # L2ARC device. -# 4. Offline the L2ARC device. # 5. Online the L2ARC device. # 6. Read the amount of log blocks rebuilt in arcstats and compare to -# (3). +# (4). # 7. Check if the labels of the L2ARC device are intact. # @@ -70,24 +70,26 @@ log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size log_must zpool offline $TESTPOOL $VDEV_CACHE - -sleep 10 +arcstat_quiescence_noecho l2_size typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) - typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ awk '{print $2}') log_must zpool online $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size -sleep 10 +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) -typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) - -log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk -gt 0 +log_must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + log_must zdb -lq $VDEV_CACHE log_must zpool destroy -f $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh index a64bd94d3169..5a79ff31ba7e 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh @@ -19,7 +19,7 @@ # . $STF_SUITE/include/libtest.shlib -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg # # DESCRIPTION: @@ -27,20 +27,20 @@ # # STRATEGY: # 1. Create pool with a cache device. -# 2. Create a random file in that pool and random read for 30 sec. +# 2. Create a random file in that pool and random read for 10 sec. # 3. Read the amount of log blocks written from the header of the # L2ARC device. # 4. Offline the L2ARC device. # 5. Online the L2ARC device. # 6. Read the amount of log blocks rebuilt in arcstats and compare to # (3). -# 7. Create another random file in that pool and random read for 30 sec. +# 7. Create another random file in that pool and random read for 10 sec. # 8. Read the amount of log blocks written from the header of the # L2ARC device. # 9. Offline the L2ARC device. # 10. Online the L2ARC device. # 11. Read the amount of log blocks rebuilt in arcstats and compare to -# (7). +# (8). # 12. Check if the amount of log blocks on the cache device has # increased. # 13. Export the pool. @@ -80,62 +80,62 @@ log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size log_must zpool offline $TESTPOOL $VDEV_CACHE - -sleep 2 +arcstat_quiescence_noecho l2_size typeset l2_dh_log_blk1=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ awk '{print $2}') - typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool online $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size -sleep 5 +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) -typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) - -log_must test $l2_dh_log_blk1 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk1 -eq $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk1 -gt 0 log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio +arcstat_quiescence_noecho l2_size log_must zpool offline $TESTPOOL $VDEV_CACHE - -sleep 2 +arcstat_quiescence_noecho l2_size typeset l2_dh_log_blk2=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ awk '{print $2}') - typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool online $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size -sleep 5 - -typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) - -log_must test $l2_dh_log_blk2 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) +log_must test $l2_dh_log_blk2 -eq $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk2 -gt $l2_dh_log_blk1 log_must zpool export $TESTPOOL +arcstat_quiescence_noecho l2_feeds typeset l2_dh_log_blk3=$(zdb -l $VDEV_CACHE | grep log_blk_count | \ awk '{print $2}') - typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks) log_must zpool import -d $VDIR $TESTPOOL +arcstat_quiescence_noecho l2_size -sleep 5 +typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks) -typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) - -log_must test $l2_dh_log_blk3 -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) +log_must test $l2_dh_log_blk3 -eq $(( $l2_rebuild_log_blk_end - \ + $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk3 -gt 0 +log must zpool offline $TESTPOOL $VDEV_CACHE +arcstat_quiescence_noecho l2_size + log_must zdb -lq $VDEV_CACHE log_must zpool destroy -f $TESTPOOL diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/setup.ksh index ef95c84cdd6a..0df61a9d2761 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/setup.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/persist_l2arc/setup.ksh @@ -18,12 +18,13 @@ # Copyright (c) 2020, George Amanakis. All rights reserved. # -. $STF_SUITE/tests/functional/persist_l2arc/persist_l2arc.cfg +. $STF_SUITE/tests/functional/l2arc/l2arc.cfg verify_runnable "global" log_must rm -rf $VDIR log_must mkdir -p $VDIR log_must mkfile $SIZE $VDEV +log_must mkfile $SIZE $VDEV1 log_pass diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/Makefile.am index 694de18a6cf9..d93eb73cf832 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/Makefile.am @@ -3,4 +3,6 @@ dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ raidz_001_neg.ksh \ - raidz_002_pos.ksh + raidz_002_pos.ksh \ + raidz_003_pos.ksh \ + raidz_004_pos.ksh diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh new file mode 100644 index 000000000000..bf22632c7eff --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/raidz_003_pos.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Call the raidz_test tool with -S and -e to test all supported raidz +# implementations with expanded map and default reflow offset. +# This options will test several raidz block geometries and several zio +# parameters that affect raidz block layout. Data reconstruction performs +# all combinations of failed disks. Wall time is set to 5min, but actual +# runtime might be longer. +# + +log_must raidz_test -S -e -t 60 + +log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh new file mode 100644 index 000000000000..6cd2bf7c9f60 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/raidz/raidz_004_pos.ksh @@ -0,0 +1,41 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by vStack. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Call the raidz_test tool with -S and -e to test all supported raidz +# implementations with expanded map and zero reflow offset. +# This options will test several raidz block geometries and several zio +# parameters that affect raidz block layout. Data reconstruction performs +# all combinations of failed disks. Wall time is set to 5min, but actual +# runtime might be longer. +# + +log_must raidz_test -S -e -r 0 -t 60 + +log_pass "raidz_test parameter sweep test with expanded map succeeded." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/Makefile.am index 6f6cc405b9bf..b2d4414b2906 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/Makefile.am @@ -2,10 +2,17 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/redundancy dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - redundancy_001_pos.ksh \ - redundancy_002_pos.ksh \ - redundancy_003_pos.ksh \ - redundancy_004_neg.ksh + redundancy_draid1.ksh \ + redundancy_draid2.ksh \ + redundancy_draid3.ksh \ + redundancy_draid_spare1.ksh \ + redundancy_draid_spare2.ksh \ + redundancy_draid_spare3.ksh \ + redundancy_mirror.ksh \ + redundancy_raidz1.ksh \ + redundancy_raidz2.ksh \ + redundancy_raidz3.ksh \ + redundancy_stripe.ksh dist_pkgdata_DATA = \ redundancy.cfg \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 9bf2df0d1368..26ded8720d10 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -66,6 +66,23 @@ function random echo $value } +# +# Get the number of checksum errors for the pool. +# +# $1 Pool +# +function cksum_pool +{ + typeset -i cksum=$(zpool status $1 | awk ' + !NF { isvdev = 0 } + isvdev { errors += $NF } + /CKSUM$/ { isvdev = 1 } + END { print errors } + ') + + echo $cksum +} + # # Record the directories construction and checksum all the files which reside # within the specified pool @@ -81,6 +98,7 @@ function record_data [[ -z $pool ]] && log_fail "No specified pool." [[ -f $recordfile ]] && log_must rm -f $recordfile + sync_pool $pool typeset mntpnt mntpnt=$(get_prop mountpoint $pool) log_must eval "du -a $mntpnt > $recordfile 2>&1" @@ -119,22 +137,43 @@ function setup_test_env destroy_pool $pool fi - log_must mkfile $MINVDEVSIZE $vdevs + log_must truncate -s $MINVDEVSIZE $vdevs - log_must zpool create -m $TESTDIR $pool $keyword $vdevs + log_must zpool create -f -m $TESTDIR $pool $keyword $vdevs log_note "Filling up the filesystem ..." typeset -i ret=0 typeset -i i=0 typeset file=$TESTDIR/file + typeset -i limit + (( limit = $(get_prop available $pool) / 4 )) + while true ; do - file_write -o create -f $file.$i \ - -b $BLOCKSZ -c $NUM_WRITES + [[ $(get_prop available $pool) -lt $limit ]] && break + file_write -o create -f $file.$i -b $BLOCKSZ -c $NUM_WRITES + ret=$? + (( $ret != 0 )) && break + (( i = i + 1 )) + done + + record_data $TESTPOOL $PRE_RECORD_FILE +} + +function refill_test_env +{ + log_note "Re-filling the filesystem ..." + typeset -i ret=0 + typeset -i i=0 + typeset mntpnt + mntpnt=$(get_prop mountpoint $pool) + typeset file=$mntpnt/file + while [[ -e $file.$i ]]; do + log_must rm -f $file.$i + file_write -o create -f $file.$i -b $BLOCKSZ -c $NUM_WRITES ret=$? (( $ret != 0 )) && break (( i = i + 1 )) done - (($ret != 28 )) && log_note "file_write return value($ret) is unexpected." record_data $TESTPOOL $PRE_RECORD_FILE } diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh index 90d14f60017b..a73890e4cc05 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh @@ -48,7 +48,7 @@ verify_runnable "global" -log_assert "Verify raidz pool can withstand one device is failing." +log_assert "Verify raidz pool can withstand one device failing." log_onexit cleanup typeset -i cnt=$(random_int_between 2 5) @@ -74,4 +74,4 @@ log_must recover_bad_missing_devs $TESTPOOL 1 remove_devs $TESTPOOL 1 log_must is_data_valid $TESTPOOL -log_pass "Raidz pool can withstand one devices is failing passed." +log_pass "raidz pool can withstand one device failing passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh index 74bda19990e3..94b9b8825154 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh @@ -48,7 +48,7 @@ verify_runnable "global" -log_assert "Verify raidz2 pool can withstand two devices are failing." +log_assert "Verify raidz2 pool can withstand two devices failing." log_onexit cleanup typeset -i cnt=$(random_int_between 3 5) @@ -81,4 +81,4 @@ for i in 1 2; do log_must recover_bad_missing_devs $TESTPOOL $i done -log_pass "Raidz2 pool can withstand two devices are failing passed." +log_pass "raidz2 pool can withstand two devices failing passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh new file mode 100644 index 000000000000..85d420ab0d3a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid1.ksh @@ -0,0 +1,78 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid pool can withstand at most 1 device failing or missing. +# +# STRATEGY: +# 1. Create N(>3,<6) virtual disk files. +# 2. Create draid pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged one of the virtual disk file. +# 6. Verify the data is correct to prove draid can withstand 1 device is +# failing. +# + +verify_runnable "global" + +log_assert "Verify draid pool can withstand one device failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 3 6) +setup_test_env $TESTPOOL draid $cnt + +# +# Inject data corruption error for draid pool +# +damage_devs $TESTPOOL 1 "label" +log_must is_data_valid $TESTPOOL +log_must clear_errors $TESTPOOL + +# +# Inject bad device error for draid pool +# +damage_devs $TESTPOOL 1 +log_must is_data_valid $TESTPOOL +log_must recover_bad_missing_devs $TESTPOOL 1 + +# +# Inject missing device error for draid pool +# +remove_devs $TESTPOOL 1 +log_must is_data_valid $TESTPOOL + +log_pass "draid pool can withstand one device failing passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh new file mode 100644 index 000000000000..04f1fdfb150d --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid2.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid2 pool can withstand 2 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>4,<6) virtual disk files. +# 2. Create draid2 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most two of the virtual disk files. +# 6. Verify the data is correct to prove draid2 can withstand 2 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify draid2 pool can withstand two devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 4 6) +setup_test_env $TESTPOOL draid2 $cnt + +# +# Inject data corruption errors for draid2 pool +# +for i in 1 2; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for draid2 pool +# +for i in 1 2; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for draid2 pool +# +for i in 1 2; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "draid2 pool can withstand two devices failing passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh new file mode 100644 index 000000000000..bddd150d0c98 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid3.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A draid3 pool can withstand 3 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>5,<6) virtual disk files. +# 2. Create draid3 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most two of the virtual disk files. +# 6. Verify the data is correct to prove draid3 can withstand 3 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify draid3 pool can withstand three devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 5 6) +setup_test_env $TESTPOOL draid3 $cnt + +# +# Inject data corruption errors for draid3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for draid3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for draid3 pool +# +for i in 1 2 3; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "draid3 pool can withstand three devices failing passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh new file mode 100644 index 000000000000..3b7951596dbb --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare1.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, Datto Inc. All rights reserved. +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify resilver to dRAID distributed spares. +# +# STRATEGY: +# 1. For resilvers: +# a. Create a semi-random dRAID pool configuration which can: +# - sustain N failures (1-3), and +# - has N distributed spares to replace all faulted vdevs +# b. Fill the pool with data +# c. Systematically fault a vdev, then replace it with a spare +# d. Scrub the pool to verify no data was lost +# e. Verify the contents of files in the pool +# + +log_assert "Verify resilver to dRAID distributed spares" + +log_onexit cleanup + +for replace_mode in "healing" "sequential"; do + + if [[ "$replace_mode" = "sequential" ]]; then + flags="-s" + else + flags="" + fi + + parity=$(random_int_between 1 3) + spares=$(random_int_between $parity 3) + data=$(random_int_between 1 8) + + (( min_children = (data + parity + spares) )) + children=$(random_int_between $min_children 16) + + draid="draid${parity}:${data}d:${children}c:${spares}s" + + setup_test_env $TESTPOOL $draid $children + + i=0 + while [[ $i -lt $spares ]]; do + fault_vdev="$BASEDIR/vdev$i" + spare_vdev="draid${parity}-0-${i}" + + log_must zpool offline -f $TESTPOOL $fault_vdev + log_must check_vdev_state $TESTPOOL $fault_vdev "FAULTED" + log_must zpool replace -w $flags $TESTPOOL \ + $fault_vdev $spare_vdev + log_must check_vdev_state spare-$i "DEGRADED" + log_must check_vdev_state $spare_vdev "ONLINE" + log_must check_hotspare_state $TESTPOOL $spare_vdev "INUSE" + log_must zpool detach $TESTPOOL $fault_vdev + + resilver_cksum=$(cksum_pool $TESTPOOL) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $TESTPOOL + log_fail "$replace_mode resilver " + "cksum errors: $resilver_cksum" + fi + + if [[ "$replace_mode" = "healing" ]]; then + log_must zpool scrub $TESTPOOL + fi + + log_must wait_scrubbed $TESTPOOL + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + scrub_cksum=$(cksum_pool $TESTPOOL) + if [[ $scrub_cksum != 0 ]]; then + log_must zpool status -v $TESTPOOL + log_fail "scrub cksum errors: $scrub_cksum" + fi + + (( i += 1 )) + done + + log_must is_data_valid $TESTPOOL + + cleanup +done + +log_pass "Verify resilver to dRAID distributed spares" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh new file mode 100644 index 000000000000..08fdd558f929 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare2.ksh @@ -0,0 +1,80 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify multiple dRAID spares can be used. +# +# STRATEGY: +# 1. Create a pool and fill it with data. +# 2. Engage 3 distributed spares and verify the pool +# 3. Refill the filesystem with new data +# 4. Clear the pool to online previous faulted devices and resilver +# 5. Verify the pool and its contents +# + +log_assert "Verify multiple dRAID spares" + +log_onexit cleanup + +parity=1 +spares=3 +data=$(random_int_between 1 4) +children=10 +draid="draid${parity}:${data}d:${children}c:${spares}s" + +setup_test_env $TESTPOOL $draid $children + +# Replace vdev7 -> draid1-0-0 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev7 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev7 draid1-0-0 + +# Replace vdev8 -> draid1-0-1 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev8 draid1-0-1 + +# Replace vdev9 -> draid1-0-2 +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev9 +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev9 draid1-0-2 + +# Verify, refill and verify the pool contents. +verify_pool $TESTPOOL +refill_test_env $TESTPOOL +verify_pool $TESTPOOL + +# Bring everything back online and check for errors. +log_must zpool clear $TESTPOOL +log_must zpool wait -t resilver $TESTPOOL + +log_must wait_hotspare_state $TESTPOOL draid1-0-0 "AVAIL" +log_must wait_hotspare_state $TESTPOOL draid1-0-1 "AVAIL" +log_must wait_hotspare_state $TESTPOOL draid1-0-2 "AVAIL" + +log_must zpool scrub -w $TESTPOOL +log_must check_pool_status $TESTPOOL "scan" "repaired 0B" +log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + +log_must is_data_valid $TESTPOOL + +log_pass "Verify multiple dRAID spares" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh new file mode 100644 index 000000000000..587a1be0a66a --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_draid_spare3.ksh @@ -0,0 +1,197 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify dRAID resilver to traditional and distributed spares for +# a variety of pool configurations and pool states. +# +# STRATEGY: +# 1. For resilvers: +# a. Create a semi-random dRAID pool configuration which can +# sustain 1 failure and has 5 distributed spares. +# b. Fill the pool with data +# c. Systematically fault and replace vdevs in the pools with +# spares to test resilving in common pool states. +# d. Scrub the pool to verify no data was lost +# e. Verify the contents of files in the pool +# + +log_assert "Verify dRAID resilver" + +function cleanup_tunable +{ + log_must set_tunable32 REBUILD_SCRUB_ENABLED 1 + cleanup +} + +log_onexit cleanup_tunable + +if is_kmemleak; then + log_unsupported "Test case runs slowly when kmemleak is enabled" +fi + +# +# Disable scrubbing after a sequential resilver to verify the resilver +# alone is able to reconstruct the data without the help of a scrub. +# +log_must set_tunable32 REBUILD_SCRUB_ENABLED 0 + +for replace_mode in "healing" "sequential"; do + + if [[ "$replace_mode" = "sequential" ]]; then + flags="-s" + else + flags="" + fi + + parity=1 + spares=5 + data=$(random_int_between 1 4) + children=10 + draid="draid${parity}:${data}d:${children}c:${spares}s" + + setup_test_env $TESTPOOL $draid $children + + # + # Perform a variety of replacements to normal and distributed spares + # for a variety of different vdev configurations to exercise different + # resilver code paths. The final configuration is expected to be: + # + # NAME STATE READ WRITE CKSUM + # testpool DEGRADED 0 0 0 + # draid1:1d:10c:5s-0 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/new_vdev0 ONLINE 0 0 0 + # /var/tmp/basedir.28683/new_vdev1 ONLINE 0 0 0 + # spare-2 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev2 FAULTED 0 0 0 + # draid1-0-3 ONLINE 0 0 0 + # spare-3 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev3 FAULTED 0 0 0 + # draid1-0-4 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev4 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev5 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev6 ONLINE 0 0 0 + # draid1-0-0 ONLINE 0 0 0 + # spare-8 DEGRADED 0 0 0 + # /var/tmp/basedir.28683/vdev8 FAULTED 0 0 0 + # draid1-0-1 ONLINE 0 0 0 + # spare-9 ONLINE 0 0 0 + # /var/tmp/basedir.28683/vdev9 ONLINE 0 0 0 + # draid1-0-2 ONLINE 0 0 0 + # spares + # draid1-0-0 INUSE currently in use + # draid1-0-1 INUSE currently in use + # draid1-0-2 INUSE currently in use + # draid1-0-3 INUSE currently in use + # draid1-0-4 INUSE currently in use + # + + # Distributed spare which replaces original online device + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev7 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev7 draid1-0-0 + log_must zpool detach $TESTPOOL $BASEDIR/vdev7 + log_must check_vdev_state $TESTPOOL draid1-0-0 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-0 "INUSE" + + # Distributed spare in mirror with original device faulted + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev8 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev8 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev8 draid1-0-1 + log_must check_vdev_state $TESTPOOL spare-8 "DEGRADED" + log_must check_vdev_state $TESTPOOL draid1-0-1 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-1 "INUSE" + + # Distributed spare in mirror with original device still online + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev9 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev9 draid1-0-2 + log_must check_vdev_state $TESTPOOL spare-9 "ONLINE" + log_must check_vdev_state $TESTPOOL draid1-0-2 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-2 "INUSE" + + # Normal faulted device replacement + new_vdev0="$BASEDIR/new_vdev0" + log_must truncate -s $MINVDEVSIZE $new_vdev0 + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev0 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev0 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev0 $new_vdev0 + log_must check_vdev_state $TESTPOOL $new_vdev0 "ONLINE" + + # Distributed spare faulted device replacement + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev2 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev2 "FAULTED" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev2 draid1-0-3 + log_must check_vdev_state $TESTPOOL spare-2 "DEGRADED" + log_must check_vdev_state $TESTPOOL draid1-0-3 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-3 "INUSE" + + # Normal online device replacement + new_vdev1="$BASEDIR/new_vdev1" + log_must truncate -s $MINVDEVSIZE $new_vdev1 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev1 "ONLINE" + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev1 $new_vdev1 + log_must check_vdev_state $TESTPOOL $new_vdev1 "ONLINE" + + # Distributed spare online device replacement (then fault) + log_must zpool replace -w $flags $TESTPOOL $BASEDIR/vdev3 draid1-0-4 + log_must check_vdev_state $TESTPOOL spare-3 "ONLINE" + log_must check_vdev_state $TESTPOOL draid1-0-4 "ONLINE" + log_must check_hotspare_state $TESTPOOL draid1-0-4 "INUSE" + log_must zpool offline -f $TESTPOOL $BASEDIR/vdev3 + log_must check_vdev_state $TESTPOOL $BASEDIR/vdev3 "FAULTED" + log_must check_vdev_state $TESTPOOL spare-3 "DEGRADED" + + resilver_cksum=$(cksum_pool $TESTPOOL) + if [[ $resilver_cksum != 0 ]]; then + log_must zpool status -v $TESTPOOL + log_fail "$replace_mode resilver cksum errors: $resilver_cksum" + fi + + if [[ "$replace_mode" = "healing" ]]; then + log_must zpool scrub -w $TESTPOOL + else + if [[ $(get_tunable REBUILD_SCRUB_ENABLED) -eq 0 ]]; then + log_must zpool scrub -w $TESTPOOL + else + log_must wait_scrubbed $TESTPOOL + fi + fi + + log_must is_pool_scrubbed $TESTPOOL + + scrub_cksum=$(cksum_pool $TESTPOOL) + if [[ $scrub_cksum != 0 ]]; then + log_must zpool status -v $TESTPOOL + log_fail "scrub cksum errors: $scrub_cksum" + fi + + log_must check_pool_status $TESTPOOL "scan" "repaired 0B" + log_must check_pool_status $TESTPOOL "scan" "with 0 errors" + + log_must is_data_valid $TESTPOOL + + cleanup +done + +log_pass "Verify resilver to dRAID distributed spares" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh new file mode 100644 index 000000000000..0a01c47106b3 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/redundancy/redundancy_raidz3.ksh @@ -0,0 +1,84 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# A raidz3 pool can withstand 3 devices are failing or missing. +# +# STRATEGY: +# 1. Create N(>4,<5) virtual disk files. +# 2. Create raidz3 pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Record all the files and directories checksum information. +# 5. Damaged at most two of the virtual disk files. +# 6. Verify the data is correct to prove raidz3 can withstand 3 devices +# are failing. +# + +verify_runnable "global" + +log_assert "Verify raidz3 pool can withstand three devices failing." +log_onexit cleanup + +typeset -i cnt=$(random_int_between 4 5) +setup_test_env $TESTPOOL raidz3 $cnt + +# +# Inject data corruption errors for raidz3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i "label" + log_must is_data_valid $TESTPOOL + log_must clear_errors $TESTPOOL +done + +# +# Inject bad devices errors for raidz3 pool +# +for i in 1 2 3; do + damage_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +# +# Inject missing device errors for raidz3 pool +# +for i in 1 2 3; do + remove_devs $TESTPOOL $i + log_must is_data_valid $TESTPOOL + log_must recover_bad_missing_devs $TESTPOOL $i +done + +log_pass "raidz3 pool can withstand three devices failing passed." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/Makefile.am index 4cc773463356..878935b96d3c 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/Makefile.am @@ -29,7 +29,8 @@ dist_pkgdata_SCRIPTS = \ removal_with_send.ksh removal_with_send_recv.ksh \ removal_with_snapshot.ksh removal_with_write.ksh \ removal_with_zdb.ksh remove_mirror.ksh remove_mirror_sanity.ksh \ - remove_raidz.ksh remove_expanded.ksh remove_indirect.ksh + remove_raidz.ksh remove_expanded.ksh remove_indirect.ksh \ + remove_attach_mirror.ksh dist_pkgdata_DATA = \ removal.kshlib diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/remove_attach_mirror.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/remove_attach_mirror.ksh new file mode 100644 index 000000000000..9bbb07cd9419 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/removal/remove_attach_mirror.ksh @@ -0,0 +1,73 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# Resilvering results in no CKSUM errors in pools with indirect vdevs. +# +# STRATEGY: +# 1. Create a pool with two top-vdevs +# 2. Write some files +# 3. Remove one of the top-vdevs +# 4. Reattach it to make a mirror +# + +TMPDIR=${TMPDIR:-$TEST_BASE_DIR} + +DISK1="$TMPDIR/dsk1" +DISK2="$TMPDIR/dsk2" +DISKS="$DISK1 $DISK2" + +# fio options +export DIRECTORY=/$TESTPOOL +export NUMJOBS=16 +export RUNTIME=10 +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export BLOCKSIZE=4K +export SYNC_TYPE=0 +export DIRECT=1 +export FILE_SIZE=128M + +log_must mkfile 4g $DISK1 +log_must mkfile 4g $DISK2 + +function cleanup +{ + default_cleanup_noexit + log_must rm -f $DISKS +} + +log_must zpool create -O recordsize=4k $TESTPOOL $DISK1 $DISK2 +log_onexit cleanup + +log_must fio $FIO_SCRIPTS/mkfiles.fio +log_must fio $FIO_SCRIPTS/sequential_reads.fio + +log_must zpool remove -w $TESTPOOL $DISK2 +log_must zpool attach -w $TESTPOOL $DISK1 $DISK2 + +verify_pool $TESTPOOL + +log_pass "Resilvering results in no CKSUM errors with indirect vdevs" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh index e9427c7adc9d..998d3eec7c71 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh @@ -38,7 +38,7 @@ # Attaching disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Attach a disk to the pool. # 3. Verify the integrity of the file system and the resilvering. @@ -152,7 +152,7 @@ done log_note "Verify 'zpool attach' fails with non-mirrors." -for type in "" "raidz" "raidz1"; do +for type in "" "raidz" "raidz1" "draid" "draid1"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh index 4261d4d67cc0..e99d681bb21d 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh @@ -37,7 +37,7 @@ # Attaching disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Attach a disk to the pool. # 3. Verify the integrity of the file system and the resilvering. @@ -151,7 +151,7 @@ done log_note "Verify 'zpool attach' fails with non-mirrors." -for type in "" "raidz" "raidz1"; do +for type in "" "raidz" "raidz1" "draid"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/detach.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/detach.ksh index aa3ec4f7a75d..f049c639d8a6 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/detach.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/detach.ksh @@ -37,7 +37,7 @@ # Detaching disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Detach a disk from the pool. # 3. Verify the integrity of the file system and the resilvering. @@ -143,7 +143,7 @@ destroy_pool $TESTPOOL1 log_note "Verify 'zpool detach' fails with non-mirrors." -for type in "" "raidz" "raidz1"; do +for type in "" "raidz" "raidz1" "draid"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh index c919b44b21cc..26dc6f87b26e 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh @@ -26,7 +26,7 @@ # # DESCRIPTION: # Executing 'zpool replace -s' for raidz vdevs failed. Sequential -# resilvers are only allowed for stripe/mirror pools. +# resilvers are only allowed for stripe/mirror/dRAID pools. # # STRATEGY: # 1. Create a raidz pool, verify 'zpool replace -s' fails @@ -67,4 +67,9 @@ log_must zpool create $TESTPOOL1 mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE destroy_pool $TESTPOOL1 +# draid +log_must zpool create $TESTPOOL1 draid ${VDEV_FILES[@]} +log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE +destroy_pool $TESTPOOL1 + log_pass "Sequential resilver is not allowed for raidz vdevs" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh index 5997352284b4..b3c7995fd62a 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh @@ -38,7 +38,7 @@ # Replacing disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror) and +# 1. Create multidisk pools (stripe/mirror/draid) and # start some random I/O # 2. Replace a disk in the pool with another disk. # 3. Verify the integrity of the file system and the rebuilding. @@ -137,7 +137,7 @@ done # log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE -for type in "" "mirror"; do +for type in "" "mirror" "draid"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh index 253cf65e452b..2585397bba88 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh @@ -37,7 +37,7 @@ # Replacing disks during I/O should pass for supported pools. # # STRATEGY: -# 1. Create multidisk pools (stripe/mirror/raidz) and +# 1. Create multidisk pools (stripe/mirror/raidz/draid) and # start some random I/O # 2. Replace a disk in the pool with another disk. # 3. Verify the integrity of the file system and the resilvering. @@ -134,7 +134,7 @@ done # log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE -for type in "" "raidz" "mirror"; do +for type in "" "raidz" "mirror" "draid"; do for op in "" "-f"; do create_pool $TESTPOOL1 $type $specials_list log_must zfs create $TESTPOOL1/$TESTFS1 diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh index d48ee45d03b1..924b56935def 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_config.ksh @@ -70,14 +70,20 @@ log_must set_tunable64 VDEV_MIN_MS_COUNT 32 typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) -for type in "" "mirror" "raidz2"; do +for type in "" "mirror" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" - else + elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" + elif [[ "$type" = "draid" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + + # The per-vdev utilization is lower due to the capacity + # resilverd for the distributed spare. + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) fi log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh index 6af877241d0d..78fe18fa6946 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh @@ -60,7 +60,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh index a0dd1c88496d..13c9b95e0661 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh @@ -61,7 +61,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "raidz2" "draid" "draid2"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_config.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_config.ksh index 44f187cc646c..9a6e19e1c042 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_config.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_config.ksh @@ -70,14 +70,20 @@ log_must set_tunable64 VDEV_MIN_MS_COUNT 32 typeset VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.75 / 1024 / 1024) )) typeset VDEV_MIN_MB=$(( floor(4 * MINVDEVSIZE * 0.30 / 1024 / 1024) )) -for type in "" "mirror" "raidz2"; do +for type in "" "mirror" "raidz2" "draid"; do if [[ "$type" = "" ]]; then VDEVS="$TRIM_VDEV1" elif [[ "$type" = "mirror" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" - else + elif [[ "$type" = "raidz2" ]]; then VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3" + elif [[ "$type" = "draid" ]]; then + VDEVS="$TRIM_VDEV1 $TRIM_VDEV2 $TRIM_VDEV3 $TRIM_VDEV4" + + # The per-vdev utilization is lower due to the capacity + # resilverd for the distributed spare. + VDEV_MAX_MB=$(( floor(4 * MINVDEVSIZE * 0.50 / 1024 / 1024) )) fi log_must truncate -s $((4 * MINVDEVSIZE)) $VDEVS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh index e25b52747c69..38f226d7f8e7 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/trim/trim_integrity.ksh @@ -60,7 +60,7 @@ log_must set_tunable64 TRIM_EXTENT_BYTES_MIN 4096 typeset trim_txg_batch=$(get_tunable TRIM_TXG_BATCH) log_must set_tunable64 TRIM_TXG_BATCH 8 -for type in "" "mirror" "raidz" "raidz2" "raidz3"; do +for type in "" "mirror" "raidz" "draid"; do log_must truncate -s 1G $TRIM_VDEVS log_must zpool create -f $TESTPOOL $type $TRIM_VDEVS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/Makefile.am index 8f0287bc17b2..9100e4adadca 100644 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/Makefile.am +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/Makefile.am @@ -20,7 +20,8 @@ dist_pkgdata_SCRIPTS = \ userquota_013_pos.ksh \ userspace_001_pos.ksh \ userspace_002_pos.ksh \ - userspace_003_pos.ksh + userspace_003_pos.ksh \ + userspace_encrypted.ksh dist_pkgdata_DATA = \ userquota.cfg \ diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/userspace_encrypted.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/userspace_encrypted.ksh new file mode 100644 index 000000000000..429b16e04e44 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/userspace_encrypted.ksh @@ -0,0 +1,85 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019, loli10K . All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/userquota/userquota_common.kshlib + +# +# DESCRIPTION: +# 'zfs userspace' and 'zfs groupspace' can be used on encrypted datasets +# +# +# STRATEGY: +# 1. Create both un-encrypted and encrypted datasets +# 2. Receive un-encrypted dataset in encrypted hierarchy +# 3. Verify encrypted datasets support 'zfs userspace' and 'zfs groupspace' +# + +function cleanup +{ + destroy_pool $POOLNAME + rm -f $FILEDEV +} + +function log_must_unsupported +{ + log_must_retry "unsupported" 3 "$@" + (( $? != 0 )) && log_fail +} + +log_onexit cleanup + +FILEDEV="$TEST_BASE_DIR/userspace_encrypted" +POOLNAME="testpool$$" +typeset -a POOL_OPTS=('' # all pool features enabled + '-d' # all pool features disabled + '-d -o feature@userobj_accounting=enabled' # only userobj_accounting enabled + '-d -o feature@project_quota=enabled') # only project_quota enabled +DATASET_ENCROOT="$POOLNAME/encroot" +DATASET_SENDFS="$POOLNAME/sendfs" + +log_assert "'zfs user/groupspace' should work on encrypted datasets" + +for opts in "${POOL_OPTS[@]}"; do + # Setup + truncate -s $SPA_MINDEVSIZE $FILEDEV + log_must zpool create $opts -o feature@encryption=enabled $POOLNAME \ + $FILEDEV + + # 1. Create both un-encrypted and encrypted datasets + log_must zfs create $DATASET_SENDFS + log_must eval "echo 'password' | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt " \ + "$DATASET_ENCROOT" + log_must zfs create $DATASET_ENCROOT/fs + + # 2. Receive un-encrypted dataset in encrypted hierarchy + log_must zfs snap $DATASET_SENDFS@snap + log_must eval "zfs send $DATASET_SENDFS@snap | zfs recv " \ + "$DATASET_ENCROOT/recvfs" + + # 3. Verify encrypted datasets support 'zfs userspace' and + # 'zfs groupspace' + log_must zfs userspace $DATASET_ENCROOT/fs + log_must zfs groupspace $DATASET_ENCROOT/fs + log_must_unsupported zfs userspace $DATASET_ENCROOT/recvfs + log_must_unsupported zfs groupspace $DATASET_ENCROOT/recvfs + + # Cleanup + cleanup +done + +log_pass "'zfs user/groupspace' works on encrypted datasets" diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/userspace_send_encrypted.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/userspace_send_encrypted.ksh new file mode 100644 index 000000000000..fbd2cc99b55b --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/userquota/userspace_send_encrypted.ksh @@ -0,0 +1,108 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2020, George Amanakis . All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/userquota/userquota_common.kshlib + +# +# DESCRIPTION: +# Sending raw encrypted datasets back to the source dataset succeeds. +# +# +# STRATEGY: +# 1. Create encrypted source dataset, set userquota and write a file +# 2. Create base and an additional snapshot (s1) +# 3. Unmount the source dataset +# 4. Raw send the base snapshot to a new target dataset +# 5. Raw send incrementally the s1 snapshot to the new target dataset +# 6. Mount both source and target datasets +# 7. Verify encrypted datasets support 'zfs userspace' and 'zfs groupspace' +# and the accounting is done correctly +# + +function cleanup +{ + destroy_pool $POOLNAME + rm -f $FILEDEV +} + +function log_must_unsupported +{ + log_must_retry "unsupported" 3 "$@" + (( $? != 0 )) && log_fail +} + +log_onexit cleanup + +FILEDEV="$TEST_BASE_DIR/userspace_encrypted" +POOLNAME="testpool$$" +ENC_SOURCE="$POOLNAME/source" +ENC_TARGET="$POOLNAME/target" + +log_assert "Sending raw encrypted datasets back to the source dataset succeeds." + +# Setup +truncate -s 200m $FILEDEV +log_must zpool create -o feature@encryption=enabled $POOLNAME \ + $FILEDEV + +# Create encrypted source dataset +log_must eval "echo 'password' | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt " \ + "$ENC_SOURCE" + +# Set user quota and write file +log_must zfs set userquota@$QUSER1=50m $ENC_SOURCE +mkmount_writable $ENC_SOURCE +mntpnt=$(get_prop mountpoint $ENC_SOURCE) +log_must user_run $QUSER1 mkfile 20m /$mntpnt/file +sync + +# Snapshot, raw send to new dataset +log_must zfs snap $ENC_SOURCE@base +log_must zfs snap $ENC_SOURCE@s1 +log_must zfs umount $ENC_SOURCE +log_must eval "zfs send -w $ENC_SOURCE@base | zfs recv " \ + "$ENC_TARGET" + +log_must eval "zfs send -w -i @base $ENC_SOURCE@s1 | zfs recv " \ + "$ENC_TARGET" + +log_must zfs destroy $ENC_SOURCE@s1 +log_must eval "zfs send -w -i @base $ENC_TARGET@s1 | zfs recv " \ + "$ENC_SOURCE" + +# Mount encrypted datasets and verify they support 'zfs userspace' and +# 'zfs groupspace' and the accounting is done correctly +log_must zfs mount $ENC_SOURCE +log_must eval "echo password | zfs load-key $ENC_TARGET" +log_must zfs mount $ENC_TARGET +sync + +src_uspace=$(( $(zfs userspace -Hp $ENC_SOURCE | grep $QUSER1 | \ + awk '{print $4}')/1024/1024)) +tgt_uspace=$(( $(zfs userspace -Hp $ENC_TARGET | grep $QUSER1 | \ + awk '{print $4}')/1024/1024)) +log_must test "$src_uspace" -eq "$tgt_uspace" + +src_uquota=$(zfs userspace -Hp $ENC_SOURCE | grep $QUSER1 | awk '{print $5}') +tgt_uquota=$(zfs userspace -Hp $ENC_TARGET | grep $QUSER1 | awk '{print $5}') +log_must test "$src_uquota" -eq "$tgt_uquota" + +# Cleanup +cleanup + +log_pass "Sending raw encrypted datasets back to the source dataset succeeds." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_004_pos.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_004_pos.ksh index 0a25d7ac507d..786322b30a97 100755 --- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_004_pos.ksh +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/xattr/xattr_004_pos.ksh @@ -33,11 +33,11 @@ # # DESCRIPTION: # -# Creating files on ufs|ext and tmpfs, and copying those files to ZFS with -# appropriate cp flags, the xattrs will still be readable. +# Create files on ufs|ext, copy those files to ZFS with appropriate cp flags, +# and verify the xattrs will still be readable. # # STRATEGY: -# 1. Create files in ufs|ext and tmpfs with xattrs +# 1. Create files in ufs|ext with xattrs # 2. Copy those files to zfs # 3. Ensure the xattrs can be read and written # 4. Do the same in reverse. @@ -54,7 +54,7 @@ function cleanup { fi } -log_assert "Files from $NEWFS_DEFAULT_FS,tmpfs with xattrs copied to zfs retain xattr info." +log_assert "Files from $NEWFS_DEFAULT_FS with xattrs copied to zfs retain xattr info." log_onexit cleanup # Create a ufs|ext file system that we can work in @@ -63,28 +63,23 @@ block_device_wait log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTFS/zvol > /dev/null 2>&1" log_must mkdir /tmp/$NEWFS_DEFAULT_FS.$$ -log_must mkdir /tmp/tmpfs.$$ if is_illumos; then log_must mount $ZVOL_DEVDIR/$TESTPOOL/$TESTFS/zvol \ /tmp/$NEWFS_DEFAULT_FS.$$ - # Create files in ufs and tmpfs, and set some xattrs on them. + # Create files in ufs, and set some xattrs on them. log_must touch /tmp/$NEWFS_DEFAULT_FS.$$/$NEWFS_DEFAULT_FS-file.$$ - log_must touch /tmp/tmpfs-file.$$ log_must runat /tmp/$NEWFS_DEFAULT_FS.$$/$NEWFS_DEFAULT_FS-file.$$ \ cp /etc/passwd . - log_must runat /tmp/tmpfs-file.$$ cp /etc/group . # copy those files to ZFS log_must cp -@ /tmp/$NEWFS_DEFAULT_FS.$$/$NEWFS_DEFAULT_FS-file.$$ \ $TESTDIR - log_must cp -@ /tmp/tmpfs-file.$$ $TESTDIR # ensure the xattr information has been copied correctly log_must runat $TESTDIR/$NEWFS_DEFAULT_FS-file.$$ \ diff passwd /etc/passwd - log_must runat $TESTDIR/tmpfs-file.$$ diff group /etc/group log_must umount /tmp/$NEWFS_DEFAULT_FS.$$ else @@ -94,21 +89,15 @@ else log_must mount ${options:+""} \ $ZVOL_DEVDIR/$TESTPOOL/$TESTFS/zvol /tmp/$NEWFS_DEFAULT_FS.$$ - # Create files in ext and tmpfs, and set some xattrs on them. + # Create files in ext, and set some xattrs on them. # Use small values for xattrs for ext compatibility. log_must touch /tmp/$NEWFS_DEFAULT_FS.$$/$NEWFS_DEFAULT_FS-file.$$ echo "TEST XATTR" >/tmp/xattr1 - echo "1234567890" >/tmp/xattr2 log_must set_xattr_stdin xattr1 \ /tmp/$NEWFS_DEFAULT_FS.$$/$NEWFS_DEFAULT_FS-file.$$ /tmp/xattr2.$$" - log_must diff /tmp/xattr2.$$ /tmp/xattr2 - log_must rm /tmp/tmpfs-file.$$ - log_must rm /tmp/xattr2 /tmp/xattr2.$$ - fi - log_must umount /tmp/$NEWFS_DEFAULT_FS.$$ fi -log_pass "Files from $NEWFS_DEFAULT_FS,tmpfs with xattrs copied to zfs retain xattr info." +log_pass "Files from $NEWFS_DEFAULT_FS with xattrs copied to zfs retain xattr info." diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/Makefile.am new file mode 100644 index 000000000000..36d08a41a91c --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/Makefile.am @@ -0,0 +1,5 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/zpool_influxdb +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + zpool_influxdb.ksh diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/cleanup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/cleanup.ksh new file mode 100644 index 000000000000..a8cd2e4b611f --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/cleanup.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at +# https://opensource.org/licenses/CDDL-1.0 +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Richard Elling +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/setup.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/setup.ksh new file mode 100644 index 000000000000..43f2c8c20b29 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/setup.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at +# https://opensource.org/licenses/CDDL-1.0 +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Richard Elling +# + +. $STF_SUITE/include/libtest.shlib + +default_raidz_setup $DISKS diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/zpool_influxdb.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/zpool_influxdb.ksh new file mode 100644 index 000000000000..495a4a38b7f5 --- /dev/null +++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/zpool_influxdb/zpool_influxdb.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at +# https://opensource.org/licenses/CDDL-1.0 +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2020 Richard Elling +# + +. $STF_SUITE/include/libtest.shlib + +typeset tmpfile=$TEST_BASE_DIR/zpool_influxdb.out.$$ +function cleanup +{ + if [[ -f $tmpfile ]]; then + rm -f $tmpfile + fi +} +log_onexit cleanup + +log_assert "zpool_influxdb gathers statistics" + +if ! is_global_zone ; then + TESTPOOL=${TESTPOOL%%/*} +fi + +function check_for +{ + grep "^${1}," $tmpfile >/dev/null 2>/dev/null + if [ $? -ne 0 ]; then + log_fail "cannot find stats for $1" + fi +} + +# by default, all stats and histograms for all pools +log_must zpool_influxdb > $tmpfile + +STATS=" +zpool_io_size +zpool_latency +zpool_stats +zpool_vdev_queue +zpool_vdev_stats +" +for stat in $STATS; do + check_for $stat +done + +# scan stats aren't expected to be there until after a scan has started +zpool scrub $TESTPOOL +zpool_influxdb > $tmpfile +check_for zpool_scan_stats + +log_pass "zpool_influxdb gathers statistics"