From 00b46022c676e402e3f33ce93ee2983bbad2c46f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 26 Aug 2010 11:46:09 -0700 Subject: [PATCH] Add linux kernel memory support Required kmem/vmem changes Signed-off-by: Brian Behlendorf --- lib/libzpool/include/sys/zfs_context.h | 5 +++++ lib/libzpool/kernel.c | 28 ++++++++++++++------------ module/zfs/arc.c | 19 +++++++++++++++-- module/zfs/dbuf.c | 12 +++++++++++ module/zfs/ddt.c | 5 ++++- module/zfs/dmu_send.c | 4 ++-- module/zfs/spa.c | 6 +++--- module/zfs/spa_config.c | 4 ++-- module/zfs/spa_history.c | 9 ++++----- module/zfs/spa_misc.c | 2 +- module/zfs/txg.c | 4 ++-- module/zfs/zap_micro.c | 6 +++--- module/zfs/zfs_ioctl.c | 8 ++++---- module/zfs/zil.c | 6 +++--- 14 files changed, 77 insertions(+), 41 deletions(-) diff --git a/lib/libzpool/include/sys/zfs_context.h b/lib/libzpool/include/sys/zfs_context.h index 34c351bd0dbc..338c871e0df5 100644 --- a/lib/libzpool/include/sys/zfs_context.h +++ b/lib/libzpool/include/sys/zfs_context.h @@ -349,10 +349,14 @@ extern void kstat_delete(kstat_t *); #define KM_SLEEP UMEM_NOFAIL #define KM_PUSHPAGE KM_SLEEP #define KM_NOSLEEP UMEM_DEFAULT +#define KM_NODEBUG 0x0 #define KMC_NODEBUG UMC_NODEBUG #define kmem_alloc(_s, _f) umem_alloc(_s, _f) #define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) #define kmem_free(_b, _s) umem_free(_b, _s) +#define vmem_alloc(_s, _f) kmem_alloc(_s, _f) +#define vmem_zalloc(_s, _f) kmem_zalloc(_s, _f) +#define vmem_free(_b, _s) kmem_free(_b, _s) #define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) #define kmem_cache_destroy(_c) umem_cache_destroy(_c) @@ -575,6 +579,7 @@ typedef struct callb_cpr { #define zone_dataset_visible(x, y) (1) #define INGLOBALZONE(z) (1) +extern char *kmem_vasprintf(const char *fmt, va_list adx); extern char *kmem_asprintf(const char *fmt, ...); #define strfree(str) kmem_free((str), strlen(str)+1) diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 0559347e96b9..494e544ea7f8 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -1107,25 +1107,27 @@ ksiddomain_rele(ksiddomain_t *ksid) umem_free(ksid, sizeof (ksiddomain_t)); } -/* - * Do not change the length of the returned string; it must be freed - * with strfree(). - */ +char * +kmem_vasprintf(const char *fmt, va_list adx) +{ + char *buf = NULL; + va_list adx_copy; + + va_copy(adx_copy, adx); + VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); + va_end(adx_copy); + + return (buf); +} + char * kmem_asprintf(const char *fmt, ...) { - int size; + char *buf = NULL; va_list adx; - char *buf; va_start(adx, fmt); - size = vsnprintf(NULL, 0, fmt, adx) + 1; - va_end(adx); - - buf = kmem_alloc(size, KM_SLEEP); - - va_start(adx, fmt); - size = vsnprintf(buf, size, fmt, adx); + VERIFY(vasprintf(&buf, fmt, adx) != -1); va_end(adx); return (buf); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5ac73e1158ca..f1d51805b19f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -523,12 +523,13 @@ static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); * Hash table routines */ -#define HT_LOCK_PAD 64 +#define HT_LOCK_ALIGN 64 +#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) struct ht_lock { kmutex_t ht_lock; #ifdef _KERNEL - unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; + unsigned char pad[HT_LOCK_PAD]; #endif }; @@ -772,8 +773,15 @@ buf_fini(void) { int i; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_free() in the linux kernel */ + vmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#else kmem_free(buf_hash_table.ht_table, (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_cache); @@ -875,8 +883,15 @@ buf_init(void) hsize <<= 1; retry: buf_hash_table.ht_mask = hsize - 1; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_alloc() in the linux kernel */ + buf_hash_table.ht_table = + vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); +#else buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); +#endif if (buf_hash_table.ht_table == NULL) { ASSERT(hsize > (1ULL << 8)); hsize >>= 1; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c1b27d4ef338..fb7d0ac86eca 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -293,7 +293,13 @@ dbuf_init(void) retry: h->hash_table_mask = hsize - 1; +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_alloc() in the linux kernel */ + h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); +#else h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); +#endif if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); @@ -317,7 +323,13 @@ dbuf_fini(void) for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); +#if defined(_KERNEL) && defined(HAVE_SPL) + /* Large allocations which do not require contiguous pages + * should be using vmem_free() in the linux kernel */ + vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); +#else kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); +#endif kmem_cache_destroy(dbuf_cache); } diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index ae9d2a5e139c..c7db3d7580bc 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -501,6 +501,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) { ddt_histogram_t *ddh_total; + /* XXX: Move to a slab */ ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); ddt_get_dedup_histogram(spa, ddh_total); ddt_histogram_stat(dds_total, ddh_total); @@ -649,6 +650,7 @@ ddt_alloc(const ddt_key_t *ddk) { ddt_entry_t *dde; + /* XXX: Move to a slab */ dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); @@ -797,7 +799,8 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) { ddt_t *ddt; - ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); + /* XXX: Move to a slab */ + ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP | KM_NODEBUG); mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&ddt->ddt_tree, ddt_entry_compare, diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index f13cfd316f90..ae0b36fc6380 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1337,7 +1337,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ra.vp = vp; ra.voff = *voffp; ra.bufsize = 1<<20; - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); + ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == @@ -1486,7 +1486,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, } } - kmem_free(ra.buf, ra.bufsize); + vmem_free(ra.buf, ra.bufsize); *voffp = ra.voff; return (ra.err); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index e037f4133ff5..afdfa123221e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1270,7 +1270,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); - packed = kmem_alloc(nvsize, KM_SLEEP); + packed = kmem_alloc(nvsize, KM_SLEEP | KM_NODEBUG); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) @@ -5217,7 +5217,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); - packed = kmem_alloc(bufsize, KM_SLEEP); + packed = vmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); @@ -5225,7 +5225,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); - kmem_free(packed, bufsize); + vmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 1cf3950d450d..b7ef12a8fa38 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -96,7 +96,7 @@ spa_config_load(void) if (kobj_get_filesize(file, &fsize) != 0) goto out; - buf = kmem_alloc(fsize, KM_SLEEP); + buf = kmem_alloc(fsize, KM_SLEEP | KM_NODEBUG); /* * Read the nvlist from the file. @@ -159,7 +159,7 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) */ VERIFY(nvlist_size(nvl, &buflen, NV_ENCODE_XDR) == 0); - buf = kmem_alloc(buflen, KM_SLEEP); + buf = kmem_alloc(buflen, KM_SLEEP | KM_NODEBUG); temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); VERIFY(nvlist_pack(nvl, &buf, &buflen, NV_ENCODE_XDR, diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index a65f16bccdde..ce7d378c6ff9 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -428,6 +428,7 @@ log_internal(history_internal_events_t event, spa_t *spa, dmu_tx_t *tx, const char *fmt, va_list adx) { history_arg_t *ha; + va_list adx_copy; /* * If this is part of creating a pool, not everything is @@ -437,11 +438,9 @@ log_internal(history_internal_events_t event, spa_t *spa, return; ha = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); - ha->ha_history_str = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, - KM_SLEEP); - - (void) vsprintf(ha->ha_history_str, fmt, adx); - + va_copy(adx_copy, adx); + ha->ha_history_str = kmem_vasprintf(fmt, adx_copy); + va_end(adx_copy); ha->ha_log_type = LOG_INTERNAL; ha->ha_event = event; ha->ha_zone = NULL; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 6fb3f90e327a..31e82d879adf 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -438,7 +438,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) ASSERT(MUTEX_HELD(&spa_namespace_lock)); - spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); + spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP | KM_NODEBUG); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index f9f24dd0a945..5fc086e5de69 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -49,7 +49,7 @@ txg_init(dsl_pool_t *dp, uint64_t txg) int c; bzero(tx, sizeof (tx_state_t)); - tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); + tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); for (c = 0; c < max_ncpus; c++) { int i; @@ -107,7 +107,7 @@ txg_fini(dsl_pool_t *dp) if (tx->tx_commit_cb_taskq != NULL) taskq_destroy(tx->tx_commit_cb_taskq); - kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); + vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); bzero(tx, sizeof (tx_state_t)); } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index aa86bdecb11e..04369bbc50b3 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -532,7 +532,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); sz = zap->zap_dbuf->db_size; - mzp = kmem_alloc(sz, KM_SLEEP); + mzp = vmem_alloc(sz, KM_SLEEP); bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; @@ -540,7 +540,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err) { - kmem_free(mzp, sz); + vmem_free(mzp, sz); return (err); } } @@ -566,7 +566,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) if (err) break; } - kmem_free(mzp, sz); + vmem_free(mzp, sz); *zapp = zap; return (err); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 902c2342a718..221b1e335925 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -170,7 +170,7 @@ history_str_get(zfs_cmd_t *zc) if (zc->zc_history == 0) return (NULL); - buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP | KM_NODEBUG); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); @@ -1027,7 +1027,7 @@ get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) if (size == 0) return (EINVAL); - packed = kmem_alloc(size, KM_SLEEP); + packed = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { @@ -1093,7 +1093,7 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) if (size > zc->zc_nvlist_dst_size) { error = ENOMEM; } else { - packed = kmem_alloc(size, KM_SLEEP); + packed = kmem_alloc(size, KM_SLEEP | KM_NODEBUG); VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, KM_SLEEP) == 0); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, @@ -5081,7 +5081,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) if (vec >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (-EINVAL); - zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP | KM_NODEBUG); error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); if (error != 0) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 3c18d43fa5c2..ad11fd6c6357 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1075,7 +1075,7 @@ zil_itx_create(uint64_t txtype, size_t lrsize) lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); - itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); + itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP|KM_NODEBUG); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ @@ -1939,7 +1939,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); - zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. @@ -1951,7 +1951,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg); - kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); + vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);