blobfs: don't flush a partial buffer with no sync pending

We flush a cache buffer once it's filled. When the write for that cache buffer has completed, we look to see if there's more data to flush. Currently if there's *any* more data to flush, we will flush it, even if it's not a full buffer. That can hurt performance though. Ideally we only want to flush a partial buffer if there's been an explicit sync operation that requires that partial buffer to be flushed. Otherwise we will end up writing the partial buffer to disk, and then come back and write that data again later when the buffer is full. Add a new unit test to test for this condition. This patch breaks one of the existing unit tests which was designed specifically around a RocksDB failure condition. Change that file_length unit test to now write exactly one CACHE_BUFFER, which still tests the general logic making sure that we don't confuse the amount of data flushed with the value written to the file's length xattr. Signed-off-by: Jim Harris <james.r.harris@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/455698 (master) (cherry picked from commit cdd089a8c5) Change-Id: I83795fb45afe854b38648d0e0c1a7928219307a2 Signed-off-by: Tomasz Zawadzki <tomasz.zawadzki@intel.com> Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/457276 Tested-by: SPDK CI Jenkins <sys_sgci@intel.com> Reviewed-by: Darek Stojaczyk <dariusz.stojaczyk@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Reviewed-by: Shuhei Matsumoto <shuhei.matsumoto.xt@hitachi.com>
2019-05-24 10:31:17 -07:00 · 2019-05-24 10:31:17 -07:00 · b85a64c878
commit b85a64c878
parent 31f18569bd
2 changed files with 78 additions and 10 deletions
--- a/lib/blobfs/blobfs.c
+++ b/lib/blobfs/blobfs.c
@ -2053,11 +2053,15 @@ __file_flush(void *ctx)

 	pthread_spin_lock(&file->lock);
 	next = spdk_tree_find_buffer(file->tree, file->length_flushed);
-	if (next == NULL || next->in_progress) {
+	if (next == NULL || next->in_progress ||
+	    ((next->bytes_filled < next->buf_size) && TAILQ_EMPTY(&file->sync_requests))) {
 		/*
-		 * There is either no data to flush, or a flush I/O is already in
-		 *  progress.  So return immediately - if a flush I/O is in
-		 *  progress we will flush more data after that is completed.
+		 * There is either no data to flush, a flush I/O is already in
+		 *  progress, or the next buffer is partially filled but there's no
+		 *  outstanding request to sync it.
+		 * So return immediately - if a flush I/O is in progress we will flush
+		 *  more data after that is completed, or a partial buffer will get flushed
+		 *  when it is either filled or the file is synced.
 		 */
 		free_fs_request(req);
 		if (next == NULL) {
--- a/test/unit/lib/blobfs/blobfs_sync_ut/blobfs_sync_ut.c
+++ b/test/unit/lib/blobfs/blobfs_sync_ut/blobfs_sync_ut.c
@ -163,6 +163,11 @@ _fs_unload(void *arg)
 	g_fs = NULL;
 }

+static void
+_nop(void *arg)
+{
+}
+
 static void
 cache_write(void)
 {
@ -220,13 +225,10 @@ file_length(void)
 	CU_ASSERT(rc == 0);
 	SPDK_CU_ASSERT_FATAL(g_file != NULL);

-	/* Write slightly more than one CACHE_BUFFER.  Filling at least one cache
-	 * buffer triggers a flush to disk.  Currently when that cache buffer is written,
-	 * it will proceed to write the final byte, even though there's been no explicit
-	 * sync for it yet.  We will fix that eventually, but for now test with this
-	 * behavior since it matches a subtle RocksDB failure scenario.
+	/* Write one CACHE_BUFFER.  Filling at least one cache buffer triggers
+	 * a flush to disk.
 	 */
-	buf_length = CACHE_BUFFER_SIZE + 1;
+	buf_length = CACHE_BUFFER_SIZE;
 	buf = calloc(1, buf_length);
 	spdk_file_write(g_file, channel, buf, 0, buf_length);
 	free(buf);
@ -277,6 +279,67 @@ file_length(void)
 	ut_send_request(_fs_unload, NULL);
 }

+static void
+partial_buffer(void)
+{
+	int rc;
+	char *buf;
+	uint64_t buf_length;
+	struct spdk_fs_thread_ctx *channel;
+	struct spdk_file_stat stat = {0};
+
+	ut_send_request(_fs_init, NULL);
+
+	channel = spdk_fs_alloc_thread_ctx(g_fs);
+
+	g_file = NULL;
+	rc = spdk_fs_open_file(g_fs, channel, "testfile", SPDK_BLOBFS_OPEN_CREATE, &g_file);
+	CU_ASSERT(rc == 0);
+	SPDK_CU_ASSERT_FATAL(g_file != NULL);
+
+	/* Write one CACHE_BUFFER plus one byte.  Filling at least one cache buffer triggers
+	 * a flush to disk.  We want to make sure the extra byte is not implicitly flushed.
+	 * It should only get flushed once we sync or close the file.
+	 */
+	buf_length = CACHE_BUFFER_SIZE + 1;
+	buf = calloc(1, buf_length);
+	spdk_file_write(g_file, channel, buf, 0, buf_length);
+	free(buf);
+
+	/* Send some nop messages to the dispatch thread.  This will ensure any of the
+	 * pending write operations are completed.  A well-functioning blobfs should only
+	 * issue one write for the filled CACHE_BUFFER - a buggy one might try to write
+	 * the extra byte.  So do a bunch of _nops to make sure all of them (even the buggy
+	 * ones) get a chance to run.  Note that we can't just send a message to the
+	 * dispatch thread to call spdk_thread_poll() because the messages are themselves
+	 * run in the context of spdk_thread_poll().
+	 */
+	ut_send_request(_nop, NULL);
+	ut_send_request(_nop, NULL);
+	ut_send_request(_nop, NULL);
+	ut_send_request(_nop, NULL);
+	ut_send_request(_nop, NULL);
+	ut_send_request(_nop, NULL);
+
+	CU_ASSERT(g_file->length_flushed == CACHE_BUFFER_SIZE);
+
+	/* Close the file.  This causes an implicit sync which should write the
+	 * length_flushed value as the "length" xattr on the file.
+	 */
+	spdk_file_close(g_file, channel);
+
+	rc = spdk_fs_file_stat(g_fs, channel, "testfile", &stat);
+	CU_ASSERT(rc == 0);
+	CU_ASSERT(buf_length == stat.size);
+
+	rc = spdk_fs_delete_file(g_fs, channel, "testfile");
+	CU_ASSERT(rc == 0);
+
+	spdk_fs_free_thread_ctx(channel);
+
+	ut_send_request(_fs_unload, NULL);
+}
+
 static void
 cache_write_null_buffer(void)
 {
@ -463,6 +526,7 @@ int main(int argc, char **argv)
 	if (
 		CU_add_test(suite, "write", cache_write) == NULL ||
 		CU_add_test(suite, "file length", file_length) == NULL ||
+		CU_add_test(suite, "partial buffer", partial_buffer) == NULL ||
 		CU_add_test(suite, "write_null_buffer", cache_write_null_buffer) == NULL ||
 		CU_add_test(suite, "create_sync", fs_create_sync) == NULL ||
 		CU_add_test(suite, "append_no_cache", cache_append_no_cache) == NULL ||