sort(1): Memoize MD5 computation to reduce repeated computation

Experimentally, reduces sort -R time of a 148160 line corpus from about 3.15s to about 0.93s on this particular system. There's probably room for improvement using some digest other than md5, but I don't want to look at sort(1) anymore. Some discussion of other possible improvements in the Test Plan section of the Differential. PR: 230792 Reviewed by: jhb (earlier version) Differential Revision: https://reviews.freebsd.org/D19885
2019-04-13 04:42:17 +00:00 · 2019-04-13 04:42:17 +00:00 · 7474baafb6
commit 7474baafb6
parent 7061ad58c2
3 changed files with 36 additions and 0 deletions
--- a/usr.bin/sort/coll.c
+++ b/usr.bin/sort/coll.c
@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset)
 	return (numcoll_impl(kv1, kv2, offset, true));
 }

+/* Use hint space to memoize md5 computations, at least. */
+static void
+randomcoll_init_hint(struct key_value *kv, void *hash)
+{
+
+	memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached));
+	kv->hint->status = HS_INITIALIZED;
+}
+
 /*
 * Implements random sort (-R).
 */
@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
 	struct bwstring *s1, *s2;
 	MD5_CTX ctx1, ctx2;
 	unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
+	int cmp;

 	s1 = kv1->k;
 	s2 = kv2->k;
@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
 	if (s1 == s2)
 		return (0);

+	if (kv1->hint->status == HS_INITIALIZED &&
+	    kv2->hint->status == HS_INITIALIZED) {
+		cmp = memcmp(kv1->hint->v.Rh.cached,
+		    kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached));
+		if (cmp != 0)
+			return (cmp);
+	}
+
 	memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
 	memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));

@ -1012,6 +1030,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
 	MD5Final(hash1, &ctx1);
 	MD5Final(hash2, &ctx2);

+	if (kv1->hint->status == HS_UNINITIALIZED)
+		randomcoll_init_hint(kv1, hash1);
+	if (kv2->hint->status == HS_UNINITIALIZED)
+		randomcoll_init_hint(kv2, hash2);
+
 	return (memcmp(hash1, hash2, sizeof(hash1)));
 }

--- a/usr.bin/sort/coll.h
+++ b/usr.bin/sort/coll.h
@ -64,6 +64,17 @@ struct M_hint
 	int			 m;
 };

+/*
+ * Sort hint data for -R
+ *
+ * This stores the first 12 bytes of the digest rather than the full output to
+ * avoid increasing the size of the 'key_hint' object via the 'v' union.
+ */
+struct R_hint
+{
+	unsigned char		 cached[12];
+};
+
 /*
 * Status of a sort hint object
 */
@ -83,6 +94,7 @@ struct key_hint
 		struct n_hint		nh;
 		struct g_hint		gh;
 		struct M_hint		Mh;
+		struct R_hint		Rh;
 	}			v;
 };

--- a/usr.bin/sort/sort.c
+++ b/usr.bin/sort/sort.c
@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c)
 		break;
 	case 'R':
 		sm->Rflag = true;
+		need_hint = true;
 		need_random = true;
 		break;
 	case 'M':