sort(1): Memoize MD5 computation to reduce repeated computation

Experimentally, reduces sort -R time of a 148160 line corpus from about
3.15s to about 0.93s on this particular system.

There's probably room for improvement using some digest other than md5, but
I don't want to look at sort(1) anymore.  Some discussion of other possible
improvements in the Test Plan section of the Differential.

PR:		230792
Reviewed by:	jhb (earlier version)
Differential Revision:	https://reviews.freebsd.org/D19885
This commit is contained in:
cem 2019-04-13 04:42:17 +00:00
parent 7061ad58c2
commit 7474baafb6
3 changed files with 36 additions and 0 deletions

View File

@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset)
return (numcoll_impl(kv1, kv2, offset, true));
}
/* Use hint space to memoize md5 computations, at least. */
static void
randomcoll_init_hint(struct key_value *kv, void *hash)
{
memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached));
kv->hint->status = HS_INITIALIZED;
}
/*
* Implements random sort (-R).
*/
@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
struct bwstring *s1, *s2;
MD5_CTX ctx1, ctx2;
unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
int cmp;
s1 = kv1->k;
s2 = kv2->k;
@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
if (s1 == s2)
return (0);
if (kv1->hint->status == HS_INITIALIZED &&
kv2->hint->status == HS_INITIALIZED) {
cmp = memcmp(kv1->hint->v.Rh.cached,
kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached));
if (cmp != 0)
return (cmp);
}
memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));
@ -1012,6 +1030,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
MD5Final(hash1, &ctx1);
MD5Final(hash2, &ctx2);
if (kv1->hint->status == HS_UNINITIALIZED)
randomcoll_init_hint(kv1, hash1);
if (kv2->hint->status == HS_UNINITIALIZED)
randomcoll_init_hint(kv2, hash2);
return (memcmp(hash1, hash2, sizeof(hash1)));
}

View File

@ -64,6 +64,17 @@ struct M_hint
int m;
};
/*
* Sort hint data for -R
*
* This stores the first 12 bytes of the digest rather than the full output to
* avoid increasing the size of the 'key_hint' object via the 'v' union.
*/
struct R_hint
{
unsigned char cached[12];
};
/*
* Status of a sort hint object
*/
@ -83,6 +94,7 @@ struct key_hint
struct n_hint nh;
struct g_hint gh;
struct M_hint Mh;
struct R_hint Rh;
} v;
};

View File

@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c)
break;
case 'R':
sm->Rflag = true;
need_hint = true;
need_random = true;
break;
case 'M':