sort(1): Memoize MD5 computation to reduce repeated computation
Experimentally, reduces sort -R time of a 148160 line corpus from about 3.15s to about 0.93s on this particular system. There's probably room for improvement using some digest other than md5, but I don't want to look at sort(1) anymore. Some discussion of other possible improvements in the Test Plan section of the Differential. PR: 230792 Reviewed by: jhb (earlier version) Differential Revision: https://reviews.freebsd.org/D19885
This commit is contained in:
parent
49d9a59783
commit
f20b149b45
@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset)
|
||||
return (numcoll_impl(kv1, kv2, offset, true));
|
||||
}
|
||||
|
||||
/* Use hint space to memoize md5 computations, at least. */
|
||||
static void
|
||||
randomcoll_init_hint(struct key_value *kv, void *hash)
|
||||
{
|
||||
|
||||
memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached));
|
||||
kv->hint->status = HS_INITIALIZED;
|
||||
}
|
||||
|
||||
/*
|
||||
* Implements random sort (-R).
|
||||
*/
|
||||
@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
|
||||
struct bwstring *s1, *s2;
|
||||
MD5_CTX ctx1, ctx2;
|
||||
unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
|
||||
int cmp;
|
||||
|
||||
s1 = kv1->k;
|
||||
s2 = kv2->k;
|
||||
@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
|
||||
if (s1 == s2)
|
||||
return (0);
|
||||
|
||||
if (kv1->hint->status == HS_INITIALIZED &&
|
||||
kv2->hint->status == HS_INITIALIZED) {
|
||||
cmp = memcmp(kv1->hint->v.Rh.cached,
|
||||
kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached));
|
||||
if (cmp != 0)
|
||||
return (cmp);
|
||||
}
|
||||
|
||||
memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
|
||||
memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));
|
||||
|
||||
@ -1012,6 +1030,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
|
||||
MD5Final(hash1, &ctx1);
|
||||
MD5Final(hash2, &ctx2);
|
||||
|
||||
if (kv1->hint->status == HS_UNINITIALIZED)
|
||||
randomcoll_init_hint(kv1, hash1);
|
||||
if (kv2->hint->status == HS_UNINITIALIZED)
|
||||
randomcoll_init_hint(kv2, hash2);
|
||||
|
||||
return (memcmp(hash1, hash2, sizeof(hash1)));
|
||||
}
|
||||
|
||||
|
@ -64,6 +64,17 @@ struct M_hint
|
||||
int m;
|
||||
};
|
||||
|
||||
/*
|
||||
* Sort hint data for -R
|
||||
*
|
||||
* This stores the first 12 bytes of the digest rather than the full output to
|
||||
* avoid increasing the size of the 'key_hint' object via the 'v' union.
|
||||
*/
|
||||
struct R_hint
|
||||
{
|
||||
unsigned char cached[12];
|
||||
};
|
||||
|
||||
/*
|
||||
* Status of a sort hint object
|
||||
*/
|
||||
@ -83,6 +94,7 @@ struct key_hint
|
||||
struct n_hint nh;
|
||||
struct g_hint gh;
|
||||
struct M_hint Mh;
|
||||
struct R_hint Rh;
|
||||
} v;
|
||||
};
|
||||
|
||||
|
@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c)
|
||||
break;
|
||||
case 'R':
|
||||
sm->Rflag = true;
|
||||
need_hint = true;
|
||||
need_random = true;
|
||||
break;
|
||||
case 'M':
|
||||
|
Loading…
Reference in New Issue
Block a user