sort(1): Memoize MD5 computation to reduce repeated computation
Experimentally, reduces sort -R time of a 148160 line corpus from about 3.15s to about 0.93s on this particular system. There's probably room for improvement using some digest other than md5, but I don't want to look at sort(1) anymore. Some discussion of other possible improvements in the Test Plan section of the Differential. PR: 230792 Reviewed by: jhb (earlier version) Differential Revision: https://reviews.freebsd.org/D19885
This commit is contained in:
parent
49d9a59783
commit
f20b149b45
@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset)
|
|||||||
return (numcoll_impl(kv1, kv2, offset, true));
|
return (numcoll_impl(kv1, kv2, offset, true));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Use hint space to memoize md5 computations, at least. */
|
||||||
|
static void
|
||||||
|
randomcoll_init_hint(struct key_value *kv, void *hash)
|
||||||
|
{
|
||||||
|
|
||||||
|
memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached));
|
||||||
|
kv->hint->status = HS_INITIALIZED;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Implements random sort (-R).
|
* Implements random sort (-R).
|
||||||
*/
|
*/
|
||||||
@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
|
|||||||
struct bwstring *s1, *s2;
|
struct bwstring *s1, *s2;
|
||||||
MD5_CTX ctx1, ctx2;
|
MD5_CTX ctx1, ctx2;
|
||||||
unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
|
unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
|
||||||
|
int cmp;
|
||||||
|
|
||||||
s1 = kv1->k;
|
s1 = kv1->k;
|
||||||
s2 = kv2->k;
|
s2 = kv2->k;
|
||||||
@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
|
|||||||
if (s1 == s2)
|
if (s1 == s2)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
if (kv1->hint->status == HS_INITIALIZED &&
|
||||||
|
kv2->hint->status == HS_INITIALIZED) {
|
||||||
|
cmp = memcmp(kv1->hint->v.Rh.cached,
|
||||||
|
kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached));
|
||||||
|
if (cmp != 0)
|
||||||
|
return (cmp);
|
||||||
|
}
|
||||||
|
|
||||||
memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
|
memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
|
||||||
memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));
|
memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));
|
||||||
|
|
||||||
@ -1012,6 +1030,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv2,
|
|||||||
MD5Final(hash1, &ctx1);
|
MD5Final(hash1, &ctx1);
|
||||||
MD5Final(hash2, &ctx2);
|
MD5Final(hash2, &ctx2);
|
||||||
|
|
||||||
|
if (kv1->hint->status == HS_UNINITIALIZED)
|
||||||
|
randomcoll_init_hint(kv1, hash1);
|
||||||
|
if (kv2->hint->status == HS_UNINITIALIZED)
|
||||||
|
randomcoll_init_hint(kv2, hash2);
|
||||||
|
|
||||||
return (memcmp(hash1, hash2, sizeof(hash1)));
|
return (memcmp(hash1, hash2, sizeof(hash1)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,6 +64,17 @@ struct M_hint
|
|||||||
int m;
|
int m;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sort hint data for -R
|
||||||
|
*
|
||||||
|
* This stores the first 12 bytes of the digest rather than the full output to
|
||||||
|
* avoid increasing the size of the 'key_hint' object via the 'v' union.
|
||||||
|
*/
|
||||||
|
struct R_hint
|
||||||
|
{
|
||||||
|
unsigned char cached[12];
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Status of a sort hint object
|
* Status of a sort hint object
|
||||||
*/
|
*/
|
||||||
@ -83,6 +94,7 @@ struct key_hint
|
|||||||
struct n_hint nh;
|
struct n_hint nh;
|
||||||
struct g_hint gh;
|
struct g_hint gh;
|
||||||
struct M_hint Mh;
|
struct M_hint Mh;
|
||||||
|
struct R_hint Rh;
|
||||||
} v;
|
} v;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c)
|
|||||||
break;
|
break;
|
||||||
case 'R':
|
case 'R':
|
||||||
sm->Rflag = true;
|
sm->Rflag = true;
|
||||||
|
need_hint = true;
|
||||||
need_random = true;
|
need_random = true;
|
||||||
break;
|
break;
|
||||||
case 'M':
|
case 'M':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user