Bigram does not remove newline at end of filename. This
	break particulary the bigram algorithm and /var/db/locate.database
	grow up 15 %.

	Bigram does not check for characters outside 32-127.

	The bigram output is silly and need ~1/2 CPU time of
	database rebuilding.

	old:
	locate.bigram < $filelist | sort | uniq -c | sort -nr
                                    ^^^^^^^^^^^^^^
				    this can easy made bigram

	new:
        bigram < $filelist | sort -nr

code
	Code does not check for char 31.
	Use a lookup array instead a function. 3 x faster.

updatedb
	rewritten
	sync with bigram changes

	read config file /etc/locate.rc if exists
	submitted by: guido@gvr.win.tue.nl (Guido van Rooij)

concatdb - concatenate locate databases
mklocatedb - build locate database
This commit is contained in:
Wolfram Schneider 1996-08-14 00:22:31 +00:00
parent aa648cf84b
commit 370021810a
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=17592
12 changed files with 307 additions and 98 deletions

View File

@ -1,4 +1,5 @@
# @(#)Makefile 8.1 (Berkeley) 6/6/93
# $Id$
SUBDIR= bigram code locate

View File

@ -0,0 +1,3 @@
# $Id$
LIBEXECDIR?= /usr/libexec

View File

@ -2,6 +2,8 @@
PROG= locate.bigram
NOMAN= noman
BINDIR= /usr/libexec
BINDIR= ${LIBEXECDIR}
CFLAGS+= -I${.CURDIR}/../locate
.include "../Makefile.inc"
.include <bsd.prog.mk>

View File

@ -53,32 +53,65 @@ static char sccsid[] = "@(#)locate.bigram.c 8.1 (Berkeley) 6/6/93";
#include <stdio.h>
#include <sys/param.h> /* for MAXPATHLEN */
#include <string.h> /* memchr */
#include "locate.h"
char buf1[MAXPATHLEN] = " ";
char buf2[MAXPATHLEN];
u_char buf1[MAXPATHLEN] = " ";
u_char buf2[MAXPATHLEN];
unsigned int bigram[UCHAR_MAX][UCHAR_MAX];
main ( )
void main ( )
{
register char *cp;
register char *oldpath = buf1, *path = buf2;
register u_char *cp;
register u_char *oldpath = buf1, *path = buf2;
register int i, j;
/* init bigram buffer */
for (i = 0; i < UCHAR_MAX; i++)
for (j = 0; j < UCHAR_MAX; j++)
bigram[i][j] = 0;
while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) {
/* skip empty lines */
if (*path == '\n')
continue;
/* Squelch characters that would botch the decoding. */
for (cp = path; *cp != NULL; cp++) {
/* chop newline */
if (*cp == '\n')
*cp = NULL;
/* range */
else if (*cp < ASCII_MIN || *cp > ASCII_MAX)
*cp = '?';
}
/* skip longest common prefix */
for ( cp = path; *cp == *oldpath; cp++, oldpath++ )
if ( *oldpath == NULL )
break;
for (cp = path; *cp == *oldpath && *cp; cp++, oldpath++);
/*
* output post-residue bigrams only
*/
/* check later for boundary */
while ( *cp != NULL && *(cp + 1) != NULL ) {
putchar ( *cp++ );
putchar ( *cp++ );
putchar ( '\n' );
bigram[*cp][*(cp+1)]++;
cp += 2;
}
if ( path == buf1 ) /* swap pointers */
path = buf2, oldpath = buf1;
else
path = buf1, oldpath = buf2;
}
/* output, boundary check */
for (i = ASCII_MIN; i <= ASCII_MAX; i++)
for (j = ASCII_MIN; j <= ASCII_MAX; j++)
if (bigram[i][j] != 0)
fprintf(stdout, "%4d %c%c\n",
bigram[i][j], i, j);
}

View File

@ -1,8 +1,9 @@
# @(#)Makefile 8.1 (Berkeley) 6/6/93
PROG= locate.code
CFLAGS+=-I${.CURDIR}/../locate
CFLAGS+=-I${.CURDIR}/../locate
NOMAN= noman
BINDIR= /usr/libexec
BINDIR= ${LIBEXECDIR}
.include "../Makefile.inc"
.include <bsd.prog.mk>

View File

@ -89,25 +89,38 @@ static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93";
#define BGBUFSIZE (NBG * 2) /* size of bigram buffer */
char buf1[MAXPATHLEN + 1] = " ";
char buf2[MAXPATHLEN + 1];
u_char buf1[MAXPATHLEN] = " ";
u_char buf2[MAXPATHLEN];
char bigrams[BGBUFSIZE + 1] = { 0 };
#define LOOKUP 1
#ifdef LOOKUP
#define BGINDEX(x) (big[(u_int)*x][(u_int)*(x+1)])
typedef u_char bg_t;
bg_t big[UCHAR_MAX][UCHAR_MAX];
#else
#define BGINDEX(x) bgindex(x)
typedef int bg_t;
#endif
int bgindex __P((char *));
void usage __P((void));
extern int optind;
extern int optopt;
int
main(argc, argv)
int argc;
char *argv[];
{
register char *cp, *oldpath, *path;
register u_char *cp, *oldpath, *path;
int ch, code, count, diffcount, oldcount;
FILE *fp;
register int i, j;
while ((ch = getopt(argc, argv, "")) != EOF)
switch(ch) {
case '?':
default:
usage();
}
@ -126,27 +139,38 @@ main(argc, argv)
err(1, "stdout");
(void)fclose(fp);
#ifdef LOOKUP
/* init lookup table */
for (i = 0; i < UCHAR_MAX; i++)
for (j = 0; j < UCHAR_MAX; j++)
big[i][j] = (bg_t)-1;
for (cp = bigrams, i = 0; *cp != NULL; i += 2, cp += 2)
big[(int)*cp][(int)*(cp + 1)] = (bg_t)i;
#endif
oldpath = buf1;
path = buf2;
oldcount = 0;
while (fgets(path, sizeof(buf2) - 1, stdin) != NULL) {
/* Truncate newline. */
cp = path + strlen(path) - 1;
if (cp > path && *cp == '\n')
*cp = '\0';
while (fgets(path, sizeof(buf2), stdin) != NULL) {
/* skip empty lines */
if (*path == '\n')
continue;
/* Squelch characters that would botch the decoding. */
for (cp = path; *cp != NULL; cp++) {
if ((u_char)*cp >= PARITY)
*cp &= PARITY-1;
if (*cp <= SWITCH)
/* chop newline */
if (*cp == '\n')
*cp = NULL;
/* range */
else if (*cp < ASCII_MIN || *cp > ASCII_MAX)
*cp = '?';
}
/* Skip longest common prefix. */
for (cp = path; *cp == *oldpath; cp++, oldpath++)
if (*oldpath == NULL)
break;
for (cp = path; *cp == *oldpath && *cp; cp++, oldpath++);
count = cp - path;
diffcount = count - oldcount + OFFSET;
oldcount = count;
@ -164,7 +188,7 @@ main(argc, argv)
err(1, "stdout");
break;
}
if ((code = bgindex(cp)) < 0) {
if ((code = BGINDEX(cp)) == (bg_t)-1) {
if (putchar(*cp++) == EOF ||
putchar(*cp++) == EOF)
err(1, "stdout");
@ -189,6 +213,7 @@ main(argc, argv)
exit(0);
}
#ifndef LOOKUP
int
bgindex(bg) /* Return location of bg in bigrams or -1. */
char *bg;
@ -202,6 +227,7 @@ bgindex(bg) /* Return location of bg in bigrams or -1. */
break;
return (*p == NULL ? -1 : --p - bigrams);
}
#endif /* !LOOKUP */
void
usage()

View File

@ -1,12 +1,22 @@
# @(#)Makefile 8.1 (Berkeley) 6/6/93
# $Id: Makefile,v 1.3 1996/04/25 15:54:22 wosch Exp wosch $
PROG= locate
MAN1= locate.1
MAN8= locate.updatedb.8
SCRIPTS= updatedb mklocatedb concatdb
MLINKS+= locate.updatedb.8 updatedb.8
beforeinstall:
.for script in ${SCRIPTS}
${INSTALL} -c -o ${BINOWN} -g ${BINGRP} -m ${BINMODE} \
${.CURDIR}/updatedb.sh ${DESTDIR}/usr/libexec/locate.updatedb
${.CURDIR}/${script}.sh ${DESTDIR}${LIBEXECDIR}/locate.${script}
.endfor
# only /usr/src/etc/Makefile install files in /etc
# ${INSTALL} -c -o root -g wheel -m 644 \
# ${.CURDIR}/locate.rc ${DESTDIR}/etc
.include "../../Makefile.inc"
.include "../Makefile.inc"
.include <bsd.prog.mk>

View File

@ -0,0 +1,49 @@
#!/bin/sh
#
# (c) Wolfram Schneider, Berlin. September 1995. Public domain.
#
# concatdb - concatenate locate databases
#
# usage: concatdb database1 ... databaseN > newdb
#
# Sequence of databases is important.
#
# $Id: concatdb.sh,v 1.2 1996/04/20 21:55:21 wosch Exp wosch $
# The directory containing locate subprograms
: ${LIBEXECDIR=/usr/libexec}; export LIBEXECDIR
PATH=$LIBEXECDIR:/bin:/usr/bin:$PATH; export PATH
umask 077 # protect temp files
: ${TMPDIR=/tmp}; export TMPDIR;
if test X"$TMPDIR" = X -o ! -d "$TMPDIR"; then
TMPDIR=/tmp; export TMPDIR
fi
# utilities to built locate database
: ${bigram=locate.bigram}
: ${code=locate.code}
: ${sort=sort}
case $# in
[01]) echo 'usage: concatdb databases1 ... databaseN > newdb'
exit 1
;;
esac
bigrams=$TMPDIR/_concatdb$$.bigrams
trap 'rm -f $bigrams' 0 1 2 3 5 10 15
for db
do
$locate -d $db /
done | $bigram | $sort -nr | awk 'NR <= 128 { printf $2 }' > $bigrams
for db
do
$locate -d $db /
done | $code $bigrams

View File

@ -39,3 +39,29 @@
#define OFFSET 14 /* abs value of max likely diff */
#define PARITY 0200 /* parity bit */
#define SWITCH 30 /* switch code */
/* 0-28 likeliest differential counts + offset to make nonnegative */
#define LDC_MIN 0
#define LDC_MAX 28
/* 128-255 bigram codes (128 most common, as determined by 'updatedb') */
#define BIGRAM_MIN (UCHAR_MAX - CHAR_MAX)
#define BIGRAM_MAX UCHAR_MAX
/* 32-127 single character (printable) ascii residue (ie, literal) */
#define ASCII_MIN 32
#define ASCII_MAX CHAR_MAX
/* #define TO7BIT(x) (x = ( ((u_char)x) & CHAR_MAX )) */
#define TO7BIT(x) (x = x & CHAR_MAX )
#if UCHAR_MAX >= 4096
define TOLOWER(ch) tolower(ch)
#else
u_char myctype[UCHAR_MAX + 1];
#define TOLOWER(ch) (myctype[ch])
#endif
#define INTSIZE (sizeof(int))

View File

@ -0,0 +1,23 @@
#
# /etc/locate.rc - command script for updatedb(8)
#
# $Id: locate.rc,v 1.1 1996/04/26 15:25:23 wosch Exp wosch $
# temp directory
#TMPDIR="/tmp"
# the actual database
#FCODES="/var/db/locate.database"
# directories to be put in the database
#SEARCHPATHS="/"
# directories unwanted in output
#PRUNEPATHS="/tmp /usr/tmp /var/tmp"
# filesystems allowed. Beware: a non-listed filesystem will be pruned
# and is the SEARCHPATHS starts in such a filesystem locate will build
# an empty database
#
# be carefully if you add 'nfs'
#FILESYSTEMS="ufs"

View File

@ -0,0 +1,52 @@
#!/bin/sh
#
# (c) Wolfram Schneider, September 1995. Public domain.
#
# mklocatedb - build locate database
#
# usage: mklocatedb [-presort] < filelist > database
#
# $Id: mklocatedb.sh,v 1.2 1996/04/20 21:55:21 wosch Exp wosch $
# The directory containing locate subprograms
: ${LIBEXECDIR=/usr/libexec}; export LIBEXECDIR
PATH=$LIBEXECDIR:/bin:/usr/bin:$PATH; export PATH
umask 077 # protect temp files
: ${TMPDIR=/tmp}; export TMPDIR;
if test X"$TMPDIR" = X -o ! -d "$TMPDIR"; then
TMPDIR=/tmp; export TMPDIR
fi
# utilities to built locate database
: ${bigram=locate.bigram}
: ${code=locate.code}
: ${sort=sort}
sortopt="-u -T $TMPDIR"
sortcmd=$sort
# Input already sorted
case X"$1" in
X-nosort|X-presort) sortcmd=cat; sortopt=;shift;;
esac
bigrams=$TMPDIR/_mklocatedb$$.bigrams
filelist=$TMPDIR/_mklocatedb$$.list
trap 'rm -f $bigrams $filelist' 0 1 2 3 5 10 15
if $sortcmd $sortopt > $filelist; then
$bigram < $filelist | $sort -nr |
awk 'NR <= 128 { printf $2 }' > $bigrams &&
$code $bigrams < $filelist
else
echo "`basename $0`: cannot build locate database" >&2
exit 1
fi

View File

@ -1,79 +1,62 @@
#!/bin/sh
#
# Copyright (c) 1989, 1993
# The Regents of the University of California. All rights reserved.
# (c) Wolfram Schneider, Berlin. September 1995. Public domain.
#
# This code is derived from software contributed to Berkeley by
# James A. Woods.
#
# Modified to be a /bin/sh script by Nate Williams
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. All advertising materials mentioning features or use of this software
# must display the following acknowledgement:
# This product includes software developed by the University of
# California, Berkeley and its contributors.
# 4. Neither the name of the University nor the names of its contributors
# may be used to endorse or promote products derived from this software
# without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
# @(#)updatedb.csh 8.3 (Berkeley) 3/19/94
# updatedb - update locate database for local mounted filesystems
#
# $Id: updatedb.sh,v 1.3 1996/04/20 21:55:21 wosch Exp wosch $
SRCHPATHS="/" # directories to be put in the database
LIBDIR="/usr/libexec" # for subprograms
FCODES="/var/db/locate.database" # the database
if [ "$TMPDIR" = "" ]; then
TMPDIR="/var/tmp" # for temp files
LOCATE_CONFIG="/etc/locate.rc"
if [ -f "$LOCATE_CONFIG" -a -r "$LOCATE_CONFIG" ]; then
. $LOCATE_CONFIG
fi
PATH=/bin:/usr/bin
BIGRAMS="$TMPDIR/locate.bigrams.$$"
FILELIST="$TMPDIR/locate.list.$$"
ERRS="$TMPDIR/locate.errs.$$"
# The directory containing locate subprograms
: ${LIBEXECDIR=/usr/libexec}; export LIBEXECDIR
# Make a file list and compute common bigrams.
# Alphabetize '/' before any other char with 'tr'.
# If the system is very short of sort space, 'bigram' can be made
# smarter to accumulate common bigrams directly without sorting
# ('awk', with its associative memory capacity, can do this in several
# lines, but is too slow, and runs out of string space on small machines).
PATH=$LIBEXECDIR:/bin:/usr/bin:$PATH; export PATH
# search locally or everything
# find ${SRCHPATHS} -print | \
find ${SRCHPATHS} ! -fstype ufs -prune -or -print | \
tr '/' '\001' | \
(sort -T $TMPDIR -f; echo $? > $ERRS) | tr '\001' '/' > $FILELIST
$LIBDIR/locate.bigram < $FILELIST | \
(sort -T $TMPDIR ; echo $? >> $ERRS) | \
uniq -c | sort -T $TMPDIR -nr | \
awk '{ if (NR <= 128) print $2 }' | tr -d '\012' > $BIGRAMS
: ${mklocatedb=locate.mklocatedb} # make locate database program
: ${FCODES=/var/db/locate.database} # the database
: ${SEARCHPATHS="/"} # directories to be put in the database
: ${PRUNEPATHS="/tmp /usr/tmp /var/tmp"} # unwanted directories
: ${FILESYSTEMS="ufs"} # allowed filesystems
: ${find=find}
# code the file list
if [ `sort -u $ERRS | grep -s -v 0` ]; then
printf 'locate: updatedb failed\n\n'
else
$LIBDIR/locate.code $BIGRAMS < $FILELIST > $FCODES
chmod 644 $FCODES
rm $BIGRAMS $FILELIST $ERRS
case X"$SEARCHPATHS" in
X) echo "$0: empty variable SEARCHPATHS"; exit 1;; esac
case X"$FILESYSTEMS" in
X) echo "$0: empty variable FILESYSTEMS"; exit 1;; esac
# Make a list a paths to exclude in the locate run
excludes="! (" or=""
for fstype in $FILESYSTEMS
do
excludes="$excludes $or -fstype $fstype"
or="-or"
done
excludes="$excludes ) -prune"
case X"$PRUNEPATHS" in
X) ;;
*) for path in $PRUNEPATHS
do
excludes="$excludes -or -path $path -prune"
done;;
esac
tmp=${TMPDIR=/tmp}/_updatedb$$
trap 'rm -f $tmp' 0 1 2 3 5 10 15
# search locally
# echo $find $SEARCHPATHS $excludes -or -print && exit
if $find $SEARCHPATHS $excludes -or -print 2>/dev/null |
$mklocatedb > $tmp
then
case X"`$find $tmp -size -257c -print`" in
X) cat $tmp > $FCODES;;
*) echo "updatedb: locate database $tmp is empty"
exit 1
esac
fi