Garrett Wollman 2494e810b8 Some fixes for HTTP:
1) Implement redirects (or try to, at least).
2) Implement automatic retry after 503 errors when Retry-After is given.
3) Implement a -a flag to enable both of these behaviors.
4) Recognize Transfer-Encoding headers and emit a warning that the file
  is likely to be damaged.
5) Bug fix: only write the amount of data we read.
6) Actually document some of these.
7) Fix the usage message to display flags in semi-alphabetical order.
1997-01-31 19:55:51 +00:00

1204 lines
31 KiB
C

/*-
* Copyright 1997 Massachusetts Institute of Technology
*
* Permission to use, copy, modify, and distribute this software and
* its documentation for any purpose and without fee is hereby
* granted, provided that both the above copyright notice and this
* permission notice appear in all copies, that both the above
* copyright notice and this permission notice appear in all
* supporting documentation, and that the name of M.I.T. not be used
* in advertising or publicity pertaining to distribution of the
* software without specific, written prior permission. M.I.T. makes
* no representations about the suitability of this software for any
* purpose. It is provided "as is" without express or implied
* warranty.
*
* THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS
* ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
* SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $Id: http.c,v 1.1 1997/01/30 21:43:41 wollman Exp $
*/
#include <sys/types.h>
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <md5.h>
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <time.h>
#include <unistd.h>
#include <sys/param.h> /* for MAXHOSTNAMELEN */
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/uio.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include "fetch.h"
static int http_parse(struct fetch_state *fs, const char *uri);
static int http_proxy_parse(struct fetch_state *fs, const char *uri);
static int http_close(struct fetch_state *fs);
static int http_retrieve(struct fetch_state *fs);
struct uri_scheme http_scheme =
{ "http", http_parse, http_proxy_parse, "HTTP_PROXY", "http" };
struct http_state {
char *http_hostname;
char *http_remote_request;
char *http_decoded_file;
char *http_host_header;
char *http_authentication;
char *http_proxy_authentication;
unsigned http_port;
int http_redirected;
};
/* We are only concerned with headers we might receive. */
enum http_header {
ht_accept_ranges, ht_age, ht_allow, ht_cache_control, ht_connection,
ht_content_base, ht_content_encoding, ht_content_language,
ht_content_length, ht_content_location, ht_content_md5,
ht_content_range, ht_content_type, ht_date, ht_etag, ht_expires,
ht_last_modified, ht_location, ht_pragma, ht_proxy_authenticate,
ht_public, ht_retry_after, ht_server, ht_transfer_encoding,
ht_upgrade, ht_vary, ht_via, ht_www_authenticate, ht_warning,
/* unusual cases */
ht_syntax_error, ht_unknown, ht_end_of_header
};
static char *format_http_date(time_t when);
static char *format_http_user_agent(void);
static enum http_header http_parse_header(char *line, char **valuep);
static int check_md5(FILE *fp, char *base64ofmd5);
static int http_first_line(const char *line);
static int parse_http_content_range(char *orig, off_t *first, off_t *total);
static time_t parse_http_date(char *datestring);
static int
http_parse(struct fetch_state *fs, const char *uri)
{
const char *p, *colon, *slash, *ques, *q;
char *hostname, *hosthdr, *trimmed_name;
unsigned port;
struct http_state *https;
p = uri + 5;
port = 0;
if (p[0] != '/' || p[1] != '/') {
warnx("`%s': malformed `http' URL", uri);
return EX_USAGE;
}
p += 2;
colon = strchr(p, ':');
slash = strchr(p, '/');
if (colon && slash && colon < slash)
q = colon;
else
q = slash;
if (q == 0) {
warnx("`%s': malformed `http' URL", uri);
return EX_USAGE;
}
hostname = alloca(q - p + 1);
hostname[0] = '\0';
strncat(hostname, p, q - p);
p = slash;
if (colon && colon + 1 != slash) {
unsigned long ul;
char *ep;
errno = 0;
ul = strtoul(colon + 1, &ep, 10);
if (ep != slash || ep == colon + 1 || errno != 0
|| ul < 1 || ul > 65534) {
warn("`%s': invalid port in URL", uri);
return EX_USAGE;
}
port = ul;
} else {
port = 80;
}
p = slash + 1;
https = malloc(sizeof *https);
if (https == 0)
err(EX_OSERR, "malloc");
/*
* Now, we have a copy of the hostname in hostname, the specified port
* (or the default value) in port, and p points to the filename part
* of the URI.
*/
https->http_hostname = safe_strdup(hostname);
https->http_port = port;
hosthdr = alloca(sizeof("Host: :\r\n") + 5 + strlen(hostname));
sprintf(hosthdr, "Host: %s:%d\r\n", hostname, port);
https->http_host_header = safe_strdup(hosthdr);
https->http_remote_request = safe_strdup(p);
ques = strpbrk(p, "?#");
if (ques) {
trimmed_name = safe_strndup(p, ques - p);
} else {
trimmed_name = safe_strdup(p);
}
https->http_decoded_file = percent_decode(trimmed_name);
free(trimmed_name);
p = https->http_decoded_file;
/* now p is the decoded version, so we can extract the basename */
if (fs->fs_outputfile == 0) {
slash = strrchr(p, '/');
if (slash)
fs->fs_outputfile = slash + 1;
else
fs->fs_outputfile = p;
}
https->http_redirected = 0;
fs->fs_proto = https;
fs->fs_close = http_close;
fs->fs_retrieve = http_retrieve;
return 0;
}
/*
* An HTTP proxy works by accepting a complete URI in a GET request,
* retrieving that object, and then forwarding it back to us. Because
* it can conceivably handle any URI, we have to do a bit more work
* in the parsing of it.
*/
static int
http_proxy_parse(struct fetch_state *fs, const char *uri)
{
struct http_state *https;
const char *env, *slash, *ques;
char *file;
int rv;
https = malloc(sizeof *https);
https->http_remote_request = safe_strdup(uri);
env = getenv("HTTP_PROXY");
rv = parse_host_port(env, &https->http_hostname, &https->http_port);
if (rv) {
out:
free(https->http_remote_request);
free(https);
return rv;
}
if (strncmp(uri, "http://", 7) == 0) {
char *hosthdr;
slash = strchr(uri + 7, '/');
if (slash == 0) {
warnx("`%s': malformed `http' URL", uri);
rv = EX_USAGE;
free(https->http_hostname);
goto out;
}
ques = strpbrk(slash, "?#");
if (ques == 0)
file = safe_strdup(slash);
else
file = safe_strndup(slash, ques - slash);
hosthdr = alloca(sizeof("Host: \r\n") + slash - uri - 7);
strcpy(hosthdr, "Host: ");
strncat(hosthdr, uri + 7, slash - uri - 7);
strcat(hosthdr, "\r\n");
https->http_host_header = safe_strdup(hosthdr);
} else {
slash = uri;
while (*slash && *slash != ':')
slash++;
if (*slash)
slash++;
if (slash[0] == '/' && slash[1] == '/') {
slash += 2;
while (*slash && *slash != '/')
slash++;
}
file = safe_strdup(slash);
https->http_host_header = safe_strdup("");
}
https->http_decoded_file = percent_decode(file);
https->http_redirected = 0;
free(file);
if (fs->fs_outputfile == 0) {
slash = strrchr(https->http_decoded_file, '/');
/* NB: we are not guaranteed to find one... */
fs->fs_outputfile = slash ? slash + 1
: https->http_decoded_file;
}
fs->fs_proto = https;
fs->fs_close = http_close;
fs->fs_retrieve = http_retrieve;
return 0;
}
static int
http_close(struct fetch_state *fs)
{
struct http_state *https = fs->fs_proto;
free(https->http_hostname);
free(https->http_remote_request);
free(https->http_decoded_file);
free(https->http_host_header);
free(https);
fs->fs_outputfile = 0;
return 0;
}
static int
nullclose(struct fetch_state *fs)
{
return 0;
}
/*
* Process a redirection. This has a small memory leak.
*/
static int
http_redirect(struct fetch_state *fs, char *new, int permanent)
{
struct http_state *https = fs->fs_proto;
int num_redirects = https->http_redirected + 1;
char *out = safe_strdup(fs->fs_outputfile);
int rv;
if (num_redirects > 5) {
warnx("%s: HTTP redirection limit exceeded");
return EX_PROTOCOL;
}
free(https->http_hostname);
free(https->http_remote_request);
free(https->http_decoded_file);
free(https);
warnx("%s: resource has moved %s to `%s'", out,
permanent ? "permanently" : "temporarily", new);
rv = http_parse(fs, new);
if (rv != 0) {
fs->fs_close = nullclose; /* XXX rethink interface? */
return rv;
}
https = fs->fs_proto;
https->http_redirected = num_redirects;
/*
* This ensures that the output file name doesn't suddenly change
* under the user's feet. Unfortunately, this results in a small
* memory leak. I wish C had garbage collection...
*/
fs->fs_outputfile = out;
rv = http_retrieve(fs);
return rv;
}
/*
* Get a file using HTTP. We will try to implement HTTP/1.1 eventually.
* This subroutine makes heavy use of the 4.4-Lite standard I/O library,
* in particular the `fgetln' which allows us to slurp an entire `line'
* (an arbitrary string of non-NUL characters ending in a newline) directly
* out of the stdio buffer. This makes interpreting the HTTP headers much
* easier, since they are all guaranteed to end in `\r\n' and we can just
* ignore the `\r'.
*/
static int
http_retrieve(struct fetch_state *fs)
{
struct http_state *https;
FILE *remote, *local;
int s;
struct sockaddr_in sin;
struct msghdr msg;
#define NIOV 16 /* max is currently 12 */
struct iovec iov[NIOV];
int n, status;
const char *env;
int timo;
char *line, *new_location;
size_t linelen, readresult, writeresult;
off_t total_length, restart_from;
time_t last_modified, when_to_retry;
char *base64ofmd5;
static char buf[BUFFER_SIZE];
int to_stdout, restarting, redirection, retrying;
char rangebuf[sizeof("Range: bytes=18446744073709551616-\r\n")];
https = fs->fs_proto;
to_stdout = (strcmp(fs->fs_outputfile, "-") == 0);
restarting = fs->fs_restart;
redirection = 0;
retrying = 0;
/*
* Figure out the timeout. Prefer the -T command-line value,
* otherwise the HTTP_TIMEOUT envar, or else don't time out at all.
*/
if (fs->fs_timeout) {
timo = fs->fs_timeout;
} else if ((env = getenv("HTTP_TIMEOUT")) != 0) {
char *ep;
unsigned long ul;
errno = 0;
ul = strtoul(env, &ep, 0);
if (*ep != '\0' || *env == '\0' || errno != 0
|| ul > INT_MAX) {
warnx("`%s': invalid timeout", env);
return EX_USAGE;
}
timo = ul;
} else {
timo = 0;
}
memset(&sin, 0, sizeof sin);
sin.sin_family = AF_INET;
sin.sin_len = sizeof sin;
sin.sin_port = htons(https->http_port);
if (inet_aton(https->http_hostname, &sin.sin_addr) == 0) {
struct hostent *hp;
/* XXX - do timeouts for name resolution? */
hp = gethostbyname2(https->http_hostname, AF_INET);
if (hp == 0) {
warnx("`%s': cannot resolve: %s", https->http_hostname,
hstrerror(h_errno));
return EX_NOHOST;
}
memcpy(&sin.sin_addr, hp->h_addr_list[0], sizeof sin.sin_addr);
}
msg.msg_name = (caddr_t)&sin;
msg.msg_namelen = sizeof sin;
msg.msg_iov = iov;
n = 0;
msg.msg_control = 0;
msg.msg_controllen = 0;
msg.msg_flags = MSG_EOF;
#define addstr(Iov, N, Str) \
do { \
Iov[N].iov_base = (void *)Str; \
Iov[N].iov_len = strlen(Iov[n].iov_base); \
N++; \
} while(0)
retry:
addstr(iov, n, "GET /");
addstr(iov, n, https->http_remote_request);
addstr(iov, n, " HTTP/1.1\r\n");
/*
* The choice of HTTP/1.1 may be a bit controversial. The
* specification says that implementations which are not at
* least conditionally compliant MUST NOT call themselves
* HTTP/1.1. We choose not to comply with that requirement.
* (Eventually we will support the full HTTP/1.1, at which
* time this comment will not apply. But it's amusing how
* specifications attempt to define behavior for implementations
* which aren't obeying the spec in the first place...)
*/
addstr(iov, n, format_http_user_agent());
/* do content negotiation here */
addstr(iov, n, "Accept: */*\r\n");
addstr(iov, n, https->http_host_header);
addstr(iov, n, "Connection: close\r\n");
if (fs->fs_mirror) {
struct stat stab;
errno = 0;
if (((!to_stdout && stat(fs->fs_outputfile, &stab) == 0)
|| (to_stdout && fstat(STDOUT_FILENO, &stab) == 0))
&& S_ISREG(stab.st_mode)) {
addstr(iov, n, "If-Modified-Since: ");
addstr(iov, n, format_http_date(stab.st_mtime));
addstr(iov, n, "\r\n");
} else if (errno != 0) {
warn("%s: cannot mirror; will retrieve anew",
fs->fs_outputfile);
}
}
if (restarting) {
struct stat stab;
errno = 0;
if (((!to_stdout && stat(fs->fs_outputfile, &stab) == 0)
|| (to_stdout && fstat(STDOUT_FILENO, &stab) == 0))
&& S_ISREG(stab.st_mode)) {
addstr(iov, n, "If-Range: ");
addstr(iov, n, format_http_date(stab.st_mtime));
addstr(iov, n, "\r\n");
sprintf(rangebuf, "Range: bytes=%qd-\r\n",
(quad_t)stab.st_size);
addstr(iov, n, rangebuf);
} else if (errno != 0) {
warn("%s: cannot restart; will retrieve anew",
fs->fs_outputfile);
restarting = 0;
} else {
warnx("%s: cannot restart; will retrieve anew",
fs->fs_outputfile);
restarting = 0;
}
}
addstr(iov, n, "\r\n");
msg.msg_iovlen = n;
if (n >= NIOV)
err(EX_SOFTWARE, "request vector length exceeded: %d", n);
s = socket(PF_INET, SOCK_STREAM, 0);
if (s < 0) {
warn("socket");
return EX_OSERR;
}
remote = fdopen(s, "r");
if (remote == 0) {
warn("fdopen");
close(s);
return EX_OSERR;
}
setup_sigalrm();
alarm(timo);
if (sendmsg(s, &msg, MSG_EOF) < 0) {
warn("%s", https->http_hostname);
fclose(remote);
return EX_OSERR;
}
got100reply:
alarm(timo);
line = fgetln(remote, &linelen);
alarm(0);
if (line == 0) {
if (ferror(remote)) {
warn("reading reply from %s", https->http_hostname);
fclose(remote);
unsetup_sigalrm();
return EX_OSERR;
} else {
warnx("empty reply from %s", https->http_hostname);
fclose(remote);
unsetup_sigalrm();
return EX_PROTOCOL;
}
}
/*
* If the other end is HTTP 0.9, then we just suck their
* response over; can't do anything fancy. We assume that
* the file is a text file, so it is safe to use fgetln()
* to suck the entire file. (It had better be, since
* we used it to grab the first line.)
*/
if (linelen < 5 || strncasecmp(line, "http/", 5) != 0) {
if (to_stdout)
local = fopen("/dev/stdout", "w");
else
local = fopen(fs->fs_outputfile, "w");
if (local == 0) {
warn("%s: fopen", fs->fs_outputfile);
fclose(remote);
unsetup_sigalrm();
return EX_OSERR;
}
display(fs, -1, 0);
do {
writeresult = fwrite(line, 1, linelen, local);
display(fs, -1, writeresult);
if (writeresult != linelen)
break;
alarm(timo);
line = fgetln(remote, &linelen);
alarm(0);
} while(line != 0);
unsetup_sigalrm();
if (ferror(local)) {
warn("%s", fs->fs_outputfile);
fclose(local);
fclose(remote);
rm(fs);
return EX_OSERR;
} else if(ferror(remote)) {
warn("%s", https->http_hostname);
fclose(local);
fclose(remote);
rm(fs);
return EX_OSERR;
}
fclose(local);
fclose(remote);
display(fs, -1, -1);
return 0;
}
/*
* OK. The other end is doing HTTP 1.0 at the very least.
* This means that some of the fancy stuff is at least possible.
*/
line[linelen - 1] = '\0'; /* turn line into a string */
status = http_first_line(line);
/* In the future, we might handle redirection and other responses. */
switch(status) {
case 100: /* Continue */
goto got100reply;
case 200: /* Here come results */
case 203: /* Non-Authoritative Information */
restarting = 0;
break;
case 206: /* Here come partial results */
/* can only happen when restarting */
break;
case 301: /* Resource has moved permanently */
if (!fs->fs_auto_retry)
goto spewerror;
redirection = 301;
break;
case 302: /* Resource has moved temporarily */
/*
* We don't test fs->fs_auto_retry here so that this
* sort of redirection is transparent to the user.
*/
redirection = 302;
break;
case 304: /* Object is unmodified */
if (fs->fs_mirror) {
fclose(remote);
unsetup_sigalrm();
return 0;
}
goto spewerror;
case 401: /* Unauthorized */
case 407: /* Proxy Authentication Required */
/* XXX implement authentication */
case 503: /* Service Unavailable */
if (!fs->fs_auto_retry)
goto spewerror;
retrying = 503;
break;
default:
spewerror:
warnx("%s: %s: HTTP server returned error code %d",
fs->fs_outputfile, https->http_hostname, status);
if (fs->fs_verbose > 1) {
fputs(line, stderr);
fputc('\n', stderr);
while ((line = fgetln(remote, &linelen)) != 0)
fwrite(line, 1, linelen, stderr);
}
fclose(remote);
unsetup_sigalrm();
return EX_UNAVAILABLE;
}
total_length = -1; /* -1 means ``don't know'' */
last_modified = when_to_retry = -1;
base64ofmd5 = 0;
new_location = 0;
restart_from = 0;
while((line = fgetln(remote, &linelen)) != 0) {
char *value, *ep;
enum http_header header;
unsigned long ul;
line[linelen - 1] = '\0';
header = http_parse_header(line, &value);
if (header == ht_end_of_header)
break;
switch(header) {
case ht_content_length:
errno = 0;
ul = strtoul(value, &ep, 10);
if (errno != 0 || *ep)
warnx("invalid Content-Length: `%s'", value);
if (!restarting)
total_length = ul;
break;
case ht_last_modified:
last_modified = parse_http_date(value);
if (last_modified == -1 && fs->fs_verbose > 0)
warnx("invalid Last-Modified: `%s'", value);
break;
case ht_content_md5:
base64ofmd5 = safe_strdup(value);
break;
case ht_content_range:
if (!restarting) /* XXX protocol error */
break;
/* NB: we might have to restart from farther back
than we asked. */
status = parse_http_content_range(value, &restart_from,
&total_length);
/* If we couldn't understand the reply, get the whole
thing. */
if (status) {
restarting = 0;
doretry:
fclose(remote);
if (base64ofmd5)
free(base64ofmd5);
if (new_location)
free(new_location);
restart_from = 0;
n = 0;
goto retry;
}
break;
case ht_location:
if (redirection) {
char *s = value;
while (*s && !isspace(*s))
s++;
new_location = safe_strndup(value, s - value);
}
break;
case ht_transfer_encoding:
warnx("%s: %s specified a Transfer-Encoding: %s",
fs->fs_outputfile, https->http_hostname,
value);
warnx("%s: output file may be uninterpretable",
fs->fs_outputfile);
break;
case ht_retry_after:
if (!retrying)
break;
errno = 0;
ul = strtoul(value, &ep, 10);
if (errno != 0 || (*ep && !isspace(*ep))) {
time_t when;
when = parse_http_date(value);
if (when == -1)
break;
when_to_retry = when;
} else {
when_to_retry = time(0) + ul;
}
break;
default:
break;
}
}
if (retrying) {
int howlong;
if (when_to_retry == -1) {
/* This assignment is OK because all we do is print. */
line = (char *)"HTTP/1.1 503 Service Unavailable";
goto spewerror;
}
howlong = when_to_retry - time(0);
if (howlong < 30)
howlong = 30;
warnx("%s: service unavailable; retrying in %d seconds",
https->http_hostname, howlong);
sleep(howlong);
goto doretry;
}
if (redirection && new_location) {
fclose(remote);
if (base64ofmd5)
free(base64ofmd5);
status = http_redirect(fs, new_location, redirection == 301);
free(new_location);
return status;
} else if (redirection) {
warnx("%s: redirection but no new location",
fs->fs_outputfile);
fclose(remote);
if (base64ofmd5)
free(base64ofmd5);
return EX_PROTOCOL;
}
/*
* OK, if we got here, then we have finished parsing the header
* and have read the `\r\n' line which denotes the end of same.
* We may or may not have a good idea of the length of the file
* or its modtime. At this point we will have to deal with
* any special byte-range, content-negotiation, redirection,
* or authentication, and probably jump back up to the top,
* once we implement those features. So, all we have left to
* do is open up the output file and copy data from input to
* output until EOF.
*/
if (to_stdout)
local = fopen("/dev/stdout", "w");
else
local = fopen(fs->fs_outputfile, "w");
if (local == 0) {
warn("%s: fopen", fs->fs_outputfile);
fclose(remote);
unsetup_sigalrm();
return EX_OSERR;
}
fs->fs_modtime = last_modified;
fseek(local, restart_from, SEEK_SET); /* XXX truncation off_t->long */
display(fs, total_length, restart_from); /* XXX truncation */
do {
alarm(timo);
readresult = fread(buf, 1, sizeof buf, remote);
alarm(0);
if (readresult == 0)
break;
display(fs, total_length, readresult);
writeresult = fwrite(buf, 1, readresult, local);
} while (writeresult == readresult);
status = errno; /* save errno for warn(), below, if needed */
display(fs, total_length, -1); /* do here in case we have to warn */
errno = status;
if (ferror(remote)) {
warn("reading remote file from %s", https->http_hostname);
status = EX_OSERR;
} else if(ferror(local)) {
warn("`%s': fwrite", fs->fs_outputfile);
status = EX_OSERR;
} else {
status = 0;
}
if (base64ofmd5) {
/*
* Ack. When restarting, the MD5 only covers the parts
* we are getting, not the whole thing.
*/
fseek(local, restart_from, SEEK_SET);
status = check_md5(local, base64ofmd5);
free(base64ofmd5);
}
unsetup_sigalrm();
fclose(local);
fclose(remote);
if (status != 0)
rm(fs);
else
adjmodtime(fs);
return status;
#undef addstr
}
/*
* The format of the response line for an HTTP request is:
* HTTP/V.vv{WS}999{WS}Explanatory text for humans to read\r\n
* Where {WS} represents whitespace (spaces and/or tabs) and 999
* is a machine-interprable result code. We return the integer value
* of that result code, or the impossible value `0' if we are unable to
* parse the result.
*/
static int
http_first_line(const char *line)
{
char *ep;
unsigned long ul;
if (strncasecmp(line, "http/", 5) != 0)
return 0;
line += 5;
while (*line && isdigit(*line)) /* skip major version number */
line++;
if (*line++ != '.') /* skip period */
return 0;
while (*line && isdigit(*line)) /* skip minor version number */
line++;
while (*line && isspace(*line)) /* skip first whitespace */
line++;
errno = 0;
ul = strtoul(line, &ep, 10);
if (errno != 0 || ul > 999 || ul < 100 || !isspace(*ep))
return 0;
return ul;
}
/*
* The format of a header line for an HTTP request is:
* Header-Name: header-value (with comments in parens)\r\n
* This would be a nice application for gperf(1), except that the
* names are case-insensitive and gperf can't handle that.
*/
static enum http_header
http_parse_header(char *line, char **valuep)
{
char *colon, *value;
if (*line == '\0' /* protocol error! */
|| (line[0] == '\r' && line[1] == '\0'))
return ht_end_of_header;
colon = strchr(line, ':');
if (colon == 0)
return ht_syntax_error;
*colon = '\0';
for (value = colon + 1; *value && isspace(*value); value++)
; /* do nothing */
/* Trim trailing whitespace (including \r). */
*valuep = value;
value += strlen(value) - 1;
while (value > *valuep && isspace(*value))
value--;
*++value = '\0';
#define cmp(name, num) do { if (!strcasecmp(line, name)) return num; } while(0)
cmp("Accept-Ranges", ht_accept_ranges);
cmp("Age", ht_age);
cmp("Allow", ht_allow);
cmp("Cache-Control", ht_cache_control);
cmp("Connection", ht_connection);
cmp("Content-Base", ht_content_base);
cmp("Content-Encoding", ht_content_encoding);
cmp("Content-Language", ht_content_language);
cmp("Content-Length", ht_content_length);
cmp("Content-Location", ht_content_location);
cmp("Content-MD5", ht_content_md5);
cmp("Content-Range", ht_content_range);
cmp("Content-Type", ht_content_type);
cmp("Date", ht_date);
cmp("ETag", ht_etag);
cmp("Expires", ht_expires);
cmp("Last-Modified", ht_last_modified);
cmp("Location", ht_location);
cmp("Pragma", ht_pragma);
cmp("Proxy-Authenticate", ht_proxy_authenticate);
cmp("Public", ht_public);
cmp("Retry-After", ht_retry_after);
cmp("Server", ht_server);
cmp("Transfer-Encoding", ht_transfer_encoding);
cmp("Upgrade", ht_upgrade);
cmp("Vary", ht_vary);
cmp("Via", ht_via);
cmp("WWW-Authenticate", ht_www_authenticate);
cmp("Warning", ht_warning);
#undef cmp
return ht_unknown;
}
/*
* Compute the RSA Data Security, Inc., MD5 Message Digest of the file
* given in `fp', see if it matches the one given in base64 encoding by
* `base64ofmd5'. Warn and return an error if it doesn't.
*/
static int
check_md5(FILE *fp, char *base64ofmd5) {
MD5_CTX ctx;
unsigned char digest[16];
char buf[512];
size_t len;
char *ourval;
MD5Init(&ctx);
while ((len = fread(buf, 1, sizeof buf, fp)) != 0) {
MD5Update(&ctx, buf, len);
}
MD5Final(digest, &ctx);
ourval = to_base64(digest, 16);
if (strcmp(ourval, base64ofmd5) != 0) {
warnx("MD5 digest mismatch: %s, should be %s", ourval,
base64ofmd5);
free(ourval);
return EX_DATAERR;
}
free(ourval);
return 0;
}
static const char *wkdays[] = {
"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
};
static const char *months[] = {
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec"
};
/*
* Interpret one of the three possible formats for an HTTP date.
* All of them are really bogus; HTTP should use either ISO 8601
* or NTP timestamps. We make some attempt to accept a subset of 8601
* format. The three standard formats are all fixed-length subsets of their
* respective standards (except 8601, which puts all of the stuff we
* care about up front).
*/
static time_t
parse_http_date(char *string)
{
static struct tm tm; /* get good initialization */
time_t rv;
const char *tz;
int i;
/* 8601 has the shortest minimum length */
if (strlen(string) < 15)
return -1;
if (isdigit(*string)) {
/* ISO 8601: 19970127T134551stuffwedon'tcareabout */
for (i = 0; i < 15; i++) {
if (i != 8 && !isdigit(string[i]))
break;
}
if (i < 15)
return -1;
#define digit(x) (string[x] - '0')
tm.tm_year = (digit(0) * 1000
+ digit(1) * 100
+ digit(2) * 10
+ digit(3)) - 1900;
tm.tm_mon = digit(4) * 10 + digit(5) - 1;
tm.tm_mday = digit(6) * 10 + digit(7);
if (string[8] != 'T' && string[8] != 't' && string[8] != ' ')
return -1;
tm.tm_hour = digit(9) * 10 + digit(10);
tm.tm_min = digit(11) * 10 + digit(12);
tm.tm_sec = digit(13) * 10 + digit(14);
/* We don't care about the rest of the stuff after the secs. */
} else if (string[3] == ',') {
/* Mon, 27 Jan 1997 14:24:35 stuffwedon'tcareabout */
if (strlen(string) < 25)
return -1;
string += 5; /* skip over day-of-week */
if (!(isdigit(string[0]) && isdigit(string[1])))
return -1;
tm.tm_mday = digit(0) * 10 + digit(1);
for (i = 0; i < 12; i++) {
if (strncasecmp(months[i], &string[3], 3) == 0)
break;
}
if (i >= 12)
return -1;
tm.tm_mon = i;
if (sscanf(&string[7], "%d %d:%d:%d", &i, &tm.tm_hour,
&tm.tm_min, &tm.tm_sec) != 4)
return -1;
tm.tm_year = i - 1900;
} else if (string[3] == ' ') {
/* Mon Jan 27 14:25:20 1997 */
if (strlen(string) < 25)
return -1;
string += 4;
for (i = 0; i < 12; i++) {
if (strncasecmp(string, months[i], 3) == 0)
break;
}
if (i >= 12)
return -1;
tm.tm_mon = i;
if (sscanf(&string[4], "%d %d:%d:%d %u", &tm.tm_mday,
&tm.tm_hour, &tm.tm_min, &tm.tm_sec, &i)
!= 5)
return -1;
tm.tm_year = i - 1900;
} else {
/* Monday, 27-Jan-97 14:31:09 stuffwedon'tcareabout */
char *comma = strchr(string, ',');
char mname[4];
if (comma == 0)
return -1;
string = comma + 1;
if (strlen(string) < 19)
return -1;
string++;
mname[4] = '\0';
if (sscanf(string, "%d-%c%c%c-%d %d:%d:%d", &tm.tm_mday,
mname, mname + 1, mname + 2, &tm.tm_year,
&tm.tm_hour, &tm.tm_min, &tm.tm_sec) != 8)
return -1;
for (i = 0; i < 12; i++) {
if (strcasecmp(months[i], mname))
break;
}
if (i >= 12)
return -1;
tm.tm_mon = i;
}
#undef digit
if (tm.tm_sec > 60 || tm.tm_min > 59 || tm.tm_hour > 23
|| tm.tm_mday > 31 || tm.tm_mon > 11)
return -1;
if (tm.tm_sec < 0 || tm.tm_min < 0 || tm.tm_hour < 0
|| tm.tm_mday < 0 || tm.tm_mon < 0 || tm.tm_year < 0)
return -1;
tz = getenv("TZ");
setenv("TZ", "UTC0", 1);
tzset();
rv = mktime(&tm);
if (tz)
setenv("TZ", tz, 1);
else
unsetenv("TZ");
return rv;
}
static char *
format_http_date(time_t when)
{
struct tm *tm;
static char buf[30];
tm = gmtime(&when);
if (tm == 0)
return 0;
#ifndef HTTP_DATE_ISO_8601
sprintf(buf, "%s, %02d %s %04d %02d:%02d:%02d GMT",
wkdays[tm->tm_wday], tm->tm_mday, months[tm->tm_mon],
tm->tm_year + 1900, tm->tm_hour, tm->tm_min, tm->tm_sec);
#else /* ISO 8601 */
sprintf(buf, "%04d%02d%02dT%02d%02d%02d+0000",
tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
tm->tm_hour, tm->tm_min, tm->tm_sec);
#endif
return buf;
}
static char *
format_http_user_agent(void)
{
static char buf[128];
static int inited;
if (!inited) {
int mib[2];
char ostype[128], osrelease[128], machine[128];
size_t len;
mib[0] = CTL_KERN;
mib[1] = KERN_OSTYPE;
len = sizeof ostype;
if (sysctl(mib, 2, ostype, &len, 0, 0) < 0) {
warn("sysctl");
ostype[0] = '\0';
}
mib[1] = KERN_OSRELEASE;
len = sizeof osrelease;
if (sysctl(mib, 2, osrelease, &len, 0, 0) < 0) {
warn("sysctl");
osrelease[0] = '\0';
}
mib[0] = CTL_HW;
mib[1] = HW_MACHINE;
len = sizeof machine;
if (sysctl(mib, 2, machine, &len, 0, 0) < 0) {
warn("sysctl");
machine[0] = '\0';
}
snprintf(buf, sizeof buf,
"User-Agent: " FETCH_VERSION " %s/%s (%s)\r\n",
ostype, osrelease, machine);
}
return buf;
}
/*
* Parse a Content-Range return header from the server. RFC 2066 defines
* this header to have the format:
* Content-Range: bytes 12345-67890/123456
* Since we always ask for the whole rest of the file, we consider it an
* error if the reply doesn't claim to give it to us.
*/
static int
parse_http_content_range(char *orig, off_t *restart_from, off_t *total_length)
{
u_quad_t first, last, total;
char *ep;
if (strcasecmp(orig, "bytes") != 0) {
warnx("unknown Content-Range unit: `%s'", orig);
return EX_PROTOCOL;
}
orig += 5;
while (*orig && isspace(*orig))
orig++;
errno = 0;
first = strtouq(orig, &ep, 10);
if (errno != 0 || *ep != '-') {
warnx("invalid Content-Range: `%s'", orig);
return EX_PROTOCOL;
}
last = strtouq(ep + 1, &ep, 10);
if (errno != 0 || *ep != '/' || last < first) {
warnx("invalid Content-Range: `%s'", orig);
return EX_PROTOCOL;
}
total = strtouq(ep + 1, &ep, 10);
if (errno != 0 || !(*ep == '\0' || isspace(*ep))) {
warnx("invalid Content-Range: `%s'", orig);
return EX_PROTOCOL;
}
if (last + 1 != total) {
warnx("HTTP server did not return requested Content-Range");
return EX_PROTOCOL;
}
*restart_from = first;
*total_length = last;
return 0;
}