From d1b143ee9a5c249312ffa20988d3d91830fab7eb Mon Sep 17 00:00:00 2001 From: Eric van Gyzen Date: Wed, 2 Mar 2022 14:00:38 -0600 Subject: [PATCH] Vendor import of expat 2.4.6 Sponsored by: Dell EMC Isilon --- Changes | 95 ++++++++++++++ Makefile.in | 1 + README.md | 2 +- configure.ac | 12 +- doc/Makefile.am | 3 +- doc/Makefile.in | 4 +- doc/reference.html | 2 +- doc/xmlwf.1 | 2 +- doc/xmlwf.xml | 4 +- examples/Makefile.in | 1 + examples/elements.c | 2 +- examples/outline.c | 2 +- lib/Makefile.in | 1 + lib/expat.h | 2 +- lib/xmlparse.c | 195 ++++++++++++++++++++-------- lib/xmlrole.c | 2 +- lib/xmltok.c | 9 +- lib/xmltok_impl.c | 20 +-- tests/Makefile.in | 1 + tests/benchmark/Makefile.in | 1 + tests/benchmark/benchmark.c | 2 +- tests/runtests.c | 251 +++++++++++++++++++++++++++++++++++- xmlwf/Makefile.in | 1 + xmlwf/xmlfile.c | 2 +- xmlwf/xmlwf.c | 8 +- 25 files changed, 529 insertions(+), 96 deletions(-) diff --git a/Changes b/Changes index 7540d38ca822..40127e1b76f7 100644 --- a/Changes +++ b/Changes @@ -2,6 +2,101 @@ NOTE: We are looking for help with a few things: https://github.com/libexpat/libexpat/labels/help%20wanted If you can help, please get in touch. Thanks! +Release 2.4.6 Sun February 20 2022 + Bug fixes: + #566 Fix a regression introduced by the fix for CVE-2022-25313 + in release 2.4.5 that affects applications that (1) + call function XML_SetElementDeclHandler and (2) are + parsing XML that contains nested element declarations + (e.g. ""). + + Other changes: + #567 #568 Version info bumped from 9:5:8 to 9:6:8; + see https://verbump.de/ for what these numbers do + + Special thanks to: + Matt Sergeant + Samanta Navarro + Sergei Trofimovich + and + NixOS + Perl XML::Parser + +Release 2.4.5 Fri February 18 2022 + Security fixes: + #562 CVE-2022-25235 -- Passing malformed 2- and 3-byte UTF-8 + sequences (e.g. from start tag names) to the XML + processing application on top of Expat can cause + arbitrary damage (e.g. code execution) depending + on how invalid UTF-8 is handled inside the XML + processor; validation was not their job but Expat's. + Exploits with code execution are known to exist. + #561 CVE-2022-25236 -- Passing (one or more) namespace separator + characters in "xmlns[:prefix]" attribute values + made Expat send malformed tag names to the XML + processor on top of Expat which can cause + arbitrary damage (e.g. code execution) depending + on such unexpectable cases are handled inside the XML + processor; validation was not their job but Expat's. + Exploits with code execution are known to exist. + #558 CVE-2022-25313 -- Fix stack exhaustion in doctype parsing + that could be triggered by e.g. a 2 megabytes + file with a large number of opening braces. + Expected impact is denial of service or potentially + arbitrary code execution. + #560 CVE-2022-25314 -- Fix integer overflow in function copyString; + only affects the encoding name parameter at parser creation + time which is often hardcoded (rather than user input), + takes a value in the gigabytes to trigger, and a 64-bit + machine. Expected impact is denial of service. + #559 CVE-2022-25315 -- Fix integer overflow in function storeRawNames; + needs input in the gigabytes and a 64-bit machine. + Expected impact is denial of service or potentially + arbitrary code execution. + + Other changes: + #557 #564 Version info bumped from 9:4:8 to 9:5:8; + see https://verbump.de/ for what these numbers do + + Special thanks to: + Ivan Fratric + Samanta Navarro + and + Google Project Zero + JetBrains + +Release 2.4.4 Sun January 30 2022 + Security fixes: + #550 CVE-2022-23852 -- Fix signed integer overflow + (undefined behavior) in function XML_GetBuffer + (that is also called by function XML_Parse internally) + for when XML_CONTEXT_BYTES is defined to >0 (which is both + common and default). + Impact is denial of service or more. + #551 CVE-2022-23990 -- Fix unsigned integer overflow in function + doProlog triggered by large content in element type + declarations when there is an element declaration handler + present (from a prior call to XML_SetElementDeclHandler). + Impact is denial of service or more. + + Bug fixes: + #544 #545 xmlwf: Fix a memory leak on output file opening error + + Other changes: + #546 Autotools: Fix broken CMake support under Cygwin + #554 Windows: Add missing files to the installer to fix + compilation with CMake from installed sources + #552 #554 Version info bumped from 9:3:8 to 9:4:8; + see https://verbump.de/ for what these numbers do + + Special thanks to: + Carlo Bramini + hwt0415 + Roland Illig + Samanta Navarro + and + Clang LeakSan and the Clang team + Release 2.4.3 Sun January 16 2022 Security fixes: #531 #534 CVE-2021-45960 -- Fix issues with left shifts by >=29 places diff --git a/Makefile.in b/Makefile.in index 34f900a61630..7c6551fca2cf 100644 --- a/Makefile.in +++ b/Makefile.in @@ -306,6 +306,7 @@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ +CMAKE_SHARED_LIBRARY_PREFIX = @CMAKE_SHARED_LIBRARY_PREFIX@ CPPFLAGS = @CPPFLAGS@ CSCOPE = @CSCOPE@ CTAGS = @CTAGS@ diff --git a/README.md b/README.md index 6fdd6148714b..959c4a6e94a7 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![Downloads GitHub](https://img.shields.io/github/downloads/libexpat/libexpat/total?label=Downloads%20GitHub)](https://github.com/libexpat/libexpat/releases) -# Expat, Release 2.4.3 +# Expat, Release 2.4.6 This is Expat, a C library for parsing XML, started by [James Clark](https://en.wikipedia.org/wiki/James_Clark_%28programmer%29) in 1997. diff --git a/configure.ac b/configure.ac index c285a2b60ed7..5175487bb4b3 100644 --- a/configure.ac +++ b/configure.ac @@ -82,7 +82,7 @@ dnl If the API changes incompatibly set LIBAGE back to 0 dnl LIBCURRENT=9 # sync -LIBREVISION=3 # with +LIBREVISION=6 # with LIBAGE=8 # CMakeLists.txt! AC_CONFIG_HEADERS([expat_config.h]) @@ -395,9 +395,17 @@ AC_SUBST([AM_CFLAGS]) AC_SUBST([AM_CXXFLAGS]) AC_SUBST([AM_LDFLAGS]) +dnl Emulate the use of CMAKE_SHARED_LIBRARY_PREFIX under CMake +AC_MSG_CHECKING([for shared library name prefix]) +AS_CASE("${host_os}", + [cygwin*], [CMAKE_SHARED_LIBRARY_PREFIX=cyg], + [CMAKE_SHARED_LIBRARY_PREFIX=lib]) +AC_MSG_RESULT([${CMAKE_SHARED_LIBRARY_PREFIX}]) +AC_SUBST([CMAKE_SHARED_LIBRARY_PREFIX]) + AS_CASE("${host_os}", [darwin*], [CMAKE_NOCONFIG_SOURCE=cmake/autotools/expat-noconfig__macos.cmake.in], - [mingw*], [CMAKE_NOCONFIG_SOURCE=cmake/autotools/expat-noconfig__windows.cmake.in], + [mingw*|cygwin*], [CMAKE_NOCONFIG_SOURCE=cmake/autotools/expat-noconfig__windows.cmake.in], [CMAKE_NOCONFIG_SOURCE=cmake/autotools/expat-noconfig__linux.cmake.in]) AC_CONFIG_FILES([Makefile] [expat.pc] diff --git a/doc/Makefile.am b/doc/Makefile.am index 16987e8dec40..c3a3ce59c1b9 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -6,7 +6,7 @@ # \___/_/\_\ .__/ \__,_|\__| # |_| XML parser # -# Copyright (c) 2017-2021 Sebastian Pipping +# Copyright (c) 2017-2022 Sebastian Pipping # Copyright (c) 2017 Stephen Groat # Copyright (c) 2017 Joe Orton # Licensed under the MIT license: @@ -57,5 +57,4 @@ EXTRA_DIST = \ ok.min.css \ reference.html \ style.css \ - valid-xhtml10.png \ xmlwf.xml diff --git a/doc/Makefile.in b/doc/Makefile.in index c48834052935..9c7d76da2ce0 100644 --- a/doc/Makefile.in +++ b/doc/Makefile.in @@ -22,7 +22,7 @@ # \___/_/\_\ .__/ \__,_|\__| # |_| XML parser # -# Copyright (c) 2017-2021 Sebastian Pipping +# Copyright (c) 2017-2022 Sebastian Pipping # Copyright (c) 2017 Stephen Groat # Copyright (c) 2017 Joe Orton # Licensed under the MIT license: @@ -209,6 +209,7 @@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ +CMAKE_SHARED_LIBRARY_PREFIX = @CMAKE_SHARED_LIBRARY_PREFIX@ CPPFLAGS = @CPPFLAGS@ CSCOPE = @CSCOPE@ CTAGS = @CTAGS@ @@ -344,7 +345,6 @@ EXTRA_DIST = \ ok.min.css \ reference.html \ style.css \ - valid-xhtml10.png \ xmlwf.xml all: all-am diff --git a/doc/reference.html b/doc/reference.html index 1629413e541e..26db5a635479 100644 --- a/doc/reference.html +++ b/doc/reference.html @@ -49,7 +49,7 @@

The Expat XML Parser - Release 2.4.3 + Release 2.4.6

diff --git a/doc/xmlwf.1 b/doc/xmlwf.1 index f76353173f62..f931d63d4e1d 100644 --- a/doc/xmlwf.1 +++ b/doc/xmlwf.1 @@ -5,7 +5,7 @@ \\$2 \(la\\$1\(ra\\$3 .. .if \n(.g .mso www.tmac -.TH XMLWF 1 "January 16, 2022" "" "" +.TH XMLWF 1 "February 20, 2022" "" "" .SH NAME xmlwf \- Determines if an XML document is well-formed .SH SYNOPSIS diff --git a/doc/xmlwf.xml b/doc/xmlwf.xml index 804fc5620eda..79ed58569ea9 100644 --- a/doc/xmlwf.xml +++ b/doc/xmlwf.xml @@ -21,8 +21,8 @@ "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd" [ Scott"> Bronson"> - - January 16, 2022"> + February 20, 2022"> + 1"> bronson@rinspin.com"> diff --git a/examples/Makefile.in b/examples/Makefile.in index d476e53526bd..8528d439290b 100644 --- a/examples/Makefile.in +++ b/examples/Makefile.in @@ -230,6 +230,7 @@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ +CMAKE_SHARED_LIBRARY_PREFIX = @CMAKE_SHARED_LIBRARY_PREFIX@ CPPFLAGS = @CPPFLAGS@ CSCOPE = @CSCOPE@ CTAGS = @CTAGS@ diff --git a/examples/elements.c b/examples/elements.c index 2b6645610771..481d44472686 100644 --- a/examples/elements.c +++ b/examples/elements.c @@ -13,7 +13,7 @@ Copyright (c) 1997-2000 Thai Open Source Software Center Ltd Copyright (c) 2001-2003 Fred L. Drake, Jr. Copyright (c) 2004-2006 Karl Waclawek - Copyright (c) 2005-2007 Steven Solie + Copyright (c) 2005-2007 Steven Solie Copyright (c) 2016-2019 Sebastian Pipping Copyright (c) 2017 Rhodri James Copyright (c) 2019 Zhongyuan Zhou diff --git a/examples/outline.c b/examples/outline.c index 4ed041febb37..936f0e09053f 100644 --- a/examples/outline.c +++ b/examples/outline.c @@ -10,7 +10,7 @@ Copyright (c) 2000 Clark Cooper Copyright (c) 2001-2003 Fred L. Drake, Jr. - Copyright (c) 2005-2007 Steven Solie + Copyright (c) 2005-2007 Steven Solie Copyright (c) 2005-2006 Karl Waclawek Copyright (c) 2016-2019 Sebastian Pipping Copyright (c) 2017 Rhodri James diff --git a/lib/Makefile.in b/lib/Makefile.in index d4069ebe983d..3581b6bf66b0 100644 --- a/lib/Makefile.in +++ b/lib/Makefile.in @@ -268,6 +268,7 @@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ +CMAKE_SHARED_LIBRARY_PREFIX = @CMAKE_SHARED_LIBRARY_PREFIX@ CPPFLAGS = @CPPFLAGS@ CSCOPE = @CSCOPE@ CTAGS = @CTAGS@ diff --git a/lib/expat.h b/lib/expat.h index 0f021e25def1..46a0e1bcd22d 100644 --- a/lib/expat.h +++ b/lib/expat.h @@ -1041,7 +1041,7 @@ XML_SetBillionLaughsAttackProtectionActivationThreshold( */ #define XML_MAJOR_VERSION 2 #define XML_MINOR_VERSION 4 -#define XML_MICRO_VERSION 3 +#define XML_MICRO_VERSION 6 #ifdef __cplusplus } diff --git a/lib/xmlparse.c b/lib/xmlparse.c index b2f5fc6bee77..7db28d07acbc 100644 --- a/lib/xmlparse.c +++ b/lib/xmlparse.c @@ -1,4 +1,4 @@ -/* 9ca2a2fedc35bcb13ba9a134ba5e173020bc2ff5f5a311abf742cec7da1ff26a (2.4.3+) +/* a30d2613dcfdef81475a9d1a349134d2d42722172fdaa7d5bb12ed2aa74b9596 (2.4.6+) __ __ _ ___\ \/ /_ __ __ _| |_ / _ \\ /| '_ \ / _` | __| @@ -11,7 +11,7 @@ Copyright (c) 2000-2006 Fred L. Drake, Jr. Copyright (c) 2001-2002 Greg Stein Copyright (c) 2002-2016 Karl Waclawek - Copyright (c) 2005-2009 Steven Solie + Copyright (c) 2005-2009 Steven Solie Copyright (c) 2016 Eric Rahm Copyright (c) 2016-2022 Sebastian Pipping Copyright (c) 2016 Gaurav @@ -33,6 +33,7 @@ Copyright (c) 2019-2020 Ben Wagner Copyright (c) 2019 Vadim Zeitlin Copyright (c) 2021 Dong-hee Na + Copyright (c) 2022 Samanta Navarro Licensed under the MIT license: Permission is hereby granted, free of charge, to any person obtaining @@ -717,8 +718,7 @@ XML_ParserCreate(const XML_Char *encodingName) { XML_Parser XMLCALL XML_ParserCreateNS(const XML_Char *encodingName, XML_Char nsSep) { - XML_Char tmp[2]; - *tmp = nsSep; + XML_Char tmp[2] = {nsSep, 0}; return XML_ParserCreate_MM(encodingName, NULL, tmp); } @@ -974,7 +974,7 @@ parserCreate(const XML_Char *encodingName, if (memsuite) { XML_Memory_Handling_Suite *mtemp; - parser = (XML_Parser)memsuite->malloc_fcn(sizeof(struct XML_ParserStruct)); + parser = memsuite->malloc_fcn(sizeof(struct XML_ParserStruct)); if (parser != NULL) { mtemp = (XML_Memory_Handling_Suite *)&(parser->m_mem); mtemp->malloc_fcn = memsuite->malloc_fcn; @@ -1343,8 +1343,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context, would be otherwise. */ if (parser->m_ns) { - XML_Char tmp[2]; - *tmp = parser->m_namespaceSeparator; + XML_Char tmp[2] = {parser->m_namespaceSeparator, 0}; parser = parserCreate(encodingName, &parser->m_mem, tmp, newDtd); } else { parser = parserCreate(encodingName, &parser->m_mem, NULL, newDtd); @@ -2067,6 +2066,11 @@ XML_GetBuffer(XML_Parser parser, int len) { keep = (int)EXPAT_SAFE_PTR_DIFF(parser->m_bufferPtr, parser->m_buffer); if (keep > XML_CONTEXT_BYTES) keep = XML_CONTEXT_BYTES; + /* Detect and prevent integer overflow */ + if (keep > INT_MAX - neededSize) { + parser->m_errorCode = XML_ERROR_NO_MEMORY; + return NULL; + } neededSize += keep; #endif /* defined XML_CONTEXT_BYTES */ if (neededSize @@ -2557,6 +2561,7 @@ storeRawNames(XML_Parser parser) { while (tag) { int bufSize; int nameLen = sizeof(XML_Char) * (tag->name.strLen + 1); + size_t rawNameLen; char *rawNameBuf = tag->buf + nameLen; /* Stop if already stored. Since m_tagStack is a stack, we can stop at the first entry that has already been copied; everything @@ -2568,7 +2573,11 @@ storeRawNames(XML_Parser parser) { /* For re-use purposes we need to ensure that the size of tag->buf is a multiple of sizeof(XML_Char). */ - bufSize = nameLen + ROUND_UP(tag->rawNameLength, sizeof(XML_Char)); + rawNameLen = ROUND_UP(tag->rawNameLength, sizeof(XML_Char)); + /* Detect and prevent integer overflow. */ + if (rawNameLen > (size_t)INT_MAX - nameLen) + return XML_FALSE; + bufSize = nameLen + (int)rawNameLen; if (bufSize > tag->bufEnd - tag->buf) { char *temp = (char *)REALLOC(parser, tag->buf, bufSize); if (temp == NULL) @@ -3750,6 +3759,17 @@ addBinding(XML_Parser parser, PREFIX *prefix, const ATTRIBUTE_ID *attId, if (! mustBeXML && isXMLNS && (len > xmlnsLen || uri[len] != xmlnsNamespace[len])) isXMLNS = XML_FALSE; + + // NOTE: While Expat does not validate namespace URIs against RFC 3986, + // we have to at least make sure that the XML processor on top of + // Expat (that is splitting tag names by namespace separator into + // 2- or 3-tuples (uri-local or uri-local-prefix)) cannot be confused + // by an attacker putting additional namespace separator characters + // into namespace declarations. That would be ambiguous and not to + // be expected. + if (parser->m_ns && (uri[len] == parser->m_namespaceSeparator)) { + return XML_ERROR_SYNTAX; + } } isXML = isXML && len == xmlLen; isXMLNS = isXMLNS && len == xmlnsLen; @@ -4092,7 +4112,7 @@ initializeEncoding(XML_Parser parser) { const char *s; #ifdef XML_UNICODE char encodingBuf[128]; - /* See comments abount `protoclEncodingName` in parserInit() */ + /* See comments about `protocolEncodingName` in parserInit() */ if (! parser->m_protocolEncodingName) s = NULL; else { @@ -5367,7 +5387,7 @@ doProlog(XML_Parser parser, const ENCODING *enc, const char *s, const char *end, if (dtd->in_eldecl) { ELEMENT_TYPE *el; const XML_Char *name; - int nameLen; + size_t nameLen; const char *nxt = (quant == XML_CQUANT_NONE ? next : next - enc->minBytesPerChar); int myindex = nextScaffoldPart(parser); @@ -5383,7 +5403,13 @@ doProlog(XML_Parser parser, const ENCODING *enc, const char *s, const char *end, nameLen = 0; for (; name[nameLen++];) ; - dtd->contentStringLen += nameLen; + + /* Detect and prevent integer overflow */ + if (nameLen > UINT_MAX - dtd->contentStringLen) { + return XML_ERROR_NO_MEMORY; + } + + dtd->contentStringLen += (unsigned)nameLen; if (parser->m_elementDeclHandler) handleDefault = XML_FALSE; } @@ -6536,7 +6562,7 @@ normalizePublicId(XML_Char *publicId) { static DTD * dtdCreate(const XML_Memory_Handling_Suite *ms) { - DTD *p = (DTD *)ms->malloc_fcn(sizeof(DTD)); + DTD *p = ms->malloc_fcn(sizeof(DTD)); if (p == NULL) return p; poolInit(&(p->pool), ms); @@ -6709,8 +6735,8 @@ dtdCopy(XML_Parser oldParser, DTD *newDtd, const DTD *oldDtd, if (! newE) return 0; if (oldE->nDefaultAtts) { - newE->defaultAtts = (DEFAULT_ATTRIBUTE *)ms->malloc_fcn( - oldE->nDefaultAtts * sizeof(DEFAULT_ATTRIBUTE)); + newE->defaultAtts + = ms->malloc_fcn(oldE->nDefaultAtts * sizeof(DEFAULT_ATTRIBUTE)); if (! newE->defaultAtts) { return 0; } @@ -6872,7 +6898,7 @@ lookup(XML_Parser parser, HASH_TABLE *table, KEY name, size_t createSize) { /* table->size is a power of 2 */ table->size = (size_t)1 << INIT_POWER; tsize = table->size * sizeof(NAMED *); - table->v = (NAMED **)table->mem->malloc_fcn(tsize); + table->v = table->mem->malloc_fcn(tsize); if (! table->v) { table->size = 0; return NULL; @@ -6912,7 +6938,7 @@ lookup(XML_Parser parser, HASH_TABLE *table, KEY name, size_t createSize) { } size_t tsize = newSize * sizeof(NAMED *); - NAMED **newV = (NAMED **)table->mem->malloc_fcn(tsize); + NAMED **newV = table->mem->malloc_fcn(tsize); if (! newV) return NULL; memset(newV, 0, tsize); @@ -6941,7 +6967,7 @@ lookup(XML_Parser parser, HASH_TABLE *table, KEY name, size_t createSize) { } } } - table->v[i] = (NAMED *)table->mem->malloc_fcn(createSize); + table->v[i] = table->mem->malloc_fcn(createSize); if (! table->v[i]) return NULL; memset(table->v[i], 0, createSize); @@ -7229,7 +7255,7 @@ poolGrow(STRING_POOL *pool) { if (bytesToAllocate == 0) return XML_FALSE; - tem = (BLOCK *)pool->mem->malloc_fcn(bytesToAllocate); + tem = pool->mem->malloc_fcn(bytesToAllocate); if (! tem) return XML_FALSE; tem->size = blockSize; @@ -7305,44 +7331,15 @@ nextScaffoldPart(XML_Parser parser) { return next; } -static void -build_node(XML_Parser parser, int src_node, XML_Content *dest, - XML_Content **contpos, XML_Char **strpos) { - DTD *const dtd = parser->m_dtd; /* save one level of indirection */ - dest->type = dtd->scaffold[src_node].type; - dest->quant = dtd->scaffold[src_node].quant; - if (dest->type == XML_CTYPE_NAME) { - const XML_Char *src; - dest->name = *strpos; - src = dtd->scaffold[src_node].name; - for (;;) { - *(*strpos)++ = *src; - if (! *src) - break; - src++; - } - dest->numchildren = 0; - dest->children = NULL; - } else { - unsigned int i; - int cn; - dest->numchildren = dtd->scaffold[src_node].childcnt; - dest->children = *contpos; - *contpos += dest->numchildren; - for (i = 0, cn = dtd->scaffold[src_node].firstchild; i < dest->numchildren; - i++, cn = dtd->scaffold[cn].nextsib) { - build_node(parser, cn, &(dest->children[i]), contpos, strpos); - } - dest->name = NULL; - } -} - static XML_Content * build_model(XML_Parser parser) { + /* Function build_model transforms the existing parser->m_dtd->scaffold + * array of CONTENT_SCAFFOLD tree nodes into a new array of + * XML_Content tree nodes followed by a gapless list of zero-terminated + * strings. */ DTD *const dtd = parser->m_dtd; /* save one level of indirection */ XML_Content *ret; - XML_Content *cpos; - XML_Char *str; + XML_Char *str; /* the current string writing location */ /* Detect and prevent integer overflow. * The preprocessor guard addresses the "always false" warning @@ -7368,10 +7365,96 @@ build_model(XML_Parser parser) { if (! ret) return NULL; - str = (XML_Char *)(&ret[dtd->scaffCount]); - cpos = &ret[1]; + /* What follows is an iterative implementation (of what was previously done + * recursively in a dedicated function called "build_node". The old recursive + * build_node could be forced into stack exhaustion from input as small as a + * few megabyte, and so that was a security issue. Hence, a function call + * stack is avoided now by resolving recursion.) + * + * The iterative approach works as follows: + * + * - We have two writing pointers, both walking up the result array; one does + * the work, the other creates "jobs" for its colleague to do, and leads + * the way: + * + * - The faster one, pointer jobDest, always leads and writes "what job + * to do" by the other, once they reach that place in the + * array: leader "jobDest" stores the source node array index (relative + * to array dtd->scaffold) in field "numchildren". + * + * - The slower one, pointer dest, looks at the value stored in the + * "numchildren" field (which actually holds a source node array index + * at that time) and puts the real data from dtd->scaffold in. + * + * - Before the loop starts, jobDest writes source array index 0 + * (where the root node is located) so that dest will have something to do + * when it starts operation. + * + * - Whenever nodes with children are encountered, jobDest appends + * them as new jobs, in order. As a result, tree node siblings are + * adjacent in the resulting array, for example: + * + * [0] root, has two children + * [1] first child of 0, has three children + * [3] first child of 1, does not have children + * [4] second child of 1, does not have children + * [5] third child of 1, does not have children + * [2] second child of 0, does not have children + * + * Or (the same data) presented in flat array view: + * + * [0] root, has two children + * + * [1] first child of 0, has three children + * [2] second child of 0, does not have children + * + * [3] first child of 1, does not have children + * [4] second child of 1, does not have children + * [5] third child of 1, does not have children + * + * - The algorithm repeats until all target array indices have been processed. + */ + XML_Content *dest = ret; /* tree node writing location, moves upwards */ + XML_Content *const destLimit = &ret[dtd->scaffCount]; + XML_Content *jobDest = ret; /* next free writing location in target array */ + str = (XML_Char *)&ret[dtd->scaffCount]; + + /* Add the starting job, the root node (index 0) of the source tree */ + (jobDest++)->numchildren = 0; + + for (; dest < destLimit; dest++) { + /* Retrieve source tree array index from job storage */ + const int src_node = (int)dest->numchildren; + + /* Convert item */ + dest->type = dtd->scaffold[src_node].type; + dest->quant = dtd->scaffold[src_node].quant; + if (dest->type == XML_CTYPE_NAME) { + const XML_Char *src; + dest->name = str; + src = dtd->scaffold[src_node].name; + for (;;) { + *str++ = *src; + if (! *src) + break; + src++; + } + dest->numchildren = 0; + dest->children = NULL; + } else { + unsigned int i; + int cn; + dest->name = NULL; + dest->numchildren = dtd->scaffold[src_node].childcnt; + dest->children = jobDest; + + /* Append scaffold indices of children to array */ + for (i = 0, cn = dtd->scaffold[src_node].firstchild; + i < dest->numchildren; i++, cn = dtd->scaffold[cn].nextsib) + (jobDest++)->numchildren = (unsigned int)cn; + } + } - build_node(parser, 0, ret, &cpos, &str); return ret; } @@ -7400,7 +7483,7 @@ getElementType(XML_Parser parser, const ENCODING *enc, const char *ptr, static XML_Char * copyString(const XML_Char *s, const XML_Memory_Handling_Suite *memsuite) { - int charsRequired = 0; + size_t charsRequired = 0; XML_Char *result; /* First determine how long the string is */ diff --git a/lib/xmlrole.c b/lib/xmlrole.c index 77746ee42d10..3f0f5c150c62 100644 --- a/lib/xmlrole.c +++ b/lib/xmlrole.c @@ -11,7 +11,7 @@ Copyright (c) 2002 Greg Stein Copyright (c) 2002-2006 Karl Waclawek Copyright (c) 2002-2003 Fred L. Drake, Jr. - Copyright (c) 2005-2009 Steven Solie + Copyright (c) 2005-2009 Steven Solie Copyright (c) 2016-2021 Sebastian Pipping Copyright (c) 2017 Rhodri James Copyright (c) 2019 David Loffredo diff --git a/lib/xmltok.c b/lib/xmltok.c index 502ca1adc33b..c659983b4008 100644 --- a/lib/xmltok.c +++ b/lib/xmltok.c @@ -11,8 +11,8 @@ Copyright (c) 2001-2003 Fred L. Drake, Jr. Copyright (c) 2002 Greg Stein Copyright (c) 2002-2016 Karl Waclawek - Copyright (c) 2005-2009 Steven Solie - Copyright (c) 2016-2021 Sebastian Pipping + Copyright (c) 2005-2009 Steven Solie + Copyright (c) 2016-2022 Sebastian Pipping Copyright (c) 2016 Pascal Cuoq Copyright (c) 2016 Don Lewis Copyright (c) 2017 Rhodri James @@ -98,11 +98,6 @@ + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \ & (1u << (((byte)[2]) & 0x1F))) -#define UTF8_GET_NAMING(pages, p, n) \ - ((n) == 2 \ - ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ - : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0)) - /* Detection of invalid UTF-8 sequences is based on Table 3.1B of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ with the additional restriction of not allowing the Unicode diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c index 0430591b4263..4072b06497d1 100644 --- a/lib/xmltok_impl.c +++ b/lib/xmltok_impl.c @@ -10,7 +10,7 @@ Copyright (c) 2000 Clark Cooper Copyright (c) 2002 Fred L. Drake, Jr. Copyright (c) 2002-2016 Karl Waclawek - Copyright (c) 2016-2021 Sebastian Pipping + Copyright (c) 2016-2022 Sebastian Pipping Copyright (c) 2017 Rhodri James Copyright (c) 2018 Benjamin Peterson Copyright (c) 2018 Anton Maklakov @@ -69,7 +69,7 @@ case BT_LEAD##n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ - if (! IS_NAME_CHAR(enc, ptr, n)) { \ + if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -98,7 +98,7 @@ case BT_LEAD##n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ - if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \ + if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -1142,6 +1142,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, case BT_LEAD##n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ + if (IS_INVALID_CHAR(enc, ptr, n)) { \ + *nextTokPtr = ptr; \ + return XML_TOK_INVALID; \ + } \ if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ ptr += n; \ tok = XML_TOK_NAME; \ @@ -1270,7 +1274,7 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end, switch (BYTE_TYPE(enc, ptr)) { # define LEAD_CASE(n) \ case BT_LEAD##n: \ - ptr += n; \ + ptr += n; /* NOTE: The encoding has already been validated. */ \ break; LEAD_CASE(2) LEAD_CASE(3) @@ -1339,7 +1343,7 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end, switch (BYTE_TYPE(enc, ptr)) { # define LEAD_CASE(n) \ case BT_LEAD##n: \ - ptr += n; \ + ptr += n; /* NOTE: The encoding has already been validated. */ \ break; LEAD_CASE(2) LEAD_CASE(3) @@ -1518,7 +1522,7 @@ PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax, state = inName; \ } # define LEAD_CASE(n) \ - case BT_LEAD##n: \ + case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \ START_NAME ptr += (n - MINBPC(enc)); \ break; LEAD_CASE(2) @@ -1730,7 +1734,7 @@ PREFIX(nameLength)(const ENCODING *enc, const char *ptr) { switch (BYTE_TYPE(enc, ptr)) { # define LEAD_CASE(n) \ case BT_LEAD##n: \ - ptr += n; \ + ptr += n; /* NOTE: The encoding has already been validated. */ \ break; LEAD_CASE(2) LEAD_CASE(3) @@ -1775,7 +1779,7 @@ PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end, switch (BYTE_TYPE(enc, ptr)) { # define LEAD_CASE(n) \ case BT_LEAD##n: \ - ptr += n; \ + ptr += n; /* NOTE: The encoding has already been validated. */ \ pos->columnNumber++; \ break; LEAD_CASE(2) diff --git a/tests/Makefile.in b/tests/Makefile.in index 1d382c34b5bb..024ddd98cf29 100644 --- a/tests/Makefile.in +++ b/tests/Makefile.in @@ -516,6 +516,7 @@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ +CMAKE_SHARED_LIBRARY_PREFIX = @CMAKE_SHARED_LIBRARY_PREFIX@ CPPFLAGS = @CPPFLAGS@ CSCOPE = @CSCOPE@ CTAGS = @CTAGS@ diff --git a/tests/benchmark/Makefile.in b/tests/benchmark/Makefile.in index 7bf8068b1942..64238f1da99a 100644 --- a/tests/benchmark/Makefile.in +++ b/tests/benchmark/Makefile.in @@ -227,6 +227,7 @@ AWK = @AWK@ CC = @CC@ CCDEPMODE = @CCDEPMODE@ CFLAGS = @CFLAGS@ +CMAKE_SHARED_LIBRARY_PREFIX = @CMAKE_SHARED_LIBRARY_PREFIX@ CPPFLAGS = @CPPFLAGS@ CSCOPE = @CSCOPE@ CTAGS = @CTAGS@ diff --git a/tests/benchmark/benchmark.c b/tests/benchmark/benchmark.c index ba2bc6e1fa22..2c4eb78e3615 100644 --- a/tests/benchmark/benchmark.c +++ b/tests/benchmark/benchmark.c @@ -7,7 +7,7 @@ |_| XML parser Copyright (c) 2003-2006 Karl Waclawek - Copyright (c) 2005-2007 Steven Solie + Copyright (c) 2005-2007 Steven Solie Copyright (c) 2017 Sebastian Pipping Copyright (c) 2017 Rhodri James Licensed under the MIT license: diff --git a/tests/runtests.c b/tests/runtests.c index d2923ac1fd58..6d6f66909a11 100644 --- a/tests/runtests.c +++ b/tests/runtests.c @@ -8,7 +8,7 @@ Copyright (c) 2001-2006 Fred L. Drake, Jr. Copyright (c) 2003 Greg Stein - Copyright (c) 2005-2007 Steven Solie + Copyright (c) 2005-2007 Steven Solie Copyright (c) 2005-2012 Karl Waclawek Copyright (c) 2016-2022 Sebastian Pipping Copyright (c) 2017-2018 Rhodri James @@ -2664,6 +2664,82 @@ START_TEST(test_dtd_elements) { } END_TEST +static void XMLCALL +element_decl_check_model(void *userData, const XML_Char *name, + XML_Content *model) { + UNUSED_P(userData); + uint32_t errorFlags = 0; + + /* Expected model array structure is this: + * [0] (type 6, quant 0) + * [1] (type 5, quant 0) + * [3] (type 4, quant 0, name "bar") + * [4] (type 4, quant 0, name "foo") + * [5] (type 4, quant 3, name "xyz") + * [2] (type 4, quant 2, name "zebra") + */ + errorFlags |= ((xcstrcmp(name, XCS("junk")) == 0) ? 0 : (1u << 0)); + errorFlags |= ((model != NULL) ? 0 : (1u << 1)); + + errorFlags |= ((model[0].type == XML_CTYPE_SEQ) ? 0 : (1u << 2)); + errorFlags |= ((model[0].quant == XML_CQUANT_NONE) ? 0 : (1u << 3)); + errorFlags |= ((model[0].numchildren == 2) ? 0 : (1u << 4)); + errorFlags |= ((model[0].children == &model[1]) ? 0 : (1u << 5)); + errorFlags |= ((model[0].name == NULL) ? 0 : (1u << 6)); + + errorFlags |= ((model[1].type == XML_CTYPE_CHOICE) ? 0 : (1u << 7)); + errorFlags |= ((model[1].quant == XML_CQUANT_NONE) ? 0 : (1u << 8)); + errorFlags |= ((model[1].numchildren == 3) ? 0 : (1u << 9)); + errorFlags |= ((model[1].children == &model[3]) ? 0 : (1u << 10)); + errorFlags |= ((model[1].name == NULL) ? 0 : (1u << 11)); + + errorFlags |= ((model[2].type == XML_CTYPE_NAME) ? 0 : (1u << 12)); + errorFlags |= ((model[2].quant == XML_CQUANT_REP) ? 0 : (1u << 13)); + errorFlags |= ((model[2].numchildren == 0) ? 0 : (1u << 14)); + errorFlags |= ((model[2].children == NULL) ? 0 : (1u << 15)); + errorFlags |= ((xcstrcmp(model[2].name, XCS("zebra")) == 0) ? 0 : (1u << 16)); + + errorFlags |= ((model[3].type == XML_CTYPE_NAME) ? 0 : (1u << 17)); + errorFlags |= ((model[3].quant == XML_CQUANT_NONE) ? 0 : (1u << 18)); + errorFlags |= ((model[3].numchildren == 0) ? 0 : (1u << 19)); + errorFlags |= ((model[3].children == NULL) ? 0 : (1u << 20)); + errorFlags |= ((xcstrcmp(model[3].name, XCS("bar")) == 0) ? 0 : (1u << 21)); + + errorFlags |= ((model[4].type == XML_CTYPE_NAME) ? 0 : (1u << 22)); + errorFlags |= ((model[4].quant == XML_CQUANT_NONE) ? 0 : (1u << 23)); + errorFlags |= ((model[4].numchildren == 0) ? 0 : (1u << 24)); + errorFlags |= ((model[4].children == NULL) ? 0 : (1u << 25)); + errorFlags |= ((xcstrcmp(model[4].name, XCS("foo")) == 0) ? 0 : (1u << 26)); + + errorFlags |= ((model[5].type == XML_CTYPE_NAME) ? 0 : (1u << 27)); + errorFlags |= ((model[5].quant == XML_CQUANT_PLUS) ? 0 : (1u << 28)); + errorFlags |= ((model[5].numchildren == 0) ? 0 : (1u << 29)); + errorFlags |= ((model[5].children == NULL) ? 0 : (1u << 30)); + errorFlags |= ((xcstrcmp(model[5].name, XCS("xyz")) == 0) ? 0 : (1u << 31)); + + XML_SetUserData(g_parser, (void *)(uintptr_t)errorFlags); + XML_FreeContentModel(g_parser, model); +} + +START_TEST(test_dtd_elements_nesting) { + // Payload inspired by a test in Perl's XML::Parser + const char *text = "\n" + "]>\n" + ""; + + XML_SetUserData(g_parser, (void *)(uintptr_t)-1); + + XML_SetElementDeclHandler(g_parser, element_decl_check_model); + if (XML_Parse(g_parser, text, (int)strlen(text), XML_TRUE) + == XML_STATUS_ERROR) + xml_failure(g_parser); + + if ((uint32_t)(uintptr_t)XML_GetUserData(g_parser) != 0) + fail("Element declaration model regression detected"); +} +END_TEST + /* Test foreign DTD handling */ START_TEST(test_set_foreign_dtd) { const char *text1 = "\n"; @@ -3847,6 +3923,30 @@ START_TEST(test_get_buffer_2) { } END_TEST +/* Test for signed integer overflow CVE-2022-23852 */ +#if defined(XML_CONTEXT_BYTES) +START_TEST(test_get_buffer_3_overflow) { + XML_Parser parser = XML_ParserCreate(NULL); + assert(parser != NULL); + + const char *const text = "\n"; + const int expectedKeepValue = (int)strlen(text); + + // After this call, variable "keep" in XML_GetBuffer will + // have value expectedKeepValue + if (XML_Parse(parser, text, (int)strlen(text), XML_FALSE /* isFinal */) + == XML_STATUS_ERROR) + xml_failure(parser); + + assert(expectedKeepValue > 0); + if (XML_GetBuffer(parser, INT_MAX - expectedKeepValue + 1) != NULL) + fail("enlarging buffer not failed"); + + XML_ParserFree(parser); +} +END_TEST +#endif // defined(XML_CONTEXT_BYTES) + /* Test position information macros */ START_TEST(test_byte_info_at_end) { const char *text = ""; @@ -5974,6 +6074,105 @@ START_TEST(test_utf8_in_cdata_section_2) { } END_TEST +START_TEST(test_utf8_in_start_tags) { + struct test_case { + bool goodName; + bool goodNameStart; + const char *tagName; + }; + + // The idea with the tests below is this: + // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences + // go to isNever and are hence not a concern. + // + // We start with a character that is a valid name character + // (or even name-start character, see XML 1.0r4 spec) and then we flip + // single bits at places where (1) the result leaves the UTF-8 encoding space + // and (2) we stay in the same n-byte sequence family. + // + // The flipped bits are highlighted in angle brackets in comments, + // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped + // the most significant bit to 1 to leave UTF-8 encoding space. + struct test_case cases[] = { + // 1-byte UTF-8: [0xxx xxxx] + {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':' + {false, false, "\xBA"}, // [<1>011 1010] + {true, false, "\x39"}, // [0011 1001] = ASCII nine '9' + {false, false, "\xB9"}, // [<1>011 1001] + + // 2-byte UTF-8: [110x xxxx] [10xx xxxx] + {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] = + // Arabic small waw U+06E5 + {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101] + {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101] + {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101] + {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] = + // combining char U+0301 + {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001] + {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001] + {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001] + + // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx] + {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] = + // Devanagari Letter A U+0905 + {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101] + {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101] + {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101] + {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101] + {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101] + {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] = + // combining char U+0901 + {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001] + {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001] + {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001] + {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001] + {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001] + }; + const bool atNameStart[] = {true, false}; + + size_t i = 0; + char doc[1024]; + size_t failCount = 0; + + for (; i < sizeof(cases) / sizeof(cases[0]); i++) { + size_t j = 0; + for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) { + const bool expectedSuccess + = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName; + sprintf(doc, "<%s%s>