From 8c1de5fb0010ae712568f1706b737270c3609bd8 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 21 Dec 2006 16:05:16 +0000 Subject: Initial SQL/XML support: xml data type and initial set of functions. --- src/backend/utils/adt/xml.c | 942 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 942 insertions(+) create mode 100644 src/backend/utils/adt/xml.c (limited to 'src/backend/utils/adt/xml.c') diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c new file mode 100644 index 00000000000..8997730fc8d --- /dev/null +++ b/src/backend/utils/adt/xml.c @@ -0,0 +1,942 @@ +/*------------------------------------------------------------------------- + * + * xml.c + * XML data type support. + * + * + * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * $PostgreSQL: pgsql/src/backend/utils/adt/xml.c,v 1.1 2006/12/21 16:05:15 petere Exp $ + * + *------------------------------------------------------------------------- + */ + +/* + * Generally, XML type support is only available when libxml use was + * configured during the build. But even if that is not done, the + * type and all the functions are available, but most of them will + * fail. For one thing, this avoids having to manage variant catalog + * installations. But it also has nice effects such as that you can + * dump a database containing XML type data even if the server is not + * linked with libxml. + */ + +#include "postgres.h" + +#ifdef USE_LIBXML +#include +#include +#include +#include +#include +#endif /* USE_LIBXML */ + +#include "fmgr.h" +#include "mb/pg_wchar.h" +#include "nodes/execnodes.h" +#include "utils/builtins.h" +#include "utils/xml.h" + + +#ifdef USE_LIBXML + +/* + * A couple of useful macros (similar to ones from libxml/parse.c) + */ +#define CMP4( s, c1, c2, c3, c4 ) \ + ( ((unsigned char *) s)[ 0 ] == c1 && ((unsigned char *) s)[ 1 ] == c2 && \ + ((unsigned char *) s)[ 2 ] == c3 && ((unsigned char *) s)[ 3 ] == c4 ) +#define CMP5( s, c1, c2, c3, c4, c5 ) \ + ( CMP4( s, c1, c2, c3, c4 ) && ((unsigned char *) s)[ 4 ] == c5 ) + +#define PG_XML_DEFAULT_URI "dummy.xml" +#define XML_ERRBUF_SIZE 200 + + +static void xml_init(void); +static void *xml_palloc(size_t size); +static void *xml_repalloc(void *ptr, size_t size); +static void xml_pfree(void *ptr); +static char *xml_pstrdup(const char *string); +static void xml_ereport(int level, char *msg, void *ctxt); +static void xml_errorHandler(void *ctxt, const char *msg, ...); +static void xml_ereport_by_code(int level, char *msg, int errcode); +static xmlChar *xml_text2xmlChar(text *in); +static xmlDocPtr xml_parse(text *data, int opts, bool is_document); + + +/* Global variables */ +/* taken from contrib/xml2 */ +/* FIXME: DO NOT USE global vars !!! */ +char *xml_errbuf; /* per line error buffer */ +char *xml_errmsg = NULL; /* overall error message */ + +#endif /* USE_LIBXML */ + + +#define NO_XML_SUPPORT() ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("no XML support in this installation"))) + + +Datum +xml_in(PG_FUNCTION_ARGS) +{ +#ifdef USE_LIBXML + char *s = PG_GETARG_CSTRING(0); + size_t len; + xmltype *vardata; + + len = strlen(s); + vardata = palloc(len + VARHDRSZ); + VARATT_SIZEP(vardata) = len + VARHDRSZ; + memcpy(VARDATA(vardata), s, len); + + /* + * Parse the data to check if it is well-formed XML data. Assume + * that ERROR occurred if parsing failed. Do we need DTD + * validation (if DTD exists)? + */ + xml_parse(vardata, XML_PARSE_DTDATTR | XML_PARSE_DTDVALID, false); + + PG_RETURN_XML_P(vardata); +#else + NO_XML_SUPPORT(); + return 0; +#endif +} + + +Datum +xml_out(PG_FUNCTION_ARGS) +{ + xmltype *s = PG_GETARG_XML_P(0); + char *result; + int32 len; + + len = VARSIZE(s) - VARHDRSZ; + result = palloc(len + 1); + memcpy(result, VARDATA(s), len); + result[len] = '\0'; + + PG_RETURN_CSTRING(result); +} + + +#ifdef USE_LIBXML +static void +appendStringInfoText(StringInfo str, const text *t) +{ + appendBinaryStringInfo(str, VARDATA(t), VARSIZE(t) - VARHDRSZ); +} + + +static xmltype * +stringinfo_to_xmltype(StringInfo buf) +{ + int32 len; + xmltype *result; + + len = buf->len + VARHDRSZ; + result = palloc(len); + VARATT_SIZEP(result) = len; + memcpy(VARDATA(result), buf->data, buf->len); + + return result; +} +#endif + + +Datum +xmlcomment(PG_FUNCTION_ARGS) +{ +#ifdef USE_LIBXML + text *arg = PG_GETARG_TEXT_P(0); + int len = VARATT_SIZEP(arg) - VARHDRSZ; + StringInfoData buf; + int i; + + /* check for "--" in string or "-" at the end */ + for (i = 1; i < len; i++) + if ((VARDATA(arg)[i] == '-' && VARDATA(arg)[i - 1] == '-') + || (VARDATA(arg)[i] == '-' && i == len - 1)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_XML_COMMENT), + errmsg("invalid XML comment"))); + + initStringInfo(&buf); + appendStringInfo(&buf, ""); + + PG_RETURN_XML_P(stringinfo_to_xmltype(&buf)); +#else + NO_XML_SUPPORT(); + return 0; +#endif +} + + +Datum +xmlparse(PG_FUNCTION_ARGS) +{ +#ifdef USE_LIBXML + text *data; + bool is_document; + bool preserve_whitespace; + + data = PG_GETARG_TEXT_P(0); + + if (PG_NARGS() >= 2) + is_document = PG_GETARG_BOOL(1); + else + is_document = false; + + if (PG_NARGS() >= 3) + preserve_whitespace = PG_GETARG_BOOL(2); + else + /* + * Since the XMLPARSE grammar makes STRIP WHITESPACE the + * default, this argument should really default to false. But + * until we have actually implemented whitespace stripping, + * this would be annoying. + */ + preserve_whitespace = true; + + if (!preserve_whitespace) + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("XMLPARSE with STRIP WHITESPACE is not implemented"))); + + /* + * Note, that here we try to apply DTD defaults + * (XML_PARSE_DTDATTR) according to SQL/XML:10.16.7.d: 'Default + * valies defined by internal DTD are applied'. As for external + * DTDs, we try to support them too, (see SQL/XML:10.16.7.e) + */ + xml_parse(data, XML_PARSE_DTDATTR, is_document); /* assume that ERROR occurred if parsing failed */ + + PG_RETURN_XML_P(data); +#else + NO_XML_SUPPORT(); + return 0; +#endif +} + + +Datum +xmlpi(PG_FUNCTION_ARGS) +{ +#ifdef USE_LIBXML + char *target = NameStr(*PG_GETARG_NAME(0)); + StringInfoData buf; + + if (strlen(target) >= 3 + && (target[0] == 'x' || target[0] == 'X') + && (target[1] == 'm' || target[1] == 'M') + && (target[2] == 'l' || target[2] == 'L')) + { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("invalid XML processing instruction"), + errdetail("XML processing instruction target name cannot start with \"xml\"."))); + } + + initStringInfo(&buf); + + appendStringInfo(&buf, " 1) + { + text *arg = PG_GETARG_TEXT_P(1); + char *string; + + string = DatumGetCString(DirectFunctionCall1(textout, PointerGetDatum(arg))); + if (strstr(string, "?>")) + ereport(ERROR, + (errcode(ERRCODE_INVALID_XML_PROCESSING_INSTRUCTION), + errmsg("invalid XML processing instruction"), + errdetail("XML processing instruction cannot contain \"?>\"."))); + + appendStringInfoString(&buf, " "); + appendStringInfoString(&buf, string); + } + appendStringInfoString(&buf, "?>"); + + PG_RETURN_XML_P(stringinfo_to_xmltype(&buf)); +#else + NO_XML_SUPPORT(); + return 0; +#endif +} + + +Datum +xmlroot(PG_FUNCTION_ARGS) +{ +#ifdef USE_LIBXML + xmltype *data; + text *version; + int standalone; + StringInfoData buf; + + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + else + data = PG_GETARG_XML_P(0); + + if (PG_ARGISNULL(1)) + version = NULL; + else + version = PG_GETARG_TEXT_P(1); + + if (PG_ARGISNULL(2)) + standalone = 0; + else + { + bool tmp = PG_GETARG_BOOL(2); + standalone = (tmp ? 1 : -1); + } + + /* + * FIXME: This is probably supposed to be cleverer if there + * already is an XML preamble. + */ + initStringInfo(&buf); + + appendStringInfo(&buf,""); + appendStringInfoText(&buf, (text *) data); + + PG_RETURN_XML_P(stringinfo_to_xmltype(&buf)); +#else + NO_XML_SUPPORT(); + return 0; +#endif +} + + +/* + * Validate document (given as string) against DTD (given as external link) + * TODO !!! use text instead of cstring for second arg + * TODO allow passing DTD as a string value (not only as an URI) + * TODO redesign (see comment with '!!!' below) + */ +Datum +xmlvalidate(PG_FUNCTION_ARGS) +{ +#ifdef USE_LIBXML + text *data = PG_GETARG_TEXT_P(0); + text *dtdOrUri = PG_GETARG_TEXT_P(1); + bool result = FALSE; + xmlParserCtxtPtr ctxt; /* the parser context */ + xmlDocPtr doc; /* the resulting document tree */ + xmlDtdPtr dtd; + + xml_init(); + + ctxt = xmlNewParserCtxt(); + if (ctxt == NULL) + xml_ereport(ERROR, "could not allocate parser context", ctxt); + doc = xmlCtxtReadMemory(ctxt, (char *) VARDATA(data), + VARSIZE(data) - VARHDRSZ, PG_XML_DEFAULT_URI, NULL, 0); + if (doc == NULL) + xml_ereport(ERROR, "could not parse XML data", ctxt); + +#if 0 + uri = xmlCreateURI(); + ereport(NOTICE, (errcode(0),errmsg(" dtd - %s", dtdOrUri))); + dtd = palloc(sizeof(xmlDtdPtr)); + uri = xmlParseURI(dtdOrUri); + if (uri == NULL) + xml_ereport(ERROR, "not implemented yet... (TODO)", ctxt); + else +#endif + dtd = xmlParseDTD(NULL, xml_text2xmlChar(dtdOrUri)); + + if (dtd == NULL) + { +#if 0 + xmlFreeDoc(doc); + xmlFreeParserCtxt(ctxt); +#endif + xml_ereport(ERROR, "could not load DTD", ctxt); + } + + if (xmlValidateDtd(xmlNewValidCtxt(), doc, dtd) == 1) + result = TRUE; + +#if 0 + xmlFreeURI(uri); + xmlFreeDtd(dtd); + xmlFreeDoc(doc); + xmlFreeParserCtxt(ctxt); + xmlCleanupParser(); +#endif + + if (!result) + xml_ereport(NOTICE, "validation against DTD failed", ctxt); + + PG_RETURN_BOOL(result); +#else /* not USE_LIBXML */ + NO_XML_SUPPORT(); + return 0; +#endif /* not USE_LIBXML */ +} + + +#ifdef USE_LIBXML + +/* + * Container for some init stuff (not good design!) + * TODO xmlChar is utf8-char, make proper tuning (initdb with enc!=utf8 and check) + */ +static void +xml_init(void) +{ + /* + * Currently, we have no pure UTF-8 support for internals -- check + * if we can work. + */ + if (sizeof (char) != sizeof (xmlChar)) + ereport(ERROR, + (errmsg("cannot initialize XML library"), + errdetail("libxml2 has incompatible char type: sizeof(char)=%u, sizeof(xmlChar)=%u.", + sizeof(char), sizeof(xmlChar)))); + + xmlMemSetup(xml_pfree, xml_palloc, xml_repalloc, xml_pstrdup); + xmlInitParser(); + LIBXML_TEST_VERSION; + /* do not flood PG's logfile with libxml error messages - reset error handler*/ + xmlSetGenericErrorFunc(NULL, xml_errorHandler); + xml_errmsg = NULL; + xml_errbuf = palloc(XML_ERRBUF_SIZE); + memset(xml_errbuf, 0, XML_ERRBUF_SIZE); +} + + +/* + * Convert a C string to XML internal representation + * (same things as for TEXT, but with checking the data for well-formedness + * and, moreover, validation against DTD, if needed). + * NOTICE: We use TEXT type as internal storage type. In the future, + * we plan to create own storage type (maybe several types/strategies) + * TODO predefined DTDs / XSDs and validation + * TODO validation against XML Schema + * TODO maybe, libxml2's xmlreader is better? (do not construct DOM, yet do not use SAX - see xml_reader.c) + * TODO what about internal URI for docs? (see PG_XML_DEFAULT_URI below) + */ +static xmlDocPtr +xml_parse(text *data, int opts, bool is_document) +{ + bool validationFailed = FALSE; + xmlParserCtxtPtr ctxt; /* the parser context */ + xmlDocPtr doc; /* the resulting document tree */ + int res_code; + int32 len; + xmlChar *string; +#ifdef XML_DEBUG_DTD_CONST + xmlDtdPtr dtd; /* pointer to DTD */ +#endif + + xml_init(); + + len = VARSIZE(data) - VARHDRSZ; /* will be useful later */ + string = xml_text2xmlChar(data); + + ctxt = xmlNewParserCtxt(); + if (ctxt == NULL) + xml_ereport(ERROR, "could not allocate parser context", ctxt); + + /* first, we try to parse the string as it is XML doc, then, as XML chunk */ + ereport(DEBUG3, (errmsg("string to parse: %s", string))); + if (len > 4 && CMP5(string, '<', '?', 'x', 'm', 'l')) + { + /* consider it as DOCUMENT */ + doc = xmlCtxtReadMemory(ctxt, string, len, PG_XML_DEFAULT_URI, NULL, opts); + if (doc == NULL) + { + xml_ereport(ERROR, "could not parse XML data", ctxt); +#if 0 + xmlFreeParserCtxt(ctxt); + xmlCleanupParser(); + ereport(ERROR, (errmsg("could not parse XML data"))); +#endif + } + } + else + { + /* attempt to parse the string as if it is an XML fragment */ + ereport(DEBUG3, (errmsg("the string is not an XML doc, trying to parse as a CHUNK"))); + doc = xmlNewDoc(NULL); + /* TODO resolve: xmlParseBalancedChunkMemory assumes that string is UTF8 encoded! */ + res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, string, NULL); + if (res_code != 0) + { + xmlFreeParserCtxt(ctxt); + xmlCleanupParser(); + xml_ereport_by_code(ERROR, "could not parse XML data", res_code); + } + } + +#ifdef XML_DEBUG_DTD_CONST + dtd = xmlParseDTD(NULL, (xmlChar *) XML_DEBUG_DTD_CONST); + xml_ereport(DEBUG3, "solid path to DTD was defined for debugging purposes", ctxt); + if (dtd == NULL) + { + xml_ereport(ERROR, "could not parse DTD data", ctxt); + } + else +#else + /* if dtd for our xml data is detected... */ + if ((doc->intSubset != NULL) || (doc->extSubset != NULL)) +#endif + { + /* assume that inline DTD exists - validation should be performed */ +#ifdef XML_DEBUG_DTD_CONST + if (xmlValidateDtd(xmlNewValidCtxt(), doc, dtd) != 1) +#else + if (ctxt->valid == 0) +#endif + { + /* DTD exists, but validator reported 'validation failed' */ + validationFailed = TRUE; + } + } + + if (validationFailed) + xml_ereport(WARNING, "validation against DTD failed", ctxt); + + /* TODO encoding issues + * (thoughts: + * CASE: + * - XML data has explicit encoding attribute in its prolog + * - if not, assume that enc. of XML data is the same as client's one + * + * The common rule is to accept the XML data only if its encoding + * is the same as encoding of the storage (server's). The other possible + * option is to accept all the docs, but DO TRANSFORMATION and, if needed, + * change the prolog. + * + * I think I'd stick the first way (for the 1st version), + * it's much simplier (less errors...) + * ) */ + /* ... */ + + xmlFreeParserCtxt(ctxt); + xmlCleanupParser(); + + ereport(DEBUG3, (errmsg("XML data successfully parsed, encoding: %s", + (char *) doc->encoding))); + + return doc; +} + + +/* + * xmlChar<->text convertions + */ +static xmlChar * +xml_text2xmlChar(text *in) +{ + int32 len = VARSIZE(in) - VARHDRSZ; + xmlChar *res; + + res = palloc(len + 1); + memcpy(res, VARDATA(in), len); + res[len] = '\0'; + + return(res); +} + + +/* + * Wrappers for memory management functions + */ +static void * +xml_palloc(size_t size) +{ + return palloc(size); +} + + +static void * +xml_repalloc(void *ptr, size_t size) +{ + return repalloc(ptr, size); +} + + +static void +xml_pfree(void *ptr) +{ + pfree(ptr); +} + + +static char * +xml_pstrdup(const char *string) +{ + return pstrdup(string); +} + + +/* + * Wrapper for "ereport" function. + * Adds detail - libxml's native error message, if any. + */ +static void +xml_ereport(int level, char *msg, void *ctxt) +{ + char *xmlErrDetail; + int xmlErrLen, i; + xmlErrorPtr libxmlErr = NULL; + + if (xml_errmsg != NULL) + { + ereport(DEBUG1, (errmsg("%s", xml_errmsg))); + pfree(xml_errmsg); + } + + if (ctxt != NULL) + libxmlErr = xmlCtxtGetLastError(ctxt); + + if (libxmlErr == NULL) + { + if (level == ERROR) + { + xmlFreeParserCtxt(ctxt); + xmlCleanupParser(); + } + ereport(level, (errmsg(msg))); + } + else + { + /* as usual, libxml error message contains '\n'; get rid of it */ + xmlErrLen = strlen(libxmlErr->message); /* - 1; */ + xmlErrDetail = (char *) palloc(xmlErrLen); + for (i = 0; i < xmlErrLen; i++) + { + if (libxmlErr->message[i] == '\n') + xmlErrDetail[i] = '.'; + else + xmlErrDetail[i] = libxmlErr->message[i]; + } + if (level == ERROR) + { + xmlFreeParserCtxt(ctxt); + xmlCleanupParser(); + } + ereport(level, (errmsg(msg), errdetail("%s", xmlErrDetail))); + } +} + + +/* + * Error handler for libxml error messages + */ +static void +xml_errorHandler(void *ctxt, const char *msg,...) +{ + va_list args; + + va_start(args, msg); + vsnprintf(xml_errbuf, XML_ERRBUF_SIZE, msg, args); + va_end(args); + /* Now copy the argument across */ + if (xml_errmsg == NULL) + xml_errmsg = pstrdup(xml_errbuf); + else + { + int32 xsize = strlen(xml_errmsg); + + xml_errmsg = repalloc(xml_errmsg, (size_t) (xsize + strlen(xml_errbuf) + 1)); + strncpy(&xml_errmsg[xsize - 1], xml_errbuf, strlen(xml_errbuf)); + xml_errmsg[xsize + strlen(xml_errbuf) - 1] = '\0'; + } + memset(xml_errbuf, 0, XML_ERRBUF_SIZE); +} + + +/* + * Return error message by libxml error code + * TODO make them closer to recommendations from Postgres manual + */ +static void +xml_ereport_by_code(int level, char *msg, int code) +{ + const char *det; + + if (code < 0) + { + ereport(level, (errmsg(msg))); + return; + } + + switch (code) { + case XML_ERR_INTERNAL_ERROR: + det = "libxml internal error"; + break; + case XML_ERR_ENTITY_LOOP: + det = "Detected an entity reference loop"; + break; + case XML_ERR_ENTITY_NOT_STARTED: + det = "EntityValue: \" or ' expected"; + break; + case XML_ERR_ENTITY_NOT_FINISHED: + det = "EntityValue: \" or ' expected"; + break; + case XML_ERR_ATTRIBUTE_NOT_STARTED: + det = "AttValue: \" or ' expected"; + break; + case XML_ERR_LT_IN_ATTRIBUTE: + det = "Unescaped '<' not allowed in attributes values"; + break; + case XML_ERR_LITERAL_NOT_STARTED: + det = "SystemLiteral \" or ' expected"; + break; + case XML_ERR_LITERAL_NOT_FINISHED: + det = "Unfinished System or Public ID \" or ' expected"; + break; + case XML_ERR_MISPLACED_CDATA_END: + det = "Sequence ']]>' not allowed in content"; + break; + case XML_ERR_URI_REQUIRED: + det = "SYSTEM or PUBLIC, the URI is missing"; + break; + case XML_ERR_PUBID_REQUIRED: + det = "PUBLIC, the Public Identifier is missing"; + break; + case XML_ERR_HYPHEN_IN_COMMENT: + det = "Comment must not contain '--' (double-hyphen)"; + break; + case XML_ERR_PI_NOT_STARTED: + det = "xmlParsePI : no target name"; + break; + case XML_ERR_RESERVED_XML_NAME: + det = "Invalid PI name"; + break; + case XML_ERR_NOTATION_NOT_STARTED: + det = "NOTATION: Name expected here"; + break; + case XML_ERR_NOTATION_NOT_FINISHED: + det = "'>' required to close NOTATION declaration"; + break; + case XML_ERR_VALUE_REQUIRED: + det = "Entity value required"; + break; + case XML_ERR_URI_FRAGMENT: + det = "Fragment not allowed"; + break; + case XML_ERR_ATTLIST_NOT_STARTED: + det = "'(' required to start ATTLIST enumeration"; + break; + case XML_ERR_NMTOKEN_REQUIRED: + det = "NmToken expected in ATTLIST enumeration"; + break; + case XML_ERR_ATTLIST_NOT_FINISHED: + det = "')' required to finish ATTLIST enumeration"; + break; + case XML_ERR_MIXED_NOT_STARTED: + det = "MixedContentDecl : '|' or ')*' expected"; + break; + case XML_ERR_PCDATA_REQUIRED: + det = "MixedContentDecl : '#PCDATA' expected"; + break; + case XML_ERR_ELEMCONTENT_NOT_STARTED: + det = "ContentDecl : Name or '(' expected"; + break; + case XML_ERR_ELEMCONTENT_NOT_FINISHED: + det = "ContentDecl : ',' '|' or ')' expected"; + break; + case XML_ERR_PEREF_IN_INT_SUBSET: + det = "PEReference: forbidden within markup decl in internal subset"; + break; + case XML_ERR_GT_REQUIRED: + det = "Expected '>'"; + break; + case XML_ERR_CONDSEC_INVALID: + det = "XML conditional section '[' expected"; + break; + case XML_ERR_EXT_SUBSET_NOT_FINISHED: + det = "Content error in the external subset"; + break; + case XML_ERR_CONDSEC_INVALID_KEYWORD: + det = "conditional section INCLUDE or IGNORE keyword expected"; + break; + case XML_ERR_CONDSEC_NOT_FINISHED: + det = "XML conditional section not closed"; + break; + case XML_ERR_XMLDECL_NOT_STARTED: + det = "Text declaration '' expected"; + break; + case XML_ERR_EXT_ENTITY_STANDALONE: + det = "external parsed entities cannot be standalone"; + break; + case XML_ERR_ENTITYREF_SEMICOL_MISSING: + det = "EntityRef: expecting ';'"; + break; + case XML_ERR_DOCTYPE_NOT_FINISHED: + det = "DOCTYPE improperly terminated"; + break; + case XML_ERR_LTSLASH_REQUIRED: + det = "EndTag: '