XML conversion utility, requires expat library.

John Gray

XML conversion utility, requires expat library.
John Gray
113bb9b5 · Bruce Momjian · d4cafeba · 113bb9b5 · 113bb9b5 · 113bb9b5
8 changed file
--- a/contrib/README
+++ b/contrib/README

 The PostgreSQL contrib tree
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+---------------------------

 This subtree contains tools, modules, and examples that are not
 maintained as part of the core PostgreSQL system, mainly because
@@ -177,3 +177,7 @@ userlock -
 vacuumlo -
 	Remove orphaned large objects
 	by Peter T Mount <peter@retep.org.uk>
+
+xml -
+	Storing XML in PostgreSQL
+	by John Gray <jgray@beansindustry.co.uk>
--- a/contrib/xml/Makefile
+++ b/contrib/xml/Makefile
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Adapted from tutorial makefile
+#-------------------------------------------------------------------------
+
+subdir = contrib/xml
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+override CFLAGS+= $(CFLAGS_SL)
+
+
+#
+# DLOBJS is the dynamically-loaded object files.  The "funcs" queries
+# include CREATE FUNCTIONs that load routines from these files.
+#
+DLOBJS= pgxml$(DLSUFFIX)
+
+
+QUERIES= pgxml.sql
+
+all: $(DLOBJS) $(QUERIES)
+
+# Requires the expat library
+
+%.so: %.o
+	$(CC) -shared -lexpat -o $@ $<
+
+
+%.sql: %.source
+	if [ -z "$$USER" ]; then USER=$$LOGNAME; fi; \
+	if [ -z "$$USER" ]; then USER=`whoami`; fi; \
+	if [ -z "$$USER" ]; then echo 'Cannot deduce $$USER.'; exit 1; fi; \
+	rm -f $@; \
+	C=`pwd`; \
+	sed -e "s:_CWD_:$$C:g" \
+	    -e "s:_OBJWD_:$$C:g" \
+	    -e "s:_DLSUFFIX_:$(DLSUFFIX):g" \
+	    -e "s/_USER_/$$USER/g" < $< > $@
+
+clean:
+	rm -f $(DLOBJS) $(QUERIES)
--- a/contrib/xml/README
+++ b/contrib/xml/README
+This package contains a couple of simple routines for hooking the
+expat XML parser up to PostgreSQL. This is a work-in-progress and all
+very basic at the moment (see the file TODO for some outline of what
+remains to be done).
+
+At present, two functions are defined, one which checks
+well-formedness, and the other which performs very simple XPath-type
+queries.
+
+Prerequisite:
+
+expat parser 1.95.0 or newer (http://expat.sourceforge.net)
+
+I used a shared library version -I'm sure you could use a static
+library if you wished though. I had no problems compiling from source.
+
+Function documentation and usage:
+---------------------------------
+
+pgxml_parse(text) returns bool
+  parses the provided text and returns true or false if it is 
+well-formed or not. It returns NULL if the parser couldn't be
+created for any reason.
+
+pgxml_xpath(text doc, text xpath, int n) returns text
+  parses doc and returns the cdata of the nth occurence of
+the "XPath" listed. See below for details on the syntax.
+
+
+Example:
+
+Given a  table docstore:
+
+ Attribute |  Type   | Modifier 
+-----------+---------+----------
+ docid     | integer | 
+ document  | text    | 
+
+containing documents such as (these are archaeological site
+descriptions, in case anyone is wondering):
+
+<?XML version="1.0"?>
+<site provider="Foundations" sitecode="ak97" version="1">
+   <name>Church Farm, Ashton Keynes</name>
+   <invtype>watching brief</invtype>
+   <location scheme="osgb">SU04209424</location>
+</site>
+
+one can type:
+
+select docid, 
+pgxml_xpath(document,'/site/name',1) as sitename,
+pgxml_xpath(document,'/site/location',1) as location
+ from docstore;
+ 
+and get as output:
+
+ docid |          sitename           |  location  
+-------+-----------------------------+------------
+     1 | Church Farm, Ashton Keynes  | SU04209424
+     2 | Glebe Farm, Long Itchington | SP41506500
+(2 rows)
+
+
+"XPath" syntax supported
+------------------------
+
+At present it only supports paths of the form:
+'tag1/tag2' or '/tag1/tag2'
+
+The first case will find any <tag2> within a <tag1>, the second will
+find any <tag2> within a <tag1> at the top level of the document.
+
+The real XPath is much more complex (see TODO file).
+
+
+John Gray <jgray@azuli.co.uk>  26 July 2001
+
--- a/contrib/xml/TODO
+++ b/contrib/xml/TODO
+PGXML TODO List
+===============
+
+Some of these items still require much more thought! The data model
+for XML documents and the parsing model of expat don't really fit so
+well with a standard SQL model.
+
+1. Generalised XML parsing support
+
+Allow a user to specify handlers (in any PL) to be used by the parser.
+This must permit distinct sets of parser settings -user may want some
+documents in a database to parsed with one set of handlers, others
+with a different set.
+
+i.e. the pgxml_parse function would take as parameters (document,
+parsername) where parsername was the identifier for a collection of
+handler etc. settings.
+
+"Stub" handlers in the pgxml code would invoke the functions through
+the standard fmgr interface. The parser interface would define the
+prototype for these functions. How does the handler function know
+which document/context has resulted it in being called?
+
+Mechanism for defining collection of parser settings (in a table? -but
+maybe copied for efficiency into a structure when first required by a
+query?)
+
+2. Support for other parsers
+
+Expat may not be the best choice as a parser because a new parser
+instance is needed for each document i.e. all the handlers must be set
+again for each document. Another parser may have a more efficient way
+of parsing a set of documents identically.
+
+3. XPath support
+
+Proper XPath support. I really need to sit down and plough
+through the specification...
+
+The very simple text comparison system currently used is too
+basic. Need to convert the path to an ordered list of nodes. Each node
+is an element qualifier, and may have a list of attribute
+qualifications attached. This probably requires lexx/yacc combination.
+(James Clark has written a yacc grammar for XPath). Not all the
+features of XPath are necessarily relevant.
+
+An option to return subdocuments (i.e. subelements AND cdata, not just
+cdata). This should maybe be the default.
+
+4. Multiple occurences of elements.
+
+This section is all very sketchy, and has various weaknesses.
+ 
+Is there a good way to optimise/index the results of certain XPath
+operations to make them faster?:
+
+select docid, pgxml_xpath(document,'/site/location',1) as location 
+where pgxml_xpath(document,'/site/name',1) = 'Church Farm';
+
+and with multiple element occurences in a document?
+
+select d.docid, pgxml_xpath(d.document,'/site/location',1) 
+from docstore d, 
+pgxml_xpaths('docstore','document','feature/type','docid') ft 
+where ft.key = d.docid and ft.value ='Limekiln';
+
+pgxml_xpaths params are relname, attrname, xpath, returnkey. It would
+return a set of two-element tuples (key,value) consisting of the value of
+returnkey, and the cdata value of the xpath. The XML document would be
+defined by relname and attrname.
+
+The pgxml_xpaths function could be the basis of a functional index,
+which could speed up the above query very substantially, working
+through the normal query planner mechanism. Syntax above is fragile
+through using names rather than OID.
+ 
+John Gray <jgray@azuli.co.uk>
+
+
+
+
+
+
--- a/contrib/xml/pgxml.c
+++ b/contrib/xml/pgxml.c
+/********************************************************
+ * Interface code to parse an XML document using expat
+ ********************************************************/
+
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "expat.h"
+#include "pgxml.h"
+
+/* Memory management - we make expat use standard pg MM */
+
+XML_Memory_Handling_Suite mhs;
+
+/* passthrough functions (palloc is a macro) */
+
+static void *pgxml_palloc(size_t size) 
+{
+  return palloc(size);
+}
+
+static void *pgxml_repalloc(void *ptr, size_t size)
+{
+  return repalloc(ptr,size);
+}
+
+static void pgxml_pfree(void *ptr)
+{
+  return pfree(ptr);
+}
+
+static void pgxml_mhs_init() 
+{
+  mhs.malloc_fcn = pgxml_palloc;
+  mhs.realloc_fcn = pgxml_repalloc;
+  mhs.free_fcn = pgxml_pfree;
+}
+
+static void pgxml_handler_init()
+{
+  /* This code should set up the relevant handlers from  user-supplied
+     settings. Quite how these settings are made is another matter :) */
+}
+
+/* Returns true if document is well-formed */
+
+PG_FUNCTION_INFO_V1(pgxml_parse);
+
+Datum
+pgxml_parse(PG_FUNCTION_ARGS)
+{
+  /* called as pgxml_parse(document) */
+  XML_Parser p;
+  text *t = PG_GETARG_TEXT_P(0); /*document buffer */
+  int32 docsize = VARSIZE(t) - VARHDRSZ;
+
+  pgxml_mhs_init();
+
+  pgxml_handler_init();
+
+  p = XML_ParserCreate_MM(NULL,&mhs,NULL);
+  if (! p) {
+    elog(ERROR, "pgxml: Could not create expat parser");
+    PG_RETURN_NULL(); /* seems appropriate if we couldn't parse */
+  }
+
+  if (! XML_Parse(p, (char *)VARDATA(t) , docsize, 1)) {
+    /*    elog(NOTICE, "Parse error at line %d:%s",
+	    XML_GetCurrentLineNumber(p),
+	    XML_ErrorString(XML_GetErrorCode(p))); */
+    XML_ParserFree(p);
+    PG_RETURN_BOOL(false);
+  }
+
+  XML_ParserFree(p);
+  PG_RETURN_BOOL(true);
+}
+
+/* XPath handling functions */
+
+/* XPath support here is for a very skeletal kind of XPath!
+   It was easy to program though... */
+
+/* This first is the core function that builds a result set. The 
+   actual functions called by the user manipulate that result set
+   in various ways.
+*/
+
+static XPath_Results *build_xpath_results(text *doc, text *pathstr)
+{
+  XPath_Results *xpr;
+  char *res;
+  pgxml_udata *udata;
+  XML_Parser p;
+  int32 docsize;
+
+  xpr = (XPath_Results *) palloc((sizeof(XPath_Results)));
+  memset((void *)xpr, 0, sizeof(XPath_Results));
+  xpr->rescount=0;
+
+  docsize=VARSIZE(doc)-VARHDRSZ;
+
+  /* res isn't going to be the real return type, it is just a buffer */
+
+  res = (char *) palloc(docsize);
+  memset((void *)res, 0, docsize);
+
+  xpr->resbuf = res;
+
+  udata = (pgxml_udata *) palloc((sizeof(pgxml_udata)));
+  memset((void *)udata,0,sizeof(pgxml_udata));
+
+  udata->currentpath[0]='\0';
+  udata->textgrab=0;
+
+  udata->path= (char *) palloc(VARSIZE(pathstr));
+  memcpy(udata->path, VARDATA(pathstr), VARSIZE(pathstr)-VARHDRSZ);
+
+  udata->path[VARSIZE(pathstr)-VARHDRSZ]='\0';
+
+  udata->resptr = res;
+  udata->reslen = 0;
+
+  udata->xpres = xpr;
+
+  /* Now fire up the parser */
+  pgxml_mhs_init();
+
+  p = XML_ParserCreate_MM(NULL,&mhs,NULL);
+  if (! p) {
+    elog(ERROR, "pgxml: Could not create expat parser");
+    pfree(xpr);
+    pfree(udata->path);
+    pfree(udata);
+    pfree(res);
+    return NULL;
+  }
+  XML_SetUserData(p, (void *)udata);
+
+  /* Set the handlers */
+
+  XML_SetElementHandler(p, pgxml_starthandler, pgxml_endhandler);
+  XML_SetCharacterDataHandler(p, pgxml_charhandler);
+
+  if (! XML_Parse(p, (char *)VARDATA(doc) , docsize, 1)) {
+    /*     elog(NOTICE, "Parse error at line %d:%s",
+	    XML_GetCurrentLineNumber(p),
+	    XML_ErrorString(XML_GetErrorCode(p))); */
+    XML_ParserFree(p);
+    pfree(xpr);
+    pfree(udata->path);
+    pfree(udata);
+
+    return NULL;
+  }
+
+  pfree(udata->path);
+  pfree(udata);
+  XML_ParserFree(p);
+  return xpr;
+}
+
+
+PG_FUNCTION_INFO_V1(pgxml_xpath);
+
+Datum
+pgxml_xpath(PG_FUNCTION_ARGS)
+{
+  /* called as pgxml_xpath(document,pathstr, index) for the moment*/
+
+  XPath_Results *xpresults;
+  text *restext;
+
+  text *t = PG_GETARG_TEXT_P(0); /*document buffer */
+  text *t2= PG_GETARG_TEXT_P(1);
+  int32 ind = PG_GETARG_INT32(2) - 1;
+
+  xpresults = build_xpath_results(t,t2);
+
+  /* This needs to be changed depending on the mechanism for returning
+     our set of results. */
+
+  if (xpresults==NULL)   /*parse error (not WF or parser failure) */
+    {
+      PG_RETURN_NULL();
+    }
+
+  if (ind >= (xpresults->rescount)) 
+    {
+      PG_RETURN_NULL();
+    }
+  
+  restext = (text *) palloc(xpresults->reslens[ind]+VARHDRSZ);
+  memcpy(VARDATA(restext),xpresults->results[ind],xpresults->reslens[ind]);
+
+  VARATT_SIZEP(restext) = xpresults->reslens[ind]+VARHDRSZ;
+
+  pfree(xpresults->resbuf);
+  pfree(xpresults);
+
+  PG_RETURN_TEXT_P(restext);
+}
+
+
+static void pgxml_pathcompare(void *userData)
+{
+  char  *matchpos;
+
+  matchpos=strstr(UD->currentpath, UD->path);
+
+  if (matchpos == NULL) { /* Should we have more logic here ? */
+    if (UD->textgrab) {
+      UD->textgrab=0;
+      pgxml_finalisegrabbedtext(userData);
+    }
+    return;
+  }
+  /* OK, we have a match of some sort. Now we need to check that
+     our match is anchored to the *end* of the string AND
+     that it is immediately preceded by a '/'*/
+  /* This test wouldn't work if strlen (UD->path) overran the length
+     of the currentpath, but that's not possible because we got a match! */
+
+  if ((matchpos + strlen(UD->path))[0]=='\0') 
+    {
+      if ((UD->path)[0]=='/') {
+	if (matchpos == UD->currentpath) {
+	  UD->textgrab=1;
+	}
+      } else {
+	if ((matchpos-1)[0]=='/') {
+	  UD->textgrab=1;
+	}
+      }
+    }
+}
+
+static void pgxml_starthandler(void *userData, const XML_Char *name,
+			const XML_Char **atts)
+{
+
+  char sepstr[]="/";
+
+  if ((strlen(name)+strlen(UD->currentpath))>MAXPATHLENGTH-2) {
+    elog(NOTICE,"Path too long");
+  } else {
+    strncat(UD->currentpath,sepstr,1);
+    strcat(UD->currentpath, name);
+  }
+  if (UD->textgrab) 
+    {
+      /* Depending on user preference, should we "reconstitute"
+	 the element into the result text?
+      */
+    } else {
+      pgxml_pathcompare(userData);
+    }
+}
+
+static void pgxml_endhandler(void *userData, const XML_Char *name)
+{
+  /* Start by removing the current element off the end of the
+     currentpath */
+
+  char *sepptr;
+
+  sepptr=strrchr(UD->currentpath,'/');
+  if (sepptr==NULL) {
+    elog(ERROR,"There's a problem...");
+    sepptr=UD->currentpath;
+  }
+  if (strcmp(name, sepptr+1) !=0) {
+    elog(NOTICE,"Wanted [%s], got [%s]",sepptr,name);
+    /* unmatched entry, so do nothing */
+  } else {
+    sepptr[0]='\0'; /* Chop that element off the end */
+  }
+
+  if (UD->textgrab) {  
+    pgxml_pathcompare(userData);
+  }
+
+}
+
+static void pgxml_charhandler(void *userData, const XML_Char *s, int len)
+{
+  if (UD->textgrab) {
+    if (len>0) {
+      memcpy(UD->resptr,s,len);
+      UD->resptr += len;
+      UD->reslen += len;
+    }
+  }
+}
+/* Should I be using PG list types here? */
+
+static void pgxml_finalisegrabbedtext(void *userData)
+{
+  /* In res/reslen, we have a single result. */
+  UD->xpres->results[UD->xpres->rescount]= UD->resptr - UD->reslen;
+  UD->xpres->reslens[UD->xpres->rescount]= UD->reslen;
+  UD->reslen=0;
+  UD->xpres->rescount++;
+
+  /* This effectively concatenates all the results together but we
+     do know where one ends and the next begins */
+}
+
+
+
--- a/contrib/xml/pgxml.h
+++ b/contrib/xml/pgxml.h
+/* Header for pg xml parser interface */
+
+static void *pgxml_palloc(size_t size);
+static void *pgxml_repalloc(void *ptr, size_t size);
+static void pgxml_pfree(void *ptr);
+static void pgxml_mhs_init();
+static void pgxml_handler_init();
+Datum pgxml_parse(PG_FUNCTION_ARGS);
+Datum pgxml_xpath(PG_FUNCTION_ARGS);
+static void pgxml_starthandler(void *userData, const XML_Char *name,
+			const XML_Char **atts);
+static void pgxml_endhandler(void *userData, const XML_Char *name);
+static void pgxml_charhandler(void *userData, const XML_Char *s, int len);
+static void pgxml_pathcompare(void *userData);
+static void pgxml_finalisegrabbedtext(void *userData);
+
+#define MAXPATHLENGTH 512
+#define MAXRESULTS 100
+
+
+typedef struct {
+  int rescount;
+  char *results[MAXRESULTS];
+  int32 reslens[MAXRESULTS];
+  char *resbuf; /* pointer to the result buffer for pfree */
+} XPath_Results;
+
+
+
+typedef struct {
+  char currentpath[MAXPATHLENGTH];
+  char *path;
+  int textgrab;
+  char *resptr;
+  int32 reslen;
+  XPath_Results *xpres;
+} pgxml_udata;
+
+
+#define UD ((pgxml_udata *) userData)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/contrib/xml/pgxml.source
+++ b/contrib/xml/pgxml.source
+--SQL for XML parser
+
+CREATE FUNCTION pgxml_parse(text) RETURNS bool
+	AS '_OBJWD_/pgxml_DLSUFFIX_' LANGUAGE 'c' WITH (isStrict);
+
+CREATE FUNCTION pgxml_xpath(text,text,int) RETURNS text
+	AS '_OBJWD_/pgxml_DLSUFFIX_' LANGUAGE 'c' WITH (isStrict);
\ No newline at end of file
--- a/contrib/xml/xpath-yacc
+++ b/contrib/xml/xpath-yacc
+
+%token QNAME
+%token NAME_COLON_STAR
+%token DOT
+%token DOT_DOT
+%token AT
+%token AXIS_NAME
+%token FUNCTION_NAME
+%token COMMENT
+%token PI
+%token TEXT
+%token NODE
+%token STAR
+%token LPAR
+%token RPAR
+%token LSQB
+%token RSQB
+%token LITERAL
+%token NUMBER
+%token COLON_COLON
+%token DOLLAR_QNAME
+%token SLASH
+%token SLASH_SLASH
+%token VBAR
+%token COMMA
+%token PLUS
+%token MINUS
+%token EQUALS
+%token GT
+%token LT
+%token GTE
+%token LTE
+%token MULTIPLY
+%token AND
+%token OR
+%token MOD
+%token DIV
+%token QUO
+
+%%
+
+expr :
+  or_expr
+  ;
+
+or_expr :
+  and_expr
+  | or_expr OR and_expr
+  ;
+
+and_expr :
+  equality_expr
+  | and_expr AND equality_expr
+  ;
+
+equality_expr :
+  relational_expr
+  | equality_expr EQUALS relational_expr
+  ;
+
+relational_expr :
+  additive_expr
+  | relational_expr LT additive_expr
+  | relational_expr GT additive_expr
+  | relational_expr LTE additive_expr
+  | relational_expr GTE additive_expr
+  ;
+
+additive_expr :
+  multiplicative_expr
+  | additive_expr PLUS multiplicative_expr
+  | additive_expr MINUS multiplicative_expr
+  ;
+
+multiplicative_expr :
+  unary_expr
+  | multiplicative_expr MULTIPLY unary_expr
+  | multiplicative_expr DIV unary_expr
+  | multiplicative_expr MOD unary_expr
+  ;
+
+unary_expr :
+  union_expr
+  | '-' unary_expr
+  ;
+
+union_expr :
+  path_expr
+  | union_expr VBAR path_expr
+  ;
+
+path_expr :
+  location_path
+  | primary_expr predicates segment
+  ;
+
+segment :
+  /* empty */
+  | SLASH relative_location_path
+  | SLASH_SLASH relative_location_path
+  ;
+
+location_path :
+  relative_location_path
+  | absolute_location_path
+  ;
+
+absolute_location_path :
+  SLASH
+  | SLASH relative_location_path
+  | SLASH_SLASH relative_location_path
+  ;
+
+relative_location_path :
+  step
+  | relative_location_path SLASH step
+  | relative_location_path SLASH_SLASH step
+  ;
+
+step :
+  axis node_test predicates
+  | DOT
+  | DOT_DOT
+  ;
+
+axis:
+  /* empty */
+  | AXIS_NAME COLON_COLON
+  | AT
+  ;
+
+predicates :
+  /* empty */
+  | predicates LSQB expr RSQB
+  ;
+
+primary_expr :
+  DOLLAR_QNAME
+  | LPAR expr RPAR
+  | LITERAL
+  | NUMBER
+  | function_call
+  ;
+
+function_call :
+  FUNCTION_NAME LPAR opt_args RPAR
+  ;
+
+opt_args :
+  /* empty */
+  | args
+  ;
+
+args :
+  expr
+  | args COMMA expr
+  ;
+  
+node_test :
+  QNAME
+  | STAR
+  | NAME_COLON_STAR
+  | PI LPAR opt_literal RPAR
+  | COMMENT LPAR RPAR
+  | TEXT LPAR RPAR
+  | NODE LPAR RPAR
+  ;
+
+opt_literal :
+  /* empty */
+  | LITERAL
+  ;
+
+
+
+
+
+