From 7646b18d64b6c739d04ca453493070e88c4aab13 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Sat, 20 Apr 2002 06:41:40 +0000 Subject: [PATCH] another entity processing update from Markus Henke Daniel * tree.c: another entity processing update from Markus Henke Daniel --- ChangeLog | 4 + include/libxml/xmlautomata.h | 10 + tree.c | 417 ++++++++++++++++++++--------------- xmlregexp.c | 113 ++++++++++ xmlschemas.c | 22 +- 5 files changed, 393 insertions(+), 173 deletions(-) diff --git a/ChangeLog b/ChangeLog index 87789fcc..2569fb50 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Fri Apr 19 18:26:04 CEST 2002 Daniel Veillard + + * tree.c: another entity processing update from Markus Henke + Fri Apr 19 17:14:24 CEST 2002 Bjorn Reese * trionan.c: fixed crash on OSF/1 diff --git a/include/libxml/xmlautomata.h b/include/libxml/xmlautomata.h index 853acc4f..c7d5b057 100644 --- a/include/libxml/xmlautomata.h +++ b/include/libxml/xmlautomata.h @@ -60,6 +60,16 @@ xmlAutomataStatePtr xmlAutomataNewCountTrans(xmlAutomataPtr am, int min, int max, void *data); +xmlAutomataStatePtr xmlAutomataNewOnceTrans (xmlAutomataPtr am, + xmlAutomataStatePtr from, + xmlAutomataStatePtr to, + const xmlChar *token, + int min, + int max, + void *data); +xmlAutomataStatePtr xmlAutomataNewAllTrans (xmlAutomataPtr am, + xmlAutomataStatePtr from, + xmlAutomataStatePtr to); xmlAutomataStatePtr xmlAutomataNewEpsilon (xmlAutomataPtr am, xmlAutomataStatePtr from, xmlAutomataStatePtr to); diff --git a/tree.c b/tree.c index 086cb162..e5a4958c 100644 --- a/tree.c +++ b/tree.c @@ -902,56 +902,73 @@ xmlStringGetNodeList(xmlDocPtr doc, const xmlChar *value) { * Returns a pointer to the string copy, the caller must free it. */ xmlChar * -xmlNodeListGetString(xmlDocPtr doc, xmlNodePtr list, int inLine) { +xmlNodeListGetString(xmlDocPtr doc, xmlNodePtr list, int inLine) +{ xmlNodePtr node = list; xmlChar *ret = NULL; xmlEntityPtr ent; - if (list == NULL) return(NULL); + if (list == NULL) + return (NULL); while (node != NULL) { if ((node->type == XML_TEXT_NODE) || - (node->type == XML_CDATA_SECTION_NODE)) { - if (inLine) { - ret = xmlStrcat(ret, node->content); - } else { - xmlChar *buffer; - - buffer = xmlEncodeEntitiesReentrant(doc, node->content); - if (buffer != NULL) { - ret = xmlStrcat(ret, buffer); - xmlFree(buffer); - } - } - } else if (node->type == XML_ENTITY_REF_NODE) { - if (inLine) { - ent = xmlGetDocEntity(doc, node->name); - if (ent != NULL) - ret = xmlStrcat(ret, ent->content); - else { - ret = xmlStrcat(ret, node->content); - } + (node->type == XML_CDATA_SECTION_NODE)) { + if (inLine) { + ret = xmlStrcat(ret, node->content); } else { - xmlChar buf[2]; - buf[0] = '&'; buf[1] = 0; - ret = xmlStrncat(ret, buf, 1); - ret = xmlStrcat(ret, node->name); - buf[0] = ';'; buf[1] = 0; - ret = xmlStrncat(ret, buf, 1); - } - } -#if 0 - else { - xmlGenericError(xmlGenericErrorContext, - "xmlGetNodeListString : invalid node type %d\n", - node->type); - } -#endif - node = node->next; - } - return(ret); -} + xmlChar *buffer; + buffer = xmlEncodeEntitiesReentrant(doc, node->content); + if (buffer != NULL) { + ret = xmlStrcat(ret, buffer); + xmlFree(buffer); + } + } + } else if (node->type == XML_ENTITY_REF_NODE) { + if (inLine) { + ent = xmlGetDocEntity(doc, node->name); + if (ent != NULL) { + xmlChar *buffer; + + /* an entity content can be any "well balanced chunk", + * i.e. the result of the content [43] production: + * http://www.w3.org/TR/REC-xml#NT-content. + * So it can contain text, CDATA section or nested + * entity reference nodes (among others). + * -> we recursive call xmlNodeListGetString() + * which handles these types */ + buffer = xmlNodeListGetString(doc, ent->children, 1); + if (buffer != NULL) { + ret = xmlStrcat(ret, buffer); + xmlFree(buffer); + } + } else { + ret = xmlStrcat(ret, node->content); + } + } else { + xmlChar buf[2]; + + buf[0] = '&'; + buf[1] = 0; + ret = xmlStrncat(ret, buf, 1); + ret = xmlStrcat(ret, node->name); + buf[0] = ';'; + buf[1] = 0; + ret = xmlStrncat(ret, buf, 1); + } + } +#if 0 + else { + xmlGenericError(xmlGenericErrorContext, + "xmlGetNodeListString : invalid node type %d\n", + node->type); + } +#endif + node = node->next; + } + return (ret); +} /** * xmlNodeListGetRawString: * @doc: the document @@ -965,54 +982,73 @@ xmlNodeListGetString(xmlDocPtr doc, xmlNodePtr list, int inLine) { * Returns a pointer to the string copy, the caller must free it. */ xmlChar * -xmlNodeListGetRawString(xmlDocPtr doc, xmlNodePtr list, int inLine) { +xmlNodeListGetRawString(xmlDocPtr doc, xmlNodePtr list, int inLine) +{ xmlNodePtr node = list; xmlChar *ret = NULL; xmlEntityPtr ent; - if (list == NULL) return(NULL); + if (list == NULL) + return (NULL); while (node != NULL) { if ((node->type == XML_TEXT_NODE) || - (node->type == XML_CDATA_SECTION_NODE)) { - if (inLine) { - ret = xmlStrcat(ret, node->content); - } else { - xmlChar *buffer; - - buffer = xmlEncodeSpecialChars(doc, node->content); - if (buffer != NULL) { - ret = xmlStrcat(ret, buffer); - xmlFree(buffer); - } - } - } else if (node->type == XML_ENTITY_REF_NODE) { - if (inLine) { - ent = xmlGetDocEntity(doc, node->name); - if (ent != NULL) - ret = xmlStrcat(ret, ent->content); - else { - ret = xmlStrcat(ret, node->content); - } + (node->type == XML_CDATA_SECTION_NODE)) { + if (inLine) { + ret = xmlStrcat(ret, node->content); } else { - xmlChar buf[2]; - buf[0] = '&'; buf[1] = 0; - ret = xmlStrncat(ret, buf, 1); - ret = xmlStrcat(ret, node->name); - buf[0] = ';'; buf[1] = 0; - ret = xmlStrncat(ret, buf, 1); - } - } + xmlChar *buffer; + + buffer = xmlEncodeSpecialChars(doc, node->content); + if (buffer != NULL) { + ret = xmlStrcat(ret, buffer); + xmlFree(buffer); + } + } + } else if (node->type == XML_ENTITY_REF_NODE) { + if (inLine) { + ent = xmlGetDocEntity(doc, node->name); + if (ent != NULL) { + xmlChar *buffer; + + /* an entity content can be any "well balanced chunk", + * i.e. the result of the content [43] production: + * http://www.w3.org/TR/REC-xml#NT-content. + * So it can contain text, CDATA section or nested + * entity reference nodes (among others). + * -> we recursive call xmlNodeListGetRawString() + * which handles these types */ + buffer = + xmlNodeListGetRawString(doc, ent->children, 1); + if (buffer != NULL) { + ret = xmlStrcat(ret, buffer); + xmlFree(buffer); + } + } else { + ret = xmlStrcat(ret, node->content); + } + } else { + xmlChar buf[2]; + + buf[0] = '&'; + buf[1] = 0; + ret = xmlStrncat(ret, buf, 1); + ret = xmlStrcat(ret, node->name); + buf[0] = ';'; + buf[1] = 0; + ret = xmlStrncat(ret, buf, 1); + } + } #if 0 - else { - xmlGenericError(xmlGenericErrorContext, - "xmlGetNodeListString : invalid node type %d\n", - node->type); - } + else { + xmlGenericError(xmlGenericErrorContext, + "xmlGetNodeListString : invalid node type %d\n", + node->type); + } #endif - node = node->next; + node = node->next; } - return(ret); + return (ret); } /** @@ -3763,122 +3799,159 @@ xmlNodeGetBase(xmlDocPtr doc, xmlNodePtr cur) { * It's up to the caller to free the memory. */ xmlChar * -xmlNodeGetContent(xmlNodePtr cur) { - if (cur == NULL) return(NULL); +xmlNodeGetContent(xmlNodePtr cur) +{ + if (cur == NULL) + return (NULL); switch (cur->type) { case XML_DOCUMENT_FRAG_NODE: - case XML_ELEMENT_NODE: { - xmlNodePtr tmp = cur; - xmlBufferPtr buffer; - xmlChar *ret; + case XML_ELEMENT_NODE:{ + xmlNodePtr tmp = cur; + xmlBufferPtr buffer; + xmlChar *ret; - buffer = xmlBufferCreate(); - if (buffer == NULL) - return(NULL); - while (tmp != NULL) { - switch (tmp->type) { - case XML_CDATA_SECTION_NODE: - case XML_TEXT_NODE: - if (tmp->content != NULL) - xmlBufferCat(buffer, tmp->content); - break; - case XML_ENTITY_REF_NODE: { - xmlEntityPtr ent; + buffer = xmlBufferCreate(); + if (buffer == NULL) + return (NULL); + while (tmp != NULL) { + switch (tmp->type) { + case XML_CDATA_SECTION_NODE: + case XML_TEXT_NODE: + if (tmp->content != NULL) + xmlBufferCat(buffer, tmp->content); + break; + case XML_ENTITY_REF_NODE:{ + /* recursive substitution of entity references */ + xmlChar *cont = xmlNodeGetContent(tmp); - ent = xmlGetDocEntity(cur->doc, tmp->name); - if (ent != NULL) - xmlBufferCat(buffer, ent->content); - } - default: - break; - } - /* - * Skip to next node - */ - if (tmp->children != NULL) { - if (tmp->children->type != XML_ENTITY_DECL) { - tmp = tmp->children; - continue; - } - } - if (tmp == cur) - break; + if (cont) { + xmlBufferCat(buffer, + (const xmlChar *) cont); + xmlFree(cont); + } + break; + } + default: + break; + } + /* + * Skip to next node + */ + if (tmp->children != NULL) { + if (tmp->children->type != XML_ENTITY_DECL) { + tmp = tmp->children; + continue; + } + } + if (tmp == cur) + break; - if (tmp->next != NULL) { - tmp = tmp->next; - continue; - } - - do { - tmp = tmp->parent; - if (tmp == NULL) - break; - if (tmp == cur) { - tmp = NULL; - break; - } - if (tmp->next != NULL) { - tmp = tmp->next; - break; - } - } while (tmp != NULL); - } - ret = buffer->content; - buffer->content = NULL; - xmlBufferFree(buffer); - return(ret); - } - case XML_ATTRIBUTE_NODE: { - xmlAttrPtr attr = (xmlAttrPtr) cur; - if (attr->parent != NULL) - return(xmlNodeListGetString(attr->parent->doc, attr->children, 1)); - else - return(xmlNodeListGetString(NULL, attr->children, 1)); - break; - } + if (tmp->next != NULL) { + tmp = tmp->next; + continue; + } + + do { + tmp = tmp->parent; + if (tmp == NULL) + break; + if (tmp == cur) { + tmp = NULL; + break; + } + if (tmp->next != NULL) { + tmp = tmp->next; + break; + } + } while (tmp != NULL); + } + ret = buffer->content; + buffer->content = NULL; + xmlBufferFree(buffer); + return (ret); + } + case XML_ATTRIBUTE_NODE:{ + xmlAttrPtr attr = (xmlAttrPtr) cur; + + if (attr->parent != NULL) + return (xmlNodeListGetString + (attr->parent->doc, attr->children, 1)); + else + return (xmlNodeListGetString(NULL, attr->children, 1)); + break; + } case XML_COMMENT_NODE: case XML_PI_NODE: - if (cur->content != NULL) - return(xmlStrdup(cur->content)); - return(NULL); - case XML_ENTITY_REF_NODE: - /* - * Locate the entity, and get it's content - * @@@ - */ - return(NULL); + if (cur->content != NULL) + return (xmlStrdup(cur->content)); + return (NULL); + case XML_ENTITY_REF_NODE:{ + xmlEntityPtr ent; + xmlNodePtr tmp; + xmlBufferPtr buffer; + xmlChar *ret; + + /* lookup entity declaration */ + ent = xmlGetDocEntity(cur->doc, cur->name); + if (ent == NULL) + return (NULL); + + buffer = xmlBufferCreate(); + if (buffer == NULL) + return (NULL); + + /* an entity content can be any "well balanced chunk", + * i.e. the result of the content [43] production: + * http://www.w3.org/TR/REC-xml#NT-content + * -> we iterate through child nodes and recursive call + * xmlNodeGetContent() which handles all possible node types */ + tmp = ent->children; + while (tmp) { + xmlChar *cont = xmlNodeGetContent(tmp); + + if (cont) { + xmlBufferCat(buffer, (const xmlChar *) cont); + xmlFree(cont); + } + tmp = tmp->next; + } + + ret = buffer->content; + buffer->content = NULL; + xmlBufferFree(buffer); + return (ret); + } case XML_ENTITY_NODE: case XML_DOCUMENT_NODE: case XML_HTML_DOCUMENT_NODE: case XML_DOCUMENT_TYPE_NODE: case XML_NOTATION_NODE: case XML_DTD_NODE: - case XML_XINCLUDE_START: - case XML_XINCLUDE_END: + case XML_XINCLUDE_START: + case XML_XINCLUDE_END: #ifdef LIBXML_DOCB_ENABLED - case XML_DOCB_DOCUMENT_NODE: + case XML_DOCB_DOCUMENT_NODE: #endif - return(NULL); - case XML_NAMESPACE_DECL: - return(xmlStrdup(((xmlNsPtr)cur)->href)); + return (NULL); + case XML_NAMESPACE_DECL: + return (xmlStrdup(((xmlNsPtr) cur)->href)); case XML_ELEMENT_DECL: - /* TODO !!! */ - return(NULL); + /* TODO !!! */ + return (NULL); case XML_ATTRIBUTE_DECL: - /* TODO !!! */ - return(NULL); + /* TODO !!! */ + return (NULL); case XML_ENTITY_DECL: - /* TODO !!! */ - return(NULL); + /* TODO !!! */ + return (NULL); case XML_CDATA_SECTION_NODE: case XML_TEXT_NODE: - if (cur->content != NULL) - return(xmlStrdup(cur->content)); - return(NULL); + if (cur->content != NULL) + return (xmlStrdup(cur->content)); + return (NULL); } - return(NULL); + return (NULL); } - /** * xmlNodeSetContent: * @cur: the node being modified diff --git a/xmlregexp.c b/xmlregexp.c index 31657ea8..5d63c90b 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -109,6 +109,8 @@ typedef enum { XML_REGEXP_QUANT_OPT, XML_REGEXP_QUANT_MULT, XML_REGEXP_QUANT_PLUS, + XML_REGEXP_QUANT_ONCEONLY, + XML_REGEXP_QUANT_ALL, XML_REGEXP_QUANT_RANGE } xmlRegQuantType; @@ -279,6 +281,8 @@ struct _xmlRegExecCtxt { }; +#define REGEXP_ALL_COUNTER 0x123456 + static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); /************************************************************************ @@ -630,6 +634,10 @@ xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) { fprintf(output, "+ "); break; case XML_REGEXP_QUANT_RANGE: fprintf(output, "range "); break; + case XML_REGEXP_QUANT_ONCEONLY: + fprintf(output, "onceonly "); break; + case XML_REGEXP_QUANT_ALL: + fprintf(output, "all "); break; } } static void @@ -942,6 +950,24 @@ xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) { ctxt->states[ctxt->nbStates++] = state; } +/** + * xmlFAGenerateAllTransition: + * ctxt: a regexp parser context + * from: the from state + * to: the target state or NULL for building a new one + * + */ +static void +xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt, + xmlRegStatePtr from, xmlRegStatePtr to) { + if (to == NULL) { + to = xmlRegNewState(ctxt); + xmlRegStatePush(ctxt, to); + ctxt->state = to; + } + xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER); +} + /** * xmlFAGenerateEpsilonTransition: * ctxt: a regexp parser context @@ -3423,6 +3449,69 @@ xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, return(to); } +/** + * xmlAutomataNewOnceTrans: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * @token: the input string associated to that transition + * @min: the minimum successive occurences of token + * @min: the maximum successive occurences of token + * + * If @to is NULL, this create first a new target state in the automata + * and then adds a transition from the @from state to the target state + * activated by a succession of input of value @token and whose number + * is between @min and @max, moreover that transistion can only be crossed + * once. + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, + xmlAutomataStatePtr to, const xmlChar *token, + int min, int max, void *data) { + xmlRegAtomPtr atom; + int counter; + + if ((am == NULL) || (from == NULL) || (token == NULL)) + return(NULL); + if (min < 1) + return(NULL); + if ((max < min) || (max < 1)) + return(NULL); + atom = xmlRegNewAtom(am, XML_REGEXP_STRING); + if (atom == NULL) + return(NULL); + atom->valuep = xmlStrdup(token); + atom->data = data; + atom->quant = XML_REGEXP_QUANT_ONCEONLY; + if (min == 0) + atom->min = 1; + else + atom->min = min; + atom->max = max; + /* + * associate a counter to the transition. + */ + counter = xmlRegGetCounter(am); + am->counters[counter].min = 1; + am->counters[counter].max = 1; + + /* xmlFAGenerateTransitions(am, from, to, atom); */ + if (to == NULL) { + to = xmlRegNewState(am); + xmlRegStatePush(am, to); + } + xmlRegStateAddTrans(am, from, atom, to, counter, -1); + xmlRegAtomPush(am, atom); + am->state = to; + if (to == NULL) + to = am->state; + if (to == NULL) + return(NULL); + return(to); +} + /** * xmlAutomataNewState: * @am: an automata @@ -3465,6 +3554,30 @@ xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from, return(to); } +/** + * xmlAutomataNewAllTrans: + * @am: an automata + * @from: the starting point of the transition + * @to: the target point of the transition or NULL + * + * If @to is NULL, this create first a new target state in the automata + * and then adds a an ALL transition from the @from state to the + * target state. That transition is an epsilon transition allowed only when + * all transitions from the @from node have been activated. + * + * Returns the target state or NULL in case of error + */ +xmlAutomataStatePtr +xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from, + xmlAutomataStatePtr to) { + if ((am == NULL) || (from == NULL)) + return(NULL); + xmlFAGenerateAllTransition(am, from, to); + if (to == NULL) + return(am->state); + return(to); +} + /** * xmlAutomataNewCounter: * @am: an automata diff --git a/xmlschemas.c b/xmlschemas.c index 8cb9400b..19260637 100644 --- a/xmlschemas.c +++ b/xmlschemas.c @@ -2119,7 +2119,7 @@ xmlSchemaParseAll(xmlSchemaParserCtxtPtr ctxt, xmlSchemaPtr schema, if (type == NULL) return (NULL); type->node = node; - type->type = XML_SCHEMA_TYPE_SEQUENCE; + type->type = XML_SCHEMA_TYPE_ALL; type->id = xmlGetProp(node, BAD_CAST "id"); type->minOccurs = xmlGetMinOccurs(ctxt, node); type->maxOccurs = xmlGetMaxOccurs(ctxt, node); @@ -3037,6 +3037,26 @@ xmlSchemaBuildAContentModel(xmlSchemaTypePtr type, break; } case XML_SCHEMA_TYPE_ALL: { + xmlAutomataStatePtr end; + xmlAutomataStatePtr start; + xmlSchemaTypePtr subtypes; + xmlSchemaElementPtr elem = (xmlSchemaElementPtr) type; + + subtypes = type->subtypes; + if (subtypes == NULL) + break; + start = ctxt->state; + while (subtypes != NULL) { + ctxt->state = start; + elem = (xmlSchemaElementPtr) subtypes; + + /* TODO : handle the namespace too */ + xmlAutomataNewOnceTrans(ctxt->am, ctxt->state, ctxt->state, + elem->name, elem->minOccurs, elem->maxOccurs, + subtypes); + subtypes = subtypes->next; + } + ctxt->state = xmlAutomataNewAllTrans(ctxt->am, ctxt->state, NULL); TODO break; }