/* * parser.c : an XML 1.0 non-verifying parser * * See Copyright for the status of this software. * * Daniel.Veillard@w3.org */ #ifdef WIN32 #define HAVE_FCNTL_H #include #else #include #endif #include #include #include /* for memset() only */ #include #include #ifdef HAVE_FCNTL_H #include #endif #ifdef HAVE_UNISTD_H #include #endif #ifdef HAVE_ZLIB_H #include #endif #include "tree.h" #include "parser.h" #include "entities.h" #include "encoding.h" #include "valid.h" #include "parserInternals.h" const char *xmlParserVersion = LIBXML_VERSION; /************************************************************************ * * * Parser stacks related functions and macros * * * ************************************************************************/ int xmlSubstituteEntitiesDefaultValue = 0; /* * Generic function for accessing stacks in the Parser Context */ #define PUSH_AND_POP(type, name) \ extern int name##Push(xmlParserCtxtPtr ctxt, type value) { \ if (ctxt->name##Nr >= ctxt->name##Max) { \ ctxt->name##Max *= 2; \ ctxt->name##Tab = (void *) realloc(ctxt->name##Tab, \ ctxt->name##Max * sizeof(ctxt->name##Tab[0])); \ if (ctxt->name##Tab == NULL) { \ fprintf(stderr, "realloc failed !\n"); \ exit(1); \ } \ } \ ctxt->name##Tab[ctxt->name##Nr] = value; \ ctxt->name = value; \ return(ctxt->name##Nr++); \ } \ extern type name##Pop(xmlParserCtxtPtr ctxt) { \ type ret; \ if (ctxt->name##Nr <= 0) return(0); \ ctxt->name##Nr--; \ if (ctxt->name##Nr > 0) \ ctxt->name = ctxt->name##Tab[ctxt->name##Nr - 1]; \ else \ ctxt->name = NULL; \ ret = ctxt->name##Tab[ctxt->name##Nr]; \ ctxt->name##Tab[ctxt->name##Nr] = 0; \ return(ret); \ } \ PUSH_AND_POP(xmlParserInputPtr, input) PUSH_AND_POP(xmlNodePtr, node) /* * Macros for accessing the content. Those should be used only by the parser, * and not exported. * * Dirty macros, i.e. one need to make assumption on the context to use them * * CUR_PTR return the current pointer to the CHAR to be parsed. * CUR returns the current CHAR value, i.e. a 8 bit value if compiled * in ISO-Latin or UTF-8, and the current 16 bit value if compiled * in UNICODE mode. This should be used internally by the parser * only to compare to ASCII values otherwise it would break when * running with UTF-8 encoding. * NXT(n) returns the n'th next CHAR. Same as CUR is should be used only * to compare on ASCII based substring. * SKIP(n) Skip n CHAR, and must also be used only to skip ASCII defined * strings within the parser. * * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * * CURRENT Returns the current char value, with the full decoding of * UTF-8 if we are using this mode. It returns an int. * NEXT Skip to the next character, this does the proper decoding * in UTF-8 mode. It also pop-up unfinished entities on the fly. * It returns the pointer to the current CHAR. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */ #define CUR (*ctxt->input->cur) #define SKIP(val) ctxt->input->cur += (val) #define NXT(val) ctxt->input->cur[(val)] #define CUR_PTR ctxt->input->cur #define SKIP_BLANKS \ while (IS_BLANK(*(ctxt->input->cur))) NEXT #ifndef USE_UTF_8 #define CURRENT (*ctxt->input->cur) #define NEXT ((*ctxt->input->cur) ? \ (((*(ctxt->input->cur) == '\n') ? \ (ctxt->input->line++, ctxt->input->col = 1) : \ (ctxt->input->col++)), ctxt->input->cur++) : \ (xmlPopInput(ctxt), ctxt->input->cur)) #else #endif /** * xmlPopInput: * @ctxt: an XML parser context * * xmlPopInput: the current input pointed by ctxt->input came to an end * pop it and return the next char. * * TODO A deallocation of the popped Input structure is needed * * Returns the current CHAR in the parser context */ CHAR xmlPopInput(xmlParserCtxtPtr ctxt) { if (ctxt->inputNr == 1) return(0); /* End of main Input */ xmlFreeInputStream(inputPop(ctxt)); return(CUR); } /** * xmlPushInput: * @ctxt: an XML parser context * @input: an XML parser input fragment (entity, XML fragment ...). * * xmlPushInput: switch to a new input stream which is stacked on top * of the previous one(s). */ void xmlPushInput(xmlParserCtxtPtr ctxt, xmlParserInputPtr input) { if (input == NULL) return; inputPush(ctxt, input); } /** * xmlFreeInputStream: * @input: an xmlParserInputPtr * * Free up an input stream. */ void xmlFreeInputStream(xmlParserInputPtr input) { if (input == NULL) return; if (input->filename != NULL) free((char *) input->filename); if ((input->free != NULL) && (input->base != NULL)) input->free((char *) input->base); memset(input, -1, sizeof(xmlParserInput)); free(input); } /** * xmlNewEntityInputStream: * @ctxt: an XML parser context * @entity: an Entity pointer * * Create a new input stream based on an xmlEntityPtr * Returns the new input stream */ xmlParserInputPtr xmlNewEntityInputStream(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) { xmlParserInputPtr input; if (entity == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "internal: xmlNewEntityInputStream entity = NULL\n"); return(NULL); } if (entity->content == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "internal: xmlNewEntityInputStream entity->input = NULL\n"); return(NULL); } input = (xmlParserInputPtr) malloc(sizeof(xmlParserInput)); if (input == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "malloc: couldn't allocate a new input stream\n"); return(NULL); } input->filename = entity->SystemID; /* TODO !!! char <- CHAR */ input->base = entity->content; input->cur = entity->content; input->line = 1; input->col = 1; input->free = NULL; return(input); } /** * xmlNewStringInputStream: * @ctxt: an XML parser context * @entity: an Entity memory buffer * * Create a new input stream based on a memory buffer. * Returns the new input stream */ xmlParserInputPtr xmlNewStringInputStream(xmlParserCtxtPtr ctxt, CHAR *entity) { xmlParserInputPtr input; if (entity == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "internal: xmlNewStringInputStream string = NULL\n"); return(NULL); } input = (xmlParserInputPtr) malloc(sizeof(xmlParserInput)); if (input == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "malloc: couldn't allocate a new input stream\n"); return(NULL); } input->filename = NULL; input->base = entity; input->cur = entity; input->line = 1; input->col = 1; input->free = NULL; return(input); } /** * xmlNewInputFromFile: * @ctxt: an XML parser context * @filename: the filename to use as entity * * Create a new input stream based on a file. * * Returns the new input stream or NULL in case of error */ xmlParserInputPtr xmlNewInputFromFile(xmlParserCtxtPtr ctxt, const char *filename) { #ifdef HAVE_ZLIB_H gzFile input; #else int input; #endif int res; int len; int cnt; struct stat buf; char *buffer, *nbuf; xmlParserInputPtr inputStream; /* xmlCharEncoding enc; */ #define MINLEN 40000 if (strcmp(filename,"-") == 0) { #ifdef HAVE_ZLIB_H input = gzdopen (fileno(stdin), "r"); if (input == NULL) { fprintf (stderr, "Cannot read from stdin\n"); perror ("gzdopen failed"); return(NULL); } #else #ifdef WIN32 input = -1; #else input = fileno(stdin); #endif if (input < 0) { fprintf (stderr, "Cannot read from stdin\n"); perror ("open failed"); return(NULL); } #endif len = MINLEN; } else { #ifdef HAVE_ZLIB_H input = gzopen (filename, "r"); if (input == NULL) { fprintf (stderr, "Cannot read file %s :\n", filename); perror ("gzopen failed"); return(NULL); } #else #ifdef WIN32 input = _open (filename, O_RDONLY | _O_BINARY); #else input = open (filename, O_RDONLY); #endif if (input < 0) { fprintf (stderr, "Cannot read file %s :\n", filename); perror ("open failed"); return(NULL); } #endif res = stat(filename, &buf); if (res < 0) return(NULL); len = buf.st_size; if (len < MINLEN) len = MINLEN; } buffer = (char *)malloc((len+1)*sizeof(char)); if (buffer == NULL) { fprintf (stderr, "Cannot malloc\n"); perror ("malloc failed"); return(NULL); } cnt = 0; #ifdef HAVE_ZLIB_H while(!gzeof(input)) { #else while(1) { #endif if (cnt >= len) { len *= 2; nbuf = (char *)realloc(buffer,(len+1)*sizeof(char)); if (nbuf == NULL) { fprintf(stderr,"Cannot realloc\n"); free(buffer); perror ("realloc failed"); return(NULL); } buffer = nbuf; } #ifdef HAVE_ZLIB_H res = gzread(input, &buffer[cnt], len-cnt); #else res = read(input, &buffer[cnt], len-cnt); #endif if (res < 0) { fprintf (stderr, "Cannot read file %s :\n", filename); #ifdef HAVE_ZLIB_H perror ("gzread failed"); #else perror ("read failed"); #endif return(NULL); } if (res == 0) break; cnt += res; } #ifdef HAVE_ZLIB_H gzclose(input); #else close(input); #endif buffer[cnt] = '\0'; inputStream = (xmlParserInputPtr) malloc(sizeof(xmlParserInput)); if (inputStream == NULL) { perror("malloc"); free(ctxt); return(NULL); } inputStream->filename = strdup(filename); inputStream->line = 1; inputStream->col = 1; /* * plug some encoding conversion routines here. !!! enc = xmlDetectCharEncoding(buffer); xmlSwitchEncoding(ctxt, enc); */ inputStream->base = buffer; inputStream->cur = buffer; inputStream->free = (xmlParserInputDeallocate) free; return(inputStream); } /************************************************************************ * * * Commodity functions to handle entities * * * ************************************************************************/ /* * Macro used to grow the current buffer. */ #define growBuffer(buffer) { \ buffer##_size *= 2; \ buffer = (CHAR *) realloc(buffer, buffer##_size * sizeof(CHAR)); \ if (buffer == NULL) { \ perror("realloc failed"); \ exit(1); \ } \ } /** * xmlDecodeEntities: * @ctxt: the parser context * @what: combination of XML_SUBSTITUTE_REF and XML_SUBSTITUTE_PEREF * @len: the len to decode (in bytes !), -1 for no size limit * @end: an end marker CHAR, 0 if none * @end2: an end marker CHAR, 0 if none * @end3: an end marker CHAR, 0 if none * * [67] Reference ::= EntityRef | CharRef * * [69] PEReference ::= '%' Name ';' * * Returns A newly allocated string with the substitution done. The caller * must deallocate it ! */ CHAR * xmlDecodeEntities(xmlParserCtxtPtr ctxt, int len, int what, CHAR end, CHAR end2, CHAR end3) { CHAR *buffer = NULL; int buffer_size = 0; CHAR *out = NULL; CHAR *cur = NULL; xmlEntityPtr ent; const CHAR *start = CUR_PTR; unsigned int max = (unsigned int) len; /* * allocate a translation buffer. */ buffer_size = 1000; buffer = (CHAR *) malloc(buffer_size * sizeof(CHAR)); if (buffer == NULL) { perror("xmlDecodeEntities: malloc failed"); return(NULL); } out = buffer; /* * Ok loop until we reach one of the ending char or a size limit. */ while ((CUR_PTR - start < max) && (CUR != end) && (CUR != end2) && (CUR != end3)) { if (CUR == '&' && (what & XML_SUBSTITUTE_REF)) { if (NXT(1) == '#') { int val = xmlParseCharRef(ctxt); /* TODO: invalid for UTF-8 variable encoding !!! */ *out++ = val; } else { ent = xmlParseEntityRef(ctxt); if (ent != NULL) { cur = ent->content; while (*cur != 0) { *out++ = *cur++; if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } } } } } else if (CUR == '%' && (what & XML_SUBSTITUTE_PEREF)) { /* * a PEReference induce to switch the entity flow, * we break here to flush the current set of chars * parsed if any. We will be called back later. */ if (CUR_PTR != start) break; xmlParsePEReference(ctxt); /* * Pop-up of finished entities. */ while ((CUR == 0) && (ctxt->inputNr > 1)) xmlPopInput(ctxt); break; } else { /* TODO: invalid for UTF-8 , use COPY(out); */ *out++ = CUR; if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } NEXT; } } *out++ = 0; return(buffer); } /************************************************************************ * * * Commodity functions to handle encodings * * * ************************************************************************/ /** * xmlSwitchEncoding: * @ctxt: the parser context * @len: the len of @cur * * change the input functions when discovering the character encoding * of a given entity. * */ void xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) { switch (enc) { case XML_CHAR_ENCODING_ERROR: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "encoding unknown\n"); ctxt->wellFormed = 0; break; case XML_CHAR_ENCODING_NONE: /* let's assume it's UTF-8 without the XML decl */ return; case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ return; case XML_CHAR_ENCODING_UTF16LE: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UTF16 little endian not supported\n"); break; case XML_CHAR_ENCODING_UTF16BE: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UTF16 big endian not supported\n"); break; case XML_CHAR_ENCODING_UCS4LE: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding USC4 little endian not supported\n"); break; case XML_CHAR_ENCODING_UCS4BE: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding USC4 big endian not supported\n"); break; case XML_CHAR_ENCODING_EBCDIC: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding EBCDIC not supported\n"); break; case XML_CHAR_ENCODING_UCS4_2143: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UCS4 2143 not supported\n"); break; case XML_CHAR_ENCODING_UCS4_3412: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UCS4 3412 not supported\n"); break; case XML_CHAR_ENCODING_UCS2: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding UCS2 not supported\n"); break; case XML_CHAR_ENCODING_8859_1: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_1 ISO Latin 1 not supported\n"); break; case XML_CHAR_ENCODING_8859_2: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_2 ISO Latin 2 not supported\n"); break; case XML_CHAR_ENCODING_8859_3: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_3 not supported\n"); break; case XML_CHAR_ENCODING_8859_4: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_4 not supported\n"); break; case XML_CHAR_ENCODING_8859_5: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_5 not supported\n"); break; case XML_CHAR_ENCODING_8859_6: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_6 not supported\n"); break; case XML_CHAR_ENCODING_8859_7: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_7 not supported\n"); break; case XML_CHAR_ENCODING_8859_8: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_8 not supported\n"); break; case XML_CHAR_ENCODING_8859_9: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO_8859_9 not supported\n"); break; case XML_CHAR_ENCODING_2022_JP: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding ISO-2022-JPnot supported\n"); break; case XML_CHAR_ENCODING_SHIFT_JIS: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding Shift_JISnot supported\n"); break; case XML_CHAR_ENCODING_EUC_JP: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "char encoding EUC-JPnot supported\n"); break; } } /************************************************************************ * * * Commodity functions to handle CHARs * * * ************************************************************************/ /** * xmlStrndup: * @cur: the input CHAR * * @len: the len of @cur * * a strndup for array of CHAR's * * Returns a new CHAR * or NULL */ CHAR * xmlStrndup(const CHAR *cur, int len) { CHAR *ret = malloc((len + 1) * sizeof(CHAR)); if (ret == NULL) { fprintf(stderr, "malloc of %ld byte failed\n", (len + 1) * (long)sizeof(CHAR)); return(NULL); } memcpy(ret, cur, len * sizeof(CHAR)); ret[len] = 0; return(ret); } /** * xmlStrdup: * @cur: the input CHAR * * * a strdup for array of CHAR's * * Returns a new CHAR * or NULL */ CHAR * xmlStrdup(const CHAR *cur) { const CHAR *p = cur; while (IS_CHAR(*p)) p++; return(xmlStrndup(cur, p - cur)); } /** * xmlCharStrndup: * @cur: the input char * * @len: the len of @cur * * a strndup for char's to CHAR's * * Returns a new CHAR * or NULL */ CHAR * xmlCharStrndup(const char *cur, int len) { int i; CHAR *ret = malloc((len + 1) * sizeof(CHAR)); if (ret == NULL) { fprintf(stderr, "malloc of %ld byte failed\n", (len + 1) * (long)sizeof(CHAR)); return(NULL); } for (i = 0;i < len;i++) ret[i] = (CHAR) cur[i]; ret[len] = 0; return(ret); } /** * xmlCharStrdup: * @cur: the input char * * @len: the len of @cur * * a strdup for char's to CHAR's * * Returns a new CHAR * or NULL */ CHAR * xmlCharStrdup(const char *cur) { const char *p = cur; while (*p != '\0') p++; return(xmlCharStrndup(cur, p - cur)); } /** * xmlStrcmp: * @str1: the first CHAR * * @str2: the second CHAR * * * a strcmp for CHAR's * * Returns the integer result of the comparison */ int xmlStrcmp(const CHAR *str1, const CHAR *str2) { register int tmp; do { tmp = *str1++ - *str2++; if (tmp != 0) return(tmp); } while ((*str1 != 0) && (*str2 != 0)); return (*str1 - *str2); } /** * xmlStrncmp: * @str1: the first CHAR * * @str2: the second CHAR * * @len: the max comparison length * * a strncmp for CHAR's * * Returns the integer result of the comparison */ int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) { register int tmp; if (len <= 0) return(0); do { tmp = *str1++ - *str2++; if (tmp != 0) return(tmp); len--; if (len <= 0) return(0); } while ((*str1 != 0) && (*str2 != 0)); return (*str1 - *str2); } /** * xmlStrchr: * @str: the CHAR * array * @val: the CHAR to search * * a strchr for CHAR's * * Returns the CHAR * for the first occurence or NULL. */ CHAR * xmlStrchr(const CHAR *str, CHAR val) { while (*str != 0) { if (*str == val) return((CHAR *) str); str++; } return(NULL); } /** * xmlStrlen: * @str: the CHAR * array * * lenght of a CHAR's string * * Returns the number of CHAR contained in the ARRAY. */ int xmlStrlen(const CHAR *str) { int len = 0; if (str == NULL) return(0); while (*str != 0) { str++; len++; } return(len); } /** * xmlStrncat: * @cur: the original CHAR * array * @add: the CHAR * array added * @len: the length of @add * * a strncat for array of CHAR's * * Returns a new CHAR * containing the concatenated string. */ CHAR * xmlStrncat(CHAR *cur, const CHAR *add, int len) { int size; CHAR *ret; if ((add == NULL) || (len == 0)) return(cur); if (cur == NULL) return(xmlStrndup(add, len)); size = xmlStrlen(cur); ret = realloc(cur, (size + len + 1) * sizeof(CHAR)); if (ret == NULL) { fprintf(stderr, "xmlStrncat: realloc of %ld byte failed\n", (size + len + 1) * (long)sizeof(CHAR)); return(cur); } memcpy(&ret[size], add, len * sizeof(CHAR)); ret[size + len] = 0; return(ret); } /** * xmlStrcat: * @cur: the original CHAR * array * @add: the CHAR * array added * * a strcat for array of CHAR's * * Returns a new CHAR * containing the concatenated string. */ CHAR * xmlStrcat(CHAR *cur, const CHAR *add) { const CHAR *p = add; if (add == NULL) return(cur); if (cur == NULL) return(xmlStrdup(add)); while (IS_CHAR(*p)) p++; return(xmlStrncat(cur, add, p - add)); } /************************************************************************ * * * Commodity functions, cleanup needed ? * * * ************************************************************************/ /** * areBlanks: * @ctxt: an XML parser context * @str: a CHAR * * @len: the size of @str * * Is this a sequence of blank chars that one can ignore ? * * TODO: to be corrected accodingly to DTD information if available * * Returns 1 if ignorable 0 otherwise. */ static int areBlanks(xmlParserCtxtPtr ctxt, const CHAR *str, int len) { int i; xmlNodePtr lastChild; for (i = 0;i < len;i++) if (!(IS_BLANK(str[i]))) return(0); if (CUR != '<') return(0); if (ctxt->node == NULL) return(0); lastChild = xmlGetLastChild(ctxt->node); if (lastChild == NULL) { if (ctxt->node->content != NULL) return(0); } else if (xmlNodeIsText(lastChild)) return(0); return(1); } /** * xmlHandleEntity: * @ctxt: an XML parser context * @entity: an XML entity pointer. * * Default handling of defined entities, when should we define a new input * stream ? When do we just handle that as a set of chars ? * TODO: we should call the SAX handler here and have it resolve the issue */ void xmlHandleEntity(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) { int len; xmlParserInputPtr input; if (entity->content == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "xmlHandleEntity %s: content == NULL\n", entity->name); ctxt->wellFormed = 0; return; } len = xmlStrlen(entity->content); if (len <= 2) goto handle_as_char; /* * Redefine its content as an input stream. */ input = xmlNewEntityInputStream(ctxt, entity); xmlPushInput(ctxt, input); return; handle_as_char: /* * Just handle the content as a set of chars. */ if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) ctxt->sax->characters(ctxt->userData, entity->content, len); } /* * Forward definition for recusive behaviour. */ void xmlParsePEReference(xmlParserCtxtPtr ctxt); void xmlParseReference(xmlParserCtxtPtr ctxt); /************************************************************************ * * * Extra stuff for namespace support * * Relates to http://www.w3.org/TR/WD-xml-names * * * ************************************************************************/ /** * xmlNamespaceParseNCName: * @ctxt: an XML parser context * * parse an XML namespace name. * * [NS 3] NCName ::= (Letter | '_') (NCNameChar)* * * [NS 4] NCNameChar ::= Letter | Digit | '.' | '-' | '_' | * CombiningChar | Extender * * Returns the namespace name or NULL */ CHAR * xmlNamespaceParseNCName(xmlParserCtxtPtr ctxt) { const CHAR *q; CHAR *ret = NULL; if (!IS_LETTER(CUR) && (CUR != '_')) return(NULL); q = NEXT; while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || (CUR == '.') || (CUR == '-') || (CUR == '_') || (IS_COMBINING(CUR)) || (IS_EXTENDER(CUR))) NEXT; ret = xmlStrndup(q, CUR_PTR - q); return(ret); } /** * xmlNamespaceParseQName: * @ctxt: an XML parser context * @prefix: a CHAR ** * * parse an XML qualified name * * [NS 5] QName ::= (Prefix ':')? LocalPart * * [NS 6] Prefix ::= NCName * * [NS 7] LocalPart ::= NCName * * Returns the function returns the local part, and prefix is updated * to get the Prefix if any. */ CHAR * xmlNamespaceParseQName(xmlParserCtxtPtr ctxt, CHAR **prefix) { CHAR *ret = NULL; *prefix = NULL; ret = xmlNamespaceParseNCName(ctxt); if (CUR == ':') { *prefix = ret; NEXT; ret = xmlNamespaceParseNCName(ctxt); } return(ret); } /** * xmlSplitQName: * @name: an XML parser context * @prefix: a CHAR ** * * parse an XML qualified name string * * [NS 5] QName ::= (Prefix ':')? LocalPart * * [NS 6] Prefix ::= NCName * * [NS 7] LocalPart ::= NCName * * Returns the function returns the local part, and prefix is updated * to get the Prefix if any. */ CHAR * xmlSplitQName(const CHAR *name, CHAR **prefix) { CHAR *ret = NULL; const CHAR *q; const CHAR *cur = name; *prefix = NULL; if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL); q = cur++; while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) || (*cur == '.') || (*cur == '-') || (*cur == '_') || (IS_COMBINING(*cur)) || (IS_EXTENDER(*cur))) cur++; ret = xmlStrndup(q, cur - q); if (*cur == ':') { cur++; if (!IS_LETTER(*cur) && (*cur != '_')) return(ret); *prefix = ret; q = cur++; while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) || (*cur == '.') || (*cur == '-') || (*cur == '_') || (IS_COMBINING(*cur)) || (IS_EXTENDER(*cur))) cur++; ret = xmlStrndup(q, cur - q); } return(ret); } /** * xmlNamespaceParseNSDef: * @ctxt: an XML parser context * * parse a namespace prefix declaration * * [NS 1] NSDef ::= PrefixDef Eq SystemLiteral * * [NS 2] PrefixDef ::= 'xmlns' (':' NCName)? * * Returns the namespace name */ CHAR * xmlNamespaceParseNSDef(xmlParserCtxtPtr ctxt) { CHAR *name = NULL; if ((CUR == 'x') && (NXT(1) == 'm') && (NXT(2) == 'l') && (NXT(3) == 'n') && (NXT(4) == 's')) { SKIP(5); if (CUR == ':') { NEXT; name = xmlNamespaceParseNCName(ctxt); } } return(name); } /** * xmlParseQuotedString: * @ctxt: an XML parser context * * [OLD] Parse and return a string between quotes or doublequotes * * Returns the string parser or NULL. */ CHAR * xmlParseQuotedString(xmlParserCtxtPtr ctxt) { CHAR *ret = NULL; const CHAR *q; if (CUR == '"') { NEXT; q = CUR_PTR; while (IS_CHAR(CUR) && (CUR != '"')) NEXT; if (CUR != '"') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "String not closed \"%.50s\"\n", q); ctxt->wellFormed = 0; } else { ret = xmlStrndup(q, CUR_PTR - q); NEXT; } } else if (CUR == '\''){ NEXT; q = CUR_PTR; while (IS_CHAR(CUR) && (CUR != '\'')) NEXT; if (CUR != '\'') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "String not closed \"%.50s\"\n", q); ctxt->wellFormed = 0; } else { ret = xmlStrndup(q, CUR_PTR - q); NEXT; } } return(ret); } /** * xmlParseNamespace: * @ctxt: an XML parser context * * [OLD] xmlParseNamespace: parse specific PI '')) { /* * We can have "ns" or "prefix" attributes * Old encoding as 'href' or 'AS' attributes is still supported */ if ((CUR == 'n') && (NXT(1) == 's')) { garbage = 0; SKIP(2); SKIP_BLANKS; if (CUR != '=') continue; NEXT; SKIP_BLANKS; href = xmlParseQuotedString(ctxt); SKIP_BLANKS; } else if ((CUR == 'h') && (NXT(1) == 'r') && (NXT(2) == 'e') && (NXT(3) == 'f')) { garbage = 0; SKIP(4); SKIP_BLANKS; if (CUR != '=') continue; NEXT; SKIP_BLANKS; href = xmlParseQuotedString(ctxt); SKIP_BLANKS; } else if ((CUR == 'p') && (NXT(1) == 'r') && (NXT(2) == 'e') && (NXT(3) == 'f') && (NXT(4) == 'i') && (NXT(5) == 'x')) { garbage = 0; SKIP(6); SKIP_BLANKS; if (CUR != '=') continue; NEXT; SKIP_BLANKS; prefix = xmlParseQuotedString(ctxt); SKIP_BLANKS; } else if ((CUR == 'A') && (NXT(1) == 'S')) { garbage = 0; SKIP(2); SKIP_BLANKS; if (CUR != '=') continue; NEXT; SKIP_BLANKS; prefix = xmlParseQuotedString(ctxt); SKIP_BLANKS; } else if ((CUR == '?') && (NXT(1) == '>')) { garbage = 0; CUR_PTR ++; } else { /* * Found garbage when parsing the namespace */ if (!garbage) if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "xmlParseNamespace found garbage\n"); ctxt->wellFormed = 0; NEXT; } } MOVETO_ENDTAG(CUR_PTR); NEXT; /* * Register the DTD. if (href != NULL) if ((ctxt->sax != NULL) && (ctxt->sax->globalNamespace != NULL)) ctxt->sax->globalNamespace(ctxt->userData, href, prefix); */ if (prefix != NULL) free(prefix); if (href != NULL) free(href); } /************************************************************************ * * * The parser itself * * Relates to http://www.w3.org/TR/REC-xml * * * ************************************************************************/ /** * xmlParseName: * @ctxt: an XML parser context * * parse an XML name. * * [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | * CombiningChar | Extender * * [5] Name ::= (Letter | '_' | ':') (NameChar)* * * [6] Names ::= Name (S Name)* * * Returns the Name parsed or NULL */ CHAR * xmlParseName(xmlParserCtxtPtr ctxt) { const CHAR *q; CHAR *ret = NULL; if (!IS_LETTER(CUR) && (CUR != '_') && (CUR != ':')) return(NULL); q = NEXT; while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || (CUR == '.') || (CUR == '-') || (CUR == '_') || (CUR == ':') || (IS_COMBINING(CUR)) || (IS_EXTENDER(CUR))) NEXT; ret = xmlStrndup(q, CUR_PTR - q); return(ret); } /** * xmlParseNmtoken: * @ctxt: an XML parser context * * parse an XML Nmtoken. * * [7] Nmtoken ::= (NameChar)+ * * [8] Nmtokens ::= Nmtoken (S Nmtoken)* * * Returns the Nmtoken parsed or NULL */ CHAR * xmlParseNmtoken(xmlParserCtxtPtr ctxt) { const CHAR *q; CHAR *ret = NULL; q = NEXT; while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || (CUR == '.') || (CUR == '-') || (CUR == '_') || (CUR == ':') || (IS_COMBINING(CUR)) || (IS_EXTENDER(CUR))) NEXT; ret = xmlStrndup(q, CUR_PTR - q); return(ret); } /** * xmlParseEntityValue: * @ctxt: an XML parser context * @orig: if non-NULL store a copy of the original entity value * * parse a value for ENTITY decl. * * [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | * "'" ([^%&'] | PEReference | Reference)* "'" * * Returns the EntityValue parsed with reference substitued or NULL */ CHAR * xmlParseEntityValue(xmlParserCtxtPtr ctxt, CHAR **orig) { CHAR *ret = NULL; const CHAR *org = NULL; const CHAR *tst = NULL; const CHAR *temp = NULL; if (CUR == '"') { NEXT; org = CUR_PTR; while (CUR != '"') { tst = CUR_PTR; temp = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_BOTH, '"', 0, 0); if ((temp == NULL) && (tst == CUR_PTR)) break; ret = xmlStrcat(ret, temp); if (temp != NULL) free((char *)temp); } if (CUR != '"') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "EntityValue: \" expected\n"); ctxt->wellFormed = 0; } else { if (orig != NULL) *orig = xmlStrndup(org, CUR_PTR - org); NEXT; } } else if (CUR == '\'') { NEXT; org = CUR_PTR; while (CUR != '\'') { tst = CUR_PTR; temp = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_BOTH, '\'', 0, 0); if ((temp == NULL) && (tst == CUR_PTR)) break; ret = xmlStrcat(ret, temp); if (temp != NULL) free((char *)temp); } if (CUR != '\'') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "EntityValue: ' expected\n"); ctxt->wellFormed = 0; } else { if (orig != NULL) *orig = xmlStrndup(org, CUR_PTR - org); NEXT; } } else { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "EntityValue: \" or ' expected\n"); ctxt->wellFormed = 0; } return(ret); } /** * xmlParseAttValue: * @ctxt: an XML parser context * * parse a value for an attribute * Note: the parser won't do substitution of entities here, this * will be handled later in xmlStringGetNodeList, unless it was * asked for ctxt->replaceEntities != 0 * * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' | * "'" ([^<&'] | Reference)* "'" * * Returns the AttValue parsed or NULL. */ CHAR * xmlParseAttValue(xmlParserCtxtPtr ctxt) { CHAR *ret = NULL; if (CUR == '"') { NEXT; if (ctxt->replaceEntities != 0) ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_REF, '"', '<', 0); else ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_NONE, '"', '<', 0); if (CUR == '<') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Unescaped '<' not allowed in attributes values\n"); ctxt->wellFormed = 0; } if (CUR != '"') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n"); ctxt->wellFormed = 0; } else NEXT; } else if (CUR == '\'') { NEXT; if (ctxt->replaceEntities != 0) ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_REF, '\'', '<', 0); else ret = xmlDecodeEntities(ctxt, -1, XML_SUBSTITUTE_NONE, '\'', '<', 0); if (CUR == '<') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Unescaped '<' not allowed in attributes values\n"); ctxt->wellFormed = 0; } if (CUR != '\'') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "AttValue: ' expected\n"); ctxt->wellFormed = 0; } else NEXT; } else { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "AttValue: \" or ' expected\n"); ctxt->wellFormed = 0; } return(ret); } /** * xmlParseSystemLiteral: * @ctxt: an XML parser context * * parse an XML Literal * * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") * * Returns the SystemLiteral parsed or NULL */ CHAR * xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) { const CHAR *q; CHAR *ret = NULL; if (CUR == '"') { NEXT; q = CUR_PTR; while ((IS_CHAR(CUR)) && (CUR != '"')) NEXT; if (!IS_CHAR(CUR)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n"); ctxt->wellFormed = 0; } else { ret = xmlStrndup(q, CUR_PTR - q); NEXT; } } else if (CUR == '\'') { NEXT; q = CUR_PTR; while ((IS_CHAR(CUR)) && (CUR != '\'')) NEXT; if (!IS_CHAR(CUR)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Unfinished SystemLiteral\n"); ctxt->wellFormed = 0; } else { ret = xmlStrndup(q, CUR_PTR - q); NEXT; } } else { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n"); ctxt->wellFormed = 0; } return(ret); } /** * xmlParsePubidLiteral: * @ctxt: an XML parser context * * parse an XML public literal * * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" * * Returns the PubidLiteral parsed or NULL. */ CHAR * xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) { const CHAR *q; CHAR *ret = NULL; /* * Name ::= (Letter | '_') (NameChar)* */ if (CUR == '"') { NEXT; q = CUR_PTR; while (IS_PUBIDCHAR(CUR)) NEXT; if (CUR != '"') { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n"); ctxt->wellFormed = 0; } else { ret = xmlStrndup(q, CUR_PTR - q); NEXT; } } else if (CUR == '\'') { NEXT; q = CUR_PTR; while ((IS_LETTER(CUR)) && (CUR != '\'')) NEXT; if (!IS_LETTER(CUR)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Unfinished PubidLiteral\n"); ctxt->wellFormed = 0; } else { ret = xmlStrndup(q, CUR_PTR - q); NEXT; } } else { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "SystemLiteral \" or ' expected\n"); ctxt->wellFormed = 0; } return(ret); } /** * xmlParseCharData: * @ctxt: an XML parser context * @cdata: int indicating whether we are within a CDATA section * * parse a CharData section. * if we are within a CDATA section ']]>' marks an end of section. * * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) */ void xmlParseCharData(xmlParserCtxtPtr ctxt, int cdata) { const CHAR *q; q = CUR_PTR; while ((IS_CHAR(CUR)) && (CUR != '<') && (CUR != '&')) { if ((CUR == ']') && (NXT(1) == ']') && (NXT(2) == '>')) { if (cdata) break; else { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Sequence ']]>' not allowed in content\n"); ctxt->wellFormed = 0; } } NEXT; } if (q == CUR_PTR) return; /* * Ok the segment [q CUR_PTR] is to be consumed as chars. */ if (ctxt->sax != NULL) { if (areBlanks(ctxt, q, CUR_PTR - q)) { if (ctxt->sax->ignorableWhitespace != NULL) ctxt->sax->ignorableWhitespace(ctxt->userData, q, CUR_PTR - q); } else { if (ctxt->sax->characters != NULL) ctxt->sax->characters(ctxt->userData, q, CUR_PTR - q); } } } /** * xmlParseExternalID: * @ctxt: an XML parser context * @publicID: a CHAR** receiving PubidLiteral * @strict: indicate whether we should restrict parsing to only * production [75], see NOTE below * * Parse an External ID or a Public ID * * NOTE: Productions [75] and [83] interract badly since [75] can generate * 'PUBLIC' S PubidLiteral S SystemLiteral * * [75] ExternalID ::= 'SYSTEM' S SystemLiteral * | 'PUBLIC' S PubidLiteral S SystemLiteral * * [83] PublicID ::= 'PUBLIC' S PubidLiteral * * Returns the function returns SystemLiteral and in the second * case publicID receives PubidLiteral, is strict is off * it is possible to return NULL and have publicID set. */ CHAR * xmlParseExternalID(xmlParserCtxtPtr ctxt, CHAR **publicID, int strict) { CHAR *URI = NULL; if ((CUR == 'S') && (NXT(1) == 'Y') && (NXT(2) == 'S') && (NXT(3) == 'T') && (NXT(4) == 'E') && (NXT(5) == 'M')) { SKIP(6); if (!IS_BLANK(CUR)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Space required after 'SYSTEM'\n"); ctxt->wellFormed = 0; } SKIP_BLANKS; URI = xmlParseSystemLiteral(ctxt); if (URI == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "xmlParseExternalID: SYSTEM, no URI\n"); ctxt->wellFormed = 0; } } else if ((CUR == 'P') && (NXT(1) == 'U') && (NXT(2) == 'B') && (NXT(3) == 'L') && (NXT(4) == 'I') && (NXT(5) == 'C')) { SKIP(6); if (!IS_BLANK(CUR)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Space required after 'PUBLIC'\n"); ctxt->wellFormed = 0; } SKIP_BLANKS; *publicID = xmlParsePubidLiteral(ctxt); if (*publicID == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "xmlParseExternalID: PUBLIC, no Public Identifier\n"); ctxt->wellFormed = 0; } if (strict) { /* * We don't handle [83] so "S SystemLiteral" is required. */ if (!IS_BLANK(CUR)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Space required after the Public Identifier\n"); ctxt->wellFormed = 0; } } else { /* * We handle [83] so we return immediately, if * "S SystemLiteral" is not detected. From a purely parsing * point of view that's a nice mess. */ const CHAR *ptr = CUR_PTR; if (!IS_BLANK(*ptr)) return(NULL); while (IS_BLANK(*ptr)) ptr++; if ((*ptr != '\'') || (*ptr != '"')) return(NULL); } SKIP_BLANKS; URI = xmlParseSystemLiteral(ctxt); if (URI == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "xmlParseExternalID: PUBLIC, no URI\n"); ctxt->wellFormed = 0; } } return(URI); } /** * xmlParseComment: * @ctxt: an XML parser context * @create: should we create a node, or just skip the content * * Skip an XML (SGML) comment * This may or may not create a node (depending on the context) * The spec says that "For compatibility, the string "--" (double-hyphen) * must not occur within comments. " * * [15] Comment ::= '' */ void xmlParseComment(xmlParserCtxtPtr ctxt, int create) { const CHAR *q, *start; const CHAR *r; CHAR *val; /* * Check that there is a comment right here. */ if ((CUR != '<') || (NXT(1) != '!') || (NXT(2) != '-') || (NXT(3) != '-')) return; SKIP(4); start = q = CUR_PTR; NEXT; r = CUR_PTR; NEXT; while (IS_CHAR(CUR) && ((CUR == ':') || (CUR != '>') || (*r != '-') || (*q != '-'))) { if ((*r == '-') && (*q == '-')) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Comment must not contain '--' (double-hyphen)`\n"); ctxt->wellFormed = 0; } NEXT;r++;q++; } if (!IS_CHAR(CUR)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Comment not terminated \n