From 0ba4d5377c6c5bb904b73586a1468da58b31a911 Mon Sep 17 00:00:00 2001
From: Daniel Veillard <veillard@src.gnome.org>
Date: Sun, 1 Nov 1998 19:34:31 +0000
Subject: [PATCH] CharRef handling, comments, formatting, pre UTF-8 handling,
 Daniel.

---
 ChangeLog             |   6 ++
 encoding.c            |  11 ++
 entities.c            |  15 ++-
 include/libxml/tree.h |  10 +-
 parser.c              | 242 +++++++++++++++++++++++++++++-------------
 result/ent5           |   5 +
 test/ent5             |   5 +
 tree.h                |  10 +-
 8 files changed, 218 insertions(+), 86 deletions(-)
 create mode 100644 result/ent5
 create mode 100644 test/ent5

diff --git a/ChangeLog b/ChangeLog
index 09a9e910..272508a3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+Sun Nov  1 14:31:06 EST 1998 Daniel Veillard <Daniel.Veillard@w3.org>
+
+	* entities.c, parser.c: debug and cleanup of CharRef handling/saving.
+	  added ent5 test for this purpose.
+	* parser.c, parser.h: formatting, comments and UTF-8 planning.
+
 Fri Oct 30 01:36:52 EST 1998 Daniel Veillard <Daniel.Veillard@w3.org>
 
 	* parser.c: fixed? a strange error due to compression on a GWP
diff --git a/encoding.c b/encoding.c
index 77534193..ce86fff7 100644
--- a/encoding.c
+++ b/encoding.c
@@ -23,6 +23,17 @@
 
 #include "encoding.h"
 
+/*
+ * From rfc2044: encoding of the Unicode values on UTF-8:
+ *
+ * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
+ * 0000 0000-0000 007F   0xxxxxxx
+ * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
+ *
+ * I hope we won't use values > 0xFFFF anytime soon !
+ */
+
 /**
  * isolat1ToUTF8:
  * @out:  a pointer ot an array of bytes to store the result
diff --git a/entities.c b/entities.c
index 5d9d88a4..2875cb59 100644
--- a/entities.c
+++ b/entities.c
@@ -250,8 +250,10 @@ xmlEntityPtr xmlGetDocEntity(xmlDocPtr doc, const CHAR *name) {
 
 /*
  * xmlEncodeEntities : do a global encoding of a string, replacing the
- *                     basic content with their entities form.
- * TODO !!!! rewite !!!
+ *                     predefined entities and non ASCII values with their
+ *                     entities and CharRef counterparts.
+ * TODO !!!! Once moved to UTF-8 internal encoding, the encoding of non-ascii
+ *           get erroneous.
  */
 CHAR *xmlEncodeEntities(xmlDocPtr doc, const CHAR *input) {
     const CHAR *cur = input;
@@ -276,8 +278,6 @@ CHAR *xmlEncodeEntities(xmlDocPtr doc, const CHAR *input) {
 
 	/*
 	 * By default one have to encode at least '<', '>', '"' and '&' !
-	 * One could try a better encoding using the entities defined and
-	 * used as a compression code !!!.
 	 */
 	if (*cur == '<') {
 	    *out++ = '&';
@@ -309,6 +309,13 @@ CHAR *xmlEncodeEntities(xmlDocPtr doc, const CHAR *input) {
 	    *out++ = 'o';
 	    *out++ = 's';
 	    *out++ = ';';
+#ifndef USE_UTF_8
+	} else if ((sizeof(CHAR) == 1) && (*cur >= 0x80)) {
+	    char buf[10], *ptr;
+	    snprintf(buf, 9, "&#%d;", *cur);
+            ptr = buf;
+	    while (*ptr != 0) *out++ = *ptr++;
+#endif
 	} else {
 	    /*
 	     * default case, just copy !
diff --git a/include/libxml/tree.h b/include/libxml/tree.h
index 66793bbc..e3764d8b 100644
--- a/include/libxml/tree.h
+++ b/include/libxml/tree.h
@@ -37,9 +37,13 @@ typedef enum {
 } xmlElementType;
 
 /*
- * Currently we use only 8bit chars internal representation, but
- * the parser is not tied to that, just define UNICODE to switch to
- * a 16 bits representation.
+ * Size of an internal character representation.
+ *
+ * Currently we use 8bit chars internal representation for memory efficiency,
+ * but the parser is not tied to that, just define UNICODE to switch to
+ * a 16 bits internal representation. Note that with 8 bits wide
+ * CHARs one can still use UTF-8 to handle correctly non ISO-Latin
+ * input.
  */
 #ifdef UNICODE
 typedef unsigned short CHAR;
diff --git a/parser.c b/parser.c
index 441b5d15..aa76cc66 100644
--- a/parser.c
+++ b/parser.c
@@ -68,27 +68,49 @@ type name##Pop(xmlParserCtxtPtr ctxt) {					\
 PUSH_AND_POP(xmlParserInputPtr, input)
 PUSH_AND_POP(xmlNodePtr, node)
 
-/*************
-#define CUR (*(ctxt->input->cur) ? *(ctxt->input->cur) : xmlPopInput(ctxt))
-#define NEXT (((*(ctxt->input->cur) == '\n') ?				\
-		(ctxt->input->line++, ctxt->input->col = 1) :		\
-		(ctxt->input->col++)), ctxt->input->cur++)
- *************/
+/*
+ * Macros for accessing the content. Those should be used only by the parser,
+ * and not exported.
+ *
+ * Dirty macros, i.e. one need to make assumption on the context to use them
+ *
+ *   CUR_PTR return the current pointer to the CHAR to be parsed.
+ *   CUR     returns the current CHAR value, i.e. a 8 bit value if compiled
+ *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
+ *           in UNICODE mode. This should be used internally by the parser
+ *           only to compare to ASCII values otherwise it would break when
+ *           running with UTF-8 encoding.
+ *   NXT(n)  returns the n'th next CHAR. Same as CUR is should be used only
+ *           to compare on ASCII based substring.
+ *   SKIP(n) Skip n CHAR, and must also be used only to skip ASCII defined
+ *           strings within the parser.
+ *
+ * Clean macros, not dependent of an ASCII context.
+ *
+ *   CURRENT Returns the current char value, with the full decoding of
+ *           UTF-8 if we are using this mode. It returns an int.
+ *   NEXT    Skip to the next character, this does the proper decoding
+ *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
+ *           It returns the pointer to the current CHAR.
+ */
 
 #define CUR (*ctxt->input->cur)
+#define SKIP(val) ctxt->input->cur += (val)
+#define NXT(val) ctxt->input->cur[(val)]
+#define CUR_PTR ctxt->input->cur
+
+#define SKIP_BLANKS 							\
+    while (IS_BLANK(*(ctxt->input->cur))) NEXT
+
+#ifndef USE_UTF_8
+#define CURRENT (*ctxt->input->cur)
 #define NEXT ((*ctxt->input->cur) ?					\
                 (((*(ctxt->input->cur) == '\n') ?			\
 		    (ctxt->input->line++, ctxt->input->col = 1) :	\
 		    (ctxt->input->col++)), ctxt->input->cur++) :	\
 		(xmlPopInput(ctxt), ctxt->input->cur))
-
-#define CUR_PTR ctxt->input->cur
-
-#define NXT(val) ctxt->input->cur[(val)]
-
-#define SKIP(val) ctxt->input->cur += (val)
-#define SKIP_BLANKS 							\
-    while (IS_BLANK(*(ctxt->input->cur))) NEXT
+#else
+#endif
 
 
 /**
@@ -101,7 +123,8 @@ PUSH_AND_POP(xmlNodePtr, node)
  * TODO A deallocation of the popped Input structure is needed
  * return values: the current CHAR in the parser context
  */
-CHAR xmlPopInput(xmlParserCtxtPtr ctxt) {
+CHAR
+xmlPopInput(xmlParserCtxtPtr ctxt) {
     if (ctxt->inputNr == 1) return(0); /* End of main Input */
     inputPop(ctxt);
     return(CUR);
@@ -115,7 +138,8 @@ CHAR xmlPopInput(xmlParserCtxtPtr ctxt) {
  * xmlPushInput: switch to a new input stream which is stacked on top
  *               of the previous one(s).
  */
-void xmlPushInput(xmlParserCtxtPtr ctxt, xmlParserInputPtr input) {
+void
+xmlPushInput(xmlParserCtxtPtr ctxt, xmlParserInputPtr input) {
     if (input == NULL) return;
     inputPush(ctxt, input);
 }
@@ -540,9 +564,10 @@ xmlNewEntityInputStream(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
 #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
 
 #else
+#ifndef USE_UTF_8
 /************************************************************************
  *									*
- * 8bits / ASCII version of the macros.					*
+ * 8bits / ISO-Latin version of the macros.				*
  *									*
  ************************************************************************/
 /*
@@ -589,6 +614,15 @@ xmlNewEntityInputStream(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
  */
 #define IS_EXTENDER(c) ((c) == 0xb7)
 
+#else /* USE_UTF_8 */
+/************************************************************************
+ *									*
+ * 8bits / UTF-8 version of the macros.					*
+ *									*
+ ************************************************************************/
+
+TODO !!!
+#endif /* USE_UTF_8 */
 #endif /* !UNICODE */
 
 /*
@@ -638,7 +672,8 @@ xmlNewEntityInputStream(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
  * return values: a new CHAR * or NULL
  */
 
-CHAR *xmlStrndup(const CHAR *cur, int len) {
+CHAR *
+xmlStrndup(const CHAR *cur, int len) {
     CHAR *ret = malloc((len + 1) * sizeof(CHAR));
 
     if (ret == NULL) {
@@ -659,7 +694,8 @@ CHAR *xmlStrndup(const CHAR *cur, int len) {
  * return values: a new CHAR * or NULL
  */
 
-CHAR *xmlStrdup(const CHAR *cur) {
+CHAR *
+xmlStrdup(const CHAR *cur) {
     const CHAR *p = cur;
 
     while (IS_CHAR(*p)) p++;
@@ -675,7 +711,8 @@ CHAR *xmlStrdup(const CHAR *cur) {
  * return values: a new CHAR * or NULL
  */
 
-CHAR *xmlCharStrndup(const char *cur, int len) {
+CHAR *
+xmlCharStrndup(const char *cur, int len) {
     int i;
     CHAR *ret = malloc((len + 1) * sizeof(CHAR));
 
@@ -699,7 +736,8 @@ CHAR *xmlCharStrndup(const char *cur, int len) {
  * return values: a new CHAR * or NULL
  */
 
-CHAR *xmlCharStrdup(const char *cur) {
+CHAR *
+xmlCharStrdup(const char *cur) {
     const char *p = cur;
 
     while (*p != '\0') p++;
@@ -715,7 +753,8 @@ CHAR *xmlCharStrdup(const char *cur) {
  * return values: the integer result of the comparison
  */
 
-int xmlStrcmp(const CHAR *str1, const CHAR *str2) {
+int
+xmlStrcmp(const CHAR *str1, const CHAR *str2) {
     register int tmp;
 
     do {
@@ -735,7 +774,8 @@ int xmlStrcmp(const CHAR *str1, const CHAR *str2) {
  * return values: the integer result of the comparison
  */
 
-int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
+int
+xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
     register int tmp;
 
     if (len <= 0) return(0);
@@ -757,7 +797,8 @@ int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
  * return values: the CHAR * for the first occurence or NULL.
  */
 
-CHAR *xmlStrchr(const CHAR *str, CHAR val) {
+CHAR *
+xmlStrchr(const CHAR *str, CHAR val) {
     while (*str != 0) {
         if (*str == val) return((CHAR *) str);
 	str++;
@@ -773,7 +814,8 @@ CHAR *xmlStrchr(const CHAR *str, CHAR val) {
  * return values: the number of CHAR contained in the ARRAY.
  */
 
-int xmlStrlen(const CHAR *str) {
+int
+xmlStrlen(const CHAR *str) {
     int len = 0;
 
     if (str == NULL) return(0);
@@ -794,7 +836,8 @@ int xmlStrlen(const CHAR *str) {
  * return values: a new CHAR * containing the concatenated string.
  */
 
-CHAR *xmlStrncat(CHAR *cur, const CHAR *add, int len) {
+CHAR *
+xmlStrncat(CHAR *cur, const CHAR *add, int len) {
     int size;
     CHAR *ret;
 
@@ -824,7 +867,8 @@ CHAR *xmlStrncat(CHAR *cur, const CHAR *add, int len) {
  * return values: a new CHAR * containing the concatenated string.
  */
 
-CHAR *xmlStrcat(CHAR *cur, const CHAR *add) {
+CHAR *
+xmlStrcat(CHAR *cur, const CHAR *add) {
     const CHAR *p = add;
 
     if (add == NULL) return(cur);
@@ -879,7 +923,8 @@ static int areBlanks(xmlParserCtxtPtr ctxt, const CHAR *str, int len) {
  * TODO: we should call the SAX handler here and have it resolve the issue
  */
 
-void xmlHandleEntity(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
+void
+xmlHandleEntity(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
     int len;
     xmlParserInputPtr input;
 
@@ -934,7 +979,8 @@ CHAR *xmlParseReference(xmlParserCtxtPtr ctxt);
  * return values: the namespace name or NULL
  */
 
-CHAR *xmlNamespaceParseNCName(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlNamespaceParseNCName(xmlParserCtxtPtr ctxt) {
     const CHAR *q;
     CHAR *ret = NULL;
 
@@ -969,7 +1015,8 @@ CHAR *xmlNamespaceParseNCName(xmlParserCtxtPtr ctxt) {
  *   to get the Prefix if any.
  */
 
-CHAR *xmlNamespaceParseQName(xmlParserCtxtPtr ctxt, CHAR **prefix) {
+CHAR *
+xmlNamespaceParseQName(xmlParserCtxtPtr ctxt, CHAR **prefix) {
     CHAR *ret = NULL;
 
     *prefix = NULL;
@@ -995,7 +1042,8 @@ CHAR *xmlNamespaceParseQName(xmlParserCtxtPtr ctxt, CHAR **prefix) {
  * return values: the namespace name
  */
 
-CHAR *xmlNamespaceParseNSDef(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlNamespaceParseNSDef(xmlParserCtxtPtr ctxt) {
     CHAR *name = NULL;
 
     if ((CUR == 'x') && (NXT(1) == 'm') &&
@@ -1017,7 +1065,8 @@ CHAR *xmlNamespaceParseNSDef(xmlParserCtxtPtr ctxt) {
  * [OLD] Parse and return a string between quotes or doublequotes
  * return values: the string parser or NULL.
  */
-CHAR *xmlParseQuotedString(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseQuotedString(xmlParserCtxtPtr ctxt) {
     CHAR *ret = NULL;
     const CHAR *q;
 
@@ -1056,7 +1105,8 @@ CHAR *xmlParseQuotedString(xmlParserCtxtPtr ctxt) {
  * if ot was declared on the root of the Tree:-(
  */
 
-void xmlParseNamespace(xmlParserCtxtPtr ctxt) {
+void
+xmlParseNamespace(xmlParserCtxtPtr ctxt) {
     CHAR *href = NULL;
     CHAR *prefix = NULL;
     int garbage = 0;
@@ -1166,7 +1216,8 @@ void xmlParseNamespace(xmlParserCtxtPtr ctxt) {
  * return values: the Name parsed or NULL
  */
 
-CHAR *xmlParseName(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseName(xmlParserCtxtPtr ctxt) {
     const CHAR *q;
     CHAR *ret = NULL;
 
@@ -1198,7 +1249,8 @@ CHAR *xmlParseName(xmlParserCtxtPtr ctxt) {
  * return values: the Nmtoken parsed or NULL
  */
 
-CHAR *xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
     const CHAR *q;
     CHAR *ret = NULL;
 
@@ -1227,7 +1279,8 @@ CHAR *xmlParseNmtoken(xmlParserCtxtPtr ctxt) {
  * return values: the EntityValue parsed or NULL
  */
 
-CHAR *xmlParseEntityValue(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseEntityValue(xmlParserCtxtPtr ctxt) {
     CHAR *ret = NULL, *cur;
     const CHAR *q;
 
@@ -1314,7 +1367,8 @@ CHAR *xmlParseEntityValue(xmlParserCtxtPtr ctxt) {
  * return values: the AttValue parsed or NULL.
  */
 
-CHAR *xmlParseAttValue(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseAttValue(xmlParserCtxtPtr ctxt) {
     CHAR *ret = NULL, *cur;
     const CHAR *q;
 
@@ -1414,7 +1468,8 @@ CHAR *xmlParseAttValue(xmlParserCtxtPtr ctxt) {
  * return values: the SystemLiteral parsed or NULL
  */
 
-CHAR *xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) {
     const CHAR *q;
     CHAR *ret = NULL;
 
@@ -1455,7 +1510,8 @@ CHAR *xmlParseSystemLiteral(xmlParserCtxtPtr ctxt) {
  * return values: the PubidLiteral parsed or NULL.
  */
 
-CHAR *xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) {
     const CHAR *q;
     CHAR *ret = NULL;
     /*
@@ -1501,7 +1557,8 @@ CHAR *xmlParsePubidLiteral(xmlParserCtxtPtr ctxt) {
  * return values: 
  */
 
-void xmlParseCharData(xmlParserCtxtPtr ctxt, int cdata) {
+void
+xmlParseCharData(xmlParserCtxtPtr ctxt, int cdata) {
     const CHAR *q;
 
     q = CUR_PTR;
@@ -1537,7 +1594,8 @@ void xmlParseCharData(xmlParserCtxtPtr ctxt, int cdata) {
  *                case publicID receives PubidLiteral
  */
 
-CHAR *xmlParseExternalID(xmlParserCtxtPtr ctxt, CHAR **publicID) {
+CHAR *
+xmlParseExternalID(xmlParserCtxtPtr ctxt, CHAR **publicID) {
     CHAR *URI = NULL;
 
     if ((CUR == 'S') && (NXT(1) == 'Y') &&
@@ -1630,7 +1688,8 @@ xmlNodePtr xmlParseComment(xmlParserCtxtPtr ctxt, int create) {
  * return values: the PITarget name or NULL
  */
 
-CHAR *xmlParsePITarget(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParsePITarget(xmlParserCtxtPtr ctxt) {
     CHAR *name;
 
     name = xmlParseName(ctxt);
@@ -1654,7 +1713,8 @@ CHAR *xmlParsePITarget(xmlParserCtxtPtr ctxt) {
  * return values: the PI name or NULL
  */
 
-void xmlParsePI(xmlParserCtxtPtr ctxt) {
+void
+xmlParsePI(xmlParserCtxtPtr ctxt) {
     CHAR *target;
 
     if ((CUR == '<') && (NXT(1) == '?')) {
@@ -1756,7 +1816,8 @@ void xmlParsePI(xmlParserCtxtPtr ctxt) {
  * TODO: no handling of the values parsed !
  */
 
-void xmlParseNotationDecl(xmlParserCtxtPtr ctxt) {
+void
+xmlParseNotationDecl(xmlParserCtxtPtr ctxt) {
     CHAR *name;
     
     if ((CUR == '<') && (NXT(1) == '!') &&
@@ -1803,7 +1864,8 @@ void xmlParseNotationDecl(xmlParserCtxtPtr ctxt) {
  * [76] NDataDecl ::= S 'NDATA' S Name
  */
 
-void xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
+void
+xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
     CHAR *name = NULL;
     CHAR *value = NULL;
     CHAR *URI = NULL, *literal = NULL;
@@ -1906,7 +1968,8 @@ void xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
  * TODO: not implemented !!!
  */
 
-void xmlParseEnumeratedType(xmlParserCtxtPtr ctxt, CHAR *name) {
+void
+xmlParseEnumeratedType(xmlParserCtxtPtr ctxt, CHAR *name) {
     /*
      * TODO !!!
      */
@@ -1930,7 +1993,8 @@ void xmlParseEnumeratedType(xmlParserCtxtPtr ctxt, CHAR *name) {
  *
  * TODO: not implemented !!!
  */
-void xmlParseAttributeType(xmlParserCtxtPtr ctxt, CHAR *name) {
+void
+xmlParseAttributeType(xmlParserCtxtPtr ctxt, CHAR *name) {
     /* TODO !!! */
     if ((CUR == 'C') && (NXT(1) == 'D') &&
         (NXT(2) == 'A') && (NXT(3) == 'T') &&
@@ -1981,7 +2045,8 @@ void xmlParseAttributeType(xmlParserCtxtPtr ctxt, CHAR *name) {
  *
  * TODO: not implemented !!!
  */
-void xmlParseAttributeListDecl(xmlParserCtxtPtr ctxt) {
+void
+xmlParseAttributeListDecl(xmlParserCtxtPtr ctxt) {
     CHAR *name;
 
     /* TODO !!! */
@@ -2041,7 +2106,8 @@ void xmlParseAttributeListDecl(xmlParserCtxtPtr ctxt) {
  * TODO: not implemented !!!
  */
 
-void xmlParseElementContentDecl(xmlParserCtxtPtr ctxt, CHAR *name) {
+void
+xmlParseElementContentDecl(xmlParserCtxtPtr ctxt, CHAR *name) {
     /*
      * TODO This has to be parsed correctly, currently we just skip until
      *      we reach the first '>'.
@@ -2063,7 +2129,8 @@ void xmlParseElementContentDecl(xmlParserCtxtPtr ctxt, CHAR *name) {
  *
  * TODO There is a check [ VC: Unique Element Type Declaration ]
  */
-void xmlParseElementDecl(xmlParserCtxtPtr ctxt) {
+void
+xmlParseElementDecl(xmlParserCtxtPtr ctxt) {
     CHAR *name;
 
     if ((CUR == '<') && (NXT(1) == '!') &&
@@ -2115,7 +2182,8 @@ void xmlParseElementDecl(xmlParserCtxtPtr ctxt) {
  *
  * TODO There is a check [ VC: Proper Declaration/PE Nesting ]
  */
-void xmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
+void
+xmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
     xmlParseElementDecl(ctxt);
     xmlParseAttributeListDecl(ctxt);
     xmlParseEntityDecl(ctxt);
@@ -2134,7 +2202,8 @@ void xmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
  *                  '&#x' [0-9a-fA-F]+ ';'
  * return values: the value parsed
  */
-CHAR *xmlParseCharRef(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseCharRef(xmlParserCtxtPtr ctxt) {
     int val = 0;
     CHAR buf[2];
 
@@ -2155,13 +2224,13 @@ CHAR *xmlParseCharRef(xmlParserCtxtPtr ctxt) {
 	    }
 	    NEXT;
 	}
-	if (CUR != ';')
+	if (CUR == ';')
 	    NEXT;
     } else if  ((CUR == '&') && (NXT(1) == '#')) {
 	SKIP(2);
 	while (CUR != ';') {
 	    if ((CUR >= '0') && (CUR <= '9')) 
-	        val = val * 16 + (CUR - '0');
+	        val = val * 10 + (CUR - '0');
 	    else {
 	        xmlParserError(ctxt, "xmlParseCharRef: invalid value\n");
 		val = 0;
@@ -2169,7 +2238,7 @@ CHAR *xmlParseCharRef(xmlParserCtxtPtr ctxt) {
 	    }
 	    NEXT;
 	}
-	if (CUR != ';')
+	if (CUR == ';')
 	    NEXT;
     } else {
 	xmlParserError(ctxt, "xmlParseCharRef: invalid value\n");
@@ -2196,7 +2265,8 @@ CHAR *xmlParseCharRef(xmlParserCtxtPtr ctxt) {
  * [68] EntityRef ::= '&' Name ';'
  * return values: the entity ref string or NULL if directly as input stream.
  */
-CHAR *xmlParseEntityRef(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseEntityRef(xmlParserCtxtPtr ctxt) {
     CHAR *ret = NULL;
     const CHAR *q;
     CHAR *name;
@@ -2247,7 +2317,8 @@ CHAR *xmlParseEntityRef(xmlParserCtxtPtr ctxt) {
  * return values: the entity string or NULL if handled directly by pushing
  *      the entity value as the input.
  */
-CHAR *xmlParseReference(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseReference(xmlParserCtxtPtr ctxt) {
     if ((CUR == '&') && (NXT(1) == '#')) {
         return(xmlParseCharRef(ctxt));
     } else if (CUR == '&') {
@@ -2265,7 +2336,8 @@ CHAR *xmlParseReference(xmlParserCtxtPtr ctxt) {
  * [69] PEReference ::= '%' Name ';'
  * return values: the entity content or NULL if handled directly.
  */
-CHAR *xmlParsePEReference(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParsePEReference(xmlParserCtxtPtr ctxt) {
     CHAR *ret = NULL;
     CHAR *name;
     xmlEntityPtr entity;
@@ -2310,7 +2382,8 @@ CHAR *xmlParsePEReference(xmlParserCtxtPtr ctxt) {
  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
  */
 
-void xmlParseDocTypeDecl(xmlParserCtxtPtr ctxt) {
+void
+xmlParseDocTypeDecl(xmlParserCtxtPtr ctxt) {
     xmlDtdPtr dtd;
     CHAR *name;
     CHAR *ExternalID = NULL;
@@ -2583,7 +2656,8 @@ xmlNodePtr xmlParseStartTag(xmlParserCtxtPtr ctxt) {
  * return values: tagPtr receive the tag name just read
  */
 
-void xmlParseEndTag(xmlParserCtxtPtr ctxt, xmlNsPtr *nsPtr, CHAR **tagPtr) {
+void
+xmlParseEndTag(xmlParserCtxtPtr ctxt, xmlNsPtr *nsPtr, CHAR **tagPtr) {
     CHAR *namespace, *name;
     xmlNsPtr ns = NULL;
 
@@ -2634,7 +2708,8 @@ void xmlParseEndTag(xmlParserCtxtPtr ctxt, xmlNsPtr *nsPtr, CHAR **tagPtr) {
  *
  * [21] CDEnd ::= ']]>'
  */
-void xmlParseCDSect(xmlParserCtxtPtr ctxt) {
+void
+xmlParseCDSect(xmlParserCtxtPtr ctxt) {
     const CHAR *r, *s, *base;
 
     if ((CUR == '<') && (NXT(1) == '!') &&
@@ -2685,7 +2760,8 @@ void xmlParseCDSect(xmlParserCtxtPtr ctxt) {
  * [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)*
  */
 
-void xmlParseContent(xmlParserCtxtPtr ctxt) {
+void
+xmlParseContent(xmlParserCtxtPtr ctxt) {
     xmlNodePtr ret = NULL;
 
     while ((CUR != '<') || (NXT(1) != '/')) {
@@ -2889,7 +2965,8 @@ xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt) {
  * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')+
  * return values: the string giving the XML version number, or NULL
  */
-CHAR *xmlParseVersionNum(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseVersionNum(xmlParserCtxtPtr ctxt) {
     const CHAR *q = CUR_PTR;
     CHAR *ret;
 
@@ -2916,7 +2993,8 @@ CHAR *xmlParseVersionNum(xmlParserCtxtPtr ctxt) {
  * return values: the version string, e.g. "1.0"
  */
 
-CHAR *xmlParseVersionInfo(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseVersionInfo(xmlParserCtxtPtr ctxt) {
     CHAR *version = NULL;
     const CHAR *q;
 
@@ -2965,7 +3043,8 @@ CHAR *xmlParseVersionInfo(xmlParserCtxtPtr ctxt) {
  *
  * return values: the encoding name value or NULL
  */
-CHAR *xmlParseEncName(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseEncName(xmlParserCtxtPtr ctxt) {
     const CHAR *q = CUR_PTR;
     CHAR *ret = NULL;
 
@@ -2997,7 +3076,8 @@ CHAR *xmlParseEncName(xmlParserCtxtPtr ctxt) {
  * return values: the encoding value or NULL
  */
 
-CHAR *xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
+CHAR *
+xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
     CHAR *encoding = NULL;
     const CHAR *q;
 
@@ -3048,7 +3128,8 @@ CHAR *xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
  * return values: 1 if standalone, 0 otherwise
  */
 
-int xmlParseSDDecl(xmlParserCtxtPtr ctxt) {
+int
+xmlParseSDDecl(xmlParserCtxtPtr ctxt) {
     int standalone = -1;
 
     SKIP_BLANKS;
@@ -3112,7 +3193,8 @@ int xmlParseSDDecl(xmlParserCtxtPtr ctxt) {
  * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
  */
 
-void xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
+void
+xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
     CHAR *version;
 
     /*
@@ -3164,7 +3246,8 @@ void xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
  * [27] Misc ::= Comment | PI |  S
  */
 
-void xmlParseMisc(xmlParserCtxtPtr ctxt) {
+void
+xmlParseMisc(xmlParserCtxtPtr ctxt) {
     while (((CUR == '<') && (NXT(1) == '?')) ||
            ((CUR == '<') && (NXT(1) == '!') &&
 	    (NXT(2) == '-') && (NXT(3) == '-')) ||
@@ -3193,7 +3276,8 @@ void xmlParseMisc(xmlParserCtxtPtr ctxt) {
  *                as a result of the parsing.
  */
 
-int xmlParseDocument(xmlParserCtxtPtr ctxt) {
+int
+xmlParseDocument(xmlParserCtxtPtr ctxt) {
     xmlDefaultSAXHandlerInit();
 
     /*
@@ -3518,7 +3602,8 @@ xmlDocPtr xmlParseMemory(char *buffer, int size) {
  * Initialize a parser context
  */
 
-void xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
+void
+xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
 {
   /* Allocate the Input stack */
   ctxt->inputTab = (xmlParserInputPtr *) malloc(5 * sizeof(xmlParserInputPtr));
@@ -3545,7 +3630,8 @@ void xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
  * Clear (release owned resources) and reinitialize a parser context
  */
 
-void xmlClearParserCtxt(xmlParserCtxtPtr ctxt)
+void
+xmlClearParserCtxt(xmlParserCtxtPtr ctxt)
 {
   xmlClearNodeInfoSeq(&ctxt->node_seq);
   xmlInitParserCtxt(ctxt);
@@ -3563,7 +3649,8 @@ void xmlClearParserCtxt(xmlParserCtxtPtr ctxt)
  * NULL, but the filename parameter can be
  */
 
-void xmlSetupParserForBuffer(xmlParserCtxtPtr ctxt, const CHAR* buffer,
+void
+xmlSetupParserForBuffer(xmlParserCtxtPtr ctxt, const CHAR* buffer,
                              const char* filename)
 {
   xmlParserInputPtr input;
@@ -3618,7 +3705,8 @@ const xmlParserNodeInfo* xmlParserFindNodeInfo(const xmlParserCtxt* ctx,
  *
  * -- Initialize (set to initial state) node info sequence
  */
-void xmlInitNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
+void
+xmlInitNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
 {
   seq->length = 0;
   seq->maximum = 0;
@@ -3632,7 +3720,8 @@ void xmlInitNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
  * -- Clear (release memory and reinitialize) node
  *   info sequence
  */
-void xmlClearNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
+void
+xmlClearNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
 {
   if ( seq->buffer != NULL )
     free(seq->buffer);
@@ -3685,7 +3774,8 @@ unsigned long xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeq* seq,
  *
  * Insert node info record into the sorted sequence
  */
-void xmlParserAddNodeInfo(xmlParserCtxtPtr ctx, 
+void
+xmlParserAddNodeInfo(xmlParserCtxtPtr ctx, 
                           const xmlParserNodeInfo* info)
 {
   unsigned long pos;
diff --git a/result/ent5 b/result/ent5
new file mode 100644
index 00000000..d5b34e52
--- /dev/null
+++ b/result/ent5
@@ -0,0 +1,5 @@
+<?xml version="1.0"?>
+<EXAMPLE>
+    This is an inverted exclamation sign &#161;
+    This is a space   
+</EXAMPLE>
diff --git a/test/ent5 b/test/ent5
new file mode 100644
index 00000000..adb9ea7a
--- /dev/null
+++ b/test/ent5
@@ -0,0 +1,5 @@
+<?xml version="1.0"?>
+<EXAMPLE>
+    This is an inverted exclamation sign &#xA1;
+    This is a space &#32; 
+</EXAMPLE>
diff --git a/tree.h b/tree.h
index 66793bbc..e3764d8b 100644
--- a/tree.h
+++ b/tree.h
@@ -37,9 +37,13 @@ typedef enum {
 } xmlElementType;
 
 /*
- * Currently we use only 8bit chars internal representation, but
- * the parser is not tied to that, just define UNICODE to switch to
- * a 16 bits representation.
+ * Size of an internal character representation.
+ *
+ * Currently we use 8bit chars internal representation for memory efficiency,
+ * but the parser is not tied to that, just define UNICODE to switch to
+ * a 16 bits internal representation. Note that with 8 bits wide
+ * CHARs one can still use UTF-8 to handle correctly non ISO-Latin
+ * input.
  */
 #ifdef UNICODE
 typedef unsigned short CHAR;