added HTML page indexing Daniel

* doc/index.py: added HTML page indexing Daniel
2025-08-07 06:43:02 +03:00 · 2002-10-06 21:51:18 +00:00
parent dc6d4abae7
commit 141d04ba74
2 changed files with 246 additions and 0 deletions
--- a/4
+++ b/4
@@ -1,3 +1,7 @@
 Sun Oct  6 23:50:29 CEST 2002 Daniel Veillard <daniel@veillard.com>
 	* doc/index.py: added HTML page indexing 
 Fri Oct  4 15:33:55 CEST 2002 Igor Zlatkovic <igor@stud.fh-frankfurt.de>
 	* xmlIO.c: extended Windows path normalisation to fix the base
--- a/doc/index.py
+++ b/doc/index.py
@@ -41,6 +41,13 @@ import sys
 import string
 import os
 #
 # We are not interested in parsing errors here
 #
 def callback(ctx, str):
    return
 libxml2.registerErrorHandler(callback, None)
 #
 # The dictionnary of tables required and the SQL command needed
 # to create them
@@ -60,6 +67,19 @@ TABLES={
 	   KEY name (name),
 	   KEY symbol (symbol),
 	   UNIQUE KEY ID (name, symbol))""",
  "wordsHTML" : """CREATE TABLE wordsHTML (
           name varchar(50) NOT NULL,
 	   resource varchar(255) NOT NULL,
 	   section varchar(255),
 	   id varchar(50),
           relevance int,
 	   KEY name (name),
 	   KEY resource (resource),
 	   UNIQUE KEY ref (name, resource))""",
  "pages" : """CREATE TABLE pages (
           resource varchar(255) NOT NULL,
 	   title varchar(255) NOT NULL,
 	   UNIQUE KEY name (resource))""",
  "Queries" : """CREATE TABLE Queries (
           ID int(11) NOT NULL auto_increment,
 	   Value varchar(50) NOT NULL,
@@ -237,6 +257,74 @@ def addType(name, module, desc = ""):
 def addFunctype(name, module, desc = ""):
    return updateSymbol(name, module, 'functype', desc)
 def addPage(resource, title):
    global DB
    if DB == None:
        openMySQL()
    if DB == None:
        return -1
    if resource == None:
        return -1
    c = DB.cursor()
    try:
 	ret = c.execute(
 	    """INSERT INTO pages (resource, title) VALUES ('%s','%s')""" %
                    (resource, title))
    except:
        try:
 	    ret = c.execute(
 		"""UPDATE pages SET title='%s' WHERE resource='%s'""" %
                    (title, resource))
        except:
 	    print "Update symbol (%s, %s, %s) failed command" % (name, module, type)
 	    print """UPDATE pages SET title='%s' WHERE resource='%s'""" % (title, resource)
 	    print sys.exc_type, sys.exc_value
 	    return -1
    return ret
 def updateWordHTML(name, resource, desc, id, relevance):
    global DB
    if DB == None:
        openMySQL()
    if DB == None:
        return -1
    if name == None:
        return -1
    if resource == None:
        return -1
    if id == None:
        id = ""
    if desc == None:
        desc = ""
    else:
 	try:
 	    desc = string.replace(desc, "'", " ")
 	    desc = desc[0:99]
 	except:
 	    desc = ""
    c = DB.cursor()
    try:
 	ret = c.execute(
 """INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ('%s','%s', '%s', '%s', '%d')""" %
                    (name, resource, desc, id, relevance))
    except:
        try:
 	    ret = c.execute(
 """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" %
                    (desc, id, relevance, name, resource))
        except:
 	    print "Update symbol (%s, %s, %d) failed command" % (name, resource, relevance)
 	    print """UPDATE wordsHTML SET section='%s', id='%s', relevance='%d' where name='%s' and resource='%s'""" % (desc, id, relevance, name, resource)
 	    print sys.exc_type, sys.exc_value
 	    return -1
    return ret
 #########################################################################
 #									#
 #                  Word dictionnary and analysis routines		#
@@ -244,6 +332,7 @@ def addFunctype(name, module, desc = ""):
 #########################################################################
 wordsDict = {}
 wordsDictHTML = {}
 def splitIdentifier(str):
    ret = []
@@ -303,6 +392,65 @@ def addString(str, module, symbol, relevance):
    return ret
 def addWordHTML(word, resource, id, section, relevance):
    global wordsDictHTML
    if word == None or len(word) < 3:
        return -1
    if resource == None or section == None:
        return -1
    if wordsDictHTML.has_key(word):
        d = wordsDictHTML[word]
 	if d == None:
 	    return 0
 	if len(d) > 15:
 	    wordsDictHTML[word] = None
 	    return 0
 	try:
 	    (r,i,s) = d[resource]
 	    if i != None:
 	        id = i
 	    if s != None:
 	        section = s
 	    relevance = relevance + r
 	except:
 	    pass
    else:
        wordsDictHTML[word] = {}
    wordsDictHTML[word][resource] = (relevance, id, section)
    return relevance
 def addStringHTML(str, resource, id, section, relevance):
    if str == None or len(str) < 3:
        return -1
    ret = 0
    str = string.replace(str, ".", " ")
    str = string.replace(str, ",", " ")
    str = string.replace(str, "'", " ")
    str = string.replace(str, '"', " ")
    str = string.replace(str, ";", " ")
    str = string.replace(str, "-", " ")
    str = string.replace(str, "(", " ")
    str = string.replace(str, ")", " ")
    str = string.replace(str, "{", " ")
    str = string.replace(str, "}", " ")
    str = string.replace(str, "<", " ")
    str = string.replace(str, ">", " ")
    str = string.replace(str, "/", " ")
    str = string.replace(str, "*", " ")
    str = string.replace(str, ":", " ")
    str = string.replace(str, "\n", " ")
    str = string.replace(str, "\r", " ")
    str = string.replace(str, "\xc2", " ")
    str = string.replace(str, "\xa0", " ")
    l = string.split(str)
    for word in l:
 	if len(word) > 2:
 	    ret = ret + addWordHTML(word, resource, id, section, relevance)
    return ret
 #########################################################################
 #									#
@@ -561,6 +709,83 @@ def analyzeAPI(doc):
        cur = cur.next
    return count
 #########################################################################
 #									#
 #                  Web pages parsing and analysis			#
 #									#
 #########################################################################
 import glob
 def analyzeHTMLPara(doc, resource, p, section, id):
    words = 0
    try:
 	content = p.content
 	words = words + addStringHTML(content, resource, id, section, 5)
    except:
        return -1
    return words
 def analyzeHTMLPre(doc, resource, p, section, id):
    words = 0
    try:
 	content = p.content
 	words = words + addStringHTML(content, resource, id, section, 5)
    except:
        return -1
    return words
 def analyzeHTML(doc, resource):
    para = 0;
    ctxt = doc.xpathNewContext()
    try:
 	res = ctxt.xpathEval("//head/title")
 	title = res[0].content
    except:
        title = "Page %s" % (resource)
    addPage(resource, title)
    try:
 	items = ctxt.xpathEval("//h1 | //h2 | //h3 | //p | //pre")
 	section = title
 	id = ""
 	for item in items:
 	    if item.name == 'h1' or item.name == 'h2' or item.name == 'h3':
 	        section = item.content
 		if item.prop("id"):
 		    id = item.prop("id")
 		elif item.prop("name"):
 		    id = item.prop("name")
 	    elif item.name == 'p':
 	        analyzeHTMLPara(doc, resource, item, section, id)
 		para = para + 1
 	    elif item.name == 'pre':
 	        analyzeHTMLPre(doc, resource, item, section, id)
 		para = para + 1
 	    else:
 	        print "Page %s, unexpected %s element" % (resource, item.name)
    except:
        print "Page %s: problem analyzing" % (resource)
 	print sys.exc_type, sys.exc_value
    return para
 def analyzeHTMLPages():
    ret = 0
    HTMLfiles = glob.glob("*.html") + glob.glob("tutorial/*.html")
    for html in HTMLfiles:
 	if html[0:3] == "API":
 	    continue
 	if html == "xml.html":
 	    continue
 	try:
 	    doc = libxml2.htmlParseFile(html, None)
 	    res = analyzeHTML(doc, html)
 	    print "Parsed %s : %d paragraphs" % (html, res)
 	    ret = ret + 1
 	except:
 	    print "could not parse %s" % (html)
    return ret
 #########################################################################
 #									#
 #          Main code: open the DB, the API XML and analyze it		#
@@ -573,6 +798,23 @@ except:
    print sys.exc_type, sys.exc_value
    sys.exit(1)
 ret = analyzeHTMLPages()
 print "Indexed %d words in %d HTML pages" % (len(wordsDictHTML), ret)
 i = 0
 skipped = 0
 for word in wordsDictHTML.keys():
    refs = wordsDictHTML[word]
    if refs  == None:
        skipped = skipped + 1
        continue;
    for resource in refs.keys():
        (relevance, id, section) = refs[resource]
        updateWordHTML(word, resource, section, id, relevance)
 	i = i + 1
 print "Found %d associations in HTML pages" % (i)
 try:
    doc = loadAPI(API)
    ret = analyzeAPI(doc)