#!/usr/bin/env python import os import sys import time import logging logging.basicConfig(level = logging.INFO, format = '%(asctime)s %(levelname)s: %(message)s') log = logging.getLogger(__name__) class Page(): def __init__(self, fn, full: bool = True): self.fn = fn self.title = '' self.size = 0 self.mtime = 0 self.h1 = [] self.h2 = [] self.h3 = [] self.lines = [] self.read(full=full) def read(self, full: bool = True): try: self.title = ' ' + os.path.basename(self.fn).replace('.md', '').replace('-', ' ') + ' ' self.mtime = int(os.path.getmtime(self.fn)) with open(self.fn, 'r', encoding='utf-8') as f: content = f.read() self.size = len(content) self.lines = [line.strip().lower() + ' ' for line in content.splitlines() if len(line)>1] self.h1 = [line[1:] for line in self.lines if line.startswith('# ')] self.h2 = [line[2:] for line in self.lines if line.startswith('## ')] self.h3 = [line[3:] for line in self.lines if line.startswith('### ')] if not full: self.lines.clear() except Exception as e: log.error(f'Wiki: page="{self.fn}" {e}') def search(self, text): if not text or len(text) < 2: return [] text = text.lower() if text.strip() == self.title.lower().strip(): return 1.0 if self.title.lower().startswith(f'{text} '): return 0.99 if f' {text} ' in self.title.lower(): return 0.98 if f' {text}' in self.title.lower(): return 0.97 if any(f' {text} ' in h for h in self.h1): return 0.89 if any(f' {text}' in h for h in self.h1): return 0.88 if any(f' {text} ' in h for h in self.h2): return 0.79 if any(f' {text}' in h for h in self.h2): return 0.78 if any(f' {text} ' in h for h in self.h3): return 0.69 if any(f' {text}' in h for h in self.h3): return 0.68 if f'{text}' in self.title.lower(): return 0.59 if any(f'{text}' in h for h in self.h1): return 0.58 if any(f'{text}' in h for h in self.h2): return 0.57 if any(f'{text}' in h for h in self.h3): return 0.56 if any(text in line for line in self.lines): return 0.50 return 0.0 def get(self): try: with open(self.fn, 'r', encoding='utf-8') as f: content = f.read() return content except Exception as e: log.error(f'Wiki: page="{self.fn}" {e}') return '' def __str__(self): return f'Page(title="{self.title.strip()}" fn="{self.fn}" mtime={self.mtime} h1={[h.strip() for h in self.h1]} h2={len(self.h2)} h3={len(self.h3)} lines={len(self.lines)} size={self.size})' class Pages(): def __init__(self): self.time = time.time() self.size = 0 self.full = None self.pages: list[Page] = [] def build(self, full: bool = True): self.pages.clear() self.full = full with os.scandir('wiki') as entries: for entry in entries: if entry.is_file() and entry.name.endswith('.md'): page = Page(entry.path, full=full) self.pages.append(page) self.size = sum(page.size for page in self.pages) def search(self, text: str, topk: int = 10, full: bool = True) -> list[Page]: if not text: return [] if len(self.pages) == 0: self.build(full=full) text = text.lower() scores = [page.search(text) for page in self.pages] mtimes = [page.mtime for page in self.pages] found = sorted(zip(scores, mtimes, self.pages), key=lambda x: (x[0], x[1]), reverse=True) found = [item for item in found if item[0] > 0] return [(item[0], item[2]) for item in found][:topk] index = Pages() if __name__ == "__main__": sys.argv.pop(0) if len(sys.argv) < 1: log.error("Usage: python cli/docs.py ") text = ' '.join(sys.argv) topk = 10 full = True log.info(f'Search: "{text}" topk={topk}, full={full}') t0 = time.time() results = index.search(text, topk=topk, full=full) t1 = time.time() log.info(f'Results: pages={len(results)} size={index.size} time={t1-t0:.3f}') for score, page in results: log.info(f'Score: {score:.2f} {page}') # if len(results) > 0: # log.info('Top result:') # log.info(results[0][1].get())