1
0
mirror of https://github.com/vladmandic/sdnext.git synced 2026-01-27 15:02:48 +03:00
Files
sdnext/cli/docs.py
Vladimir Mandic 1b3f5405a3 fix batch processing
Signed-off-by: Vladimir Mandic <mandic00@live.com>
2025-07-31 09:56:40 -04:00

145 lines
4.7 KiB
Python
Executable File

#!/usr/bin/env python
import os
import sys
import time
import logging
logging.basicConfig(level = logging.INFO, format = '%(asctime)s %(levelname)s: %(message)s')
log = logging.getLogger(__name__)
class Page():
def __init__(self, fn, full: bool = True):
self.fn = fn
self.title = ''
self.size = 0
self.mtime = 0
self.h1 = []
self.h2 = []
self.h3 = []
self.lines = []
self.read(full=full)
def read(self, full: bool = True):
try:
self.title = ' ' + os.path.basename(self.fn).replace('.md', '').replace('-', ' ') + ' '
self.mtime = int(os.path.getmtime(self.fn))
with open(self.fn, 'r', encoding='utf-8') as f:
content = f.read()
self.size = len(content)
self.lines = [line.strip().lower() + ' ' for line in content.splitlines() if len(line)>1]
self.h1 = [line[1:] for line in self.lines if line.startswith('# ')]
self.h2 = [line[2:] for line in self.lines if line.startswith('## ')]
self.h3 = [line[3:] for line in self.lines if line.startswith('### ')]
if not full:
self.lines.clear()
except Exception as e:
log.error(f'Wiki: page="{self.fn}" {e}')
def search(self, text):
if not text or len(text) < 2:
return []
text = text.lower()
if text.strip() == self.title.lower().strip():
return 1.0
if self.title.lower().startswith(f'{text} '):
return 0.99
if f' {text} ' in self.title.lower():
return 0.98
if f' {text}' in self.title.lower():
return 0.97
if any(f' {text} ' in h for h in self.h1):
return 0.89
if any(f' {text}' in h for h in self.h1):
return 0.88
if any(f' {text} ' in h for h in self.h2):
return 0.79
if any(f' {text}' in h for h in self.h2):
return 0.78
if any(f' {text} ' in h for h in self.h3):
return 0.69
if any(f' {text}' in h for h in self.h3):
return 0.68
if f'{text}' in self.title.lower():
return 0.59
if any(f'{text}' in h for h in self.h1):
return 0.58
if any(f'{text}' in h for h in self.h2):
return 0.57
if any(f'{text}' in h for h in self.h3):
return 0.56
if any(text in line for line in self.lines):
return 0.50
return 0.0
def get(self):
try:
with open(self.fn, 'r', encoding='utf-8') as f:
content = f.read()
return content
except Exception as e:
log.error(f'Wiki: page="{self.fn}" {e}')
return ''
def __str__(self):
return f'Page(title="{self.title.strip()}" fn="{self.fn}" mtime={self.mtime} h1={[h.strip() for h in self.h1]} h2={len(self.h2)} h3={len(self.h3)} lines={len(self.lines)} size={self.size})'
class Pages():
def __init__(self):
self.time = time.time()
self.size = 0
self.full = None
self.pages: list[Page] = []
def build(self, full: bool = True):
self.pages.clear()
self.full = full
with os.scandir('wiki') as entries:
for entry in entries:
if entry.is_file() and entry.name.endswith('.md'):
page = Page(entry.path, full=full)
self.pages.append(page)
self.size = sum(page.size for page in self.pages)
def search(self, text: str, topk: int = 10, full: bool = True) -> list[Page]:
if not text:
return []
if len(self.pages) == 0:
self.build(full=full)
text = text.lower()
scores = [page.search(text) for page in self.pages]
mtimes = [page.mtime for page in self.pages]
found = sorted(zip(scores, mtimes, self.pages), key=lambda x: (x[0], x[1]), reverse=True)
found = [item for item in found if item[0] > 0]
return [(item[0], item[2]) for item in found][:topk]
index = Pages()
if __name__ == "__main__":
sys.argv.pop(0)
if len(sys.argv) < 1:
log.error("Usage: python cli/docs.py <search_term>")
text = ' '.join(sys.argv)
topk = 10
full = True
log.info(f'Search: "{text}" topk={topk}, full={full}')
t0 = time.time()
results = index.search(text, topk=topk, full=full)
t1 = time.time()
log.info(f'Results: pages={len(results)} size={index.size} time={t1-t0:.3f}')
for score, page in results:
log.info(f'Score: {score:.2f} {page}')
# if len(results) > 0:
# log.info('Top result:')
# log.info(results[0][1].get())