mirror of
https://github.com/vladmandic/sdnext.git
synced 2026-01-27 15:02:48 +03:00
145 lines
4.7 KiB
Python
Executable File
145 lines
4.7 KiB
Python
Executable File
#!/usr/bin/env python
|
|
import os
|
|
import sys
|
|
import time
|
|
import logging
|
|
|
|
|
|
logging.basicConfig(level = logging.INFO, format = '%(asctime)s %(levelname)s: %(message)s')
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class Page():
|
|
def __init__(self, fn, full: bool = True):
|
|
self.fn = fn
|
|
self.title = ''
|
|
self.size = 0
|
|
self.mtime = 0
|
|
self.h1 = []
|
|
self.h2 = []
|
|
self.h3 = []
|
|
self.lines = []
|
|
self.read(full=full)
|
|
|
|
def read(self, full: bool = True):
|
|
try:
|
|
self.title = ' ' + os.path.basename(self.fn).replace('.md', '').replace('-', ' ') + ' '
|
|
self.mtime = int(os.path.getmtime(self.fn))
|
|
with open(self.fn, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
self.size = len(content)
|
|
self.lines = [line.strip().lower() + ' ' for line in content.splitlines() if len(line)>1]
|
|
self.h1 = [line[1:] for line in self.lines if line.startswith('# ')]
|
|
self.h2 = [line[2:] for line in self.lines if line.startswith('## ')]
|
|
self.h3 = [line[3:] for line in self.lines if line.startswith('### ')]
|
|
if not full:
|
|
self.lines.clear()
|
|
except Exception as e:
|
|
log.error(f'Wiki: page="{self.fn}" {e}')
|
|
|
|
def search(self, text):
|
|
if not text or len(text) < 2:
|
|
return []
|
|
text = text.lower()
|
|
if text.strip() == self.title.lower().strip():
|
|
return 1.0
|
|
if self.title.lower().startswith(f'{text} '):
|
|
return 0.99
|
|
if f' {text} ' in self.title.lower():
|
|
return 0.98
|
|
if f' {text}' in self.title.lower():
|
|
return 0.97
|
|
|
|
if any(f' {text} ' in h for h in self.h1):
|
|
return 0.89
|
|
if any(f' {text}' in h for h in self.h1):
|
|
return 0.88
|
|
|
|
if any(f' {text} ' in h for h in self.h2):
|
|
return 0.79
|
|
if any(f' {text}' in h for h in self.h2):
|
|
return 0.78
|
|
|
|
if any(f' {text} ' in h for h in self.h3):
|
|
return 0.69
|
|
if any(f' {text}' in h for h in self.h3):
|
|
return 0.68
|
|
|
|
if f'{text}' in self.title.lower():
|
|
return 0.59
|
|
if any(f'{text}' in h for h in self.h1):
|
|
return 0.58
|
|
if any(f'{text}' in h for h in self.h2):
|
|
return 0.57
|
|
if any(f'{text}' in h for h in self.h3):
|
|
return 0.56
|
|
|
|
if any(text in line for line in self.lines):
|
|
return 0.50
|
|
|
|
return 0.0
|
|
|
|
def get(self):
|
|
try:
|
|
with open(self.fn, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
return content
|
|
except Exception as e:
|
|
log.error(f'Wiki: page="{self.fn}" {e}')
|
|
return ''
|
|
|
|
def __str__(self):
|
|
return f'Page(title="{self.title.strip()}" fn="{self.fn}" mtime={self.mtime} h1={[h.strip() for h in self.h1]} h2={len(self.h2)} h3={len(self.h3)} lines={len(self.lines)} size={self.size})'
|
|
|
|
|
|
class Pages():
|
|
def __init__(self):
|
|
self.time = time.time()
|
|
self.size = 0
|
|
self.full = None
|
|
self.pages: list[Page] = []
|
|
|
|
def build(self, full: bool = True):
|
|
self.pages.clear()
|
|
self.full = full
|
|
with os.scandir('wiki') as entries:
|
|
for entry in entries:
|
|
if entry.is_file() and entry.name.endswith('.md'):
|
|
page = Page(entry.path, full=full)
|
|
self.pages.append(page)
|
|
self.size = sum(page.size for page in self.pages)
|
|
|
|
def search(self, text: str, topk: int = 10, full: bool = True) -> list[Page]:
|
|
if not text:
|
|
return []
|
|
if len(self.pages) == 0:
|
|
self.build(full=full)
|
|
text = text.lower()
|
|
scores = [page.search(text) for page in self.pages]
|
|
mtimes = [page.mtime for page in self.pages]
|
|
found = sorted(zip(scores, mtimes, self.pages), key=lambda x: (x[0], x[1]), reverse=True)
|
|
found = [item for item in found if item[0] > 0]
|
|
return [(item[0], item[2]) for item in found][:topk]
|
|
|
|
|
|
index = Pages()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.argv.pop(0)
|
|
if len(sys.argv) < 1:
|
|
log.error("Usage: python cli/docs.py <search_term>")
|
|
text = ' '.join(sys.argv)
|
|
topk = 10
|
|
full = True
|
|
log.info(f'Search: "{text}" topk={topk}, full={full}')
|
|
t0 = time.time()
|
|
results = index.search(text, topk=topk, full=full)
|
|
t1 = time.time()
|
|
log.info(f'Results: pages={len(results)} size={index.size} time={t1-t0:.3f}')
|
|
for score, page in results:
|
|
log.info(f'Score: {score:.2f} {page}')
|
|
# if len(results) > 0:
|
|
# log.info('Top result:')
|
|
# log.info(results[0][1].get())
|