# Moondream 3 Preview VLM Implementation # Source: https://huggingface.co/moondream/moondream3-preview # Model: 9.3GB, gated (requires HuggingFace authentication) # Architecture: Mixture-of-Experts (9B total params, 2B active) import os import re import transformers from PIL import Image from modules import shared, devices, sd_models from modules.interrogate import vqa_detection # Debug logging - function-based to avoid circular import debug_enabled = os.environ.get('SD_VQA_DEBUG', None) is not None def debug(*args, **kwargs): if debug_enabled: shared.log.trace(*args, **kwargs) # Global state moondream3_model = None loaded = None image_cache = {} # Cache encoded images for reuse def get_settings(): """ Build settings dict for Moondream 3 API from global VQA options. Moondream 3 accepts: temperature, top_p, max_tokens """ settings = {} if shared.opts.interrogate_vlm_max_length > 0: settings['max_tokens'] = shared.opts.interrogate_vlm_max_length if shared.opts.interrogate_vlm_temperature > 0: settings['temperature'] = shared.opts.interrogate_vlm_temperature if shared.opts.interrogate_vlm_top_p > 0: settings['top_p'] = shared.opts.interrogate_vlm_top_p return settings if settings else None def load_model(repo: str): """Load Moondream 3 model.""" global moondream3_model, loaded # pylint: disable=global-statement if moondream3_model is None or loaded != repo: shared.log.debug(f'Interrogate load: vlm="{repo}"') moondream3_model = None moondream3_model = transformers.AutoModelForCausalLM.from_pretrained( repo, trust_remote_code=True, torch_dtype=devices.dtype, cache_dir=shared.opts.hfcache_dir, ) moondream3_model.eval() # Initialize KV caches before moving to device (they're lazy by default) if hasattr(moondream3_model, '_setup_caches'): moondream3_model._setup_caches() # pylint: disable=protected-access # Disable flex_attention decoding (can cause hangs due to torch.compile) if hasattr(moondream3_model, 'model') and hasattr(moondream3_model.model, 'use_flex_decoding'): moondream3_model.model.use_flex_decoding = False loaded = repo devices.torch_gc() # Move model to active device sd_models.move_model(moondream3_model, devices.device) return moondream3_model def encode_image(image: Image.Image, cache_key: str = None): """ Encode image for reuse across multiple queries. Args: image: PIL Image cache_key: Optional cache key for storing encoded image Returns: Encoded image tensor """ if cache_key and cache_key in image_cache: debug(f'VQA interrogate: handler=moondream3 using cached encoding for cache_key="{cache_key}"') return image_cache[cache_key] model = load_model(loaded) with devices.inference_context(): encoded = model.encode_image(image) if cache_key: image_cache[cache_key] = encoded debug(f'VQA interrogate: handler=moondream3 cached encoding cache_key="{cache_key}" cache_size={len(image_cache)}') return encoded def query(image: Image.Image, question: str, repo: str, stream: bool = False, temperature: float = None, top_p: float = None, max_tokens: int = None, use_cache: bool = False, reasoning: bool = True): """ Visual question answering with optional streaming. Args: image: PIL Image question: Question about the image repo: Model repository stream: Enable streaming output (generator) temperature: Sampling temperature (overrides global setting) top_p: Nucleus sampling parameter (overrides global setting) max_tokens: Maximum tokens to generate (overrides global setting) use_cache: Use cached image encoding if available Returns: Answer dict or string (or generator if stream=True) """ model = load_model(repo) # Build settings - per-call parameters override global settings settings = get_settings() or {} if temperature is not None: settings['temperature'] = temperature if top_p is not None: settings['top_p'] = top_p if max_tokens is not None: settings['max_tokens'] = max_tokens debug(f'VQA interrogate: handler=moondream3 method=query question="{question}" stream={stream} settings={settings}') # Use cached encoding if requested if use_cache: cache_key = f"{id(image)}_{question}" image_input = encode_image(image, cache_key) else: image_input = image with devices.inference_context(): response = model.query( image=image_input, question=question, stream=stream, settings=settings if settings else None, reasoning=reasoning ) # Log response structure (for non-streaming) if not stream: if isinstance(response, dict): debug(f'VQA interrogate: handler=moondream3 response_type=dict keys={list(response.keys())}') if 'reasoning' in response: reasoning_text = response['reasoning'].get('text', '')[:100] + '...' if len(response['reasoning'].get('text', '')) > 100 else response['reasoning'].get('text', '') debug(f'VQA interrogate: handler=moondream3 reasoning="{reasoning_text}"') if 'answer' in response: debug(f'VQA interrogate: handler=moondream3 answer="{response["answer"]}"') return response def caption(image: Image.Image, repo: str, length: str = 'normal', stream: bool = False, temperature: float = None, top_p: float = None, max_tokens: int = None): """ Generate image captions at different lengths. Args: image: PIL Image repo: Model repository length: Caption length - 'short', 'normal', or 'long' stream: Enable streaming output (generator) temperature: Sampling temperature (overrides global setting) top_p: Nucleus sampling parameter (overrides global setting) max_tokens: Maximum tokens to generate (overrides global setting) Returns: Caption dict or string (or generator if stream=True) """ model = load_model(repo) # Build settings - per-call parameters override global settings settings = get_settings() or {} if temperature is not None: settings['temperature'] = temperature if top_p is not None: settings['top_p'] = top_p if max_tokens is not None: settings['max_tokens'] = max_tokens debug(f'VQA interrogate: handler=moondream3 method=caption length={length} stream={stream} settings={settings}') with devices.inference_context(): response = model.caption( image, length=length, stream=stream, settings=settings if settings else None ) # Log response structure (for non-streaming) if not stream and isinstance(response, dict): debug(f'VQA interrogate: handler=moondream3 response_type=dict keys={list(response.keys())}') return response def point(image: Image.Image, object_name: str, repo: str): """ Identify coordinates of all instances of a specific object in the image. Args: image: PIL Image object_name: Name of object to locate repo: Model repository Returns: List of (x, y) tuples with coordinates normalized to 0-1 range, or None if not found Example: [(0.733, 0.442), (0.5, 0.6)] for 2 instances """ model = load_model(repo) debug(f'VQA interrogate: handler=moondream3 method=point object_name="{object_name}"') with devices.inference_context(): result = model.point(image, object_name) debug(f'VQA interrogate: handler=moondream3 point_raw_result="{result}" type={type(result)}') if isinstance(result, dict): debug(f'VQA interrogate: handler=moondream3 point_raw_result_keys={list(result.keys())}') points = vqa_detection.parse_points(result) if points: debug(f'VQA interrogate: handler=moondream3 point_result={len(points)} points found') return points debug('VQA interrogate: handler=moondream3 point_result=not found') return None def detect(image: Image.Image, object_name: str, repo: str, max_objects: int = 10): """ Detect all instances of a specific object with bounding boxes. Args: image: PIL Image object_name: Name of object to detect repo: Model repository max_objects: Maximum number of objects to return Returns: List of detection dicts with keys: - 'bbox': [x1, y1, x2, y2] normalized to 0-1 - 'label': Object label - 'confidence': Detection confidence (0-1) Returns empty list if no objects found. """ model = load_model(repo) debug(f'VQA interrogate: handler=moondream3 method=detect object_name="{object_name}" max_objects={max_objects}') with devices.inference_context(): result = model.detect(image, object_name) debug(f'VQA interrogate: handler=moondream3 detect_raw_result="{result}" type={type(result)}') if isinstance(result, dict): debug(f'VQA interrogate: handler=moondream3 detect_raw_result_keys={list(result.keys())}') detections = vqa_detection.parse_detections(result, object_name, max_objects) debug(f'VQA interrogate: handler=moondream3 detect_result={len(detections)} objects found') return detections def predict(question: str, image: Image.Image, repo: str, model_name: str = None, thinking_mode: bool = False, mode: str = None, stream: bool = False, use_cache: bool = False, **kwargs): """ Main entry point for Moondream 3 VQA - auto-detects mode from question. Args: question: The question/prompt (e.g., "caption", "where is the cat?", "describe this") image: PIL Image repo: Model repository model_name: Display name for logging thinking_mode: Enable reasoning mode for query mode: Force specific mode ('query', 'caption', 'caption_short', 'caption_long', 'point', 'detect') stream: Enable streaming output (for query/caption) use_cache: Use cached image encoding (for query) **kwargs: Additional parameters (max_objects for detect, etc.) Returns: Response string (detection data stored on VQA singleton instance.last_detection_data) (or generator if stream=True for query/caption modes) """ debug(f'VQA interrogate: handler=moondream3 model_name="{model_name}" repo="{repo}" question="{question}" image_size={image.size if image else None} mode={mode} stream={stream}') # Clean question question = question.replace('<', '').replace('>', '').replace('_', ' ') if question else '' # Auto-detect mode from question if not specified if mode is None: question_lower = question.lower() # Caption detection if question in ['CAPTION', 'caption'] or 'caption' in question_lower: if 'more detailed' in question_lower or 'very long' in question_lower: mode = 'caption_long' elif 'detailed' in question_lower or 'long' in question_lower: mode = 'caption_normal' elif 'short' in question_lower or 'brief' in question_lower: mode = 'caption_short' else: # Default caption mode (matches vqa.py legacy behavior) if question == 'CAPTION': mode = 'caption_short' elif question == 'DETAILED CAPTION': mode = 'caption_normal' elif question == 'MORE DETAILED CAPTION': mode = 'caption_long' else: mode = 'caption_normal' # Point detection elif 'where is' in question_lower or 'locate' in question_lower or 'find' in question_lower or 'point' in question_lower: mode = 'point' # Object detection elif 'detect' in question_lower or 'bounding box' in question_lower or 'bbox' in question_lower: mode = 'detect' # Default to query else: mode = 'query' debug(f'VQA interrogate: handler=moondream3 mode_selected={mode}') # Dispatch to appropriate method try: if mode == 'caption_short': response = caption(image, repo, length='short', stream=stream) elif mode == 'caption_long': response = caption(image, repo, length='long', stream=stream) elif mode in ['caption', 'caption_normal']: response = caption(image, repo, length='normal', stream=stream) elif mode == 'point': # Extract object name from question - case insensitive, preserve object names object_name = question for phrase in ['point at', 'where is', 'locate', 'find']: object_name = re.sub(rf'\b{phrase}\b', '', object_name, flags=re.IGNORECASE) object_name = re.sub(r'[?.!,]', '', object_name).strip() object_name = re.sub(r'^\s*the\s+', '', object_name, flags=re.IGNORECASE) debug(f'VQA interrogate: handler=moondream3 point_extracted_object="{object_name}"') result = point(image, object_name, repo) if result: from modules.interrogate import vqa vqa.get_instance().last_detection_data = {'points': result} return vqa_detection.format_points_text(result) return "Object not found" elif mode == 'detect': # Extract object name from question - case insensitive object_name = question for phrase in ['detect', 'find all', 'bounding box', 'bbox', 'find']: object_name = re.sub(rf'\b{phrase}\b', '', object_name, flags=re.IGNORECASE) object_name = re.sub(r'[?.!,]', '', object_name).strip() object_name = re.sub(r'^\s*the\s+', '', object_name, flags=re.IGNORECASE) if ' and ' in object_name.lower(): object_name = re.split(r'\s+and\s+', object_name, flags=re.IGNORECASE)[0].strip() debug(f'VQA interrogate: handler=moondream3 detect_extracted_object="{object_name}"') results = detect(image, object_name, repo, max_objects=kwargs.get('max_objects', 10)) if results: from modules.interrogate import vqa vqa.get_instance().last_detection_data = {'detections': results} return vqa_detection.format_detections_text(results) return "No objects detected" else: # mode == 'query' if len(question) < 2: question = "Describe this image." response = query(image, question, repo, stream=stream, use_cache=use_cache, reasoning=thinking_mode) debug(f'VQA interrogate: handler=moondream3 response_before_clean="{response}"') return response except Exception as e: from modules import errors errors.display(e, 'Moondream3') return f"Error: {str(e)}" def clear_cache(): """Clear image encoding cache.""" cache_size = len(image_cache) image_cache.clear() debug(f'VQA interrogate: handler=moondream3 cleared image cache cache_size_was={cache_size}') shared.log.debug(f'Moondream3: Cleared image cache ({cache_size} entries)') def unload(): """Release Moondream 3 model from GPU/memory.""" global moondream3_model, loaded # pylint: disable=global-statement if moondream3_model is not None: shared.log.debug(f'Moondream3 unload: model="{loaded}"') sd_models.move_model(moondream3_model, devices.cpu, force=True) moondream3_model = None loaded = None clear_cache() devices.torch_gc(force=True) else: shared.log.debug('Moondream3 unload: no model loaded')