from fractions import Fraction import os import time import cv2 import numpy as np import torch import einops from PIL import Image from modules import shared, errors ,timer, rife, processing from modules.video_models.video_utils import check_av def get_video_filename(p:processing.StableDiffusionProcessingVideo): from modules.images_namegen import FilenameGenerator namegen = FilenameGenerator(p, seed=p.seed if p is not None else 0, prompt=p.prompt if p is not None else '') filename = namegen.apply(shared.opts.samples_filename_pattern if shared.opts.samples_filename_pattern and len(shared.opts.samples_filename_pattern) > 0 else "[seq]-[prompt_words]") if shared.opts.save_to_dirs: dirname = namegen.apply(shared.opts.directories_filename_pattern or "[prompt_words]") dirfile = os.path.dirname(filename) dirname = os.path.join(shared.opts.outdir_video, dirname, dirfile) else: dirname = shared.opts.outdir_video if not os.path.exists(dirname): os.makedirs(dirname, exist_ok=True) filename = os.path.join(dirname, filename) filename = namegen.sequence(filename) filename = namegen.sanitize(filename) return filename def save_params(p, filename: str = None): from modules.paths import params_path if p is None: dct = {} else: # sampler_index, sampler_shift, dynamic_shift, guidance_scale, guidance_true, init_image, init_strength, last_image, vae_type, vae_tile_frames, mp4_fps, mp4_interpolate, mp4_codec, mp4_ext, mp4_opt, mp4_video, mp4_frames, mp4_sf, vlm_enhance, vlm_model, vlm_system_prompt, override_settings = args dct = { "Prompt": p.prompt, "Negative prompt": p.negative_prompt, "Steps": p.steps, "Sampler": p.sampler_name, "Seed": p.seed, "Engine": p.video_engine, "Model": p.video_model, "Frames": p.frames, "Size": f"{p.width}x{p.height}", "Styles": ','.join(p.styles) if isinstance(p.styles, list) else p.styles, } params = ', '.join([f'{k}: {v}' for k, v in dct.items() if v is not None and v != '']) fn = filename if filename is not None else params_path with open(fn, "w", encoding="utf8") as file: file.write(params) def images_to_tensor(images): if images is None or len(images) == 0: return None array = [torch.from_numpy(np.array(image)) for image in images] tensor = torch.stack(array, dim=0) # n h w c tensor = tensor.unsqueeze(0) # 1, n, h, w, c tensor = tensor.permute(0, 4, 1, 2, 3).contiguous() # 1, c, n, h, w tensor = (tensor.float() / 127.5) - 1.0 # from [0,255] to [-1,1] # shared.log.debug(f'Video output: images={len(images)} tensor={tensor.shape}') return tensor def numpy_to_tensor(images): if images is None or len(images) == 0: return None images = (2.0 * images) - 1.0 # from [0,1] to [-1,1] array = [torch.from_numpy(images[i]) for i in range(images.shape[0])] tensor = torch.stack(array, dim=0) # n h w c tensor = tensor.unsqueeze(0) # 1, n, h, w, c tensor = tensor.permute(0, 4, 1, 2, 3).contiguous() # 1, c, n, h, w # tensor = (tensor.float() / 127.5) - 1.0 # from [0,255] to [-1,1] # shared.log.debug(f'Video output: images={len(images)} tensor={tensor.shape}') return tensor def write_audio( container, samples: torch.Tensor, audio_sample_rate: int, ) -> None: av = check_av() # create stream audio_options = { 'time_base': f'1/{audio_sample_rate}' } audio_stream = container.add_stream("aac", rate=audio_sample_rate, options=audio_options) audio_stream.codec_context.sample_rate = audio_sample_rate audio_stream.codec_context.layout = "stereo" audio_stream.codec_context.format = "fltp" audio_stream.codec_context.time_base = Fraction(1, audio_sample_rate) # audio_stream.time_base = audio_stream.codec_context.time_base # TODO audio set time-base shared.log.debug(f'Audio: codec={audio_stream.codec_context.name} rate={audio_stream.codec_context.sample_rate} layout={audio_stream.codec_context.layout} format={audio_stream.codec_context.format} base={audio_stream.codec_context.time_base}') # init input samples if samples.ndim == 1: samples = samples[:, None] if samples.shape[1] != 2 and samples.shape[0] == 2: samples = samples.T if samples.shape[1] != 2: raise ValueError(f"Expected samples with 2 channels; got shape {samples.shape}.") if samples.dtype != torch.int16: samples = torch.clip(samples, -1.0, 1.0) samples = (samples * 32767.0).to(torch.int16) audio_frames = av.AudioFrame.from_ndarray( samples.contiguous().reshape(1, -1).cpu().numpy(), format="s16", layout="stereo", ) audio_frames.sample_rate = audio_sample_rate # init resampler audio_resampler = av.audio.resampler.AudioResampler( format=audio_stream.codec_context.format, layout=audio_stream.codec_context.layout, rate=audio_stream.codec_context.sample_rate, ) # resample pts = 0 for resampled in audio_resampler.resample(audio_frames): resampled.pts = resampled.pts or 0 resampled.sample_rate = audio_frames.sample_rate packets = audio_stream.encode(resampled) for packet in packets: container.mux(packet) pts += resampled.samples # flush audio encoder for packet in audio_stream.encode(): container.mux(packet) def atomic_save_video(filename: str, tensor:torch.Tensor, audio:torch.Tensor=None, fps:float=24, codec:str='libx264', pix_fmt:str='yuv420p', options:str='', aac:int=24000, metadata:dict={}, pbar=None, ): av = check_av() if av is None or av is False: shared.log.error('Video: ffmpeg/av not available') return savejob = shared.state.begin('Save video') frames, height, width, _channels = tensor.shape rate = round(fps) options_str = options options = {} for option in [option.strip() for option in options_str.split(',')]: if '=' in option: key, value = option.split('=', 1) elif ':' in option: key, value = option.split(':', 1) else: continue options[key.strip()] = value.strip() shared.log.info(f'Video: file="{filename}" codec={codec} frames={frames} width={width} height={height} fps={rate} audio={audio is not None} aac={aac} options={options}') video_array = torch.as_tensor(tensor, dtype=torch.uint8).numpy(force=True) task = pbar.add_task('encoding', total=frames) if pbar is not None else None if task is not None: pbar.update(task, description='video encoding') with av.open(filename, mode="w") as container: for k, v in metadata.items(): container.metadata[k] = v stream: av.VideoStream = container.add_stream(codec, rate=rate, options=options) stream.width = video_array.shape[2] stream.height = video_array.shape[1] stream.pix_fmt = pix_fmt for img in video_array: frame = av.VideoFrame.from_ndarray(img, format="rgb24") for packet in stream.encode_lazy(frame): container.mux(packet) if task is not None: pbar.update(task, advance=1) for packet in stream.encode(): # flush container.mux(packet) if audio is not None: try: write_audio(container, audio, aac) except Exception as e: shared.log.error(f'Video audio encoding: {e}') errors.display(e, 'Audio') shared.state.outputs(filename) shared.state.end(savejob) def save_video( p:processing.StableDiffusionProcessingVideo, pixels:torch.Tensor=None, audio:torch.Tensor=None, binary:bytes=None, mp4_fps:int=24, mp4_codec:str='libx264', mp4_opt:str='', mp4_ext:str='mp4', mp4_sf:bool=False, # save safetensors mp4_video:bool=True, # save video mp4_frames:bool=False, # save frames mp4_interpolate:int=0, # rife interpolation aac_sample_rate:int=24000, # audio sample rate stream=None, # async progress reporting stream metadata:dict={}, # metadata for video pbar=None, # progress bar for video ): output_video = None if binary is not None: output_filename = get_video_filename(p) output_video = f'{output_filename}.{mp4_ext}' try: try: with open(output_video, 'wb') as f: f.write(binary) shared.log.info(f'Video output: file="{output_video}" size={len(binary)}') shared.state.outputs(output_video) except Exception as e: shared.log.error(f'Video output: file="{output_video}" {e}') except Exception as e: shared.log.error(f'Video output: file="{output_video}" write error {e}') errors.display(e, 'video') return 0, output_video if pixels is None: return 0, output_video if isinstance(pixels, np.ndarray): pixels = numpy_to_tensor(pixels) if isinstance(pixels, list) and isinstance(pixels[0], Image.Image): pixels = images_to_tensor(pixels) if not torch.is_tensor(pixels): shared.log.error(f'Video: type={type(pixels)} not a tensor') return 0, output_video t_save = time.time() n, _c, t, h, w = pixels.shape size = pixels.element_size() * pixels.numel() shared.log.debug(f'Video: video={mp4_video} export={mp4_frames} safetensors={mp4_sf} interpolate={mp4_interpolate}') shared.log.debug(f'Video: encode={t} raw={size} latent={pixels.shape} audio={audio.shape if audio is not None else None} fps={mp4_fps} codec={mp4_codec} ext={mp4_ext} options="{mp4_opt}"') try: preparejob = shared.state.begin('Prepare video') if stream is not None: stream.output_queue.push(('progress', (None, 'Saving video...'))) if mp4_interpolate > 0: x = pixels.squeeze(0).permute(1, 0, 2, 3) interpolated = rife.interpolate_nchw(x, count=mp4_interpolate+1) pixels = torch.stack(interpolated, dim=0) pixels = pixels.permute(1, 2, 0, 3, 4) n, _c, t, h, w = pixels.shape x = torch.clamp(pixels.float(), -1., 1.) * 127.5 + 127.5 x = x.detach().cpu().to(torch.uint8) x = einops.rearrange(x, '(m n) c t h w -> t (m h) (n w) c', n=n) x = x.contiguous() output_filename = get_video_filename(p) if shared.opts.save_txt: save_params(p, f'{output_filename}.txt') save_params(p) if mp4_sf: fn = f'{output_filename}.safetensors' shared.log.info(f'Video export: file="{fn}" type=savetensors shape={x.shape}') from safetensors.torch import save_file shared.state.outputs(fn) save_file({ 'frames': x }, fn, metadata={'format': 'video', 'frames': str(t), 'width': str(w), 'height': str(h), 'fps': str(mp4_fps), 'codec': mp4_codec, 'options': mp4_opt, 'ext': mp4_ext, 'interpolate': str(mp4_interpolate)}) if mp4_frames: shared.log.info(f'Video frames: files="{output_filename}-00000.jpg" frames={t} width={w} height={h}') for i in range(t): image = cv2.cvtColor(x[i].numpy(), cv2.COLOR_RGB2BGR) fn = f'{output_filename}-{i:05d}.jpg' shared.state.outputs(fn) cv2.imwrite(fn, image) shared.state.end(preparejob) if mp4_video and (mp4_codec != 'none'): output_video = f'{output_filename}.{mp4_ext}' atomic_save_video(output_video, tensor=x, audio=audio, fps=mp4_fps, codec=mp4_codec, options=mp4_opt, aac=aac_sample_rate, metadata=metadata, pbar=pbar) if stream is not None: stream.output_queue.push(('progress', (None, f'Video {os.path.basename(output_video)} | Codec {mp4_codec} | Size {w}x{h}x{t} | FPS {mp4_fps}'))) stream.output_queue.push(('file', output_video)) else: if stream is not None: stream.output_queue.push(('progress', (None, ''))) except Exception as e: shared.log.error(f'Video save: raw={size} {e}') errors.display(e, 'video') timer.process.add('save', time.time()-t_save) return t, output_video