diff --git a/bitmind/__init__.py b/bitmind/__init__.py index 09fc5e90..b0552aa1 100644 --- a/bitmind/__init__.py +++ b/bitmind/__init__.py @@ -18,7 +18,7 @@ # DEALINGS IN THE SOFTWARE. -__version__ = "2.2.8" +__version__ = "2.2.9" version_split = __version__.split(".") __spec_version__ = ( (1000 * int(version_split[0])) diff --git a/bitmind/synthetic_data_generation/prompt_generator.py b/bitmind/synthetic_data_generation/prompt_generator.py index a8b8bc09..cd7823e6 100644 --- a/bitmind/synthetic_data_generation/prompt_generator.py +++ b/bitmind/synthetic_data_generation/prompt_generator.py @@ -118,6 +118,7 @@ def clear_gpu(self) -> None: def generate( self, image: Image.Image, + task: Optional[str] = None, max_new_tokens: int = 20, verbose: bool = False ) -> str: @@ -127,6 +128,8 @@ def generate( Args: image: The image for which the description is to be generated. + task: The generation task ('t2i', 't2v', 'i2i', 'i2v'). If video task, + motion descriptions will be added. max_new_tokens: The maximum number of tokens to generate for each prompt. verbose: If True, additional logging information is printed. @@ -185,7 +188,10 @@ def generate( description += '.' moderated_description = self.moderate(description) - return self.enhance(moderated_description) + + if task in ['t2v', 'i2v']: + return self.enhance(moderated_description) + return moderated_description def moderate(self, description: str, max_new_tokens: int = 80) -> str: """ @@ -233,34 +239,28 @@ def enhance(self, description: str, max_new_tokens: int = 80) -> str: """ Enhance a static image description to make it suitable for video generation by adding dynamic elements and motion. - + Args: description: The static image description to enhance. max_new_tokens: Maximum number of new tokens to generate in the enhanced text. - + Returns: - An enhanced description suitable for video generation, or the original - description if enhancement fails. + An enhanced description suitable for video generation. """ messages = [ { "role": "system", "content": ( - "[INST]You are an expert at converting static image descriptions " - "into dynamic video prompts. Enhance the given description by " - "adding natural motion and temporal elements while preserving the " - "core scene. Follow these rules:\n" - "1. Maintain the essential elements of the original description\n" - "2. Add smooth, continuous motions that work well in video\n" - "3. For portraits: Add natural facial movements or expressions\n" - "4. For non-portrait images with people: Add contextually appropriate " - "actions (e.g., for a beach scene, people might be walking along " - "the shoreline or playing in the waves; for a cafe scene, people " - "might be sipping drinks or engaging in conversation)\n" - "5. For landscapes: Add environmental motion like wind or water\n" - "6. For urban scenes: Add dynamic elements like people or traffic\n" - "7. Keep the description concise but descriptive\n" - "8. Focus on gradual, natural transitions\n" + "[INST]You are an expert at converting image descriptions into video prompts. " + "Analyze the existing motion in the scene and enhance it naturally:\n" + "1. If motion exists in the image (falling, throwing, running, etc.):\n" + " - Maintain and emphasize that existing motion\n" + " - Add smooth continuation of the movement\n" + "2. If the subject is static (sitting, standing, placed):\n" + " - Keep it stable\n" + " - Add minimal environmental motion if appropriate\n" + "3. Add ONE subtle camera motion that complements the scene\n" + "4. Keep the description concise and natural\n" "Only respond with the enhanced description.[/INST]" ) }, @@ -280,5 +280,5 @@ def enhance(self, description: str, max_new_tokens: int = 80) -> str: return enhanced_text[0]['generated_text'] except Exception as e: - print(f"An error occurred during motion enhancement: {e}") - return description + bt.logging.error(f"An error occurred during motion enhancement: {e}") + return description \ No newline at end of file diff --git a/bitmind/synthetic_data_generation/synthetic_data_generator.py b/bitmind/synthetic_data_generation/synthetic_data_generator.py index 19b60607..45eab8e4 100644 --- a/bitmind/synthetic_data_generation/synthetic_data_generator.py +++ b/bitmind/synthetic_data_generation/synthetic_data_generator.py @@ -24,6 +24,7 @@ T2V_MODEL_NAMES, T2I_MODEL_NAMES, I2I_MODEL_NAMES, + I2V_MODEL_NAMES, TARGET_IMAGE_SIZE, select_random_model, get_task, @@ -152,7 +153,12 @@ def batch_generate(self, batch_size: int = 5) -> None: image_sample = self.image_cache.sample() images.append(image_sample['image']) bt.logging.info(f"Sampled image {i+1}/{batch_size} for captioning: {image_sample['path']}") - prompts.append(self.generate_prompt(image=image_sample['image'], clear_gpu=i==batch_size-1)) + task = get_task(self.model_name) if self.model_name else None + prompts.append(self.generate_prompt( + image=image_sample['image'], + clear_gpu=i==batch_size-1, + task=task + )) bt.logging.info(f"Caption {i+1}/{batch_size} generated: {prompts[-1]}") # If specific model is set, use only that model @@ -163,9 +169,12 @@ def batch_generate(self, batch_size: int = 5) -> None: i2i_model_names = random.sample(I2I_MODEL_NAMES, len(I2I_MODEL_NAMES)) t2i_model_names = random.sample(T2I_MODEL_NAMES, len(T2I_MODEL_NAMES)) t2v_model_names = random.sample(T2V_MODEL_NAMES, len(T2V_MODEL_NAMES)) + i2v_model_names = random.sample(I2V_MODEL_NAMES, len(I2V_MODEL_NAMES)) + model_names = [ - m for triple in zip_longest(t2v_model_names, t2i_model_names, i2i_model_names) - for m in triple if m is not None + m for quad in zip_longest(t2v_model_names, t2i_model_names, + i2i_model_names, i2v_model_names) + for m in quad if m is not None ] # Generate for each model/prompt combination @@ -222,7 +231,7 @@ def generate( ValueError: If real_image is None when using annotation prompt type. NotImplementedError: If prompt type is not supported. """ - prompt = self.generate_prompt(image, clear_gpu=True) + prompt = self.generate_prompt(image, clear_gpu=True, task=task) bt.logging.info("Generating synthetic data...") gen_data = self._run_generation(prompt, task, model_name, image) self.clear_gpu() @@ -231,7 +240,8 @@ def generate( def generate_prompt( self, image: Optional[Image.Image] = None, - clear_gpu: bool = True + clear_gpu: bool = True, + task: Optional[str] = None ) -> str: """Generate a prompt based on the specified strategy.""" bt.logging.info("Generating prompt") @@ -241,7 +251,7 @@ def generate_prompt( "image can't be None if self.prompt_type is 'annotation'" ) self.prompt_generator.load_models() - prompt = self.prompt_generator.generate(image) + prompt = self.prompt_generator.generate(image, task=task) if clear_gpu: self.prompt_generator.clear_gpu() else: @@ -261,9 +271,9 @@ def _run_generation( Args: prompt: The text prompt used to inspire the generation. - task: The generation task type ('t2i', 't2v', 'i2i', or None). + task: The generation task type ('t2i', 't2v', 'i2i', 'i2v', or None). model_name: Optional model name to use for generation. - image: Optional input image for image-to-image generation. + image: Optional input image for image-to-image or image-to-video generation. generate_at_target_size: If True, generate at TARGET_IMAGE_SIZE dimensions. Returns: @@ -272,6 +282,10 @@ def _run_generation( Raises: RuntimeError: If generation fails. """ + # Clear CUDA cache before loading model + torch.cuda.empty_cache() + gc.collect() + self.load_model(model_name) model_config = MODELS[self.model_name] task = get_task(model_name) if task is None else task @@ -289,14 +303,38 @@ def _run_generation( gen_args['mask_image'], mask_center = create_random_mask(image.size) gen_args['image'] = image + # prep image-to-video generation args + elif task == 'i2v': + if image is None: + raise ValueError("image cannot be None for image-to-video generation") + # Get target size from gen_args if specified, otherwise use default + target_size = ( + gen_args.get('height', 768), + gen_args.get('width', 768) + ) + if image.size[0] > target_size[0] or image.size[1] > target_size[1]: + image = image.resize(target_size, Image.Resampling.LANCZOS) + gen_args['image'] = image # Prepare generation arguments for k, v in gen_args.items(): if isinstance(v, dict): if "min" in v and "max" in v: - gen_args[k] = np.random.randint(v['min'], v['max']) + # For i2v, use minimum values to save memory + if task == 'i2v': + gen_args[k] = v['min'] + else: + gen_args[k] = np.random.randint(v['min'], v['max']) if "options" in v: gen_args[k] = random.choice(v['options']) + # Ensure num_frames is always an integer + if k == 'num_frames' and isinstance(v, dict): + if "min" in v: + gen_args[k] = v['min'] + elif "max" in v: + gen_args[k] = v['max'] + else: + gen_args[k] = 24 # Default value try: if generate_at_target_size: @@ -307,6 +345,10 @@ def _run_generation( gen_args['width'] = gen_args['resolution'][1] del gen_args['resolution'] + # Ensure num_frames is an integer before generation + if 'num_frames' in gen_args: + gen_args['num_frames'] = int(gen_args['num_frames']) + truncated_prompt = truncate_prompt_if_too_long(prompt, self.model) bt.logging.info(f"Generating media from prompt: {truncated_prompt}") bt.logging.info(f"Generation args: {gen_args}") @@ -321,8 +363,14 @@ def _run_generation( pretrained_args = model_config.get('from_pretrained_args', {}) torch_dtype = pretrained_args.get('torch_dtype', torch.bfloat16) with torch.autocast(self.device, torch_dtype, cache_enabled=False): + # Clear CUDA cache before generation + torch.cuda.empty_cache() + gc.collect() gen_output = generate(truncated_prompt, **gen_args) else: + # Clear CUDA cache before generation + torch.cuda.empty_cache() + gc.collect() gen_output = generate(truncated_prompt, **gen_args) gen_time = time.time() - start_time @@ -334,6 +382,8 @@ def _run_generation( f"default dimensions. Error: {e}" ) try: + # Clear CUDA cache before retry + torch.cuda.empty_cache() gen_output = self.model(prompt=truncated_prompt) gen_time = time.time() - start_time except Exception as fallback_error: @@ -461,5 +511,4 @@ def clear_gpu(self) -> None: del self.model self.model = None gc.collect() - torch.cuda.empty_cache() - + torch.cuda.empty_cache() \ No newline at end of file diff --git a/bitmind/validator/challenge.py b/bitmind/validator/challenge.py index 1466cbd2..9cca7724 100644 --- a/bitmind/validator/challenge.py +++ b/bitmind/validator/challenge.py @@ -13,7 +13,7 @@ from bitmind.utils.uids import get_random_uids from bitmind.validator.reward import get_rewards from bitmind.validator.config import ( - TARGET_IMAGE_SIZE, + TARGET_IMAGE_SIZE, MIN_FRAMES, MAX_FRAMES, P_STITCH, @@ -136,7 +136,7 @@ def sample_video_frames(self, video_cache): sample['video'] = sample_A['video'] + sample_B['video'] return sample - + def process_metadata(self, sample) -> bool: """Prepare challenge metadata and media for logging to Weights & Biases """ self.metadata = { @@ -179,4 +179,4 @@ def create_wandb_video(video_frames, fps): except Exception as e: bt.logging.error(e) bt.logging.error(f"{self.modality} is truncated or corrupt. Challenge skipped.") - return False + return False \ No newline at end of file diff --git a/bitmind/validator/config.py b/bitmind/validator/config.py index 9d7e3333..fc56eefb 100644 --- a/bitmind/validator/config.py +++ b/bitmind/validator/config.py @@ -17,7 +17,9 @@ EulerDiscreteScheduler, DEISMultistepScheduler, AutoPipelineForInpainting, - StableDiffusionInpaintPipeline + StableDiffusionInpaintPipeline, + CogView4Pipeline, + CogVideoXImageToVideoPipeline ) from .model_utils import ( @@ -114,7 +116,8 @@ class Modality(StrEnum): {"path": "bitmind/lfw"}, {"path": "bitmind/caltech-256"}, {"path": "bitmind/caltech-101"}, - {"path": "bitmind/dtd"} + {"path": "bitmind/dtd"}, + {"path": "bitmind/idoc-mugshots-images"} ], "semisynthetic": [ {"path": "bitmind/face-swap"} @@ -143,6 +146,21 @@ class Modality(StrEnum): # Text-to-image model configurations T2I_MODELS: Dict[str, Dict[str, Any]] = { + "THUDM/CogView4-6B": { + "pipeline_cls": CogView4Pipeline, + "from_pretrained_args": { + "torch_dtype": torch.bfloat16, + "use_safetensors": True + }, + "generate_args": { + "guidance_scale": 3.5, + "num_images_per_prompt": 1, + "num_inference_steps": 50, + "width": 512, + "height": 512 + }, + "use_autocast": False + }, "stabilityai/stable-diffusion-xl-base-1.0": { "pipeline_cls": StableDiffusionXLPipeline, "from_pretrained_args": { @@ -304,12 +322,6 @@ class Modality(StrEnum): "scheduler": { "cls": DEISMultistepScheduler } - }, - "stable-diffusion-v1-5/stable-diffusion-inpainting": { - "pipeline_cls": StableDiffusionInpaintPipeline, - "generate_args": { - "num_inference_steps": {"min": 40, "max": 60}, - } } } I2I_MODEL_NAMES: List[str] = list(I2I_MODELS.keys()) @@ -407,13 +419,16 @@ class Modality(StrEnum): } T2V_MODEL_NAMES: List[str] = list(T2V_MODELS.keys()) +# Image-to-video model configurations +I2V_MODELS: Dict[str, Dict[str, Any]] = {} +I2V_MODEL_NAMES: List[str] = list(I2V_MODELS.keys()) + # Combined model configurations -MODELS: Dict[str, Dict[str, Any]] = {**T2I_MODELS, **I2I_MODELS, **T2V_MODELS} +MODELS: Dict[str, Dict[str, Any]] = {**T2I_MODELS, **I2I_MODELS, **T2V_MODELS, **I2V_MODELS} MODEL_NAMES: List[str] = list(MODELS.keys()) - def get_modality(model_name): - if model_name in T2V_MODEL_NAMES: + if model_name in T2V_MODEL_NAMES + I2V_MODEL_NAMES: return Modality.VIDEO elif model_name in T2I_MODEL_NAMES + I2I_MODEL_NAMES: return Modality.IMAGE @@ -421,7 +436,7 @@ def get_modality(model_name): def get_output_media_type(model_name): if model_name in I2I_MODEL_NAMES: return MediaType.SEMISYNTHETIC - elif model_name in T2I_MODEL_NAMES + T2V_MODEL_NAMES: + elif model_name in T2I_MODEL_NAMES + T2V_MODEL_NAMES + I2V_MODEL_NAMES: return MediaType.SYNTHETIC def get_task(model_name): @@ -431,15 +446,17 @@ def get_task(model_name): return 't2i' elif model_name in I2I_MODEL_NAMES: return 'i2i' + elif model_name in I2V_MODEL_NAMES: + return 'i2v' def select_random_model(task: Optional[str] = None) -> str: """ - Select a random text-to-image or text-to-video model based on the specified + Select a random text-to-image, text-to-video, image-to-image, or image-to-video model based on the specified modality. Args: - modality: The type of model to select ('t2v', 't2i', 'i2i', or 'random'). + modality: The type of model to select ('t2v', 't2i', 'i2i', 'i2v', or 'random'). If None or 'random', randomly chooses between the valid options Returns: @@ -457,5 +474,9 @@ def select_random_model(task: Optional[str] = None) -> str: return np.random.choice(T2V_MODEL_NAMES) elif task == 'i2i': return np.random.choice(I2I_MODEL_NAMES) + elif task == 'i2v': + if not I2V_MODEL_NAMES: + raise NotImplementedError("I2V models are not currently configured") + return np.random.choice(I2V_MODEL_NAMES) else: raise NotImplementedError(f"Unsupported task: {task}") \ No newline at end of file diff --git a/bitmind/validator/verify_models.py b/bitmind/validator/verify_models.py index ff79c295..ff80c9e0 100644 --- a/bitmind/validator/verify_models.py +++ b/bitmind/validator/verify_models.py @@ -23,10 +23,10 @@ def is_model_cached(model_name): # Check if the model directory exists if os.path.isdir(model_path): - bt.logging.info(f"{model_name} is in HF cache. Skipping....") + print(f"{model_name} is in HF cache. Skipping....") return True else: - bt.logging.info(f"{model_name} is not cached. Downloading....") + print(f"{model_name} is not cached. Downloading....") return False diff --git a/requirements.txt b/requirements.txt index eb68fdf4..947e04ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ scikit-learn==1.5.2 # Deep learning tools transformers==4.48.0 -diffusers==0.32.2 +diffusers==0.33.1 accelerate==1.2.0 bitsandbytes==0.45.0 sentencepiece==0.2.0