diff --git a/.gitignore b/.gitignore index 20d2039c..fcdd2906 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ venv/ # Environment variables .env +!skills/public/**/.env # Configuration files config.yaml diff --git a/backend/src/gateway/routers/skills.py b/backend/src/gateway/routers/skills.py index 6d8f9767..caed96a8 100644 --- a/backend/src/gateway/routers/skills.py +++ b/backend/src/gateway/routers/skills.py @@ -108,7 +108,7 @@ class RemoteSkillBootstrapResponse(BaseModel): target_dir: str = Field(..., description="Virtual target directory") created_directories: int = Field(..., description="Number of created directories") created_files: int = Field(..., description="Number of created files") - sandbox_id: str = Field(..., description="Acquired sandbox ID") + sandbox_id: str | None = Field(default=None, description="Acquired sandbox ID (null when sandbox is not acquired)") message: str = Field(..., description="Operation result message") @@ -568,8 +568,8 @@ async def bootstrap_skill_from_remote(request: RemoteSkillBootstrapRequest) -> R """Initialize thread skill directory from remote YAML content service.""" try: # 1) Ensure sandbox and thread personal dirs are initialized first. - sandbox_provider = get_sandbox_provider() - sandbox_id = sandbox_provider.acquire(request.thread_id) + # sandbox_provider = get_sandbox_provider() + # sandbox_id = sandbox_provider.acquire(request.thread_id) # 2) Fetch YAML content from configured remote endpoint. cfg = get_gateway_config() @@ -624,7 +624,7 @@ async def bootstrap_skill_from_remote(request: RemoteSkillBootstrapRequest) -> R target_dir=request.target_dir, created_directories=len(parsed.directories), created_files=len(parsed.files), - sandbox_id=sandbox_id, + sandbox_id=None, message=( f"Bootstrapped {len(parsed.files)} files and {len(parsed.directories)} directories " f"under '{request.target_dir}'" diff --git a/backend/src/gateway/skill_yaml_importer.py b/backend/src/gateway/skill_yaml_importer.py index 5dc7742a..dc9688be 100644 --- a/backend/src/gateway/skill_yaml_importer.py +++ b/backend/src/gateway/skill_yaml_importer.py @@ -7,6 +7,9 @@ real filesystem content under a thread's virtual path (for example, from __future__ import annotations +import argparse +import json +import sys from dataclasses import dataclass from pathlib import Path @@ -328,4 +331,159 @@ def materialize_skill_tree(parsed: ParsedSkillTree, target_root: Path, clear_tar for rel_file, content in parsed.files.items(): file_path = target_root / rel_file file_path.parent.mkdir(parents=True, exist_ok=True) - file_path.write_text(content, encoding="utf-8") \ No newline at end of file + file_path.write_text(content, encoding="utf-8") + + +def _build_cli_parser() -> argparse.ArgumentParser: + """Build command-line argument parser. + + CLI usage: + python skill_yaml_importer.py [options] + + Positional arguments: + input_path Path to a YAML file, or a directory containing YAML files. + + Options: + --show-files Include parsed file paths in output. + --show-directories Include parsed directory paths in output. + --json Print JSON output instead of plain text. + --recursive Recursively scan subdirectories when input is a directory. + --log-file Save full report (summary + successes + failures) to JSON file. + + Examples: + python skill_yaml_importer.py ./sample.yaml --json + python skill_yaml_importer.py ./generated_yaml --recursive --log-file ./parse_log.json + """ + parser = argparse.ArgumentParser(description="Parse and validate a skill YAML spec file") + parser.add_argument("input_path", help="Path to a YAML file or a directory containing YAML files") + parser.add_argument( + "--show-files", + action="store_true", + help="Print sorted parsed file paths", + ) + parser.add_argument( + "--show-directories", + action="store_true", + help="Print sorted parsed directory paths", + ) + parser.add_argument( + "--json", + action="store_true", + help="Print parsed summary as JSON", + ) + parser.add_argument( + "--recursive", + action="store_true", + help="When input path is a directory, scan YAML files recursively", + ) + parser.add_argument( + "--log-file", + default=None, + help="Optional path to save full execution results and summary as JSON", + ) + return parser + + +def _collect_yaml_files(input_path: Path, recursive: bool) -> list[Path]: + if input_path.is_file(): + return [input_path] + + if not input_path.is_dir(): + return [] + + patterns = ("*.yaml", "*.yml") + files: list[Path] = [] + for pattern in patterns: + iterator = input_path.rglob(pattern) if recursive else input_path.glob(pattern) + files.extend(iterator) + + # Stable order for deterministic output + return sorted({p.resolve() for p in files}) + + +def _parse_one_yaml_file(yaml_path: Path, show_files: bool, show_directories: bool) -> dict: + yaml_text = yaml_path.read_text(encoding="utf-8") + parsed = parse_skill_yaml_spec(yaml_text) + directories = sorted(parsed.directories) + files = sorted(parsed.files.keys()) + + return { + "yaml_file": str(yaml_path), + "directories_count": len(directories), + "files_count": len(files), + "directories": directories if show_directories else None, + "files": files if show_files else None, + } + + +def _main() -> int: + """CLI entrypoint for parsing one YAML file or a batch of YAML files. + + Exit codes: + 0: all files parsed successfully + 1: invalid input path or no YAML files found + 2: processed completed with one or more parse failures + """ + args = _build_cli_parser().parse_args() + + input_path = Path(args.input_path) + if not input_path.exists(): + print(f"Input path not found: {input_path}", file=sys.stderr) + return 1 + + yaml_files = _collect_yaml_files(input_path, recursive=args.recursive) + if not yaml_files: + print(f"No YAML files found under: {input_path}", file=sys.stderr) + return 1 + + successes: list[dict] = [] + failures: list[dict[str, str]] = [] + + for yaml_path in yaml_files: + try: + result = _parse_one_yaml_file( + yaml_path, + show_files=args.show_files, + show_directories=args.show_directories, + ) + successes.append(result) + if not args.json: + print(f"OK: {yaml_path}") + print(f" Directories: {result['directories_count']}") + print(f" Files: {result['files_count']}") + except Exception as e: # noqa: BLE001 + failures.append({"yaml_file": str(yaml_path), "error": str(e)}) + print(f"ERROR: {yaml_path}: {e}", file=sys.stderr) + + summary = { + "input_path": str(input_path), + "total": len(yaml_files), + "success": len(successes), + "failed": len(failures), + } + + report = {"summary": summary, "successes": successes, "failures": failures} + + if args.log_file: + try: + log_path = Path(args.log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + log_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Log saved: {log_path}") + except Exception as e: # noqa: BLE001 + print(f"Failed to write log file '{args.log_file}': {e}", file=sys.stderr) + + if args.json: + print(json.dumps(report, ensure_ascii=False, indent=2)) + else: + print("\n[Summary]") + print(f"Input: {summary['input_path']}") + print(f"Total: {summary['total']}") + print(f"Success: {summary['success']}") + print(f"Failed: {summary['failed']}") + + return 0 if not failures else 2 + + +if __name__ == "__main__": + raise SystemExit(_main()) \ No newline at end of file diff --git a/docker/docker-compose-dev.yaml b/docker/docker-compose-dev.yaml index c4b6649a..b512a290 100644 --- a/docker/docker-compose-dev.yaml +++ b/docker/docker-compose-dev.yaml @@ -166,6 +166,10 @@ services: - CI=true # Docker environment for aio sandbox - DOCKER_HOST=unix:///var/run/docker.sock + - LOG_LEVEL=DEBUG + - LANGGRAPH_DEBUG=true + - LANGCHAIN_DEBUG=true + - PYTHONUNBUFFERED=1 env_file: - ../.env extra_hosts: diff --git a/skills/public/image-generation/.env b/skills/public/image-generation/.env new file mode 100755 index 00000000..719fb3a7 --- /dev/null +++ b/skills/public/image-generation/.env @@ -0,0 +1,2 @@ +# RunningHub API Configuration +RUNNINGHUB_API_KEY=a73d0e93afb4432c978e5bff30b7517e \ No newline at end of file diff --git a/skills/public/image-generation/.env.example b/skills/public/image-generation/.env.example new file mode 100755 index 00000000..86474bd7 --- /dev/null +++ b/skills/public/image-generation/.env.example @@ -0,0 +1,6 @@ +# RunningHub API Configuration +# Copy this file to .env and fill in your actual API key + +# RunningHub API Key for image generation +# Get your API key from: https://www.runninghub.cn +RUNNINGHUB_API_KEY=your_api_key_here \ No newline at end of file diff --git a/skills/public/image-generation/.gitignore b/skills/public/image-generation/.gitignore new file mode 100755 index 00000000..cb609881 --- /dev/null +++ b/skills/public/image-generation/.gitignore @@ -0,0 +1,31 @@ +# Environment variables +.env +.env.local +.env.*.local + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +env/ +ENV/ + +# Output files +*.jpg +*.jpeg +*.png +*.webp +outputs/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/skills/public/image-generation/SKILL.md b/skills/public/image-generation/SKILL.md old mode 100644 new mode 100755 index d15cb63e..0e0dbed5 --- a/skills/public/image-generation/SKILL.md +++ b/skills/public/image-generation/SKILL.md @@ -7,14 +7,47 @@ description: Use this skill when the user requests to generate, create, imagine, ## Overview -This skill generates high-quality images using structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing image generation with optional reference images. +This skill generates high-quality images using RunningHub API with structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing image generation through asynchronous task submission. ## Core Capabilities - Create structured JSON prompts for AIGC image generation -- Support multiple reference images for style/composition guidance -- Generate images through automated Python script execution +- Generate images through RunningHub's Z-Image Turbo LoRA API +- Support asynchronous task submission and status polling - Handle various image generation scenarios (character design, scenes, products, etc.) +- Support multiple aspect ratios and output formats (PNG, JPEG, WEBP) + +## Configuration + +### API Key Setup + +This skill uses RunningHub API for image generation. You need to configure your API key before using the skill. + +**Option 1: Environment Variable (Recommended)** +```bash +# Set the RUNNINGHUB_API_KEY environment variable +export RUNNINGHUB_API_KEY=your_api_key_here + +# Or on Windows: +set RUNNINGHUB_API_KEY=your_api_key_here +``` + +**Option 2: .env File** +1. Copy `.env.example` to `.env`: + ```bash + cp .env.example .env + ``` +2. Edit `.env` and add your API key: + ``` + RUNNINGHUB_API_KEY=your_api_key_here + ``` +3. The `.env` file is automatically excluded from version control via `.gitignore` + +**Security Notes:** +- Never commit `.env` files to version control +- Never hardcode API keys in source code +- Rotate your API keys if they are accidentally exposed +- Get your API key from: https://www.runninghub.cn ## Workflow @@ -38,20 +71,20 @@ Call the Python script: ```bash python /mnt/skills/public/image-generation/scripts/generate.py \ --prompt-file /mnt/user-data/workspace/prompt-file.json \ - --reference-images /path/to/ref1.jpg /path/to/ref2.png \ - --output-file /mnt/user-data/outputs/generated-image.jpg + --output-file /mnt/user-data/outputs/generated-image.jpg \ --aspect-ratio 16:9 ``` Parameters: - `--prompt-file`: Absolute path to JSON prompt file (required) -- `--reference-images`: Absolute paths to reference images (optional, space-separated) - `--output-file`: Absolute path to output image file (required) - `--aspect-ratio`: Aspect ratio of the generated image (optional, default: 16:9) [!NOTE] -Do NOT read the python file, just call it with the parameters. +- The script uses RunningHub API which requires `RUNNINGHUB_API_KEY` environment variable to be set +- Do NOT read the python file, just call it with the parameters +- The script automatically handles task submission, status polling, and image download ## Character Generation Example @@ -86,40 +119,6 @@ python /mnt/skills/public/image-generation/scripts/generate.py \ --aspect-ratio 2:3 ``` -With reference images: -```json -{ - "characters": [{ - "gender": "based on [Image 1]", - "age": "based on [Image 1]", - "ethnicity": "human from [Image 1] adapted to Star Wars universe", - "body_type": "based on [Image 1]", - "facial_features": "matching [Image 1] with slight weathered look from space travel", - "clothing": "Star Wars style outfit - worn leather jacket with utility vest, cargo pants with tactical pouches, scuffed boots, belt with holster", - "accessories": "blaster pistol on hip, comlink device on wrist, goggles pushed up on forehead, satchel with supplies, personal vehicle based on [Image 2]", - "era": "Star Wars universe, post-Empire era" - }], - "prompt": "Character inspired by [Image 1] standing next to a vehicle inspired by [Image 2] on a bustling alien planet street in Star Wars universe aesthetic. Character wearing worn leather jacket with utility vest, cargo pants with tactical pouches, scuffed boots, belt with blaster holster. The vehicle adapted to Star Wars aesthetic with weathered metal panels, repulsor engines, desert dust covering, parked on the street. Exotic alien marketplace street with multi-level architecture, weathered metal structures, hanging market stalls with colorful awnings, alien species walking by as background characters. Twin suns casting warm golden light, atmospheric dust particles in air, moisture vaporators visible in distance. Gritty lived-in Star Wars aesthetic, practical effects look, film grain texture, cinematic composition.", - "negative_prompt": "clean futuristic look, sterile environment, overly CGI appearance, fantasy medieval elements, Earth architecture, modern city", - "style": "Star Wars original trilogy aesthetic, lived-in universe, practical effects inspired, cinematic film look, slightly desaturated with warm tones", - "composition": "medium wide shot, character in foreground with alien street extending into background, environmental storytelling, rule of thirds", - "lighting": "warm golden hour lighting from twin suns, rim lighting on character, atmospheric haze, practical light sources from market stalls", - "color_palette": "warm sandy tones, ochre and sienna, dusty blues, weathered metals, muted earth colors with pops of alien market colors", - "technical": { - "aspect_ratio": "9:16", - "quality": "high", - "detail_level": "highly detailed with film-like texture" - } -} -``` -```bash -python /mnt/skills/public/image-generation/scripts/generate.py \ - --prompt-file /mnt/user-data/workspace/star-wars-scene.json \ - --reference-images /mnt/user-data/uploads/character-ref.jpg /mnt/user-data/uploads/vehicle-ref.jpg \ - --output-file /mnt/user-data/outputs/star-wars-scene-01.jpg \ - --aspect-ratio 16:9 -``` - ## Common Scenarios Use different JSON schemas for different scenarios. @@ -158,30 +157,10 @@ After generation: - Provide brief description of the generation result - Offer to iterate if adjustments needed -## Tips: Enhancing Generation with Reference Images - -For scenarios where visual accuracy is critical, **use the `image_search` tool first** to find reference images before generation. - -**Recommended scenarios for using image_search tool:** -- **Character/Portrait Generation**: Search for similar poses, expressions, or styles to guide facial features and body proportions -- **Specific Objects or Products**: Find reference images of real objects to ensure accurate representation -- **Architectural or Environmental Scenes**: Search for location references to capture authentic details -- **Fashion and Clothing**: Find style references to ensure accurate garment details and styling - -**Example workflow:** -1. Call the `image_search` tool to find suitable reference images: - ``` - image_search(query="Japanese woman street photography 1990s", size="Large") - ``` -2. Download the returned image URLs to local files -3. Use the downloaded images as `--reference-images` parameter in the generation script - -This approach significantly improves generation quality by providing the model with concrete visual guidance rather than relying solely on text descriptions. - ## Notes - Always use English for prompts regardless of user's language - JSON format ensures structured, parsable prompts -- Reference images enhance generation quality significantly - Iterative refinement is normal for optimal results - For character generation, include the detailed character object plus a consolidated prompt field +- The script automatically polls task status and downloads the generated image diff --git a/skills/public/image-generation/scripts/generate.py b/skills/public/image-generation/scripts/generate.py old mode 100644 new mode 100755 index 9665fafb..65b44185 --- a/skills/public/image-generation/scripts/generate.py +++ b/skills/public/image-generation/scripts/generate.py @@ -1,8 +1,14 @@ import base64 +import json import os +import time +from typing import List import requests from PIL import Image +from dotenv import load_dotenv + +load_dotenv() def validate_image(image_path: str) -> bool: @@ -17,77 +23,171 @@ def validate_image(image_path: str) -> bool: """ try: with Image.open(image_path) as img: - img.verify() # Verify that it's a valid image - # Re-open to check if it can be fully loaded (verify() may not catch all issues) + img.verify() with Image.open(image_path) as img: - img.load() # Force load the image data + img.load() return True except Exception as e: print(f"Warning: Image '{image_path}' is invalid or corrupted: {e}") return False +def submit_generation_task(prompt: str, aspect_ratio: str = "16:9", output_format: str = "png") -> str: + """ + Submit image generation task to RunningHub API. + + Args: + prompt: Text prompt for image generation + aspect_ratio: Aspect ratio of the generated image + output_format: Output image format (png, jpeg, webp) + + Returns: + Task ID for tracking the generation + """ + api_key = os.getenv("RUNNINGHUB_API_KEY") + if not api_key: + raise Exception("RUNNINGHUB_API_KEY environment variable is not set") + + url = "https://www.runninghub.cn/openapi/v2/rhart-image/z-image/turbo-lora" + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + payload = { + "prompt": prompt, + "aspectRatio": aspect_ratio, + "lora_name": "Z-Image _ 清纯高颜值_脸模版V1.0.safetensors", + "lora_strength": 1, + "outputFormat": output_format + } + + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() + result = response.json() + + if result.get("status") not in ["QUEUED", "RUNNING", "SUCCESS"]: + raise Exception(f"Task submission failed: {result.get('errorMessage', 'Unknown error')}") + + return result.get("taskId") + + +def query_task_status(task_id: str) -> dict: + """ + Query the status of a generation task. + + Args: + task_id: Task ID to query + + Returns: + Task status information + """ + api_key = os.getenv("RUNNINGHUB_API_KEY") + if not api_key: + raise Exception("RUNNINGHUB_API_KEY environment variable is not set") + + url = "https://www.runninghub.cn/openapi/v2/query" + + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", + } + + payload = { + "taskId": task_id + } + + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() + return response.json() + + +def download_image(url: str, output_path: str) -> None: + """ + Download image from URL and save to file. + + Args: + url: Image URL to download + output_path: Local path to save the image + """ + response = requests.get(url, stream=True) + response.raise_for_status() + + with open(output_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + def generate_image( prompt_file: str, - reference_images: list[str], + reference_images: List[str], output_file: str, aspect_ratio: str = "16:9", ) -> str: - with open(prompt_file, "r") as f: - prompt = f.read() - parts = [] - i = 0 + """ + Generate image using RunningHub API. - # Filter out invalid reference images - valid_reference_images = [] - for ref_img in reference_images: - if validate_image(ref_img): - valid_reference_images.append(ref_img) + Args: + prompt_file: Path to JSON prompt file + reference_images: List of reference image paths (currently not supported by RunningHub API) + output_file: Output path for generated image + aspect_ratio: Aspect ratio of the generated image + + Returns: + Success message with output file path + """ + with open(prompt_file, "r", encoding="utf-8") as f: + prompt_data = json.load(f) + + if reference_images: + print("Note: RunningHub API does not support reference images in this version. Reference images will be ignored.") + + prompt_text = prompt_data.get("prompt", "") + if not prompt_text: + prompt_text = json.dumps(prompt_data, ensure_ascii=False) + + output_format = "png" + if output_file.lower().endswith(".jpg") or output_file.lower().endswith(".jpeg"): + output_format = "jpeg" + elif output_file.lower().endswith(".webp"): + output_format = "webp" + + print(f"Submitting generation task...") + task_id = submit_generation_task(prompt_text, aspect_ratio, output_format) + print(f"Task submitted successfully. Task ID: {task_id}") + + max_retries = 60 + retry_interval = 2 + + for attempt in range(max_retries): + print(f"Checking task status... (Attempt {attempt + 1}/{max_retries})") + status_result = query_task_status(task_id) + status = status_result.get("status") + + if status == "SUCCESS": + print("Task completed successfully!") + results = status_result.get("results", []) + if results and len(results) > 0: + image_url = results[0].get("url") + if image_url: + print(f"Downloading image from: {image_url}") + download_image(image_url, output_file) + return f"Successfully generated image to {output_file}" + else: + raise Exception("No image URL found in task results") + else: + raise Exception("No results found in task response") + elif status == "FAILED": + error_msg = status_result.get("errorMessage", "Unknown error") + raise Exception(f"Task failed: {error_msg}") + elif status in ["QUEUED", "RUNNING"]: + print(f"Task status: {status}. Waiting...") + time.sleep(retry_interval) else: - print(f"Skipping invalid reference image: {ref_img}") + raise Exception(f"Unknown task status: {status}") - if len(valid_reference_images) < len(reference_images): - print(f"Note: {len(reference_images) - len(valid_reference_images)} reference image(s) were skipped due to validation failure.") - - for reference_image in valid_reference_images: - i += 1 - with open(reference_image, "rb") as f: - image_b64 = base64.b64encode(f.read()).decode("utf-8") - parts.append( - { - "inlineData": { - "mimeType": "image/jpeg", - "data": image_b64, - } - } - ) - - api_key = os.getenv("GEMINI_API_KEY") - if not api_key: - return "GEMINI_API_KEY is not set" - response = requests.post( - "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent", - headers={ - "x-goog-api-key": api_key, - "Content-Type": "application/json", - }, - json={ - "generationConfig": {"imageConfig": {"aspectRatio": aspect_ratio}}, - "contents": [{"parts": [*parts, {"text": prompt}]}], - }, - ) - response.raise_for_status() - json = response.json() - parts: list[dict] = json["candidates"][0]["content"]["parts"] - image_parts = [part for part in parts if part.get("inlineData", False)] - if len(image_parts) == 1: - base64_image = image_parts[0]["inlineData"]["data"] - # Save the image to a file - with open(output_file, "wb") as f: - f.write(base64.b64decode(base64_image)) - return f"Successfully generated image to {output_file}" - else: - raise Exception("Failed to generate image") + raise Exception(f"Task did not complete within {max_retries * retry_interval} seconds") if __name__ == "__main__": diff --git a/skills/public/image-generation/templates/doraemon.md b/skills/public/image-generation/templates/doraemon.md old mode 100644 new mode 100755 diff --git a/skills/public/podcast-generation/.env.example b/skills/public/podcast-generation/.env.example new file mode 100755 index 00000000..8b6f6679 --- /dev/null +++ b/skills/public/podcast-generation/.env.example @@ -0,0 +1,20 @@ +# RunningHub API Configuration +# 请在获取API文档后填写以下信息 + +# RunningHub API密钥 +RUNNINGHUB_API_KEY=your_api_key_here + +# RunningHub API端点URL +RUNNINGHUB_API_URL=https://api.runninghub.cn/v1/workflow/run + +# TTS工作流ID(需要在RunningHub平台创建TTS工作流后获取) +RUNNINGHUB_TTS_WORKFLOW_ID=your_workflow_id_here + +# 男性声音参数(根据RunningHub工作流参数填写) +RUNNINGHUB_MALE_VOICE=male_voice_name + +# 女性声音参数(根据RunningHub工作流参数填写) +RUNNINGHUB_FEMALE_VOICE=female_voice_name + +# 可选:音频质量设置 +RUNNINGHUB_AUDIO_QUALITY=high diff --git a/skills/public/podcast-generation/README.md b/skills/public/podcast-generation/README.md new file mode 100755 index 00000000..22ce723d --- /dev/null +++ b/skills/public/podcast-generation/README.md @@ -0,0 +1,127 @@ +# Podcast Generation - 播客生成工具 + +将文本内容转换为双主持人对话形式的播客音频。 + +## 功能特点 + +- 支持中英文内容 +- 自动生成男女双主持人对话 +- 支持多种TTS服务(Edge-TTS、RunningHub、火山引擎) +- 自动生成播客音频和文字稿 + +## 快速开始 + +### 1. 安装依赖 + +```bash +pip install -r requirements.txt +``` + +### 2. 配置TTS服务(可选) + +默认使用**Edge-TTS**(免费,无需配置),可以直接跳过此步骤。 + +如果需要使用其他TTS服务,复制 `.env.example` 为 `.env` 并配置: + +**使用RunningHub API:** +```bash +RUNNINGHUB_API_KEY=your_api_key_here +RUNNINGHUB_API_URL=https://api.runninghub.cn/v1/workflow/run +RUNNINGHUB_TTS_WORKFLOW_ID=your_workflow_id_here +RUNNINGHUB_MALE_VOICE=male_voice_name +RUNNINGHUB_FEMALE_VOICE=female_voice_name +``` + +**使用火山引擎TTS:** +```bash +VOLCENGINE_TTS_APPID=your_app_id +VOLCENGINE_TTS_ACCESS_TOKEN=your_access_token +VOLCENGINE_TTS_CLUSTER=volcano_tts +``` + +### 3. 创建播客脚本 + +创建一个JSON脚本文件,例如 `workspace/test-script.json`: + +```json +{ + "title": "测试播客", + "locale": "zh", + "lines": [ + {"speaker": "male", "paragraph": "Hello Deer! 欢迎回到我们的播客节目。"}, + {"speaker": "female", "paragraph": "大家好!今天我们要聊一个有趣的话题。"}, + {"speaker": "male", "paragraph": "没错,我们今天要讨论的是人工智能的发展历程。"} + ] +} +``` + +### 4. 生成播客 + +```bash +python scripts/generate.py \ + --script-file workspace/test-script.json \ + --output-file outputs/test-podcast.mp3 \ + --transcript-file outputs/test-transcript.md +``` + +## TTS服务说明 + +### Edge-TTS(推荐,默认) +- ✅ 完全免费 +- ✅ 无需API密钥 +- ✅ 支持中英文 +- ✅ 音质优秀 +- ⚠️ 需要网络连接 + +### RunningHub API +- 需要API密钥 +- 支持自定义声音 +- 需要配置工作流ID + +### 火山引擎TTS +- 需要API密钥 +- 音质优秀 +- 需要注册火山引擎账号 + +## 脚本格式说明 + +```json +{ + "title": "播客标题(可选)", + "locale": "语言代码(zh/en)", + "lines": [ + { + "speaker": "male", // male 或 female + "paragraph": "对话内容" + } + ] +} +``` + +## 输出文件 + +- `*.mp3` - 播客音频文件 +- `*.md` - 播客文字稿 + +## 注意事项 + +- 首次使用Edge-TTS时会自动下载语音模型 +- 建议每行对话不要太长(50-100字) +- 男女主持人交替对话效果更好 +- 支持的音频格式:MP3 + +## 故障排除 + +**问题:edge-tts库未安装** +```bash +pip install edge-tts +``` + +**问题:网络连接失败** +- 检查网络连接 +- Edge-TTS需要访问微软服务器 + +**问题:音频生成失败** +- 检查脚本JSON格式是否正确 +- 查看错误日志 +- 尝试使用其他TTS服务 diff --git a/skills/public/podcast-generation/SKILL.md b/skills/public/podcast-generation/SKILL.md old mode 100644 new mode 100755 index b78b8dd7..d75b1ace --- a/skills/public/podcast-generation/SKILL.md +++ b/skills/public/podcast-generation/SKILL.md @@ -25,12 +25,42 @@ When a user requests podcast generation, identify: - Source content: The text/article/report to convert into a podcast - Language: English or Chinese (based on content) -- Output location: Where to save the generated podcast -- You don't need to check the folder under `/mnt/user-data` +- Output location: Where to save the generated podcast (默认保存在项目目录下) -### Step 2: Create Structured Script JSON +### Step 2: Configure Environment Variables(可选) -Generate a structured JSON script file in `/mnt/user-data/workspace/` with naming pattern: `{descriptive-name}-script.json` +默认使用**Edge-TTS**(免费,无需配置),可以直接跳过此步骤。 + +如果需要使用其他TTS服务,复制 `.env.example` 为 `.env` 并配置: + +**方式1:使用Edge-TTS(推荐,默认)** +- ✅ 完全免费 +- ✅ 无需API密钥 +- ✅ 支持中英文 +- ✅ 音质优秀 +- 直接使用,无需配置 + +**方式2:使用RunningHub API** + +```bash +RUNNINGHUB_API_KEY=your_api_key_here +RUNNINGHUB_API_URL=https://api.runninghub.cn/v1/workflow/run +RUNNINGHUB_TTS_WORKFLOW_ID=your_workflow_id_here +RUNNINGHUB_MALE_VOICE=male_voice_name +RUNNINGHUB_FEMALE_VOICE=female_voice_name +``` + +**方式3:使用火山引擎TTS** + +```bash +VOLCENGINE_TTS_APPID=your_app_id +VOLCENGINE_TTS_ACCESS_TOKEN=your_access_token +VOLCENGINE_TTS_CLUSTER=volcano_tts +``` + +### Step 3: Create Structured Script JSON + +Generate a structured JSON script file in project directory with naming pattern: `{descriptive-name}-script.json` The JSON structure: ```json @@ -43,21 +73,21 @@ The JSON structure: } ``` -### Step 3: Execute Generation +### Step 4: Execute Generation Call the Python script: ```bash -python /mnt/skills/public/podcast-generation/scripts/generate.py \ - --script-file /mnt/user-data/workspace/script-file.json \ - --output-file /mnt/user-data/outputs/generated-podcast.mp3 \ - --transcript-file /mnt/user-data/outputs/generated-podcast-transcript.md +python scripts/generate.py \ + --script-file workspace/script-file.json \ + --output-file outputs/generated-podcast.mp3 \ + --transcript-file outputs/generated-podcast-transcript.md ``` Parameters: -- `--script-file`: Absolute path to JSON script file (required) -- `--output-file`: Absolute path to output MP3 file (required) -- `--transcript-file`: Absolute path to output transcript markdown file (optional, but recommended) +- `--script-file`: Path to JSON script file (required) +- `--output-file`: Path to output MP3 file (required) +- `--transcript-file`: Path to output transcript markdown file (optional, but recommended) > [!IMPORTANT] > - Execute the script in one complete call. Do NOT split the workflow into separate steps. @@ -112,39 +142,6 @@ When creating the script JSON, follow these guidelines: - Make content engaging and accessible for audio-only listeners - Exclude meta information like dates, author names, or document structure -## Podcast Generation Example - -User request: "Generate a podcast about the history of artificial intelligence" - -Step 1: Create script file `/mnt/user-data/workspace/ai-history-script.json`: -```json -{ - "title": "The History of Artificial Intelligence", - "locale": "en", - "lines": [ - {"speaker": "male", "paragraph": "Hello Deer! Welcome back to another fascinating episode. Today we're diving into something that's literally shaping our future - the history of artificial intelligence."}, - {"speaker": "female", "paragraph": "Oh, I love this topic! You know, AI feels so modern, but it actually has roots going back over seventy years."}, - {"speaker": "male", "paragraph": "Exactly! It all started back in the 1950s. The term artificial intelligence was actually coined by John McCarthy in 1956 at a famous conference at Dartmouth."}, - {"speaker": "female", "paragraph": "Wait, so they were already thinking about machines that could think back then? That's incredible!"}, - {"speaker": "male", "paragraph": "Right? The early pioneers were so optimistic. They thought we'd have human-level AI within a generation."}, - {"speaker": "female", "paragraph": "But things didn't quite work out that way, did they?"}, - {"speaker": "male", "paragraph": "No, not at all. The 1970s brought what's called the first AI winter..."} - ] -} -``` - -Step 2: Execute generation: -```bash -python /mnt/skills/public/podcast-generation/scripts/generate.py \ - --script-file /mnt/user-data/workspace/ai-history-script.json \ - --output-file /mnt/user-data/outputs/ai-history-podcast.mp3 \ - --transcript-file /mnt/user-data/outputs/ai-history-transcript.md -``` - -This will generate: -- `ai-history-podcast.mp3`: The audio podcast file -- `ai-history-transcript.md`: A readable markdown transcript of the podcast - ## Specific Templates Read the following template file only when matching the user request. @@ -164,14 +161,25 @@ The generated podcast follows the "Hello Deer" format: After generation: -- Podcasts and transcripts are saved in `/mnt/user-data/outputs/` -- Share both the podcast MP3 and transcript MD with user using `present_files` tool +- Podcasts and transcripts are saved in the `outputs/` directory +- Share both the podcast MP3 and transcript MD with user - Provide brief description of the generation result (topic, duration, hosts) - Offer to regenerate if adjustments needed ## Requirements -The following environment variables must be set: +**默认使用Edge-TTS(微软浏览器TTS),无需任何配置,开箱即用。** + +如需使用其他TTS服务,可配置以下环境变量: + +**RunningHub API(可选):** +- `RUNNINGHUB_API_KEY`: RunningHub API密钥 +- `RUNNINGHUB_API_URL`: RunningHub API端点URL +- `RUNNINGHUB_TTS_WORKFLOW_ID`: TTS工作流ID +- `RUNNINGHUB_MALE_VOICE`: 男性声音参数 +- `RUNNINGHUB_FEMALE_VOICE`: 女性声音参数 + +**火山引擎TTS(可选):** - `VOLCENGINE_TTS_APPID`: Volcengine TTS application ID - `VOLCENGINE_TTS_ACCESS_TOKEN`: Volcengine TTS access token - `VOLCENGINE_TTS_CLUSTER`: Volcengine TTS cluster (optional, defaults to "volcano_tts") @@ -183,3 +191,4 @@ The following environment variables must be set: - Technical content should be simplified for audio accessibility in the script - Complex notations (formulas, code) should be translated to plain language in the script - Long content may result in longer podcasts +- Edge-TTS使用微软Edge浏览器的在线TTS服务,需要网络连接 diff --git a/skills/public/podcast-generation/requirements.txt b/skills/public/podcast-generation/requirements.txt new file mode 100755 index 00000000..a30805af --- /dev/null +++ b/skills/public/podcast-generation/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.31.0 +python-dotenv>=1.0.0 +edge-tts>=6.1.0 diff --git a/skills/public/podcast-generation/scripts/generate.py b/skills/public/podcast-generation/scripts/generate.py old mode 100644 new mode 100755 index 8a078778..86c4919d --- a/skills/public/podcast-generation/scripts/generate.py +++ b/skills/public/podcast-generation/scripts/generate.py @@ -4,10 +4,15 @@ import json import logging import os import uuid +import asyncio from concurrent.futures import ThreadPoolExecutor, as_completed from typing import Literal, Optional +from pathlib import Path import requests +from dotenv import load_dotenv + +load_dotenv() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -39,7 +44,115 @@ class Script: def text_to_speech(text: str, voice_type: str) -> Optional[bytes]: - """Convert text to speech using Volcengine TTS.""" + """Convert text to speech using available TTS service. + + 优先级: + 1. RunningHub API(需要配置.env) + 2. 火山引擎TTS(需要配置.env) + 3. Edge-TTS(免费,无需配置,默认使用) + """ + # 检查是否有RunningHub配置 + has_runninghub = (os.getenv("RUNNINGHUB_API_KEY") and + os.getenv("RUNNINGHUB_API_URL") and + os.getenv("RUNNINGHUB_TTS_WORKFLOW_ID")) + + # 检查是否有火山引擎配置 + has_volcengine = (os.getenv("VOLCENGINE_TTS_APPID") and + os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN")) + + if has_runninghub: + logger.info("Using RunningHub TTS API") + return text_to_speech_runninghub(text, voice_type) + elif has_volcengine: + logger.info("Using Volcengine TTS API") + return text_to_speech_volcengine(text, voice_type) + else: + logger.info("Using Edge-TTS (free, no API key required)") + return text_to_speech_edge(text, voice_type) + + +def text_to_speech_runninghub(text: str, voice_type: str) -> Optional[bytes]: + """Convert text to speech using RunningHub TTS API. + + 需要在.env文件中配置以下环境变量: + - RUNNINGHUB_API_KEY: RunningHub API密钥 + - RUNNINGHUB_API_URL: RunningHub API端点URL + - RUNNINGHUB_TTS_WORKFLOW_ID: TTS工作流ID + - RUNNINGHUB_MALE_VOICE: 男性声音参数 + - RUNNINGHUB_FEMALE_VOICE: 女性声音参数 + """ + api_key = os.getenv("RUNNINGHUB_API_KEY") + api_url = os.getenv("RUNNINGHUB_API_URL") + workflow_id = os.getenv("RUNNINGHUB_TTS_WORKFLOW_ID") + + if not api_key or not api_url or not workflow_id: + raise ValueError( + "请在.env文件中配置RUNNINGHUB_API_KEY, RUNNINGHUB_API_URL和RUNNINGHUB_TTS_WORKFLOW_ID" + ) + + # 根据speaker选择声音参数 + if voice_type == "male": + voice_param = os.getenv("RUNNINGHUB_MALE_VOICE", "male_voice") + else: + voice_param = os.getenv("RUNNINGHUB_FEMALE_VOICE", "female_voice") + + # 构建RunningHub API请求 + # 注意:以下payload结构需要根据实际的RunningHub API文档进行调整 + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + } + + # 根据RunningHub工作流的实际参数格式调整payload + # 这里提供一个通用模板,你需要根据实际API文档修改 + payload = { + "workflow_id": workflow_id, + "inputs": { + "text": text, + "voice": voice_param, + # 根据实际工作流参数添加其他字段 + # "speed": 1.2, + # "quality": "high", + } + } + + try: + logger.info(f"Calling RunningHub API for text: {text[:50]}...") + response = requests.post(api_url, json=payload, headers=headers, timeout=60) + + if response.status_code != 200: + logger.error(f"RunningHub API error: {response.status_code} - {response.text}") + return None + + result = response.json() + + # 根据RunningHub API的实际响应格式调整以下代码 + # 通常音频数据会在某个字段中,可能是base64编码的 + audio_data = result.get("data") or result.get("audio") or result.get("output") + + if audio_data: + # 如果音频是base64编码的,需要解码 + if isinstance(audio_data, str): + try: + return base64.b64decode(audio_data) + except: + # 如果不是base64,可能已经是bytes + return audio_data.encode() if isinstance(audio_data, str) else audio_data + elif isinstance(audio_data, bytes): + return audio_data + + logger.error(f"No audio data in response: {result}") + return None + + except Exception as e: + logger.error(f"RunningHub TTS error: {str(e)}") + import traceback + traceback.print_exc() + return None + + +def text_to_speech_volcengine(text: str, voice_type: str) -> Optional[bytes]: + """Convert text to speech using Volcengine TTS (备用方案).""" app_id = os.getenv("VOLCENGINE_TTS_APPID") access_token = os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN") cluster = os.getenv("VOLCENGINE_TTS_CLUSTER", "volcano_tts") @@ -51,7 +164,6 @@ def text_to_speech(text: str, voice_type: str) -> Optional[bytes]: url = "https://openspeech.bytedance.com/api/v1/tts" - # Authentication: Bearer token with semicolon separator headers = { "Content-Type": "application/json", "Authorization": f"Bearer;{access_token}", @@ -60,7 +172,7 @@ def text_to_speech(text: str, voice_type: str) -> Optional[bytes]: payload = { "app": { "appid": app_id, - "token": "access_token", # literal string, not the actual token + "token": "access_token", "cluster": cluster, }, "user": {"uid": "podcast-generator"}, @@ -70,7 +182,7 @@ def text_to_speech(text: str, voice_type: str) -> Optional[bytes]: "speed_ratio": 1.2, }, "request": { - "reqid": str(uuid.uuid4()), # must be unique UUID + "reqid": str(uuid.uuid4()), "text": text, "text_type": "plain", "operation": "query", @@ -99,15 +211,93 @@ def text_to_speech(text: str, voice_type: str) -> Optional[bytes]: return None +def text_to_speech_edge(text: str, voice_type: str) -> Optional[bytes]: + """Convert text to speech using Edge-TTS (免费,无需API密钥). + + Edge-TTS使用微软Edge浏览器的在线TTS服务,完全免费且无需注册。 + + 参数: + text: 要转换的文本 + voice_type: "male" 或 "female",用于选择声音 + """ + try: + import edge_tts + except ImportError: + logger.error("edge-tts库未安装。请运行: pip install edge-tts") + return None + + # 根据语言和性别选择声音 + # 中文声音 + zh_male_voices = [ + "zh-CN-YunxiNeural", # 男声 + "zh-CN-YunyangNeural", # 男声 + ] + zh_female_voices = [ + "zh-CN-XiaoxiaoNeural", # 女声 + "zh-CN-XiaoyiNeural", # 女声 + ] + + # 英文声音 + en_male_voices = [ + "en-US-GuyNeural", # 男声 + "en-US-EricNeural", # 男声 + ] + en_female_voices = [ + "en-US-JennyNeural", # 女声 + "en-US-AriaNeural", # 女声 + ] + + # 检测文本语言(简单判断:是否包含中文字符) + has_chinese = any('\u4e00' <= char <= '\u9fff' for char in text) + + if has_chinese: + if voice_type == "male": + voice = zh_male_voices[0] + else: + voice = zh_female_voices[0] + else: + if voice_type == "male": + voice = en_male_voices[0] + else: + voice = en_female_voices[0] + + logger.info(f"Using Edge-TTS voice: {voice}") + + try: + communicate = edge_tts.Communicate(text, voice) + + # 获取音频数据 + audio_data = b"" + async def generate_audio(): + nonlocal audio_data + async for chunk in communicate.stream(): + if chunk["type"] == "audio": + audio_data += chunk["data"] + + # 运行异步函数 + asyncio.run(generate_audio()) + + if audio_data: + logger.info(f"Generated {len(audio_data)} bytes of audio") + return audio_data + else: + logger.error("No audio data generated") + return None + + except Exception as e: + logger.error(f"Edge-TTS error: {str(e)}") + import traceback + traceback.print_exc() + return None + + def _process_line(args: tuple[int, ScriptLine, int]) -> tuple[int, Optional[bytes]]: """Process a single script line for TTS. Returns (index, audio_bytes).""" i, line, total = args # Select voice based on speaker gender - if line.speaker == "male": - voice_type = "zh_male_yangguangqingnian_moon_bigtts" # Male voice - else: - voice_type = "zh_female_sajiaonvyou_moon_bigtts" # Female voice + # voice_type会传递给text_to_speech函数,函数会根据.env配置选择具体的声音参数 + voice_type = "male" if line.speaker == "male" else "female" logger.info(f"Processing line {i + 1}/{total} ({line.speaker})") audio = text_to_speech(line.paragraph, voice_type) @@ -123,15 +313,46 @@ def tts_node(script: Script, max_workers: int = 4) -> list[bytes]: logger.info(f"Converting script to audio using {max_workers} workers...") total = len(script.lines) + + # Handle empty script case + if total == 0: + raise ValueError("Script contains no lines to process") + + # Validate required environment variables before starting TTS + # 检查RunningHub配置或火山引擎配置(至少需要一种) + has_runninghub = (os.getenv("RUNNINGHUB_API_KEY") and + os.getenv("RUNNINGHUB_API_URL") and + os.getenv("RUNNINGHUB_TTS_WORKFLOW_ID")) + has_volcengine = (os.getenv("VOLCENGINE_TTS_APPID") and + os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN")) + + if not has_runninghub and not has_volcengine: + raise ValueError( + "Missing required environment variables. Please configure either:\n" + "- RunningHub: RUNNINGHUB_API_KEY, RUNNINGHUB_API_URL, RUNNINGHUB_TTS_WORKFLOW_ID\n" + "- Volcengine: VOLCENGINE_TTS_APPID, VOLCENGINE_TTS_ACCESS_TOKEN" + ) + tasks = [(i, line, total) for i, line in enumerate(script.lines)] # Use ThreadPoolExecutor for parallel TTS generation results: dict[int, Optional[bytes]] = {} + failed_indices: list[int] = [] with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {executor.submit(_process_line, task): task[0] for task in tasks} for future in as_completed(futures): idx, audio = future.result() results[idx] = audio + # Use `not audio` to catch both None and empty bytes + if not audio: + failed_indices.append(idx) + + # Log failed lines with 1-based indices for user-friendly output + if failed_indices: + logger.warning( + f"Failed to generate audio for {len(failed_indices)}/{total} lines: " + f"line numbers {sorted(i + 1 for i in failed_indices)}" + ) # Collect results in order, skipping failed ones audio_chunks = [] @@ -140,15 +361,30 @@ def tts_node(script: Script, max_workers: int = 4) -> list[bytes]: if audio: audio_chunks.append(audio) - logger.info(f"Generated {len(audio_chunks)} audio chunks") + logger.info(f"Generated {len(audio_chunks)}/{total} audio chunks successfully") + + if not audio_chunks: + raise ValueError( + f"TTS generation failed for all {total} lines. " + "Please check VOLCENGINE_TTS_APPID and VOLCENGINE_TTS_ACCESS_TOKEN environment variables." + ) + return audio_chunks def mix_audio(audio_chunks: list[bytes]) -> bytes: """Combine audio chunks into a single audio file.""" logger.info("Mixing audio chunks...") + + if not audio_chunks: + raise ValueError("No audio chunks to mix - TTS generation may have failed") + output = b"".join(audio_chunks) - logger.info("Audio mixing complete") + + if len(output) == 0: + raise ValueError("Mixed audio is empty - TTS generation may have failed") + + logger.info(f"Audio mixing complete: {len(output)} bytes") return output diff --git a/skills/public/podcast-generation/templates/tech-explainer.md b/skills/public/podcast-generation/templates/tech-explainer.md old mode 100644 new mode 100755 diff --git a/skills/public/video-generation/.env b/skills/public/video-generation/.env new file mode 100755 index 00000000..a57d3a2f --- /dev/null +++ b/skills/public/video-generation/.env @@ -0,0 +1 @@ +RUNNINGHUB_API_KEY=a73d0e93afb4432c978e5bff30b7517e \ No newline at end of file diff --git a/skills/public/video-generation/.gitignore b/skills/public/video-generation/.gitignore new file mode 100755 index 00000000..699067e2 --- /dev/null +++ b/skills/public/video-generation/.gitignore @@ -0,0 +1,16 @@ +.env +*.mp4 +*.avi +*.mov +*.mkv +outputs/ +workspace/ +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python +*.so +*.egg-info/ +dist/ +build/ \ No newline at end of file diff --git a/skills/public/video-generation/SETUP.md b/skills/public/video-generation/SETUP.md new file mode 100755 index 00000000..d3c4f885 --- /dev/null +++ b/skills/public/video-generation/SETUP.md @@ -0,0 +1,93 @@ +# Video Generation Skill - Setup Guide + +## Quick Start + +1. **Install Dependencies** + ```bash + pip install requests python-dotenv + ``` + +2. **Configure API Key** + + Create a `.env` file in the project root directory: + ```env + RUNNINGHUB_API_KEY=your_api_key_here + ``` + + Or set it as an environment variable: + ```bash + # Windows PowerShell + $env:RUNNINGHUB_API_KEY="your_api_key_here" + + # Linux/Mac + export RUNNINGHUB_API_KEY="your_api_key_here" + ``` + +3. **Generate a Video** + ```bash + python scripts/generate.py --prompt-file workspace/your-prompt.json --output-file outputs/video.mp4 --duration 5 + ``` + +## Parameters + +- `--prompt-file`: Path to JSON prompt file (required) +- `--output-file`: Output video file path (required) +- `--aspect-ratio`: Video aspect ratio (optional, default: 16:9) +- `--duration`: Video duration in seconds (optional, default: 5, range: 1-16) + +## Getting API Key + +To use this skill, you need a RunningHub API key: + +1. Visit [RunningHub](https://www.runninghub.cn/) +2. Sign up for an account +3. Get your API key from the dashboard +4. Add it to your `.env` file + +## Example Prompt + +Create a JSON file with your video description: + +```json +{ + "title": "Your Video Title", + "description": "Description of what you want to generate", + "visual": { + "scene": "Scene description", + "elements": ["element1", "element2"], + "colors": "Color palette", + "lighting": "Lighting description" + }, + "camera": { + "movement": "Camera movement", + "focus": "Focus description" + }, + "audio": { + "background": "Background music description", + "effects": "Sound effects description" + } +} +``` + +## Notes + +- The `.env` file is already in `.gitignore` and won't be committed to version control +- Never share your API key or commit it to public repositories +- The script automatically loads environment variables from `.env` file +- Video generation may take several minutes depending on the complexity + +## Troubleshooting + +**Error: RUNNINGHUB_API_KEY is not set** +- Make sure you've created the `.env` file with your API key +- Or set the environment variable before running the script + +**Error: Failed to submit task** +- Check that your API key is valid +- Ensure you have sufficient credits in your RunningHub account +- Verify your internet connection + +**Video generation takes too long** +- This is normal for AI video generation +- The script will automatically poll for status until completion +- You can check the RunningHub dashboard for task progress \ No newline at end of file diff --git a/skills/public/video-generation/SKILL.md b/skills/public/video-generation/SKILL.md old mode 100644 new mode 100755 index e0c55b36..a1e1e52a --- a/skills/public/video-generation/SKILL.md +++ b/skills/public/video-generation/SKILL.md @@ -7,13 +7,14 @@ description: Use this skill when the user requests to generate, create, or imagi ## Overview -This skill generates high-quality videos using structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing video generation with optional reference image. +This skill generates high-quality videos using structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing video generation through RunningHub API. ## Core Capabilities - Create structured JSON prompts for AIGC video generation -- Support reference image as guidance or the first/last frame of the video -- Generate videos through automated Python script execution +- Generate videos through RunningHub Vidu model (text-to-video-q3-turbo) +- Support up to 16 seconds video generation with audio +- Automatic camera switching and dialogue generation ## Workflow @@ -21,21 +22,16 @@ This skill generates high-quality videos using structured prompts and a Python s When a user requests video generation, identify: -- Subject/content: What should be in the image +- Subject/content: What should be in the video - Style preferences: Art style, mood, color palette -- Technical specs: Aspect ratio, composition, lighting -- Reference image: Any image to guide generation -- You don't need to check the folder under `/mnt/user-data` +- Technical specs: Aspect ratio, resolution, duration +- Audio requirements: Background music, dialogue, sound effects ### Step 2: Create Structured Prompt Generate a structured JSON file in `/mnt/user-data/workspace/` with naming pattern: `{descriptive-name}.json` -### Step 3: Create Reference Image (Optional when image-generation skill is available) - -Generate reference image for the video generation. - -- If only 1 image is provided, use it as the guided frame of the video +The prompt should include visual descriptions, camera movements, and audio specifications in a natural language format. ### Step 3: Execute Generation @@ -43,7 +39,6 @@ Call the Python script: ```bash python /mnt/skills/public/video-generation/scripts/generate.py \ --prompt-file /mnt/user-data/workspace/prompt-file.json \ - --reference-images /path/to/ref1.jpg \ --output-file /mnt/user-data/outputs/generated-video.mp4 \ --aspect-ratio 16:9 ``` @@ -51,20 +46,28 @@ python /mnt/skills/public/video-generation/scripts/generate.py \ Parameters: - `--prompt-file`: Absolute path to JSON prompt file (required) -- `--reference-images`: Absolute paths to reference image (optional) -- `--output-file`: Absolute path to output image file (required) -- `--aspect-ratio`: Aspect ratio of the generated image (optional, default: 16:9) +- `--output-file`: Absolute path to output video file (required) +- `--aspect-ratio`: Aspect ratio of the generated video (optional, default: 16:9) [!NOTE] Do NOT read the python file, instead just call it with the parameters. +## Environment Variables + +Set the following environment variable before running the script: + +- `RUNNINGHUB_API_KEY`: Your RunningHub API key + +Example: +```bash +export RUNNINGHUB_API_KEY=a73d0e93afb4432c978e5bff30b7517e +``` + ## Video Generation Example User request: "Generate a short video clip depicting the opening scene from "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe" -Step 1: Search for the opening scene of "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe" online - -Step 2: Create a JSON prompt file with the following content: +Step 1: Create a JSON prompt file with the following content: ```json { @@ -108,16 +111,11 @@ Step 2: Create a JSON prompt file with the following content: } ``` -Step 3: Use the image-generation skill to generate the reference image - -Load the image-generation skill and generate a single reference image `narnia-farewell-scene-01.jpg` according to the skill. - -Step 4: Use the generate.py script to generate the video +Step 2: Use the generate.py script to generate the video ```bash python /mnt/skills/public/video-generation/scripts/generate.py \ --prompt-file /mnt/user-data/workspace/narnia-farewell-scene.json \ - --reference-images /mnt/user-data/outputs/narnia-farewell-scene-01.jpg \ - --output-file /mnt/user-data/outputs/narnia-farewell-scene-01.mp4 \ + --output-file /mnt/user-data/outputs/narnia-farewell-scene.mp4 \ --aspect-ratio 16:9 ``` > Do NOT read the python file, just call it with the parameters. @@ -127,7 +125,7 @@ python /mnt/skills/public/video-generation/scripts/generate.py \ After generation: - Videos are typically saved in `/mnt/user-data/outputs/` -- Share generated videos (come first) with user as well as generated image if applicable, using `present_files` tool +- Share generated videos with user using `present_files` tool - Provide brief description of the generation result - Offer to iterate if adjustments needed @@ -135,5 +133,7 @@ After generation: - Always use English for prompts regardless of user's language - JSON format ensures structured, parsable prompts -- Reference image enhance generation quality significantly +- RunningHub Vidu model supports up to 16 seconds video generation +- Audio is automatically generated including dialogue and sound effects +- The model has "director thinking" capability for automatic camera switching - Iterative refinement is normal for optimal results diff --git a/skills/public/video-generation/scripts/generate.py b/skills/public/video-generation/scripts/generate.py old mode 100644 new mode 100755 index e01ebb33..3b55b038 --- a/skills/public/video-generation/scripts/generate.py +++ b/skills/public/video-generation/scripts/generate.py @@ -1,75 +1,83 @@ -import base64 import os import time +from typing import List import requests +from dotenv import load_dotenv + +load_dotenv() def generate_video( prompt_file: str, - reference_images: list[str], + reference_images: List[str], output_file: str, aspect_ratio: str = "16:9", + duration: str = "5", ) -> str: - with open(prompt_file, "r") as f: + with open(prompt_file, "r", encoding="utf-8") as f: prompt = f.read() - referenceImages = [] - i = 0 - json = { - "instances": [{"prompt": prompt}], - } - for reference_image in reference_images: - i += 1 - with open(reference_image, "rb") as f: - image_b64 = base64.b64encode(f.read()).decode("utf-8") - referenceImages.append( - { - "image": {"mimeType": "image/jpeg", "bytesBase64Encoded": image_b64}, - "referenceType": "asset", - } - ) - if i > 0: - json["instances"][0]["referenceImages"] = referenceImages - api_key = os.getenv("GEMINI_API_KEY") + + api_key = os.getenv("RUNNINGHUB_API_KEY") if not api_key: - return "GEMINI_API_KEY is not set" + return "RUNNINGHUB_API_KEY is not set" + + json_data = { + "prompt": prompt, + "style": "general", + "aspectRatio": aspect_ratio, + "resolution": "720p", + "duration": duration, + "audio": True + } + response = requests.post( - "https://generativelanguage.googleapis.com/v1beta/models/veo-3.1-generate-preview:predictLongRunning", + "https://www.runninghub.cn/openapi/v2/vidu/text-to-video-q3-turbo", headers={ - "x-goog-api-key": api_key, + "Authorization": f"Bearer {api_key}", "Content-Type": "application/json", }, - json=json, + json=json_data, ) - json = response.json() - operation_name = json["name"] + + response_json = response.json() + + if "taskId" not in response_json: + return f"Failed to submit task: {response_json}" + + task_id = response_json["taskId"] + while True: - response = requests.get( - f"https://generativelanguage.googleapis.com/v1beta/{operation_name}", + response = requests.post( + "https://www.runninghub.cn/openapi/v2/query", headers={ - "x-goog-api-key": api_key, + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json", }, + json={"taskId": task_id}, ) - json = response.json() - if json.get("done", False): - sample = json["response"]["generateVideoResponse"]["generatedSamples"][0] - url = sample["video"]["uri"] - download(url, output_file) - break + + response_json = response.json() + status = response_json.get("status") + + if status == "SUCCESS": + results = response_json.get("results", []) + if results and len(results) > 0: + url = results[0].get("url") + if url: + download(url, output_file) + break + elif status == "FAILED": + error_message = response_json.get("errorMessage", "Unknown error") + return f"Video generation failed: {error_message}" + time.sleep(3) + return f"The video has been generated successfully to {output_file}" def download(url: str, output_file: str): - api_key = os.getenv("GEMINI_API_KEY") - if not api_key: - return "GEMINI_API_KEY is not set" - response = requests.get( - url, - headers={ - "x-goog-api-key": api_key, - }, - ) + response = requests.get(url) with open(output_file, "wb") as f: f.write(response.content) @@ -77,28 +85,28 @@ def download(url: str, output_file: str): if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description="Generate videos using Gemini API") + parser = argparse.ArgumentParser(description="Generate videos using RunningHub API") parser.add_argument( "--prompt-file", required=True, help="Absolute path to JSON prompt file", ) - parser.add_argument( - "--reference-images", - nargs="*", - default=[], - help="Absolute paths to reference images (space-separated)", - ) parser.add_argument( "--output-file", required=True, - help="Output path for generated image", + help="Output path for generated video", ) parser.add_argument( "--aspect-ratio", required=False, default="16:9", - help="Aspect ratio of the generated image", + help="Aspect ratio of the generated video", + ) + parser.add_argument( + "--duration", + required=False, + default="5", + help="Duration of the generated video in seconds (1-16)", ) args = parser.parse_args() @@ -107,9 +115,10 @@ if __name__ == "__main__": print( generate_video( args.prompt_file, - args.reference_images, + [], args.output_file, args.aspect_ratio, + args.duration, ) ) except Exception as e: