Compare commits

...

2 Commits

Author SHA1 Message Date
Titan c2fc5bcd96 feat: 修改默认skill 2026-03-27 14:20:47 +08:00
Titan 5242df028b feat:优化skill注入流程 2026-03-27 13:34:30 +08:00
18 changed files with 1055 additions and 264 deletions

View File

@ -108,7 +108,7 @@ class RemoteSkillBootstrapResponse(BaseModel):
target_dir: str = Field(..., description="Virtual target directory") target_dir: str = Field(..., description="Virtual target directory")
created_directories: int = Field(..., description="Number of created directories") created_directories: int = Field(..., description="Number of created directories")
created_files: int = Field(..., description="Number of created files") created_files: int = Field(..., description="Number of created files")
sandbox_id: str = Field(..., description="Acquired sandbox ID") sandbox_id: str | None = Field(default=None, description="Acquired sandbox ID (null when sandbox is not acquired)")
message: str = Field(..., description="Operation result message") message: str = Field(..., description="Operation result message")
@ -568,8 +568,8 @@ async def bootstrap_skill_from_remote(request: RemoteSkillBootstrapRequest) -> R
"""Initialize thread skill directory from remote YAML content service.""" """Initialize thread skill directory from remote YAML content service."""
try: try:
# 1) Ensure sandbox and thread personal dirs are initialized first. # 1) Ensure sandbox and thread personal dirs are initialized first.
sandbox_provider = get_sandbox_provider() # sandbox_provider = get_sandbox_provider()
sandbox_id = sandbox_provider.acquire(request.thread_id) # sandbox_id = sandbox_provider.acquire(request.thread_id)
# 2) Fetch YAML content from configured remote endpoint. # 2) Fetch YAML content from configured remote endpoint.
cfg = get_gateway_config() cfg = get_gateway_config()
@ -624,7 +624,7 @@ async def bootstrap_skill_from_remote(request: RemoteSkillBootstrapRequest) -> R
target_dir=request.target_dir, target_dir=request.target_dir,
created_directories=len(parsed.directories), created_directories=len(parsed.directories),
created_files=len(parsed.files), created_files=len(parsed.files),
sandbox_id=sandbox_id, sandbox_id=None,
message=( message=(
f"Bootstrapped {len(parsed.files)} files and {len(parsed.directories)} directories " f"Bootstrapped {len(parsed.files)} files and {len(parsed.directories)} directories "
f"under '{request.target_dir}'" f"under '{request.target_dir}'"

View File

@ -7,6 +7,9 @@ real filesystem content under a thread's virtual path (for example,
from __future__ import annotations from __future__ import annotations
import argparse
import json
import sys
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
@ -329,3 +332,158 @@ def materialize_skill_tree(parsed: ParsedSkillTree, target_root: Path, clear_tar
file_path = target_root / rel_file file_path = target_root / rel_file
file_path.parent.mkdir(parents=True, exist_ok=True) file_path.parent.mkdir(parents=True, exist_ok=True)
file_path.write_text(content, encoding="utf-8") file_path.write_text(content, encoding="utf-8")
def _build_cli_parser() -> argparse.ArgumentParser:
"""Build command-line argument parser.
CLI usage:
python skill_yaml_importer.py <input_path> [options]
Positional arguments:
input_path Path to a YAML file, or a directory containing YAML files.
Options:
--show-files Include parsed file paths in output.
--show-directories Include parsed directory paths in output.
--json Print JSON output instead of plain text.
--recursive Recursively scan subdirectories when input is a directory.
--log-file <path> Save full report (summary + successes + failures) to JSON file.
Examples:
python skill_yaml_importer.py ./sample.yaml --json
python skill_yaml_importer.py ./generated_yaml --recursive --log-file ./parse_log.json
"""
parser = argparse.ArgumentParser(description="Parse and validate a skill YAML spec file")
parser.add_argument("input_path", help="Path to a YAML file or a directory containing YAML files")
parser.add_argument(
"--show-files",
action="store_true",
help="Print sorted parsed file paths",
)
parser.add_argument(
"--show-directories",
action="store_true",
help="Print sorted parsed directory paths",
)
parser.add_argument(
"--json",
action="store_true",
help="Print parsed summary as JSON",
)
parser.add_argument(
"--recursive",
action="store_true",
help="When input path is a directory, scan YAML files recursively",
)
parser.add_argument(
"--log-file",
default=None,
help="Optional path to save full execution results and summary as JSON",
)
return parser
def _collect_yaml_files(input_path: Path, recursive: bool) -> list[Path]:
if input_path.is_file():
return [input_path]
if not input_path.is_dir():
return []
patterns = ("*.yaml", "*.yml")
files: list[Path] = []
for pattern in patterns:
iterator = input_path.rglob(pattern) if recursive else input_path.glob(pattern)
files.extend(iterator)
# Stable order for deterministic output
return sorted({p.resolve() for p in files})
def _parse_one_yaml_file(yaml_path: Path, show_files: bool, show_directories: bool) -> dict:
yaml_text = yaml_path.read_text(encoding="utf-8")
parsed = parse_skill_yaml_spec(yaml_text)
directories = sorted(parsed.directories)
files = sorted(parsed.files.keys())
return {
"yaml_file": str(yaml_path),
"directories_count": len(directories),
"files_count": len(files),
"directories": directories if show_directories else None,
"files": files if show_files else None,
}
def _main() -> int:
"""CLI entrypoint for parsing one YAML file or a batch of YAML files.
Exit codes:
0: all files parsed successfully
1: invalid input path or no YAML files found
2: processed completed with one or more parse failures
"""
args = _build_cli_parser().parse_args()
input_path = Path(args.input_path)
if not input_path.exists():
print(f"Input path not found: {input_path}", file=sys.stderr)
return 1
yaml_files = _collect_yaml_files(input_path, recursive=args.recursive)
if not yaml_files:
print(f"No YAML files found under: {input_path}", file=sys.stderr)
return 1
successes: list[dict] = []
failures: list[dict[str, str]] = []
for yaml_path in yaml_files:
try:
result = _parse_one_yaml_file(
yaml_path,
show_files=args.show_files,
show_directories=args.show_directories,
)
successes.append(result)
if not args.json:
print(f"OK: {yaml_path}")
print(f" Directories: {result['directories_count']}")
print(f" Files: {result['files_count']}")
except Exception as e: # noqa: BLE001
failures.append({"yaml_file": str(yaml_path), "error": str(e)})
print(f"ERROR: {yaml_path}: {e}", file=sys.stderr)
summary = {
"input_path": str(input_path),
"total": len(yaml_files),
"success": len(successes),
"failed": len(failures),
}
report = {"summary": summary, "successes": successes, "failures": failures}
if args.log_file:
try:
log_path = Path(args.log_file)
log_path.parent.mkdir(parents=True, exist_ok=True)
log_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Log saved: {log_path}")
except Exception as e: # noqa: BLE001
print(f"Failed to write log file '{args.log_file}': {e}", file=sys.stderr)
if args.json:
print(json.dumps(report, ensure_ascii=False, indent=2))
else:
print("\n[Summary]")
print(f"Input: {summary['input_path']}")
print(f"Total: {summary['total']}")
print(f"Success: {summary['success']}")
print(f"Failed: {summary['failed']}")
return 0 if not failures else 2
if __name__ == "__main__":
raise SystemExit(_main())

View File

@ -166,6 +166,10 @@ services:
- CI=true - CI=true
# Docker environment for aio sandbox # Docker environment for aio sandbox
- DOCKER_HOST=unix:///var/run/docker.sock - DOCKER_HOST=unix:///var/run/docker.sock
- LOG_LEVEL=DEBUG
- LANGGRAPH_DEBUG=true
- LANGCHAIN_DEBUG=true
- PYTHONUNBUFFERED=1
env_file: env_file:
- ../.env - ../.env
extra_hosts: extra_hosts:

View File

@ -0,0 +1,6 @@
# RunningHub API Configuration
# Copy this file to .env and fill in your actual API key
# RunningHub API Key for image generation
# Get your API key from: https://www.runninghub.cn
RUNNINGHUB_API_KEY=your_api_key_here

31
skills/public/image-generation/.gitignore vendored Executable file
View File

@ -0,0 +1,31 @@
# Environment variables
.env
.env.local
.env.*.local
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
venv/
env/
ENV/
# Output files
*.jpg
*.jpeg
*.png
*.webp
outputs/
# IDE
.vscode/
.idea/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db

103
skills/public/image-generation/SKILL.md Normal file → Executable file
View File

@ -7,14 +7,47 @@ description: Use this skill when the user requests to generate, create, imagine,
## Overview ## Overview
This skill generates high-quality images using structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing image generation with optional reference images. This skill generates high-quality images using RunningHub API with structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing image generation through asynchronous task submission.
## Core Capabilities ## Core Capabilities
- Create structured JSON prompts for AIGC image generation - Create structured JSON prompts for AIGC image generation
- Support multiple reference images for style/composition guidance - Generate images through RunningHub's Z-Image Turbo LoRA API
- Generate images through automated Python script execution - Support asynchronous task submission and status polling
- Handle various image generation scenarios (character design, scenes, products, etc.) - Handle various image generation scenarios (character design, scenes, products, etc.)
- Support multiple aspect ratios and output formats (PNG, JPEG, WEBP)
## Configuration
### API Key Setup
This skill uses RunningHub API for image generation. You need to configure your API key before using the skill.
**Option 1: Environment Variable (Recommended)**
```bash
# Set the RUNNINGHUB_API_KEY environment variable
export RUNNINGHUB_API_KEY=your_api_key_here
# Or on Windows:
set RUNNINGHUB_API_KEY=your_api_key_here
```
**Option 2: .env File**
1. Copy `.env.example` to `.env`:
```bash
cp .env.example .env
```
2. Edit `.env` and add your API key:
```
RUNNINGHUB_API_KEY=your_api_key_here
```
3. The `.env` file is automatically excluded from version control via `.gitignore`
**Security Notes:**
- Never commit `.env` files to version control
- Never hardcode API keys in source code
- Rotate your API keys if they are accidentally exposed
- Get your API key from: https://www.runninghub.cn
## Workflow ## Workflow
@ -38,20 +71,20 @@ Call the Python script:
```bash ```bash
python /mnt/skills/public/image-generation/scripts/generate.py \ python /mnt/skills/public/image-generation/scripts/generate.py \
--prompt-file /mnt/user-data/workspace/prompt-file.json \ --prompt-file /mnt/user-data/workspace/prompt-file.json \
--reference-images /path/to/ref1.jpg /path/to/ref2.png \ --output-file /mnt/user-data/outputs/generated-image.jpg \
--output-file /mnt/user-data/outputs/generated-image.jpg
--aspect-ratio 16:9 --aspect-ratio 16:9
``` ```
Parameters: Parameters:
- `--prompt-file`: Absolute path to JSON prompt file (required) - `--prompt-file`: Absolute path to JSON prompt file (required)
- `--reference-images`: Absolute paths to reference images (optional, space-separated)
- `--output-file`: Absolute path to output image file (required) - `--output-file`: Absolute path to output image file (required)
- `--aspect-ratio`: Aspect ratio of the generated image (optional, default: 16:9) - `--aspect-ratio`: Aspect ratio of the generated image (optional, default: 16:9)
[!NOTE] [!NOTE]
Do NOT read the python file, just call it with the parameters. - The script uses RunningHub API which requires `RUNNINGHUB_API_KEY` environment variable to be set
- Do NOT read the python file, just call it with the parameters
- The script automatically handles task submission, status polling, and image download
## Character Generation Example ## Character Generation Example
@ -86,40 +119,6 @@ python /mnt/skills/public/image-generation/scripts/generate.py \
--aspect-ratio 2:3 --aspect-ratio 2:3
``` ```
With reference images:
```json
{
"characters": [{
"gender": "based on [Image 1]",
"age": "based on [Image 1]",
"ethnicity": "human from [Image 1] adapted to Star Wars universe",
"body_type": "based on [Image 1]",
"facial_features": "matching [Image 1] with slight weathered look from space travel",
"clothing": "Star Wars style outfit - worn leather jacket with utility vest, cargo pants with tactical pouches, scuffed boots, belt with holster",
"accessories": "blaster pistol on hip, comlink device on wrist, goggles pushed up on forehead, satchel with supplies, personal vehicle based on [Image 2]",
"era": "Star Wars universe, post-Empire era"
}],
"prompt": "Character inspired by [Image 1] standing next to a vehicle inspired by [Image 2] on a bustling alien planet street in Star Wars universe aesthetic. Character wearing worn leather jacket with utility vest, cargo pants with tactical pouches, scuffed boots, belt with blaster holster. The vehicle adapted to Star Wars aesthetic with weathered metal panels, repulsor engines, desert dust covering, parked on the street. Exotic alien marketplace street with multi-level architecture, weathered metal structures, hanging market stalls with colorful awnings, alien species walking by as background characters. Twin suns casting warm golden light, atmospheric dust particles in air, moisture vaporators visible in distance. Gritty lived-in Star Wars aesthetic, practical effects look, film grain texture, cinematic composition.",
"negative_prompt": "clean futuristic look, sterile environment, overly CGI appearance, fantasy medieval elements, Earth architecture, modern city",
"style": "Star Wars original trilogy aesthetic, lived-in universe, practical effects inspired, cinematic film look, slightly desaturated with warm tones",
"composition": "medium wide shot, character in foreground with alien street extending into background, environmental storytelling, rule of thirds",
"lighting": "warm golden hour lighting from twin suns, rim lighting on character, atmospheric haze, practical light sources from market stalls",
"color_palette": "warm sandy tones, ochre and sienna, dusty blues, weathered metals, muted earth colors with pops of alien market colors",
"technical": {
"aspect_ratio": "9:16",
"quality": "high",
"detail_level": "highly detailed with film-like texture"
}
}
```
```bash
python /mnt/skills/public/image-generation/scripts/generate.py \
--prompt-file /mnt/user-data/workspace/star-wars-scene.json \
--reference-images /mnt/user-data/uploads/character-ref.jpg /mnt/user-data/uploads/vehicle-ref.jpg \
--output-file /mnt/user-data/outputs/star-wars-scene-01.jpg \
--aspect-ratio 16:9
```
## Common Scenarios ## Common Scenarios
Use different JSON schemas for different scenarios. Use different JSON schemas for different scenarios.
@ -158,30 +157,10 @@ After generation:
- Provide brief description of the generation result - Provide brief description of the generation result
- Offer to iterate if adjustments needed - Offer to iterate if adjustments needed
## Tips: Enhancing Generation with Reference Images
For scenarios where visual accuracy is critical, **use the `image_search` tool first** to find reference images before generation.
**Recommended scenarios for using image_search tool:**
- **Character/Portrait Generation**: Search for similar poses, expressions, or styles to guide facial features and body proportions
- **Specific Objects or Products**: Find reference images of real objects to ensure accurate representation
- **Architectural or Environmental Scenes**: Search for location references to capture authentic details
- **Fashion and Clothing**: Find style references to ensure accurate garment details and styling
**Example workflow:**
1. Call the `image_search` tool to find suitable reference images:
```
image_search(query="Japanese woman street photography 1990s", size="Large")
```
2. Download the returned image URLs to local files
3. Use the downloaded images as `--reference-images` parameter in the generation script
This approach significantly improves generation quality by providing the model with concrete visual guidance rather than relying solely on text descriptions.
## Notes ## Notes
- Always use English for prompts regardless of user's language - Always use English for prompts regardless of user's language
- JSON format ensures structured, parsable prompts - JSON format ensures structured, parsable prompts
- Reference images enhance generation quality significantly
- Iterative refinement is normal for optimal results - Iterative refinement is normal for optimal results
- For character generation, include the detailed character object plus a consolidated prompt field - For character generation, include the detailed character object plus a consolidated prompt field
- The script automatically polls task status and downloads the generated image

206
skills/public/image-generation/scripts/generate.py Normal file → Executable file
View File

@ -1,8 +1,14 @@
import base64 import base64
import json
import os import os
import time
from typing import List
import requests import requests
from PIL import Image from PIL import Image
from dotenv import load_dotenv
load_dotenv()
def validate_image(image_path: str) -> bool: def validate_image(image_path: str) -> bool:
@ -17,77 +23,171 @@ def validate_image(image_path: str) -> bool:
""" """
try: try:
with Image.open(image_path) as img: with Image.open(image_path) as img:
img.verify() # Verify that it's a valid image img.verify()
# Re-open to check if it can be fully loaded (verify() may not catch all issues)
with Image.open(image_path) as img: with Image.open(image_path) as img:
img.load() # Force load the image data img.load()
return True return True
except Exception as e: except Exception as e:
print(f"Warning: Image '{image_path}' is invalid or corrupted: {e}") print(f"Warning: Image '{image_path}' is invalid or corrupted: {e}")
return False return False
def submit_generation_task(prompt: str, aspect_ratio: str = "16:9", output_format: str = "png") -> str:
"""
Submit image generation task to RunningHub API.
Args:
prompt: Text prompt for image generation
aspect_ratio: Aspect ratio of the generated image
output_format: Output image format (png, jpeg, webp)
Returns:
Task ID for tracking the generation
"""
api_key = os.getenv("RUNNINGHUB_API_KEY")
if not api_key:
raise Exception("RUNNINGHUB_API_KEY environment variable is not set")
url = "https://www.runninghub.cn/openapi/v2/rhart-image/z-image/turbo-lora"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
payload = {
"prompt": prompt,
"aspectRatio": aspect_ratio,
"lora_name": "Z-Image _ 清纯高颜值_脸模版V1.0.safetensors",
"lora_strength": 1,
"outputFormat": output_format
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
if result.get("status") not in ["QUEUED", "RUNNING", "SUCCESS"]:
raise Exception(f"Task submission failed: {result.get('errorMessage', 'Unknown error')}")
return result.get("taskId")
def query_task_status(task_id: str) -> dict:
"""
Query the status of a generation task.
Args:
task_id: Task ID to query
Returns:
Task status information
"""
api_key = os.getenv("RUNNINGHUB_API_KEY")
if not api_key:
raise Exception("RUNNINGHUB_API_KEY environment variable is not set")
url = "https://www.runninghub.cn/openapi/v2/query"
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
payload = {
"taskId": task_id
}
response = requests.post(url, headers=headers, json=payload)
response.raise_for_status()
return response.json()
def download_image(url: str, output_path: str) -> None:
"""
Download image from URL and save to file.
Args:
url: Image URL to download
output_path: Local path to save the image
"""
response = requests.get(url, stream=True)
response.raise_for_status()
with open(output_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
def generate_image( def generate_image(
prompt_file: str, prompt_file: str,
reference_images: list[str], reference_images: List[str],
output_file: str, output_file: str,
aspect_ratio: str = "16:9", aspect_ratio: str = "16:9",
) -> str: ) -> str:
with open(prompt_file, "r") as f: """
prompt = f.read() Generate image using RunningHub API.
parts = []
i = 0
# Filter out invalid reference images Args:
valid_reference_images = [] prompt_file: Path to JSON prompt file
for ref_img in reference_images: reference_images: List of reference image paths (currently not supported by RunningHub API)
if validate_image(ref_img): output_file: Output path for generated image
valid_reference_images.append(ref_img) aspect_ratio: Aspect ratio of the generated image
else:
print(f"Skipping invalid reference image: {ref_img}")
if len(valid_reference_images) < len(reference_images): Returns:
print(f"Note: {len(reference_images) - len(valid_reference_images)} reference image(s) were skipped due to validation failure.") Success message with output file path
"""
with open(prompt_file, "r", encoding="utf-8") as f:
prompt_data = json.load(f)
for reference_image in valid_reference_images: if reference_images:
i += 1 print("Note: RunningHub API does not support reference images in this version. Reference images will be ignored.")
with open(reference_image, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode("utf-8")
parts.append(
{
"inlineData": {
"mimeType": "image/jpeg",
"data": image_b64,
}
}
)
api_key = os.getenv("GEMINI_API_KEY") prompt_text = prompt_data.get("prompt", "")
if not api_key: if not prompt_text:
return "GEMINI_API_KEY is not set" prompt_text = json.dumps(prompt_data, ensure_ascii=False)
response = requests.post(
"https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent", output_format = "png"
headers={ if output_file.lower().endswith(".jpg") or output_file.lower().endswith(".jpeg"):
"x-goog-api-key": api_key, output_format = "jpeg"
"Content-Type": "application/json", elif output_file.lower().endswith(".webp"):
}, output_format = "webp"
json={
"generationConfig": {"imageConfig": {"aspectRatio": aspect_ratio}}, print(f"Submitting generation task...")
"contents": [{"parts": [*parts, {"text": prompt}]}], task_id = submit_generation_task(prompt_text, aspect_ratio, output_format)
}, print(f"Task submitted successfully. Task ID: {task_id}")
)
response.raise_for_status() max_retries = 60
json = response.json() retry_interval = 2
parts: list[dict] = json["candidates"][0]["content"]["parts"]
image_parts = [part for part in parts if part.get("inlineData", False)] for attempt in range(max_retries):
if len(image_parts) == 1: print(f"Checking task status... (Attempt {attempt + 1}/{max_retries})")
base64_image = image_parts[0]["inlineData"]["data"] status_result = query_task_status(task_id)
# Save the image to a file status = status_result.get("status")
with open(output_file, "wb") as f:
f.write(base64.b64decode(base64_image)) if status == "SUCCESS":
print("Task completed successfully!")
results = status_result.get("results", [])
if results and len(results) > 0:
image_url = results[0].get("url")
if image_url:
print(f"Downloading image from: {image_url}")
download_image(image_url, output_file)
return f"Successfully generated image to {output_file}" return f"Successfully generated image to {output_file}"
else: else:
raise Exception("Failed to generate image") raise Exception("No image URL found in task results")
else:
raise Exception("No results found in task response")
elif status == "FAILED":
error_msg = status_result.get("errorMessage", "Unknown error")
raise Exception(f"Task failed: {error_msg}")
elif status in ["QUEUED", "RUNNING"]:
print(f"Task status: {status}. Waiting...")
time.sleep(retry_interval)
else:
raise Exception(f"Unknown task status: {status}")
raise Exception(f"Task did not complete within {max_retries * retry_interval} seconds")
if __name__ == "__main__": if __name__ == "__main__":

0
skills/public/image-generation/templates/doraemon.md Normal file → Executable file
View File

View File

@ -0,0 +1,20 @@
# RunningHub API Configuration
# 请在获取API文档后填写以下信息
# RunningHub API密钥
RUNNINGHUB_API_KEY=your_api_key_here
# RunningHub API端点URL
RUNNINGHUB_API_URL=https://api.runninghub.cn/v1/workflow/run
# TTS工作流ID需要在RunningHub平台创建TTS工作流后获取
RUNNINGHUB_TTS_WORKFLOW_ID=your_workflow_id_here
# 男性声音参数根据RunningHub工作流参数填写
RUNNINGHUB_MALE_VOICE=male_voice_name
# 女性声音参数根据RunningHub工作流参数填写
RUNNINGHUB_FEMALE_VOICE=female_voice_name
# 可选:音频质量设置
RUNNINGHUB_AUDIO_QUALITY=high

View File

@ -0,0 +1,127 @@
# Podcast Generation - 播客生成工具
将文本内容转换为双主持人对话形式的播客音频。
## 功能特点
- 支持中英文内容
- 自动生成男女双主持人对话
- 支持多种TTS服务Edge-TTS、RunningHub、火山引擎
- 自动生成播客音频和文字稿
## 快速开始
### 1. 安装依赖
```bash
pip install -r requirements.txt
```
### 2. 配置TTS服务可选
默认使用**Edge-TTS**(免费,无需配置),可以直接跳过此步骤。
如果需要使用其他TTS服务复制 `.env.example``.env` 并配置:
**使用RunningHub API**
```bash
RUNNINGHUB_API_KEY=your_api_key_here
RUNNINGHUB_API_URL=https://api.runninghub.cn/v1/workflow/run
RUNNINGHUB_TTS_WORKFLOW_ID=your_workflow_id_here
RUNNINGHUB_MALE_VOICE=male_voice_name
RUNNINGHUB_FEMALE_VOICE=female_voice_name
```
**使用火山引擎TTS**
```bash
VOLCENGINE_TTS_APPID=your_app_id
VOLCENGINE_TTS_ACCESS_TOKEN=your_access_token
VOLCENGINE_TTS_CLUSTER=volcano_tts
```
### 3. 创建播客脚本
创建一个JSON脚本文件例如 `workspace/test-script.json`
```json
{
"title": "测试播客",
"locale": "zh",
"lines": [
{"speaker": "male", "paragraph": "Hello Deer! 欢迎回到我们的播客节目。"},
{"speaker": "female", "paragraph": "大家好!今天我们要聊一个有趣的话题。"},
{"speaker": "male", "paragraph": "没错,我们今天要讨论的是人工智能的发展历程。"}
]
}
```
### 4. 生成播客
```bash
python scripts/generate.py \
--script-file workspace/test-script.json \
--output-file outputs/test-podcast.mp3 \
--transcript-file outputs/test-transcript.md
```
## TTS服务说明
### Edge-TTS推荐默认
- ✅ 完全免费
- ✅ 无需API密钥
- ✅ 支持中英文
- ✅ 音质优秀
- ⚠️ 需要网络连接
### RunningHub API
- 需要API密钥
- 支持自定义声音
- 需要配置工作流ID
### 火山引擎TTS
- 需要API密钥
- 音质优秀
- 需要注册火山引擎账号
## 脚本格式说明
```json
{
"title": "播客标题(可选)",
"locale": "语言代码zh/en",
"lines": [
{
"speaker": "male", // male 或 female
"paragraph": "对话内容"
}
]
}
```
## 输出文件
- `*.mp3` - 播客音频文件
- `*.md` - 播客文字稿
## 注意事项
- 首次使用Edge-TTS时会自动下载语音模型
- 建议每行对话不要太长50-100字
- 男女主持人交替对话效果更好
- 支持的音频格式MP3
## 故障排除
**问题edge-tts库未安装**
```bash
pip install edge-tts
```
**问题:网络连接失败**
- 检查网络连接
- Edge-TTS需要访问微软服务器
**问题:音频生成失败**
- 检查脚本JSON格式是否正确
- 查看错误日志
- 尝试使用其他TTS服务

105
skills/public/podcast-generation/SKILL.md Normal file → Executable file
View File

@ -25,12 +25,42 @@ When a user requests podcast generation, identify:
- Source content: The text/article/report to convert into a podcast - Source content: The text/article/report to convert into a podcast
- Language: English or Chinese (based on content) - Language: English or Chinese (based on content)
- Output location: Where to save the generated podcast - Output location: Where to save the generated podcast (默认保存在项目目录下)
- You don't need to check the folder under `/mnt/user-data`
### Step 2: Create Structured Script JSON ### Step 2: Configure Environment Variables可选
Generate a structured JSON script file in `/mnt/user-data/workspace/` with naming pattern: `{descriptive-name}-script.json` 默认使用**Edge-TTS**(免费,无需配置),可以直接跳过此步骤。
如果需要使用其他TTS服务复制 `.env.example``.env` 并配置:
**方式1使用Edge-TTS推荐默认**
- ✅ 完全免费
- ✅ 无需API密钥
- ✅ 支持中英文
- ✅ 音质优秀
- 直接使用,无需配置
**方式2使用RunningHub API**
```bash
RUNNINGHUB_API_KEY=your_api_key_here
RUNNINGHUB_API_URL=https://api.runninghub.cn/v1/workflow/run
RUNNINGHUB_TTS_WORKFLOW_ID=your_workflow_id_here
RUNNINGHUB_MALE_VOICE=male_voice_name
RUNNINGHUB_FEMALE_VOICE=female_voice_name
```
**方式3使用火山引擎TTS**
```bash
VOLCENGINE_TTS_APPID=your_app_id
VOLCENGINE_TTS_ACCESS_TOKEN=your_access_token
VOLCENGINE_TTS_CLUSTER=volcano_tts
```
### Step 3: Create Structured Script JSON
Generate a structured JSON script file in project directory with naming pattern: `{descriptive-name}-script.json`
The JSON structure: The JSON structure:
```json ```json
@ -43,21 +73,21 @@ The JSON structure:
} }
``` ```
### Step 3: Execute Generation ### Step 4: Execute Generation
Call the Python script: Call the Python script:
```bash ```bash
python /mnt/skills/public/podcast-generation/scripts/generate.py \ python scripts/generate.py \
--script-file /mnt/user-data/workspace/script-file.json \ --script-file workspace/script-file.json \
--output-file /mnt/user-data/outputs/generated-podcast.mp3 \ --output-file outputs/generated-podcast.mp3 \
--transcript-file /mnt/user-data/outputs/generated-podcast-transcript.md --transcript-file outputs/generated-podcast-transcript.md
``` ```
Parameters: Parameters:
- `--script-file`: Absolute path to JSON script file (required) - `--script-file`: Path to JSON script file (required)
- `--output-file`: Absolute path to output MP3 file (required) - `--output-file`: Path to output MP3 file (required)
- `--transcript-file`: Absolute path to output transcript markdown file (optional, but recommended) - `--transcript-file`: Path to output transcript markdown file (optional, but recommended)
> [!IMPORTANT] > [!IMPORTANT]
> - Execute the script in one complete call. Do NOT split the workflow into separate steps. > - Execute the script in one complete call. Do NOT split the workflow into separate steps.
@ -112,39 +142,6 @@ When creating the script JSON, follow these guidelines:
- Make content engaging and accessible for audio-only listeners - Make content engaging and accessible for audio-only listeners
- Exclude meta information like dates, author names, or document structure - Exclude meta information like dates, author names, or document structure
## Podcast Generation Example
User request: "Generate a podcast about the history of artificial intelligence"
Step 1: Create script file `/mnt/user-data/workspace/ai-history-script.json`:
```json
{
"title": "The History of Artificial Intelligence",
"locale": "en",
"lines": [
{"speaker": "male", "paragraph": "Hello Deer! Welcome back to another fascinating episode. Today we're diving into something that's literally shaping our future - the history of artificial intelligence."},
{"speaker": "female", "paragraph": "Oh, I love this topic! You know, AI feels so modern, but it actually has roots going back over seventy years."},
{"speaker": "male", "paragraph": "Exactly! It all started back in the 1950s. The term artificial intelligence was actually coined by John McCarthy in 1956 at a famous conference at Dartmouth."},
{"speaker": "female", "paragraph": "Wait, so they were already thinking about machines that could think back then? That's incredible!"},
{"speaker": "male", "paragraph": "Right? The early pioneers were so optimistic. They thought we'd have human-level AI within a generation."},
{"speaker": "female", "paragraph": "But things didn't quite work out that way, did they?"},
{"speaker": "male", "paragraph": "No, not at all. The 1970s brought what's called the first AI winter..."}
]
}
```
Step 2: Execute generation:
```bash
python /mnt/skills/public/podcast-generation/scripts/generate.py \
--script-file /mnt/user-data/workspace/ai-history-script.json \
--output-file /mnt/user-data/outputs/ai-history-podcast.mp3 \
--transcript-file /mnt/user-data/outputs/ai-history-transcript.md
```
This will generate:
- `ai-history-podcast.mp3`: The audio podcast file
- `ai-history-transcript.md`: A readable markdown transcript of the podcast
## Specific Templates ## Specific Templates
Read the following template file only when matching the user request. Read the following template file only when matching the user request.
@ -164,14 +161,25 @@ The generated podcast follows the "Hello Deer" format:
After generation: After generation:
- Podcasts and transcripts are saved in `/mnt/user-data/outputs/` - Podcasts and transcripts are saved in the `outputs/` directory
- Share both the podcast MP3 and transcript MD with user using `present_files` tool - Share both the podcast MP3 and transcript MD with user
- Provide brief description of the generation result (topic, duration, hosts) - Provide brief description of the generation result (topic, duration, hosts)
- Offer to regenerate if adjustments needed - Offer to regenerate if adjustments needed
## Requirements ## Requirements
The following environment variables must be set: **默认使用Edge-TTS微软浏览器TTS无需任何配置开箱即用。**
如需使用其他TTS服务可配置以下环境变量
**RunningHub API可选**
- `RUNNINGHUB_API_KEY`: RunningHub API密钥
- `RUNNINGHUB_API_URL`: RunningHub API端点URL
- `RUNNINGHUB_TTS_WORKFLOW_ID`: TTS工作流ID
- `RUNNINGHUB_MALE_VOICE`: 男性声音参数
- `RUNNINGHUB_FEMALE_VOICE`: 女性声音参数
**火山引擎TTS可选**
- `VOLCENGINE_TTS_APPID`: Volcengine TTS application ID - `VOLCENGINE_TTS_APPID`: Volcengine TTS application ID
- `VOLCENGINE_TTS_ACCESS_TOKEN`: Volcengine TTS access token - `VOLCENGINE_TTS_ACCESS_TOKEN`: Volcengine TTS access token
- `VOLCENGINE_TTS_CLUSTER`: Volcengine TTS cluster (optional, defaults to "volcano_tts") - `VOLCENGINE_TTS_CLUSTER`: Volcengine TTS cluster (optional, defaults to "volcano_tts")
@ -183,3 +191,4 @@ The following environment variables must be set:
- Technical content should be simplified for audio accessibility in the script - Technical content should be simplified for audio accessibility in the script
- Complex notations (formulas, code) should be translated to plain language in the script - Complex notations (formulas, code) should be translated to plain language in the script
- Long content may result in longer podcasts - Long content may result in longer podcasts
- Edge-TTS使用微软Edge浏览器的在线TTS服务需要网络连接

View File

@ -0,0 +1,3 @@
requests>=2.31.0
python-dotenv>=1.0.0
edge-tts>=6.1.0

256
skills/public/podcast-generation/scripts/generate.py Normal file → Executable file
View File

@ -4,10 +4,15 @@ import json
import logging import logging
import os import os
import uuid import uuid
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Literal, Optional from typing import Literal, Optional
from pathlib import Path
import requests import requests
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -39,7 +44,115 @@ class Script:
def text_to_speech(text: str, voice_type: str) -> Optional[bytes]: def text_to_speech(text: str, voice_type: str) -> Optional[bytes]:
"""Convert text to speech using Volcengine TTS.""" """Convert text to speech using available TTS service.
优先级
1. RunningHub API需要配置.env
2. 火山引擎TTS需要配置.env
3. Edge-TTS免费无需配置默认使用
"""
# 检查是否有RunningHub配置
has_runninghub = (os.getenv("RUNNINGHUB_API_KEY") and
os.getenv("RUNNINGHUB_API_URL") and
os.getenv("RUNNINGHUB_TTS_WORKFLOW_ID"))
# 检查是否有火山引擎配置
has_volcengine = (os.getenv("VOLCENGINE_TTS_APPID") and
os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN"))
if has_runninghub:
logger.info("Using RunningHub TTS API")
return text_to_speech_runninghub(text, voice_type)
elif has_volcengine:
logger.info("Using Volcengine TTS API")
return text_to_speech_volcengine(text, voice_type)
else:
logger.info("Using Edge-TTS (free, no API key required)")
return text_to_speech_edge(text, voice_type)
def text_to_speech_runninghub(text: str, voice_type: str) -> Optional[bytes]:
"""Convert text to speech using RunningHub TTS API.
需要在.env文件中配置以下环境变量
- RUNNINGHUB_API_KEY: RunningHub API密钥
- RUNNINGHUB_API_URL: RunningHub API端点URL
- RUNNINGHUB_TTS_WORKFLOW_ID: TTS工作流ID
- RUNNINGHUB_MALE_VOICE: 男性声音参数
- RUNNINGHUB_FEMALE_VOICE: 女性声音参数
"""
api_key = os.getenv("RUNNINGHUB_API_KEY")
api_url = os.getenv("RUNNINGHUB_API_URL")
workflow_id = os.getenv("RUNNINGHUB_TTS_WORKFLOW_ID")
if not api_key or not api_url or not workflow_id:
raise ValueError(
"请在.env文件中配置RUNNINGHUB_API_KEY, RUNNINGHUB_API_URL和RUNNINGHUB_TTS_WORKFLOW_ID"
)
# 根据speaker选择声音参数
if voice_type == "male":
voice_param = os.getenv("RUNNINGHUB_MALE_VOICE", "male_voice")
else:
voice_param = os.getenv("RUNNINGHUB_FEMALE_VOICE", "female_voice")
# 构建RunningHub API请求
# 注意以下payload结构需要根据实际的RunningHub API文档进行调整
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
}
# 根据RunningHub工作流的实际参数格式调整payload
# 这里提供一个通用模板你需要根据实际API文档修改
payload = {
"workflow_id": workflow_id,
"inputs": {
"text": text,
"voice": voice_param,
# 根据实际工作流参数添加其他字段
# "speed": 1.2,
# "quality": "high",
}
}
try:
logger.info(f"Calling RunningHub API for text: {text[:50]}...")
response = requests.post(api_url, json=payload, headers=headers, timeout=60)
if response.status_code != 200:
logger.error(f"RunningHub API error: {response.status_code} - {response.text}")
return None
result = response.json()
# 根据RunningHub API的实际响应格式调整以下代码
# 通常音频数据会在某个字段中可能是base64编码的
audio_data = result.get("data") or result.get("audio") or result.get("output")
if audio_data:
# 如果音频是base64编码的需要解码
if isinstance(audio_data, str):
try:
return base64.b64decode(audio_data)
except:
# 如果不是base64可能已经是bytes
return audio_data.encode() if isinstance(audio_data, str) else audio_data
elif isinstance(audio_data, bytes):
return audio_data
logger.error(f"No audio data in response: {result}")
return None
except Exception as e:
logger.error(f"RunningHub TTS error: {str(e)}")
import traceback
traceback.print_exc()
return None
def text_to_speech_volcengine(text: str, voice_type: str) -> Optional[bytes]:
"""Convert text to speech using Volcengine TTS (备用方案)."""
app_id = os.getenv("VOLCENGINE_TTS_APPID") app_id = os.getenv("VOLCENGINE_TTS_APPID")
access_token = os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN") access_token = os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN")
cluster = os.getenv("VOLCENGINE_TTS_CLUSTER", "volcano_tts") cluster = os.getenv("VOLCENGINE_TTS_CLUSTER", "volcano_tts")
@ -51,7 +164,6 @@ def text_to_speech(text: str, voice_type: str) -> Optional[bytes]:
url = "https://openspeech.bytedance.com/api/v1/tts" url = "https://openspeech.bytedance.com/api/v1/tts"
# Authentication: Bearer token with semicolon separator
headers = { headers = {
"Content-Type": "application/json", "Content-Type": "application/json",
"Authorization": f"Bearer;{access_token}", "Authorization": f"Bearer;{access_token}",
@ -60,7 +172,7 @@ def text_to_speech(text: str, voice_type: str) -> Optional[bytes]:
payload = { payload = {
"app": { "app": {
"appid": app_id, "appid": app_id,
"token": "access_token", # literal string, not the actual token "token": "access_token",
"cluster": cluster, "cluster": cluster,
}, },
"user": {"uid": "podcast-generator"}, "user": {"uid": "podcast-generator"},
@ -70,7 +182,7 @@ def text_to_speech(text: str, voice_type: str) -> Optional[bytes]:
"speed_ratio": 1.2, "speed_ratio": 1.2,
}, },
"request": { "request": {
"reqid": str(uuid.uuid4()), # must be unique UUID "reqid": str(uuid.uuid4()),
"text": text, "text": text,
"text_type": "plain", "text_type": "plain",
"operation": "query", "operation": "query",
@ -99,15 +211,93 @@ def text_to_speech(text: str, voice_type: str) -> Optional[bytes]:
return None return None
def text_to_speech_edge(text: str, voice_type: str) -> Optional[bytes]:
"""Convert text to speech using Edge-TTS (免费无需API密钥).
Edge-TTS使用微软Edge浏览器的在线TTS服务完全免费且无需注册
参数:
text: 要转换的文本
voice_type: "male" "female"用于选择声音
"""
try:
import edge_tts
except ImportError:
logger.error("edge-tts库未安装。请运行: pip install edge-tts")
return None
# 根据语言和性别选择声音
# 中文声音
zh_male_voices = [
"zh-CN-YunxiNeural", # 男声
"zh-CN-YunyangNeural", # 男声
]
zh_female_voices = [
"zh-CN-XiaoxiaoNeural", # 女声
"zh-CN-XiaoyiNeural", # 女声
]
# 英文声音
en_male_voices = [
"en-US-GuyNeural", # 男声
"en-US-EricNeural", # 男声
]
en_female_voices = [
"en-US-JennyNeural", # 女声
"en-US-AriaNeural", # 女声
]
# 检测文本语言(简单判断:是否包含中文字符)
has_chinese = any('\u4e00' <= char <= '\u9fff' for char in text)
if has_chinese:
if voice_type == "male":
voice = zh_male_voices[0]
else:
voice = zh_female_voices[0]
else:
if voice_type == "male":
voice = en_male_voices[0]
else:
voice = en_female_voices[0]
logger.info(f"Using Edge-TTS voice: {voice}")
try:
communicate = edge_tts.Communicate(text, voice)
# 获取音频数据
audio_data = b""
async def generate_audio():
nonlocal audio_data
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_data += chunk["data"]
# 运行异步函数
asyncio.run(generate_audio())
if audio_data:
logger.info(f"Generated {len(audio_data)} bytes of audio")
return audio_data
else:
logger.error("No audio data generated")
return None
except Exception as e:
logger.error(f"Edge-TTS error: {str(e)}")
import traceback
traceback.print_exc()
return None
def _process_line(args: tuple[int, ScriptLine, int]) -> tuple[int, Optional[bytes]]: def _process_line(args: tuple[int, ScriptLine, int]) -> tuple[int, Optional[bytes]]:
"""Process a single script line for TTS. Returns (index, audio_bytes).""" """Process a single script line for TTS. Returns (index, audio_bytes)."""
i, line, total = args i, line, total = args
# Select voice based on speaker gender # Select voice based on speaker gender
if line.speaker == "male": # voice_type会传递给text_to_speech函数函数会根据.env配置选择具体的声音参数
voice_type = "zh_male_yangguangqingnian_moon_bigtts" # Male voice voice_type = "male" if line.speaker == "male" else "female"
else:
voice_type = "zh_female_sajiaonvyou_moon_bigtts" # Female voice
logger.info(f"Processing line {i + 1}/{total} ({line.speaker})") logger.info(f"Processing line {i + 1}/{total} ({line.speaker})")
audio = text_to_speech(line.paragraph, voice_type) audio = text_to_speech(line.paragraph, voice_type)
@ -123,15 +313,46 @@ def tts_node(script: Script, max_workers: int = 4) -> list[bytes]:
logger.info(f"Converting script to audio using {max_workers} workers...") logger.info(f"Converting script to audio using {max_workers} workers...")
total = len(script.lines) total = len(script.lines)
# Handle empty script case
if total == 0:
raise ValueError("Script contains no lines to process")
# Validate required environment variables before starting TTS
# 检查RunningHub配置或火山引擎配置至少需要一种
has_runninghub = (os.getenv("RUNNINGHUB_API_KEY") and
os.getenv("RUNNINGHUB_API_URL") and
os.getenv("RUNNINGHUB_TTS_WORKFLOW_ID"))
has_volcengine = (os.getenv("VOLCENGINE_TTS_APPID") and
os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN"))
if not has_runninghub and not has_volcengine:
raise ValueError(
"Missing required environment variables. Please configure either:\n"
"- RunningHub: RUNNINGHUB_API_KEY, RUNNINGHUB_API_URL, RUNNINGHUB_TTS_WORKFLOW_ID\n"
"- Volcengine: VOLCENGINE_TTS_APPID, VOLCENGINE_TTS_ACCESS_TOKEN"
)
tasks = [(i, line, total) for i, line in enumerate(script.lines)] tasks = [(i, line, total) for i, line in enumerate(script.lines)]
# Use ThreadPoolExecutor for parallel TTS generation # Use ThreadPoolExecutor for parallel TTS generation
results: dict[int, Optional[bytes]] = {} results: dict[int, Optional[bytes]] = {}
failed_indices: list[int] = []
with ThreadPoolExecutor(max_workers=max_workers) as executor: with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = {executor.submit(_process_line, task): task[0] for task in tasks} futures = {executor.submit(_process_line, task): task[0] for task in tasks}
for future in as_completed(futures): for future in as_completed(futures):
idx, audio = future.result() idx, audio = future.result()
results[idx] = audio results[idx] = audio
# Use `not audio` to catch both None and empty bytes
if not audio:
failed_indices.append(idx)
# Log failed lines with 1-based indices for user-friendly output
if failed_indices:
logger.warning(
f"Failed to generate audio for {len(failed_indices)}/{total} lines: "
f"line numbers {sorted(i + 1 for i in failed_indices)}"
)
# Collect results in order, skipping failed ones # Collect results in order, skipping failed ones
audio_chunks = [] audio_chunks = []
@ -140,15 +361,30 @@ def tts_node(script: Script, max_workers: int = 4) -> list[bytes]:
if audio: if audio:
audio_chunks.append(audio) audio_chunks.append(audio)
logger.info(f"Generated {len(audio_chunks)} audio chunks") logger.info(f"Generated {len(audio_chunks)}/{total} audio chunks successfully")
if not audio_chunks:
raise ValueError(
f"TTS generation failed for all {total} lines. "
"Please check VOLCENGINE_TTS_APPID and VOLCENGINE_TTS_ACCESS_TOKEN environment variables."
)
return audio_chunks return audio_chunks
def mix_audio(audio_chunks: list[bytes]) -> bytes: def mix_audio(audio_chunks: list[bytes]) -> bytes:
"""Combine audio chunks into a single audio file.""" """Combine audio chunks into a single audio file."""
logger.info("Mixing audio chunks...") logger.info("Mixing audio chunks...")
if not audio_chunks:
raise ValueError("No audio chunks to mix - TTS generation may have failed")
output = b"".join(audio_chunks) output = b"".join(audio_chunks)
logger.info("Audio mixing complete")
if len(output) == 0:
raise ValueError("Mixed audio is empty - TTS generation may have failed")
logger.info(f"Audio mixing complete: {len(output)} bytes")
return output return output

View File

16
skills/public/video-generation/.gitignore vendored Executable file
View File

@ -0,0 +1,16 @@
.env
*.mp4
*.avi
*.mov
*.mkv
outputs/
workspace/
__pycache__/
*.pyc
*.pyo
*.pyd
.Python
*.so
*.egg-info/
dist/
build/

View File

@ -0,0 +1,93 @@
# Video Generation Skill - Setup Guide
## Quick Start
1. **Install Dependencies**
```bash
pip install requests python-dotenv
```
2. **Configure API Key**
Create a `.env` file in the project root directory:
```env
RUNNINGHUB_API_KEY=your_api_key_here
```
Or set it as an environment variable:
```bash
# Windows PowerShell
$env:RUNNINGHUB_API_KEY="your_api_key_here"
# Linux/Mac
export RUNNINGHUB_API_KEY="your_api_key_here"
```
3. **Generate a Video**
```bash
python scripts/generate.py --prompt-file workspace/your-prompt.json --output-file outputs/video.mp4 --duration 5
```
## Parameters
- `--prompt-file`: Path to JSON prompt file (required)
- `--output-file`: Output video file path (required)
- `--aspect-ratio`: Video aspect ratio (optional, default: 16:9)
- `--duration`: Video duration in seconds (optional, default: 5, range: 1-16)
## Getting API Key
To use this skill, you need a RunningHub API key:
1. Visit [RunningHub](https://www.runninghub.cn/)
2. Sign up for an account
3. Get your API key from the dashboard
4. Add it to your `.env` file
## Example Prompt
Create a JSON file with your video description:
```json
{
"title": "Your Video Title",
"description": "Description of what you want to generate",
"visual": {
"scene": "Scene description",
"elements": ["element1", "element2"],
"colors": "Color palette",
"lighting": "Lighting description"
},
"camera": {
"movement": "Camera movement",
"focus": "Focus description"
},
"audio": {
"background": "Background music description",
"effects": "Sound effects description"
}
}
```
## Notes
- The `.env` file is already in `.gitignore` and won't be committed to version control
- Never share your API key or commit it to public repositories
- The script automatically loads environment variables from `.env` file
- Video generation may take several minutes depending on the complexity
## Troubleshooting
**Error: RUNNINGHUB_API_KEY is not set**
- Make sure you've created the `.env` file with your API key
- Or set the environment variable before running the script
**Error: Failed to submit task**
- Check that your API key is valid
- Ensure you have sufficient credits in your RunningHub account
- Verify your internet connection
**Video generation takes too long**
- This is normal for AI video generation
- The script will automatically poll for status until completion
- You can check the RunningHub dashboard for task progress

56
skills/public/video-generation/SKILL.md Normal file → Executable file
View File

@ -7,13 +7,14 @@ description: Use this skill when the user requests to generate, create, or imagi
## Overview ## Overview
This skill generates high-quality videos using structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing video generation with optional reference image. This skill generates high-quality videos using structured prompts and a Python script. The workflow includes creating JSON-formatted prompts and executing video generation through RunningHub API.
## Core Capabilities ## Core Capabilities
- Create structured JSON prompts for AIGC video generation - Create structured JSON prompts for AIGC video generation
- Support reference image as guidance or the first/last frame of the video - Generate videos through RunningHub Vidu model (text-to-video-q3-turbo)
- Generate videos through automated Python script execution - Support up to 16 seconds video generation with audio
- Automatic camera switching and dialogue generation
## Workflow ## Workflow
@ -21,21 +22,16 @@ This skill generates high-quality videos using structured prompts and a Python s
When a user requests video generation, identify: When a user requests video generation, identify:
- Subject/content: What should be in the image - Subject/content: What should be in the video
- Style preferences: Art style, mood, color palette - Style preferences: Art style, mood, color palette
- Technical specs: Aspect ratio, composition, lighting - Technical specs: Aspect ratio, resolution, duration
- Reference image: Any image to guide generation - Audio requirements: Background music, dialogue, sound effects
- You don't need to check the folder under `/mnt/user-data`
### Step 2: Create Structured Prompt ### Step 2: Create Structured Prompt
Generate a structured JSON file in `/mnt/user-data/workspace/` with naming pattern: `{descriptive-name}.json` Generate a structured JSON file in `/mnt/user-data/workspace/` with naming pattern: `{descriptive-name}.json`
### Step 3: Create Reference Image (Optional when image-generation skill is available) The prompt should include visual descriptions, camera movements, and audio specifications in a natural language format.
Generate reference image for the video generation.
- If only 1 image is provided, use it as the guided frame of the video
### Step 3: Execute Generation ### Step 3: Execute Generation
@ -43,7 +39,6 @@ Call the Python script:
```bash ```bash
python /mnt/skills/public/video-generation/scripts/generate.py \ python /mnt/skills/public/video-generation/scripts/generate.py \
--prompt-file /mnt/user-data/workspace/prompt-file.json \ --prompt-file /mnt/user-data/workspace/prompt-file.json \
--reference-images /path/to/ref1.jpg \
--output-file /mnt/user-data/outputs/generated-video.mp4 \ --output-file /mnt/user-data/outputs/generated-video.mp4 \
--aspect-ratio 16:9 --aspect-ratio 16:9
``` ```
@ -51,20 +46,28 @@ python /mnt/skills/public/video-generation/scripts/generate.py \
Parameters: Parameters:
- `--prompt-file`: Absolute path to JSON prompt file (required) - `--prompt-file`: Absolute path to JSON prompt file (required)
- `--reference-images`: Absolute paths to reference image (optional) - `--output-file`: Absolute path to output video file (required)
- `--output-file`: Absolute path to output image file (required) - `--aspect-ratio`: Aspect ratio of the generated video (optional, default: 16:9)
- `--aspect-ratio`: Aspect ratio of the generated image (optional, default: 16:9)
[!NOTE] [!NOTE]
Do NOT read the python file, instead just call it with the parameters. Do NOT read the python file, instead just call it with the parameters.
## Environment Variables
Set the following environment variable before running the script:
- `RUNNINGHUB_API_KEY`: Your RunningHub API key
Example:
```bash
export RUNNINGHUB_API_KEY=a73d0e93afb4432c978e5bff30b7517e
```
## Video Generation Example ## Video Generation Example
User request: "Generate a short video clip depicting the opening scene from "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe" User request: "Generate a short video clip depicting the opening scene from "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe"
Step 1: Search for the opening scene of "The Chronicles of Narnia: The Lion, the Witch and the Wardrobe" online Step 1: Create a JSON prompt file with the following content:
Step 2: Create a JSON prompt file with the following content:
```json ```json
{ {
@ -108,16 +111,11 @@ Step 2: Create a JSON prompt file with the following content:
} }
``` ```
Step 3: Use the image-generation skill to generate the reference image Step 2: Use the generate.py script to generate the video
Load the image-generation skill and generate a single reference image `narnia-farewell-scene-01.jpg` according to the skill.
Step 4: Use the generate.py script to generate the video
```bash ```bash
python /mnt/skills/public/video-generation/scripts/generate.py \ python /mnt/skills/public/video-generation/scripts/generate.py \
--prompt-file /mnt/user-data/workspace/narnia-farewell-scene.json \ --prompt-file /mnt/user-data/workspace/narnia-farewell-scene.json \
--reference-images /mnt/user-data/outputs/narnia-farewell-scene-01.jpg \ --output-file /mnt/user-data/outputs/narnia-farewell-scene.mp4 \
--output-file /mnt/user-data/outputs/narnia-farewell-scene-01.mp4 \
--aspect-ratio 16:9 --aspect-ratio 16:9
``` ```
> Do NOT read the python file, just call it with the parameters. > Do NOT read the python file, just call it with the parameters.
@ -127,7 +125,7 @@ python /mnt/skills/public/video-generation/scripts/generate.py \
After generation: After generation:
- Videos are typically saved in `/mnt/user-data/outputs/` - Videos are typically saved in `/mnt/user-data/outputs/`
- Share generated videos (come first) with user as well as generated image if applicable, using `present_files` tool - Share generated videos with user using `present_files` tool
- Provide brief description of the generation result - Provide brief description of the generation result
- Offer to iterate if adjustments needed - Offer to iterate if adjustments needed
@ -135,5 +133,7 @@ After generation:
- Always use English for prompts regardless of user's language - Always use English for prompts regardless of user's language
- JSON format ensures structured, parsable prompts - JSON format ensures structured, parsable prompts
- Reference image enhance generation quality significantly - RunningHub Vidu model supports up to 16 seconds video generation
- Audio is automatically generated including dialogue and sound effects
- The model has "director thinking" capability for automatic camera switching
- Iterative refinement is normal for optimal results - Iterative refinement is normal for optimal results

115
skills/public/video-generation/scripts/generate.py Normal file → Executable file
View File

@ -1,75 +1,83 @@
import base64
import os import os
import time import time
from typing import List
import requests import requests
from dotenv import load_dotenv
load_dotenv()
def generate_video( def generate_video(
prompt_file: str, prompt_file: str,
reference_images: list[str], reference_images: List[str],
output_file: str, output_file: str,
aspect_ratio: str = "16:9", aspect_ratio: str = "16:9",
duration: str = "5",
) -> str: ) -> str:
with open(prompt_file, "r") as f: with open(prompt_file, "r", encoding="utf-8") as f:
prompt = f.read() prompt = f.read()
referenceImages = []
i = 0 api_key = os.getenv("RUNNINGHUB_API_KEY")
json = {
"instances": [{"prompt": prompt}],
}
for reference_image in reference_images:
i += 1
with open(reference_image, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode("utf-8")
referenceImages.append(
{
"image": {"mimeType": "image/jpeg", "bytesBase64Encoded": image_b64},
"referenceType": "asset",
}
)
if i > 0:
json["instances"][0]["referenceImages"] = referenceImages
api_key = os.getenv("GEMINI_API_KEY")
if not api_key: if not api_key:
return "GEMINI_API_KEY is not set" return "RUNNINGHUB_API_KEY is not set"
json_data = {
"prompt": prompt,
"style": "general",
"aspectRatio": aspect_ratio,
"resolution": "720p",
"duration": duration,
"audio": True
}
response = requests.post( response = requests.post(
"https://generativelanguage.googleapis.com/v1beta/models/veo-3.1-generate-preview:predictLongRunning", "https://www.runninghub.cn/openapi/v2/vidu/text-to-video-q3-turbo",
headers={ headers={
"x-goog-api-key": api_key, "Authorization": f"Bearer {api_key}",
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
json=json, json=json_data,
) )
json = response.json()
operation_name = json["name"] response_json = response.json()
if "taskId" not in response_json:
return f"Failed to submit task: {response_json}"
task_id = response_json["taskId"]
while True: while True:
response = requests.get( response = requests.post(
f"https://generativelanguage.googleapis.com/v1beta/{operation_name}", "https://www.runninghub.cn/openapi/v2/query",
headers={ headers={
"x-goog-api-key": api_key, "Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}, },
json={"taskId": task_id},
) )
json = response.json()
if json.get("done", False): response_json = response.json()
sample = json["response"]["generateVideoResponse"]["generatedSamples"][0] status = response_json.get("status")
url = sample["video"]["uri"]
if status == "SUCCESS":
results = response_json.get("results", [])
if results and len(results) > 0:
url = results[0].get("url")
if url:
download(url, output_file) download(url, output_file)
break break
elif status == "FAILED":
error_message = response_json.get("errorMessage", "Unknown error")
return f"Video generation failed: {error_message}"
time.sleep(3) time.sleep(3)
return f"The video has been generated successfully to {output_file}" return f"The video has been generated successfully to {output_file}"
def download(url: str, output_file: str): def download(url: str, output_file: str):
api_key = os.getenv("GEMINI_API_KEY") response = requests.get(url)
if not api_key:
return "GEMINI_API_KEY is not set"
response = requests.get(
url,
headers={
"x-goog-api-key": api_key,
},
)
with open(output_file, "wb") as f: with open(output_file, "wb") as f:
f.write(response.content) f.write(response.content)
@ -77,28 +85,28 @@ def download(url: str, output_file: str):
if __name__ == "__main__": if __name__ == "__main__":
import argparse import argparse
parser = argparse.ArgumentParser(description="Generate videos using Gemini API") parser = argparse.ArgumentParser(description="Generate videos using RunningHub API")
parser.add_argument( parser.add_argument(
"--prompt-file", "--prompt-file",
required=True, required=True,
help="Absolute path to JSON prompt file", help="Absolute path to JSON prompt file",
) )
parser.add_argument(
"--reference-images",
nargs="*",
default=[],
help="Absolute paths to reference images (space-separated)",
)
parser.add_argument( parser.add_argument(
"--output-file", "--output-file",
required=True, required=True,
help="Output path for generated image", help="Output path for generated video",
) )
parser.add_argument( parser.add_argument(
"--aspect-ratio", "--aspect-ratio",
required=False, required=False,
default="16:9", default="16:9",
help="Aspect ratio of the generated image", help="Aspect ratio of the generated video",
)
parser.add_argument(
"--duration",
required=False,
default="5",
help="Duration of the generated video in seconds (1-16)",
) )
args = parser.parse_args() args = parser.parse_args()
@ -107,9 +115,10 @@ if __name__ == "__main__":
print( print(
generate_video( generate_video(
args.prompt_file, args.prompt_file,
args.reference_images, [],
args.output_file, args.output_file,
args.aspect_ratio, args.aspect_ratio,
args.duration,
) )
) )
except Exception as e: except Exception as e: