Merge pull request #692 from harry0703/dev

refactor: remove unnecessary close_clip calls in video processing
2025-05-16 11:03:36 +08:00 · 2025-05-16 11:02:59 +08:00 · 2025-05-13 16:10:53 +08:00 · 2025-05-13 16:04:15 +08:00 · 2025-05-13 15:51:59 +08:00 · 2025-05-13 15:41:18 +08:00
73 changed files with 4188 additions and 1383 deletions
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,87 @@
+name: 🐛 Bug | Bug Report
+description: 报告错误或异常问题 | Report an error or unexpected behavior
+title: "[Bug]: "
+labels:
+  - bug
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        **提交问题前，请确保您已阅读以下文档：[Getting Started (English)](https://github.com/harry0703/MoneyPrinterTurbo/blob/main/README-en.md#system-requirements-) 或 [快速开始 (中文)](https://github.com/harry0703/MoneyPrinterTurbo/blob/main/README.md#%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B-)。**
+        
+        **Before submitting an issue, please make sure you've read the following documentation: [Getting Started (English)](https://github.com/harry0703/MoneyPrinterTurbo/blob/main/README-en.md#system-requirements-) or [快速开始 (Chinese)](https://github.com/harry0703/MoneyPrinterTurbo/blob/main/README.md#%E5%BF%AB%E9%80%9F%E5%BC%80%E5%A7%8B-).**
+        
+  - type: textarea
+    attributes:
+      label: 问题描述 | Current Behavior
+      description: |
+        描述您遇到的问题
+        Describe the issue you're experiencing
+      placeholder: |
+        当我执行...操作时，程序出现了...问题
+        When I perform..., the program shows...
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: 重现步骤 | Steps to Reproduce
+      description: |
+        详细描述如何重现此问题
+        Describe in detail how to reproduce this issue
+      placeholder: |
+        1. 打开...
+        2. 点击...
+        3. 出现错误...
+        
+        1. Open...
+        2. Click on...
+        3. Error occurs...
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: 错误日志 | Error Logs
+      description: |
+        请提供相关错误信息或日志（注意不要包含敏感信息）
+        Please provide any error messages or logs (be careful not to include sensitive information)
+      placeholder: |
+        错误信息、日志或截图...
+        Error messages, logs, or screenshots...
+    validations:
+      required: true
+  - type: input
+    attributes:
+      label: Python 版本 | Python Version
+      description: |
+        您使用的 Python 版本
+        The Python version you're using
+      placeholder: v3.13.0, v3.10.0, etc.
+    validations:
+      required: true
+  - type: input
+    attributes:
+      label: 操作系统 | Operating System
+      description: |
+        您的操作系统信息
+        Your operating system information
+      placeholder: macOS 14.1, Windows 11, Ubuntu 22.04, etc.
+    validations:
+      required: true
+  - type: input
+    attributes:
+      label: MoneyPrinterTurbo 版本 | Version
+      description: |
+        您使用的 MoneyPrinterTurbo 版本
+        The version of MoneyPrinterTurbo you're using
+      placeholder: v1.2.2, etc.
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: 补充信息 | Additional Information
+      description: |
+        其他对解决问题有帮助的信息（如截图、视频等）
+        Any other information that might help solve the issue (screenshots, videos, etc.)
+    validations:
+      required: false
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: false
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,29 @@
+name: ✨ 增加功能 | Feature Request
+description: 为此项目提出一个新想法或建议 | Suggest a new idea for this project
+title: "[Feature]: "
+labels:
+  - enhancement
+
+body:
+  - type: textarea
+    attributes:
+      label: 需求描述 | Problem Statement
+      description: |
+        请描述您希望解决的问题或需求 
+        Please describe the problem you want to solve
+      placeholder: |
+        我在使用过程中遇到了...
+        I encountered... when using this project
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: 建议的解决方案 | Proposed Solution
+      description: |
+        请描述您认为可行的解决方案或实现方式
+        Please describe your suggested solution or implementation
+      placeholder: |
+        可以考虑添加...功能来解决这个问题
+        Consider adding... feature to address this issue
+    validations:
+      required: true
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,17 @@
 /**/.streamlit
 __pycache__
 logs/
+
+node_modules
+# VuePress 默认临时文件目录
+/sites/docs/.vuepress/.temp
+# VuePress 默认缓存目录
+/sites/docs/.vuepress/.cache
+# VuePress 默认构建生成的静态文件目录
+/sites/docs/.vuepress/dist
+# 模型目录
+/models/
+./models/*
+
+venv/
+.venv
--- a/14
+++ b/14
@@ -1,9 +1,12 @@
 # Use an official Python runtime as a parent image
-FROM python:3.10-slim
+FROM python:3.11-slim-bullseye

 # Set the working directory in the container
 WORKDIR /MoneyPrinterTurbo

+# 设置/MoneyPrinterTurbo目录权限为777
+RUN chmod 777 /MoneyPrinterTurbo
+
 ENV PYTHONPATH="/MoneyPrinterTurbo"

 # Install system dependencies
@@ -16,12 +19,15 @@ RUN apt-get update && apt-get install -y \
 # Fix security policy for ImageMagick
 RUN sed -i '/<policy domain="path" rights="none" pattern="@\*"/d' /etc/ImageMagick-6/policy.xml

-# Copy the current directory contents into the container at /MoneyPrinterTurbo
-COPY . .
+# Copy only the requirements.txt first to leverage Docker cache
+COPY requirements.txt ./

 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt

+# Now copy the rest of the codebase into the image
+COPY . .
+
 # Expose the port the app runs on
 EXPOSE 8501

@@ -35,4 +41,4 @@ CMD ["streamlit", "run", "./webui/Main.py","--browser.serverAddress=127.0.0.1","
 ## For Linux or MacOS:
 # docker run -v $(pwd)/config.toml:/MoneyPrinterTurbo/config.toml -v $(pwd)/storage:/MoneyPrinterTurbo/storage -p 8501:8501 moneyprinterturbo
 ## For Windows:
-# docker run -v %cd%/config.toml:/MoneyPrinterTurbo/config.toml -v %cd%/storage:/MoneyPrinterTurbo/storage -p 8501:8501 moneyprinterturbo
+# docker run -v ${PWD}/config.toml:/MoneyPrinterTurbo/config.toml -v ${PWD}/storage:/MoneyPrinterTurbo/storage -p 8501:8501 moneyprinterturbo
--- a/README-en.md
+++ b/README-en.md
@@ -10,9 +10,9 @@

 <h3>English | <a href="README.md">简体中文</a></h3>

-
-> Thanks to [RootFTW](https://github.com/Root-FTW) for the translation
-
+<div align="center">
+  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FMoneyPrinterTurbo | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</div>

 Simply provide a <b>topic</b> or <b>keyword</b> for a video, and it will automatically generate the video copy, video
 materials, video subtitles, and video background music before synthesizing a high-definition short video.
@@ -35,10 +35,19 @@ like to express our special thanks to
 **RecCloud (AI-Powered Multimedia Service Platform)** for providing a free `AI Video Generator` service based on this
 project. It allows for online use without deployment, which is very convenient.

-https://reccloud.com
+- Chinese version: https://reccloud.cn
+- English version: https://reccloud.com

 ![](docs/reccloud.com.jpg)

+## Thanks for Sponsorship 🙏
+
+Thanks to Picwish https://picwish.com for supporting and sponsoring this project, enabling continuous updates and maintenance.
+
+Picwish focuses on the **image processing field**, providing a rich set of **image processing tools** that extremely simplify complex operations, truly making image processing easier.
+
+![picwish.jpg](docs/picwish.com.jpg)
+
 ## Features 🎯

 - [x] Complete **MVC architecture**, **clearly structured** code, easy to maintain, supports both `API`
@@ -51,29 +60,22 @@ https://reccloud.com
  satisfactory one
 - [x] Supports setting the **duration of video clips**, facilitating adjustments to material switching frequency
 - [x] Supports video copy in both **Chinese** and **English**
- [x] Supports **multiple voice** synthesis
+- [x] Supports **multiple voice** synthesis, with **real-time preview** of effects
 - [x] Supports **subtitle generation**, with adjustable `font`, `position`, `color`, `size`, and also
  supports `subtitle outlining`
 - [x] Supports **background music**, either random or specified music files, with adjustable `background music volume`
- [x] Video material sources are **high-definition** and **royalty-free**
- [x] Supports integration with various models such as **OpenAI**, **moonshot**, **Azure**, **gpt4free**, **one-api**,
-  **qianwen**, **Google Gemini**, **Ollama** and more
-
- ❓[How to Use the Free OpenAI GPT-3.5 Model?](https://github.com/harry0703/MoneyPrinterTurbo/blob/main/README-en.md#common-questions-)
-
+- [x] Video material sources are **high-definition** and **royalty-free**, and you can also use your own **local materials**
+- [x] Supports integration with various models such as **OpenAI**, **Moonshot**, **Azure**, **gpt4free**, **one-api**, **Qwen**, **Google Gemini**, **Ollama**, **DeepSeek**, **ERNIE**, **Pollinations** and more

 ### Future Plans 📅

- [ ] Introduce support for GPT-SoVITS dubbing
- [ ] Enhance voice synthesis with large models for a more natural and emotionally resonant voice output
- [ ] Incorporate video transition effects to ensure a smoother viewing experience
- [ ] Improve the relevance of video content
- [ ] Add options for video length: short, medium, long
- [ ] Package the application into a one-click launch bundle for Windows and macOS for ease of use
- [ ] Enable the use of custom materials
- [ ] Offer voiceover and background music options with real-time preview
- [ ] Support a wider range of voice synthesis providers, such as OpenAI TTS, Azure TTS
- [ ] Automate the upload process to the YouTube platform
+- [ ] GPT-SoVITS dubbing support
+- [ ] Optimize voice synthesis using large models for more natural and emotionally rich voice output
+- [ ] Add video transition effects for a smoother viewing experience
+- [ ] Add more video material sources, improve the matching between video materials and script
+- [ ] Add video length options: short, medium, long
+- [ ] Support more voice synthesis providers, such as OpenAI TTS
+- [ ] Automate upload to YouTube platform

 ## Video Demos 📺

@@ -111,10 +113,34 @@ https://reccloud.com
 </tbody>
 </table>

+## System Requirements 📦
+
+- Recommended minimum 4 CPU cores or more, 4G of memory or more, GPU is not required
+- Windows 10 or MacOS 11.0, and their later versions
+
+## Quick Start 🚀
+
+### Run in Google Colab 
+Want to try MoneyPrinterTurbo without setting up a local environment? Run it directly in Google Colab!
+
+[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/harry0703/MoneyPrinterTurbo/blob/main/docs/MoneyPrinterTurbo.ipynb)
+
+
+### Windows
+
+Google Drive (v1.2.6): https://drive.google.com/file/d/1HsbzfT7XunkrCrHw5ncUjFX8XX4zAuUh/view?usp=sharing
+
+After downloading, it is recommended to **double-click** `update.bat` first to update to the **latest code**, then double-click `start.bat` to launch
+
+After launching, the browser will open automatically (if it opens blank, it is recommended to use **Chrome** or **Edge**)
+
+### Other Systems
+
+One-click startup packages have not been created yet. See the **Installation & Deployment** section below. It is recommended to use **docker** for deployment, which is more convenient.
+
 ## Installation & Deployment 📥

- Try to avoid using **Chinese paths** to prevent unpredictable issues
- Ensure your **network** is stable, meaning you can access foreign websites normally
+### Prerequisites

 #### ① Clone the Project

@@ -128,11 +154,6 @@ git clone https://github.com/harry0703/MoneyPrinterTurbo.git
 - Follow the instructions in the `config.toml` file to configure `pexels_api_keys` and `llm_provider`, and according to
  the llm_provider's service provider, set up the corresponding API Key

-#### ③ Configure Large Language Models (LLM)
-
- To use `GPT-4.0` or `GPT-3.5`, you need an `API Key` from `OpenAI`. If you don't have one, you can set `llm_provider`
-  to `g4f` (a free-to-use GPT library https://github.com/xtekky/gpt4free)
-
 ### Docker Deployment 🐳

 #### ① Launch the Docker Container
@@ -148,6 +169,8 @@ cd MoneyPrinterTurbo
 docker-compose up
 ```

+> Note：The latest version of docker will automatically install docker compose in the form of a plug-in, and the start command is adjusted to `docker compose up `
+
 #### ② Access the Web Interface

 Open your browser and visit http://0.0.0.0:8501
@@ -160,13 +183,12 @@ Open your browser and visit http://0.0.0.0:8080/docs Or http://0.0.0.0:8080/redo

 #### ① Create a Python Virtual Environment

-It is recommended to create a Python virtual environment
-using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)
+It is recommended to create a Python virtual environment using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)

 ```shell
 git clone https://github.com/harry0703/MoneyPrinterTurbo.git
 cd MoneyPrinterTurbo
-conda create -n MoneyPrinterTurbo python=3.10
+conda create -n MoneyPrinterTurbo python=3.11
 conda activate MoneyPrinterTurbo
 pip install -r requirements.txt
 ```
@@ -175,10 +197,9 @@ pip install -r requirements.txt

 ###### Windows:

- Download https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
+- Download https://imagemagick.org/script/download.php Choose the Windows version, make sure to select the **static library** version, such as ImageMagick-7.1.1-32-Q16-x64-**static**.exe
 - Install the downloaded ImageMagick, **do not change the installation path**
- Modify the `config.toml` configuration file, set `imagemagick_path` to your actual installation path (if you didn't
-  change the path during installation, just uncomment it)
+- Modify the `config.toml` configuration file, set `imagemagick_path` to your actual installation path

 ###### MacOS:

@@ -205,14 +226,12 @@ Note that you need to execute the following commands in the `root directory` of
 ###### Windows

 ```bat
-conda activate MoneyPrinterTurbo
 webui.bat
 ```

 ###### MacOS or Linux

 ```shell
-conda activate MoneyPrinterTurbo
 sh webui.sh
 ```

@@ -231,13 +250,15 @@ online for a quick experience.

 A list of all supported voices can be viewed here: [Voice List](./docs/voice-list.txt)

+2024-04-16 v1.1.2 Added 9 new Azure voice synthesis voices that require API KEY configuration. These voices sound more realistic.
+
 ## Subtitle Generation 📜

 Currently, there are 2 ways to generate subtitles:

- edge: Faster generation speed, better performance, no specific requirements for computer configuration, but the
+- **edge**: Faster generation speed, better performance, no specific requirements for computer configuration, but the
  quality may be unstable
- whisper: Slower generation speed, poorer performance, specific requirements for computer configuration, but more
+- **whisper**: Slower generation speed, poorer performance, specific requirements for computer configuration, but more
  reliable quality

 You can switch between them by modifying the `subtitle_provider` in the `config.toml` configuration file
@@ -245,7 +266,31 @@ You can switch between them by modifying the `subtitle_provider` in the `config.
 It is recommended to use `edge` mode, and switch to `whisper` mode if the quality of the subtitles generated is not
 satisfactory.

-> If left blank, it means no subtitles will be generated.
+> Note:
+>
+> 1. In whisper mode, you need to download a model file from HuggingFace, about 3GB in size, please ensure good internet connectivity
+> 2. If left blank, it means no subtitles will be generated.
+
+> Since HuggingFace is not accessible in China, you can use the following methods to download the `whisper-large-v3` model file
+
+Download links:
+
+- Baidu Netdisk: https://pan.baidu.com/s/11h3Q6tsDtjQKTjUu3sc5cA?pwd=xjs9
+- Quark Netdisk: https://pan.quark.cn/s/3ee3d991d64b
+
+After downloading the model, extract it and place the entire directory in `.\MoneyPrinterTurbo\models`,
+The final file path should look like this: `.\MoneyPrinterTurbo\models\whisper-large-v3`
+
+```
+MoneyPrinterTurbo
+  ├─models
+  │   └─whisper-large-v3
+  │          config.json
+  │          model.bin
+  │          preprocessor_config.json
+  │          tokenizer.json
+  │          vocabulary.json
+```

 ## Background Music 🎵

@@ -260,20 +305,6 @@ own fonts.

 ## Common Questions 🤔

-### ❓How to Use the Free OpenAI GPT-3.5 Model?
-[OpenAI has announced that ChatGPT with 3.5 is now free](https://openai.com/blog/start-using-chatgpt-instantly), and developers have wrapped it into an API for direct usage.
-
-**Ensure you have Docker installed and running**. Execute the following command to start the Docker service:
-```shell
-docker run -p 3040:3040 missuo/freegpt35
-```
-Once successfully started, modify the `config.toml` configuration as follows:
-
- Set `llm_provider` to `openai`
- Fill in `openai_api_key` with any value, for example, '123456'
- Change `openai_base_url` to `http://localhost:3040/v1/`
- Set `openai_model_name` to `gpt-3.5-turbo`
-
 ### ❓RuntimeError: No ffmpeg exe could be found

 Normally, ffmpeg will be automatically downloaded and detected.
@@ -293,24 +324,6 @@ actual installation path.
 ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
 ```

-### ❓Error generating audio or downloading videos
-
-[issue 56](https://github.com/harry0703/MoneyPrinterTurbo/issues/56)
-
-```
-failed to generate audio, maybe the network is not available. 
-if you are in China, please use a VPN.
-```
-
-[issue 44](https://github.com/harry0703/MoneyPrinterTurbo/issues/44)
-
-```
-failed to download videos, maybe the network is not available. 
-if you are in China, please use a VPN.
-```
-
-This is likely due to network issues preventing access to foreign services. Please use a VPN to resolve this.
-
 ### ❓ImageMagick is not installed on your computer

 [issue 33](https://github.com/harry0703/MoneyPrinterTurbo/issues/33)
@@ -325,16 +338,48 @@ For Linux systems, you can manually install it, refer to https://cn.linux-consol

 Thanks to [@wangwenqiao666](https://github.com/wangwenqiao666) for their research and exploration

+### ❓ImageMagick's security policy prevents operations related to temporary file @/tmp/tmpur5hyyto.txt
+
+You can find these policies in ImageMagick's configuration file policy.xml.
+This file is usually located in /etc/ImageMagick-`X`/ or a similar location in the ImageMagick installation directory.
+Modify the entry containing `pattern="@"`, change `rights="none"` to `rights="read|write"` to allow read and write operations on files.
+
+### ❓OSError: [Errno 24] Too many open files
+
+This issue is caused by the system's limit on the number of open files. You can solve it by modifying the system's file open limit.
+
+Check the current limit:
+
+```shell
+ulimit -n
+```
+
+If it's too low, you can increase it, for example:
+
+```shell
+ulimit -n 10240
+```
+
+### ❓Whisper model download failed, with the following error
+
+LocalEntryNotfoundEror: Cannot find an appropriate cached snapshotfolderfor the specified revision on the local disk and
+outgoing trafic has been disabled.
+To enablerepo look-ups and downloads online, pass 'local files only=False' as input.
+
+or
+
+An error occured while synchronizing the model Systran/faster-whisper-large-v3 from the Hugging Face Hub:
+An error happened while trying to locate the files on the Hub and we cannot find the appropriate snapshot folder for the
+specified revision on the local disk. Please check your internet connection and try again.
+Trying to load the model directly from the local cache, if it exists.
+
+Solution: [Click to see how to manually download the model from netdisk](#subtitle-generation-)
+
 ## Feedback & Suggestions 📢

 - You can submit an [issue](https://github.com/harry0703/MoneyPrinterTurbo/issues) or
  a [pull request](https://github.com/harry0703/MoneyPrinterTurbo/pulls).

-## Reference Projects 📚
-
-This project is based on https://github.com/FujiwaraChoki/MoneyPrinter and has been refactored with a lot of
-optimizations and added functionalities. Thanks to the original author for their spirit of open source.
-
 ## License 📝

 Click to view the [`LICENSE`](LICENSE) file
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@
 </p>
 <br>
 <h3>简体中文 | <a href="README-en.md">English</a></h3>
+<div align="center">
+  <a href="https://trendshift.io/repositories/8731" target="_blank"><img src="https://trendshift.io/api/badge/repositories/8731" alt="harry0703%2FMoneyPrinterTurbo | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</div>
 <br>
 只需提供一个视频 <b>主题</b> 或 <b>关键词</b> ，就可以全自动生成视频文案、视频素材、视频字幕、视频背景音乐，然后合成一个高清的短视频。
 <br>
@@ -26,7 +29,6 @@
 ## 特别感谢 🙏

 由于该项目的 **部署** 和 **使用**，对于一些小白用户来说，还是 **有一定的门槛**，在此特别感谢
-
 **录咖（AI智能 多媒体服务平台）** 网站基于该项目，提供的免费`AI视频生成器`服务，可以不用部署，直接在线使用，非常方便。

 - 中文版：https://reccloud.cn
@@ -34,6 +36,14 @@

 ![](docs/reccloud.cn.jpg)

+## 感谢赞助 🙏
+
+感谢佐糖 https://picwish.cn 对该项目的支持和赞助，使得该项目能够持续的更新和维护。
+
+佐糖专注于**图像处理领域**，提供丰富的**图像处理工具**，将复杂操作极致简化，真正实现让图像处理更简单。
+
+![picwish.jpg](docs/picwish.jpg)
+
 ## 功能特性 🎯

 - [x] 完整的 **MVC架构**，代码 **结构清晰**，易于维护，支持 `API` 和 `Web界面`
@@ -42,15 +52,15 @@
    - [x] 竖屏 9:16，`1080x1920`
    - [x] 横屏 16:9，`1920x1080`
 - [x] 支持 **批量视频生成**，可以一次生成多个视频，然后选择一个最满意的
- [x] 支持 **视频片段时长**设置，方便调节素材切换频率
+- [x] 支持 **视频片段时长** 设置，方便调节素材切换频率
 - [x] 支持 **中文** 和 **英文** 视频文案
- [x] 支持 **多种语音** 合成
+- [x] 支持 **多种语音** 合成，可 **实时试听** 效果
 - [x] 支持 **字幕生成**，可以调整 `字体`、`位置`、`颜色`、`大小`，同时支持`字幕描边`设置
 - [x] 支持 **背景音乐**，随机或者指定音乐文件，可设置`背景音乐音量`
- [x] 视频素材来源 **高清**，而且 **无版权**
- [x] 支持 **OpenAI**、**moonshot**、**Azure**、**gpt4free**、**one-api**、**通义千问**、**Google Gemini**、**Ollama** 等多种模型接入
+- [x] 视频素材来源 **高清**，而且 **无版权**，也可以使用自己的 **本地素材**
+- [x] 支持 **OpenAI**、**Moonshot**、**Azure**、**gpt4free**、**one-api**、**通义千问**、**Google Gemini**、**Ollama**、**DeepSeek**、 **文心一言**, **Pollinations** 等多种模型接入
+    - 中国用户建议使用 **DeepSeek** 或 **Moonshot** 作为大模型提供商（国内可直接访问，不需要VPN。注册就送额度，基本够用）

- ❓[如何使用免费的 **OpenAI GPT-3.5** 模型?](https://github.com/harry0703/MoneyPrinterTurbo?tab=readme-ov-file#%E5%B8%B8%E8%A7%81%E9%97%AE%E9%A2%98-)

 ### 后期计划 📅

@@ -59,16 +69,9 @@
 - [ ] 增加视频转场效果，使其看起来更加的流畅
 - [ ] 增加更多视频素材来源，优化视频素材和文案的匹配度
 - [ ] 增加视频长度选项：短、中、长
- [ ] 打包成一键启动包（Windows，macOS），方便使用
- [ ] 增加免费网络代理，让访问OpenAI和素材下载不再受限
- [ ] 可以使用自己的素材
- [ ] 朗读声音和背景音乐，提供实时试听
- [ ] 支持更多的语音合成服务商，比如 OpenAI TTS, Azure TTS
+- [ ] 支持更多的语音合成服务商，比如 OpenAI TTS
 - [ ] 自动上传到YouTube平台

-## 交流讨论 💬
-<img src="docs/wechat-03.jpg" width="300">
-
 ## 视频演示 📺

 ### 竖屏 9:16
@@ -77,12 +80,14 @@
 <thead>
 <tr>
 <th align="center"><g-emoji class="g-emoji" alias="arrow_forward">▶️</g-emoji> 《如何增加生活的乐趣》</th>
+<th align="center"><g-emoji class="g-emoji" alias="arrow_forward">▶️</g-emoji> 《金钱的作用》<br>更真实的合成声音</th>
 <th align="center"><g-emoji class="g-emoji" alias="arrow_forward">▶️</g-emoji> 《生命的意义是什么》</th>
 </tr>
 </thead>
 <tbody>
 <tr>
 <td align="center"><video src="https://github.com/harry0703/MoneyPrinterTurbo/assets/4928832/a84d33d5-27a2-4aba-8fd0-9fb2bd91c6a6"></video></td>
+<td align="center"><video src="https://github.com/harry0703/MoneyPrinterTurbo/assets/4928832/af2f3b0b-002e-49fe-b161-18ba91c055e8"></video></td>
 <td align="center"><video src="https://github.com/harry0703/MoneyPrinterTurbo/assets/4928832/112c9564-d52b-4472-99ad-970b75f66476"></video></td>
 </tr>
 </tbody>
@@ -106,16 +111,34 @@
 </table>

 ## 配置要求 📦
- 建议最低 CPU 4核或以上，内存 8G 或以上，显卡非必须
+
+- 建议最低 CPU **4核** 或以上，内存 **4G** 或以上，显卡非必须
 - Windows 10 或 MacOS 11.0 以上系统

+
+## 快速开始 🚀
+
+### 在 Google Colab 中运行
+免去本地环境配置，点击直接在 Google Colab 中快速体验 MoneyPrinterTurbo
+
+[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/harry0703/MoneyPrinterTurbo/blob/main/docs/MoneyPrinterTurbo.ipynb)
+
+
+### Windows一键启动包
+
+下载一键启动包，解压直接使用（路径不要有 **中文**、**特殊字符**、**空格**）
+
+- 百度网盘（v1.2.6）: https://pan.baidu.com/s/1wg0UaIyXpO3SqIpaq790SQ?pwd=sbqx 提取码: sbqx
+- Google Drive (v1.2.6): https://drive.google.com/file/d/1HsbzfT7XunkrCrHw5ncUjFX8XX4zAuUh/view?usp=sharing
+
+下载后，建议先**双击执行** `update.bat` 更新到**最新代码**，然后双击 `start.bat` 启动
+
+启动后，会自动打开浏览器（如果打开是空白，建议换成 **Chrome** 或者 **Edge** 打开）
+
 ## 安装部署 📥

-> 不想部署的可以直接下载安装包，解压直接使用
- **Windows** 版本下载地址
-  - 百度网盘: https://pan.baidu.com/s/1BB3SGtAFTytzFLS5t2d8Gg?pwd=5bry
-
 ### 前提条件
+
 - 尽量不要使用 **中文路径**，避免出现一些无法预料的问题
 - 请确保你的 **网络** 是正常的，VPN需要打开`全局流量`模式

@@ -125,20 +148,12 @@
 git clone https://github.com/harry0703/MoneyPrinterTurbo.git
 ```

-#### ② 修改配置文件
+#### ② 修改配置文件（可选，建议启动后也可以在 WebUI 里面配置）

 - 将 `config.example.toml` 文件复制一份，命名为 `config.toml`
 - 按照 `config.toml` 文件中的说明，配置好 `pexels_api_keys` 和 `llm_provider`，并根据 llm_provider 对应的服务商，配置相关的
  API Key

-#### ③ 配置大模型(LLM)
-
- 如果要使用 `GPT-4.0` 或 `GPT-3.5`，需要有 `OpenAI` 的 `API Key`，如果没有，可以将 `llm_provider` 设置为 `g4f` (
-  一个免费使用GPT的开源库 https://github.com/xtekky/gpt4free ，但是该免费的服务，稳定性较差，有时候可以用，有时候用不了)
- 或者可以使用到 [月之暗面](https://platform.moonshot.cn/console/api-keys) 申请。注册就送
-  15元体验金，可以对话1500次左右。然后设置 `llm_provider="moonshot"` 和 `moonshot_api_key`
- 也可以使用 通义千问，具体请看配置文件里面的注释说明
-
 ### Docker部署 🐳

 #### ① 启动Docker
@@ -146,6 +161,7 @@ git clone https://github.com/harry0703/MoneyPrinterTurbo.git
 如果未安装 Docker，请先安装 https://www.docker.com/products/docker-desktop/

 如果是Windows系统，请参考微软的文档：
+
 1. https://learn.microsoft.com/zh-cn/windows/wsl/install
 2. https://learn.microsoft.com/zh-cn/windows/wsl/tutorials/wsl-containers

@@ -154,6 +170,8 @@ cd MoneyPrinterTurbo
 docker-compose up
 ```

+> 注意：最新版的docker安装时会自动以插件的形式安装docker compose，启动命令调整为docker compose up
+
 #### ② 访问Web界面

 打开浏览器，访问 http://0.0.0.0:8501
@@ -176,36 +194,31 @@ docker-compose up
 ```shell
 git clone https://github.com/harry0703/MoneyPrinterTurbo.git
 cd MoneyPrinterTurbo
-conda create -n MoneyPrinterTurbo python=3.10
+conda create -n MoneyPrinterTurbo python=3.11
 conda activate MoneyPrinterTurbo
 pip install -r requirements.txt
 ```

 #### ② 安装好 ImageMagick

-###### Windows:
+- Windows:
+    - 下载 https://imagemagick.org/script/download.php 选择Windows版本，切记一定要选择 **静态库** 版本，比如
+      ImageMagick-7.1.1-32-Q16-x64-**static**.exe
+    - 安装下载好的 ImageMagick，**注意不要修改安装路径**
+    - 修改 `配置文件 config.toml` 中的 `imagemagick_path` 为你的 **实际安装路径**

- 下载 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-30-Q16-x64-static.exe
- 安装下载好的 ImageMagick，注意不要修改安装路径
- 修改 `配置文件 config.toml` 中的 `imagemagick_path` 为你的实际安装路径（如果安装的时候没有修改路径，直接取消注释即可）
-
-###### MacOS:
-
-```shell
-brew install imagemagick
-````
-
-###### Ubuntu
-
-```shell
-sudo apt-get install imagemagick
-```
-
-###### CentOS
-
-```shell
-sudo yum install ImageMagick
-```
+- MacOS:
+  ```shell
+  brew install imagemagick
+  ````
+- Ubuntu
+  ```shell
+  sudo apt-get install imagemagick
+  ```
+- CentOS
+  ```shell
+  sudo yum install ImageMagick
+  ```

 #### ③ 启动Web界面 🌐

@@ -214,17 +227,16 @@ sudo yum install ImageMagick
 ###### Windows

 ```bat
-conda activate MoneyPrinterTurbo
 webui.bat
 ```

 ###### MacOS or Linux

 ```shell
-conda activate MoneyPrinterTurbo
 sh webui.sh
 ```
-启动后，会自动打开浏览器
+
+启动后，会自动打开浏览器（如果打开是空白，建议换成 **Chrome** 或者 **Edge** 打开）

 #### ④ 启动API服务 🚀

@@ -238,6 +250,8 @@ python main.py

 所有支持的声音列表，可以查看：[声音列表](./docs/voice-list.txt)

+2024-04-16 v1.1.2 新增了9种Azure的语音合成声音，需要配置API KEY，该声音合成的更加真实。
+
 ## 字幕生成 📜

 当前支持2种字幕生成方式：
@@ -250,17 +264,20 @@ python main.py
 建议使用 `edge` 模式，如果生成的字幕质量不好，再切换到 `whisper` 模式

 > 注意：
+
 1. whisper 模式下需要到 HuggingFace 下载一个模型文件，大约 3GB 左右，请确保网络通畅
 2. 如果留空，表示不生成字幕。

 > 由于国内无法访问 HuggingFace，可以使用以下方法下载 `whisper-large-v3` 的模型文件

 下载地址：
+
 - 百度网盘: https://pan.baidu.com/s/11h3Q6tsDtjQKTjUu3sc5cA?pwd=xjs9
 - 夸克网盘：https://pan.quark.cn/s/3ee3d991d64b

 模型下载后解压，整个目录放到 `.\MoneyPrinterTurbo\models` 里面，
 最终的文件路径应该是这样: `.\MoneyPrinterTurbo\models\whisper-large-v3`
+
 ```
 MoneyPrinterTurbo  
  ├─models
@@ -283,26 +300,6 @@ MoneyPrinterTurbo

 ## 常见问题 🤔

-### ❓如何使用免费的OpenAI GPT-3.5模型?
-[OpenAI宣布ChatGPT里面3.5已经免费了](https://openai.com/blog/start-using-chatgpt-instantly)，有开发者将其封装成了API，可以直接调用
-
-**确保你安装和启动了docker服务**，执行以下命令启动docker服务
-```shell
-docker run -p 3040:3040 missuo/freegpt35
-```
-启动成功后，修改 `config.toml` 中的配置
- `llm_provider` 设置为 `openai`
- `openai_api_key` 随便填写一个即可，比如 '123456'
- `openai_base_url` 改为 `http://localhost:3040/v1/`
- `openai_model_name` 改为 `gpt-3.5-turbo`
-
-
-### ❓AttributeError: 'str' object has no attribute 'choices'`
-
-这个问题是由于 OpenAI 或者其他 LLM，没有返回正确的回复导致的。
-
-大概率是网络原因， 使用 **VPN**，或者设置 `openai_base_url` 为你的代理 ，应该就可以解决了。
-
 ### ❓RuntimeError: No ffmpeg exe could be found

 通常情况下，ffmpeg 会被自动下载，并且会被自动检测到。
@@ -321,52 +318,14 @@ Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variabl
 ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
 ```

-### ❓生成音频时报错或下载视频报错
-
-[issue 56](https://github.com/harry0703/MoneyPrinterTurbo/issues/56)
-
-```
-failed to generate audio, maybe the network is not available. 
-if you are in China, please use a VPN.
-```
-
-[issue 44](https://github.com/harry0703/MoneyPrinterTurbo/issues/44)
-
-```
-failed to download videos, maybe the network is not available. 
-if you are in China, please use a VPN.
-```
-
-这个大概率是网络原因，无法访问境外的服务，请使用VPN解决。
-
-### ❓ImageMagick is not installed on your computer
-
-[issue 33](https://github.com/harry0703/MoneyPrinterTurbo/issues/33)
-
-1. 按照 `示例配置` 里面提供的 `下载地址`
-   ，安装 https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe, 用静态库
-2. 不要安装在中文路径里面，避免出现一些无法预料的问题
-
-[issue 54](https://github.com/harry0703/MoneyPrinterTurbo/issues/54#issuecomment-2017842022)
-
-如果是linux系统，可以手动安装，参考 https://cn.linux-console.net/?p=16978
-
-感谢 [@wangwenqiao666](https://github.com/wangwenqiao666)的研究探索
-
 ### ❓ImageMagick的安全策略阻止了与临时文件@/tmp/tmpur5hyyto.txt相关的操作

-[issue 92](https://github.com/harry0703/MoneyPrinterTurbo/issues/92)
-
 可以在ImageMagick的配置文件policy.xml中找到这些策略。
 这个文件通常位于 /etc/ImageMagick-`X`/ 或 ImageMagick 安装目录的类似位置。
 修改包含`pattern="@"`的条目，将`rights="none"`更改为`rights="read|write"`以允许对文件的读写操作。

-感谢 [@chenhengzh](https://github.com/chenhengzh)的研究探索
-
 ### ❓OSError: [Errno 24] Too many open files

-[issue 100](https://github.com/harry0703/MoneyPrinterTurbo/issues/100)
-
 这个问题是由于系统打开文件数限制导致的，可以通过修改系统的文件打开数限制来解决。

 查看当前限制
@@ -381,42 +340,30 @@ ulimit -n
 ulimit -n 10240
 ```

-### ❓AttributeError: module 'PIL.Image' has no attribute 'ANTIALIAS'
+### ❓Whisper 模型下载失败，出现如下错误

-[issue 101](https://github.com/harry0703/MoneyPrinterTurbo/issues/101),
-[issue 83](https://github.com/harry0703/MoneyPrinterTurbo/issues/83),
-[issue 70](https://github.com/harry0703/MoneyPrinterTurbo/issues/70)
+LocalEntryNotfoundEror: Cannot find an appropriate cached snapshotfolderfor the specified revision on the local disk and
+outgoing trafic has been disabled.
+To enablerepo look-ups and downloads online, pass 'local files only=False' as input.

-先看下当前的 Pillow 版本是多少
+或者

-```shell
-pip list |grep Pillow
-```
+An error occured while synchronizing the model Systran/faster-whisper-large-v3 from the Hugging Face Hub:
+An error happened while trying to locate the files on the Hub and we cannot find the appropriate snapshot folder for the
+specified revision on the local disk. Please check your internet connection and try again.
+Trying to load the model directly from the local cache, if it exists.

-如果是 10.x 的版本，可以尝试下降级看看，有用户反馈降级后正常
-
-```shell
-pip uninstall Pillow
-pip install Pillow==9.5.0
-# 或者降级到 8.4.0
-pip install Pillow==8.4.0
-```
+解决方法：[点击查看如何从网盘手动下载模型](#%E5%AD%97%E5%B9%95%E7%94%9F%E6%88%90-)

 ## 反馈建议 📢

 - 可以提交 [issue](https://github.com/harry0703/MoneyPrinterTurbo/issues)
  或者 [pull request](https://github.com/harry0703/MoneyPrinterTurbo/pulls)。

-## 参考项目 📚
-
-该项目基于 https://github.com/FujiwaraChoki/MoneyPrinter 重构而来，做了大量的优化，增加了更多的功能。
-感谢原作者的开源精神。
-
 ## 许可证 📝

 点击查看 [`LICENSE`](LICENSE) 文件

-
 ## Star History

 [![Star History Chart](https://api.star-history.com/svg?repos=harry0703/MoneyPrinterTurbo&type=Date)](https://star-history.com/#harry0703/MoneyPrinterTurbo&Date)
--- a/app/asgi.py
+++ b/app/asgi.py
@@ -1,12 +1,13 @@
 """Application implementation - ASGI."""
+
 import os

 from fastapi import FastAPI, Request
 from fastapi.exceptions import RequestValidationError
-from fastapi.responses import JSONResponse
-from loguru import logger
-from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi.staticfiles import StaticFiles
+from loguru import logger

 from app.config import config
 from app.models.exception import HttpException
@@ -24,7 +25,9 @@ def exception_handler(request: Request, e: HttpException):
 def validation_exception_handler(request: Request, e: RequestValidationError):
    return JSONResponse(
        status_code=400,
-        content=utils.get_response(status=400, data=e.errors(), message='field required'),
+        content=utils.get_response(
+            status=400, data=e.errors(), message="field required"
+        ),
    )


@@ -61,7 +64,9 @@ app.add_middleware(
 )

 task_dir = utils.task_dir()
-app.mount("/tasks", StaticFiles(directory=task_dir, html=True, follow_symlink=True), name="")
+app.mount(
+    "/tasks", StaticFiles(directory=task_dir, html=True, follow_symlink=True), name=""
+)

 public_dir = utils.public_dir()
 app.mount("/", StaticFiles(directory=public_dir, html=True), name="")
--- a/app/config/init.py
+++ b/app/config/init.py
@@ -10,7 +10,9 @@ from app.utils import utils
 def __init_logger():
    # _log_file = utils.storage_dir("logs/server.log")
    _lvl = config.log_level
-    root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+    root_dir = os.path.dirname(
+        os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+    )

    def format_record(record):
        # 获取日志记录中的文件全路径
@@ -21,10 +23,13 @@ def __init_logger():
        record["file"].path = f"./{relative_path}"
        # 返回修改后的格式字符串
        # 您可以根据需要调整这里的格式
-        _format = '<green>{time:%Y-%m-%d %H:%M:%S}</> | ' + \
-                  '<level>{level}</> | ' + \
-                  '"{file.path}:{line}":<blue> {function}</> ' + \
-                  '- <level>{message}</>' + "\n"
+        _format = (
+            "<green>{time:%Y-%m-%d %H:%M:%S}</> | "
+            + "<level>{level}</> | "
+            + '"{file.path}:{line}":<blue> {function}</> '
+            + "- <level>{message}</>"
+            + "\n"
+        )
        return _format

    logger.remove()
--- a/app/config/config.py
+++ b/app/config/config.py
@@ -1,7 +1,8 @@
 import os
-import socket
-import toml
 import shutil
+import socket
+
+import toml
 from loguru import logger

 root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
@@ -17,7 +18,7 @@ def load_config():
        example_file = f"{root_dir}/config.example.toml"
        if os.path.isfile(example_file):
            shutil.copyfile(example_file, config_file)
-            logger.info(f"copy config.example.toml to config.toml")
+            logger.info("copy config.example.toml to config.toml")

    logger.info(f"load config from file: {config_file}")

@@ -25,7 +26,7 @@ def load_config():
        _config_ = toml.load(config_file)
    except Exception as e:
        logger.warning(f"load config failed: {str(e)}, try to load as utf-8-sig")
-        with open(config_file, mode="r", encoding='utf-8-sig') as fp:
+        with open(config_file, mode="r", encoding="utf-8-sig") as fp:
            _cfg_content = fp.read()
            _config_ = toml.loads(_cfg_content)
    return _config_
@@ -34,9 +35,8 @@ def load_config():
 def save_config():
    with open(config_file, "w", encoding="utf-8") as f:
        _cfg["app"] = app
-        _cfg["whisper"] = whisper
-        _cfg["pexels"] = pexels
        _cfg["azure"] = azure
+        _cfg["siliconflow"] = siliconflow
        _cfg["ui"] = ui
        f.write(toml.dumps(_cfg))

@@ -44,9 +44,15 @@ def save_config():
 _cfg = load_config()
 app = _cfg.get("app", {})
 whisper = _cfg.get("whisper", {})
-pexels = _cfg.get("pexels", {})
+proxy = _cfg.get("proxy", {})
 azure = _cfg.get("azure", {})
-ui = _cfg.get("ui", {})
+siliconflow = _cfg.get("siliconflow", {})
+ui = _cfg.get(
+    "ui",
+    {
+        "hide_log": False,
+    },
+)

 hostname = socket.gethostname()

@@ -54,9 +60,11 @@ log_level = _cfg.get("log_level", "DEBUG")
 listen_host = _cfg.get("listen_host", "0.0.0.0")
 listen_port = _cfg.get("listen_port", 8080)
 project_name = _cfg.get("project_name", "MoneyPrinterTurbo")
-project_description = _cfg.get("project_description",
-                               "<a href='https://github.com/harry0703/MoneyPrinterTurbo'>https://github.com/harry0703/MoneyPrinterTurbo</a>")
-project_version = _cfg.get("project_version", "1.1.2")
+project_description = _cfg.get(
+    "project_description",
+    "<a href='https://github.com/harry0703/MoneyPrinterTurbo'>https://github.com/harry0703/MoneyPrinterTurbo</a>",
+)
+project_version = _cfg.get("project_version", "1.2.6")
 reload_debug = False

 imagemagick_path = app.get("imagemagick_path", "")
--- a/app/controllers/base.py
+++ b/app/controllers/base.py
@@ -7,14 +7,14 @@ from app.models.exception import HttpException


 def get_task_id(request: Request):
-    task_id = request.headers.get('x-task-id')
+    task_id = request.headers.get("x-task-id")
    if not task_id:
        task_id = uuid4()
    return str(task_id)


 def get_api_key(request: Request):
-    api_key = request.headers.get('x-api-key')
+    api_key = request.headers.get("x-api-key")
    return api_key


@@ -23,5 +23,9 @@ def verify_token(request: Request):
    if token != config.app.get("api_key", ""):
        request_id = get_task_id(request)
        request_url = request.url
-        user_agent = request.headers.get('user-agent')
-        raise HttpException(task_id=request_id, status_code=401, message=f"invalid token: {request_url}, {user_agent}")
+        user_agent = request.headers.get("user-agent")
+        raise HttpException(
+            task_id=request_id,
+            status_code=401,
+            message=f"invalid token: {request_url}, {user_agent}",
+        )
--- a/app/controllers/manager/base_manager.py
+++ b/app/controllers/manager/base_manager.py
@@ -0,0 +1,64 @@
+import threading
+from typing import Any, Callable, Dict
+
+
+class TaskManager:
+    def __init__(self, max_concurrent_tasks: int):
+        self.max_concurrent_tasks = max_concurrent_tasks
+        self.current_tasks = 0
+        self.lock = threading.Lock()
+        self.queue = self.create_queue()
+
+    def create_queue(self):
+        raise NotImplementedError()
+
+    def add_task(self, func: Callable, *args: Any, **kwargs: Any):
+        with self.lock:
+            if self.current_tasks < self.max_concurrent_tasks:
+                print(f"add task: {func.__name__}, current_tasks: {self.current_tasks}")
+                self.execute_task(func, *args, **kwargs)
+            else:
+                print(
+                    f"enqueue task: {func.__name__}, current_tasks: {self.current_tasks}"
+                )
+                self.enqueue({"func": func, "args": args, "kwargs": kwargs})
+
+    def execute_task(self, func: Callable, *args: Any, **kwargs: Any):
+        thread = threading.Thread(
+            target=self.run_task, args=(func, *args), kwargs=kwargs
+        )
+        thread.start()
+
+    def run_task(self, func: Callable, *args: Any, **kwargs: Any):
+        try:
+            with self.lock:
+                self.current_tasks += 1
+            func(*args, **kwargs)  # call the function here, passing *args and **kwargs.
+        finally:
+            self.task_done()
+
+    def check_queue(self):
+        with self.lock:
+            if (
+                self.current_tasks < self.max_concurrent_tasks
+                and not self.is_queue_empty()
+            ):
+                task_info = self.dequeue()
+                func = task_info["func"]
+                args = task_info.get("args", ())
+                kwargs = task_info.get("kwargs", {})
+                self.execute_task(func, *args, **kwargs)
+
+    def task_done(self):
+        with self.lock:
+            self.current_tasks -= 1
+        self.check_queue()
+
+    def enqueue(self, task: Dict):
+        raise NotImplementedError()
+
+    def dequeue(self):
+        raise NotImplementedError()
+
+    def is_queue_empty(self):
+        raise NotImplementedError()
--- a/app/controllers/manager/memory_manager.py
+++ b/app/controllers/manager/memory_manager.py
@@ -0,0 +1,18 @@
+from queue import Queue
+from typing import Dict
+
+from app.controllers.manager.base_manager import TaskManager
+
+
+class InMemoryTaskManager(TaskManager):
+    def create_queue(self):
+        return Queue()
+
+    def enqueue(self, task: Dict):
+        self.queue.put(task)
+
+    def dequeue(self):
+        return self.queue.get()
+
+    def is_queue_empty(self):
+        return self.queue.empty()
--- a/app/controllers/manager/redis_manager.py
+++ b/app/controllers/manager/redis_manager.py
@@ -0,0 +1,56 @@
+import json
+from typing import Dict
+
+import redis
+
+from app.controllers.manager.base_manager import TaskManager
+from app.models.schema import VideoParams
+from app.services import task as tm
+
+FUNC_MAP = {
+    "start": tm.start,
+    # 'start_test': tm.start_test
+}
+
+
+class RedisTaskManager(TaskManager):
+    def __init__(self, max_concurrent_tasks: int, redis_url: str):
+        self.redis_client = redis.Redis.from_url(redis_url)
+        super().__init__(max_concurrent_tasks)
+
+    def create_queue(self):
+        return "task_queue"
+
+    def enqueue(self, task: Dict):
+        task_with_serializable_params = task.copy()
+
+        if "params" in task["kwargs"] and isinstance(
+            task["kwargs"]["params"], VideoParams
+        ):
+            task_with_serializable_params["kwargs"]["params"] = task["kwargs"][
+                "params"
+            ].dict()
+
+        # 将函数对象转换为其名称
+        task_with_serializable_params["func"] = task["func"].__name__
+        self.redis_client.rpush(self.queue, json.dumps(task_with_serializable_params))
+
+    def dequeue(self):
+        task_json = self.redis_client.lpop(self.queue)
+        if task_json:
+            task_info = json.loads(task_json)
+            # 将函数名称转换回函数对象
+            task_info["func"] = FUNC_MAP[task_info["func"]]
+
+            if "params" in task_info["kwargs"] and isinstance(
+                task_info["kwargs"]["params"], dict
+            ):
+                task_info["kwargs"]["params"] = VideoParams(
+                    **task_info["kwargs"]["params"]
+                )
+
+            return task_info
+        return None
+
+    def is_queue_empty(self):
+        return self.redis_client.llen(self.queue) == 0
--- a/app/controllers/ping.py
+++ b/app/controllers/ping.py
@@ -1,9 +1,13 @@
-from fastapi import APIRouter
-from fastapi import Request
+from fastapi import APIRouter, Request

 router = APIRouter()


-@router.get("/ping", tags=["Health Check"], description="检查服务可用性", response_description="pong")
+@router.get(
+    "/ping",
+    tags=["Health Check"],
+    description="检查服务可用性",
+    response_description="pong",
+)
 def ping(request: Request) -> str:
    return "pong"
--- a/app/controllers/v1/base.py
+++ b/app/controllers/v1/base.py
@@ -1,10 +1,10 @@
-from fastapi import APIRouter, Depends
+from fastapi import APIRouter


 def new_router(dependencies=None):
    router = APIRouter()
-    router.tags = ['V1']
-    router.prefix = '/api/v1'
+    router.tags = ["V1"]
+    router.prefix = "/api/v1"
    # 将认证依赖项应用于所有路由
    if dependencies:
        router.dependencies = dependencies
--- a/app/controllers/v1/llm.py
+++ b/app/controllers/v1/llm.py
@@ -1,31 +1,45 @@
 from fastapi import Request
+
 from app.controllers.v1.base import new_router
-from app.models.schema import VideoScriptResponse, VideoScriptRequest, VideoTermsResponse, VideoTermsRequest
+from app.models.schema import (
+    VideoScriptRequest,
+    VideoScriptResponse,
+    VideoTermsRequest,
+    VideoTermsResponse,
+)
 from app.services import llm
 from app.utils import utils

-# 认证依赖项
+# authentication dependency
 # router = new_router(dependencies=[Depends(base.verify_token)])
 router = new_router()


-@router.post("/scripts", response_model=VideoScriptResponse, summary="Create a script for the video")
+@router.post(
+    "/scripts",
+    response_model=VideoScriptResponse,
+    summary="Create a script for the video",
+)
 def generate_video_script(request: Request, body: VideoScriptRequest):
-    video_script = llm.generate_script(video_subject=body.video_subject,
-                                       language=body.video_language,
-                                       paragraph_number=body.paragraph_number)
-    response = {
-        "video_script": video_script
-    }
+    video_script = llm.generate_script(
+        video_subject=body.video_subject,
+        language=body.video_language,
+        paragraph_number=body.paragraph_number,
+    )
+    response = {"video_script": video_script}
    return utils.get_response(200, response)


-@router.post("/terms", response_model=VideoTermsResponse, summary="Generate video terms based on the video script")
+@router.post(
+    "/terms",
+    response_model=VideoTermsResponse,
+    summary="Generate video terms based on the video script",
+)
 def generate_video_terms(request: Request, body: VideoTermsRequest):
-    video_terms = llm.generate_terms(video_subject=body.video_subject,
-                                     video_script=body.video_script,
-                                     amount=body.amount)
-    response = {
-        "video_terms": video_terms
-    }
+    video_terms = llm.generate_terms(
+        video_subject=body.video_subject,
+        video_script=body.video_script,
+        amount=body.amount,
+    )
+    response = {"video_terms": video_terms}
    return utils.get_response(200, response)
--- a/app/controllers/v1/video.py
+++ b/app/controllers/v1/video.py
@@ -1,49 +1,124 @@
-import os
 import glob
+import os
 import pathlib
 import shutil
+from typing import Union

-from fastapi import Request, Depends, Path, BackgroundTasks, UploadFile
-from fastapi.responses import FileResponse, StreamingResponse
+from fastapi import BackgroundTasks, Depends, Path, Request, UploadFile
 from fastapi.params import File
+from fastapi.responses import FileResponse, StreamingResponse
 from loguru import logger

 from app.config import config
 from app.controllers import base
+from app.controllers.manager.memory_manager import InMemoryTaskManager
+from app.controllers.manager.redis_manager import RedisTaskManager
 from app.controllers.v1.base import new_router
 from app.models.exception import HttpException
-from app.models.schema import TaskVideoRequest, TaskQueryResponse, TaskResponse, TaskQueryRequest, \
-    BgmUploadResponse, BgmRetrieveResponse, TaskDeletionResponse
-from app.services import task as tm
+from app.models.schema import (
+    AudioRequest,
+    BgmRetrieveResponse,
+    BgmUploadResponse,
+    SubtitleRequest,
+    TaskDeletionResponse,
+    TaskQueryRequest,
+    TaskQueryResponse,
+    TaskResponse,
+    TaskVideoRequest,
+)
 from app.services import state as sm
+from app.services import task as tm
 from app.utils import utils

 # 认证依赖项
 # router = new_router(dependencies=[Depends(base.verify_token)])
 router = new_router()

+_enable_redis = config.app.get("enable_redis", False)
+_redis_host = config.app.get("redis_host", "localhost")
+_redis_port = config.app.get("redis_port", 6379)
+_redis_db = config.app.get("redis_db", 0)
+_redis_password = config.app.get("redis_password", None)
+_max_concurrent_tasks = config.app.get("max_concurrent_tasks", 5)
+
+redis_url = f"redis://:{_redis_password}@{_redis_host}:{_redis_port}/{_redis_db}"
+# 根据配置选择合适的任务管理器
+if _enable_redis:
+    task_manager = RedisTaskManager(
+        max_concurrent_tasks=_max_concurrent_tasks, redis_url=redis_url
+    )
+else:
+    task_manager = InMemoryTaskManager(max_concurrent_tasks=_max_concurrent_tasks)
+

@router.post("/videos", response_model=TaskResponse, summary="Generate a short video")
-def create_video(background_tasks: BackgroundTasks, request: Request, body: TaskVideoRequest):
+def create_video(
+    background_tasks: BackgroundTasks, request: Request, body: TaskVideoRequest
+):
+    return create_task(request, body, stop_at="video")
+
+
+@router.post("/subtitle", response_model=TaskResponse, summary="Generate subtitle only")
+def create_subtitle(
+    background_tasks: BackgroundTasks, request: Request, body: SubtitleRequest
+):
+    return create_task(request, body, stop_at="subtitle")
+
+
+@router.post("/audio", response_model=TaskResponse, summary="Generate audio only")
+def create_audio(
+    background_tasks: BackgroundTasks, request: Request, body: AudioRequest
+):
+    return create_task(request, body, stop_at="audio")
+
+
+def create_task(
+    request: Request,
+    body: Union[TaskVideoRequest, SubtitleRequest, AudioRequest],
+    stop_at: str,
+):
    task_id = utils.get_uuid()
    request_id = base.get_task_id(request)
    try:
        task = {
            "task_id": task_id,
            "request_id": request_id,
-            "params": body.dict(),
+            "params": body.model_dump(),
        }
        sm.state.update_task(task_id)
-        background_tasks.add_task(tm.start, task_id=task_id, params=body)
-        logger.success(f"video created: {utils.to_json(task)}")
+        task_manager.add_task(tm.start, task_id=task_id, params=body, stop_at=stop_at)
+        logger.success(f"Task created: {utils.to_json(task)}")
        return utils.get_response(200, task)
    except ValueError as e:
-        raise HttpException(task_id=task_id, status_code=400, message=f"{request_id}: {str(e)}")
+        raise HttpException(
+            task_id=task_id, status_code=400, message=f"{request_id}: {str(e)}"
+        )
+
+from fastapi import Query
+
+@router.get("/tasks", response_model=TaskQueryResponse, summary="Get all tasks")
+def get_all_tasks(request: Request, page: int = Query(1, ge=1), page_size: int = Query(10, ge=1)):
+    request_id = base.get_task_id(request)
+    tasks, total = sm.state.get_all_tasks(page, page_size)
+
+    response = {
+        "tasks": tasks,
+        "total": total,
+        "page": page,
+        "page_size": page_size,
+    }
+    return utils.get_response(200, response)


-@router.get("/tasks/{task_id}", response_model=TaskQueryResponse, summary="Query task status")
-def get_task(request: Request, task_id: str = Path(..., description="Task ID"),
-             query: TaskQueryRequest = Depends()):
+
+@router.get(
+    "/tasks/{task_id}", response_model=TaskQueryResponse, summary="Query task status"
+)
+def get_task(
+    request: Request,
+    task_id: str = Path(..., description="Task ID"),
+    query: TaskQueryRequest = Depends(),
+):
    endpoint = config.app.get("endpoint", "")
    if not endpoint:
        endpoint = str(request.base_url)
@@ -76,10 +151,16 @@ def get_task(request: Request, task_id: str = Path(..., description="Task ID"),
            task["combined_videos"] = urls
        return utils.get_response(200, task)

-    raise HttpException(task_id=task_id, status_code=404, message=f"{request_id}: task not found")
+    raise HttpException(
+        task_id=task_id, status_code=404, message=f"{request_id}: task not found"
+    )


-@router.delete("/tasks/{task_id}", response_model=TaskDeletionResponse, summary="Delete a generated short video task")
+@router.delete(
+    "/tasks/{task_id}",
+    response_model=TaskDeletionResponse,
+    summary="Delete a generated short video task",
+)
 def delete_video(request: Request, task_id: str = Path(..., description="Task ID")):
    request_id = base.get_task_id(request)
    task = sm.state.get_task(task_id)
@@ -93,32 +174,40 @@ def delete_video(request: Request, task_id: str = Path(..., description="Task ID
        logger.success(f"video deleted: {utils.to_json(task)}")
        return utils.get_response(200)

-    raise HttpException(task_id=task_id, status_code=404, message=f"{request_id}: task not found")
+    raise HttpException(
+        task_id=task_id, status_code=404, message=f"{request_id}: task not found"
+    )


-@router.get("/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files")
+@router.get(
+    "/musics", response_model=BgmRetrieveResponse, summary="Retrieve local BGM files"
+)
 def get_bgm_list(request: Request):
    suffix = "*.mp3"
    song_dir = utils.song_dir()
    files = glob.glob(os.path.join(song_dir, suffix))
    bgm_list = []
    for file in files:
-        bgm_list.append({
-            "name": os.path.basename(file),
-            "size": os.path.getsize(file),
-            "file": file,
-        })
-    response = {
-        "files": bgm_list
-    }
+        bgm_list.append(
+            {
+                "name": os.path.basename(file),
+                "size": os.path.getsize(file),
+                "file": file,
+            }
+        )
+    response = {"files": bgm_list}
    return utils.get_response(200, response)


-@router.post("/musics", response_model=BgmUploadResponse, summary="Upload the BGM file to the songs directory")
+@router.post(
+    "/musics",
+    response_model=BgmUploadResponse,
+    summary="Upload the BGM file to the songs directory",
+)
 def upload_bgm_file(request: Request, file: UploadFile = File(...)):
    request_id = base.get_task_id(request)
    # check file ext
-    if file.filename.endswith('mp3'):
+    if file.filename.endswith("mp3"):
        song_dir = utils.song_dir()
        save_path = os.path.join(song_dir, file.filename)
        # save file
@@ -126,26 +215,26 @@ def upload_bgm_file(request: Request, file: UploadFile = File(...)):
            # If the file already exists, it will be overwritten
            file.file.seek(0)
            buffer.write(file.file.read())
-        response = {
-            "file": save_path
-        }
+        response = {"file": save_path}
        return utils.get_response(200, response)

-    raise HttpException('', status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded")
+    raise HttpException(
+        "", status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded"
+    )


@router.get("/stream/{file_path:path}")
 async def stream_video(request: Request, file_path: str):
    tasks_dir = utils.task_dir()
    video_path = os.path.join(tasks_dir, file_path)
-    range_header = request.headers.get('Range')
+    range_header = request.headers.get("Range")
    video_size = os.path.getsize(video_path)
    start, end = 0, video_size - 1

    length = video_size
    if range_header:
-        range_ = range_header.split('bytes=')[1]
-        start, end = [int(part) if part else None for part in range_.split('-')]
+        range_ = range_header.split("bytes=")[1]
+        start, end = [int(part) if part else None for part in range_.split("-")]
        if start is None:
            start = video_size - end
            end = video_size - 1
@@ -154,7 +243,7 @@ async def stream_video(request: Request, file_path: str):
        length = end - start + 1

    def file_iterator(file_path, offset=0, bytes_to_read=None):
-        with open(file_path, 'rb') as f:
+        with open(file_path, "rb") as f:
            f.seek(offset, os.SEEK_SET)
            remaining = bytes_to_read or video_size
            while remaining > 0:
@@ -165,10 +254,12 @@ async def stream_video(request: Request, file_path: str):
                remaining -= len(data)
                yield data

-    response = StreamingResponse(file_iterator(video_path, start, length), media_type='video/mp4')
-    response.headers['Content-Range'] = f'bytes {start}-{end}/{video_size}'
-    response.headers['Accept-Ranges'] = 'bytes'
-    response.headers['Content-Length'] = str(length)
+    response = StreamingResponse(
+        file_iterator(video_path, start, length), media_type="video/mp4"
+    )
+    response.headers["Content-Range"] = f"bytes {start}-{end}/{video_size}"
+    response.headers["Accept-Ranges"] = "bytes"
+    response.headers["Content-Length"] = str(length)
    response.status_code = 206  # Partial Content

    return response
@@ -187,8 +278,10 @@ async def download_video(_: Request, file_path: str):
    file_path = pathlib.Path(video_path)
    filename = file_path.stem
    extension = file_path.suffix
-    headers = {
-        "Content-Disposition": f"attachment; filename={filename}{extension}"
-    }
-    return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}",
-                        media_type=f'video/{extension[1:]}')
+    headers = {"Content-Disposition": f"attachment; filename={filename}{extension}"}
+    return FileResponse(
+        path=video_path,
+        headers=headers,
+        filename=f"{filename}{extension}",
+        media_type=f"video/{extension[1:]}",
+    )
--- a/app/models/const.py
+++ b/app/models/const.py
@@ -1,8 +1,25 @@
 PUNCTUATIONS = [
-    "?", ",", ".", "、", ";", ":", "!", "…",
-    "？", "，", "。", "、", "；", "：", "！", "...",
+    "?",
+    ",",
+    ".",
+    "、",
+    ";",
+    ":",
+    "!",
+    "…",
+    "？",
+    "，",
+    "。",
+    "、",
+    "；",
+    "：",
+    "！",
+    "...",
 ]

 TASK_STATE_FAILED = -1
 TASK_STATE_COMPLETE = 1
 TASK_STATE_PROCESSING = 4
+
+FILE_TYPE_VIDEOS = ["mp4", "mov", "mkv", "webm"]
+FILE_TYPE_IMAGES = ["jpg", "jpeg", "png", "bmp"]
--- a/app/models/exception.py
+++ b/app/models/exception.py
@@ -5,16 +5,18 @@ from loguru import logger


 class HttpException(Exception):
-    def __init__(self, task_id: str, status_code: int, message: str = '', data: Any = None):
+    def __init__(
+        self, task_id: str, status_code: int, message: str = "", data: Any = None
+    ):
        self.message = message
        self.status_code = status_code
        self.data = data
-        # 获取异常堆栈信息
+        # Retrieve the exception stack trace information.
        tb_str = traceback.format_exc().strip()
        if not tb_str or tb_str == "NoneType: None":
-            msg = f'HttpException: {status_code}, {task_id}, {message}'
+            msg = f"HttpException: {status_code}, {task_id}, {message}"
        else:
-            msg = f'HttpException: {status_code}, {task_id}, {message}\n{tb_str}'
+            msg = f"HttpException: {status_code}, {task_id}, {message}\n{tb_str}"

        if status_code == 400:
            logger.warning(msg)
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -1,11 +1,16 @@
-from enum import Enum
-from typing import Any, Optional
-
-from pydantic import BaseModel
 import warnings
+from enum import Enum
+from typing import Any, List, Optional, Union
+
+import pydantic
+from pydantic import BaseModel

 # 忽略 Pydantic 的特定警告
-warnings.filterwarnings("ignore", category=UserWarning, message="Field name.*shadows an attribute in parent.*")
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message="Field name.*shadows an attribute in parent.*",
+)


 class VideoConcatMode(str, Enum):
@@ -13,6 +18,15 @@ class VideoConcatMode(str, Enum):
    sequential = "sequential"


+class VideoTransitionMode(str, Enum):
+    none = None
+    shuffle = "Shuffle"
+    fade_in = "FadeIn"
+    fade_out = "FadeOut"
+    slide_in = "SlideIn"
+    slide_out = "SlideOut"
+
+
 class VideoAspect(str, Enum):
    landscape = "16:9"
    portrait = "9:16"
@@ -28,52 +42,18 @@ class VideoAspect(str, Enum):
        return 1080, 1920


+class _Config:
+    arbitrary_types_allowed = True
+
+
+@pydantic.dataclasses.dataclass(config=_Config)
 class MaterialInfo:
    provider: str = "pexels"
    url: str = ""
    duration: int = 0


-# VoiceNames = [
-#     # zh-CN
-#     "female-zh-CN-XiaoxiaoNeural",
-#     "female-zh-CN-XiaoyiNeural",
-#     "female-zh-CN-liaoning-XiaobeiNeural",
-#     "female-zh-CN-shaanxi-XiaoniNeural",
-#
-#     "male-zh-CN-YunjianNeural",
-#     "male-zh-CN-YunxiNeural",
-#     "male-zh-CN-YunxiaNeural",
-#     "male-zh-CN-YunyangNeural",
-#
-#     # "female-zh-HK-HiuGaaiNeural",
-#     # "female-zh-HK-HiuMaanNeural",
-#     # "male-zh-HK-WanLungNeural",
-#     #
-#     # "female-zh-TW-HsiaoChenNeural",
-#     # "female-zh-TW-HsiaoYuNeural",
-#     # "male-zh-TW-YunJheNeural",
-#
-#     # en-US
-#
-#     "female-en-US-AnaNeural",
-#     "female-en-US-AriaNeural",
-#     "female-en-US-AvaNeural",
-#     "female-en-US-EmmaNeural",
-#     "female-en-US-JennyNeural",
-#     "female-en-US-MichelleNeural",
-#
-#     "male-en-US-AndrewNeural",
-#     "male-en-US-BrianNeural",
-#     "male-en-US-ChristopherNeural",
-#     "male-en-US-EricNeural",
-#     "male-en-US-GuyNeural",
-#     "male-en-US-RogerNeural",
-#     "male-en-US-SteffanNeural",
-# ]
-
-
-class VideoParams:
+class VideoParams(BaseModel):
    """
    {
      "video_subject": "",
@@ -87,27 +67,36 @@ class VideoParams:
      "stroke_width": 1.5
    }
    """
+
    video_subject: str
-    video_script: str = ""  # 用于生成视频的脚本
-    video_terms: Optional[str | list] = None  # 用于生成视频的关键词
+    video_script: str = ""  # Script used to generate the video
+    video_terms: Optional[str | list] = None  # Keywords used to generate the video
    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
    video_concat_mode: Optional[VideoConcatMode] = VideoConcatMode.random.value
+    video_transition_mode: Optional[VideoTransitionMode] = None
    video_clip_duration: Optional[int] = 5
    video_count: Optional[int] = 1

+    video_source: Optional[str] = "pexels"
+    video_materials: Optional[List[MaterialInfo]] = (
+        None  # Materials used to generate the video
+    )
+
    video_language: Optional[str] = ""  # auto detect

    voice_name: Optional[str] = ""
    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.0
    bgm_type: Optional[str] = "random"
    bgm_file: Optional[str] = ""
    bgm_volume: Optional[float] = 0.2

    subtitle_enabled: Optional[bool] = True
    subtitle_position: Optional[str] = "bottom"  # top, bottom, center
+    custom_position: float = 70.0
    font_name: Optional[str] = "STHeitiMedium.ttc"
    text_fore_color: Optional[str] = "#FFFFFF"
-    text_background_color: Optional[str] = "transparent"
+    text_background_color: Union[bool, str] = True

    font_size: int = 60
    stroke_color: Optional[str] = "#000000"
@@ -116,6 +105,38 @@ class VideoParams:
    paragraph_number: Optional[int] = 1


+class SubtitleRequest(BaseModel):
+    video_script: str
+    video_language: Optional[str] = ""
+    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.2
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+    subtitle_position: Optional[str] = "bottom"
+    font_name: Optional[str] = "STHeitiMedium.ttc"
+    text_fore_color: Optional[str] = "#FFFFFF"
+    text_background_color: Union[bool, str] = True
+    font_size: int = 60
+    stroke_color: Optional[str] = "#000000"
+    stroke_width: float = 1.5
+    video_source: Optional[str] = "local"
+    subtitle_enabled: Optional[str] = "true"
+
+
+class AudioRequest(BaseModel):
+    video_script: str
+    video_language: Optional[str] = ""
+    voice_name: Optional[str] = "zh-CN-XiaoxiaoNeural-Female"
+    voice_volume: Optional[float] = 1.0
+    voice_rate: Optional[float] = 1.2
+    bgm_type: Optional[str] = "random"
+    bgm_file: Optional[str] = ""
+    bgm_volume: Optional[float] = 0.2
+    video_source: Optional[str] = "local"
+
+
 class VideoScriptParams:
    """
    {
@@ -124,6 +145,7 @@ class VideoScriptParams:
      "paragraph_number": 1
    }
    """
+
    video_subject: Optional[str] = "春天的花海"
    video_language: Optional[str] = ""
    paragraph_number: Optional[int] = 1
@@ -137,14 +159,17 @@ class VideoTermsParams:
      "amount": 5
    }
    """
+
    video_subject: Optional[str] = "春天的花海"
-    video_script: Optional[str] = "春天的花海，如诗如画般展现在眼前。万物复苏的季节里，大地披上了一袭绚丽多彩的盛装。金黄的迎春、粉嫩的樱花、洁白的梨花、艳丽的郁金香……"
+    video_script: Optional[str] = (
+        "春天的花海，如诗如画般展现在眼前。万物复苏的季节里，大地披上了一袭绚丽多彩的盛装。金黄的迎春、粉嫩的樱花、洁白的梨花、艳丽的郁金香……"
+    )
    amount: Optional[int] = 5


 class BaseResponse(BaseModel):
    status: int = 200
-    message: Optional[str] = 'success'
+    message: Optional[str] = "success"
    data: Any = None


@@ -179,9 +204,7 @@ class TaskResponse(BaseResponse):
            "example": {
                "status": 200,
                "message": "success",
-                "data": {
-                    "task_id": "6c85c8cc-a77a-42b9-bc30-947815aa0558"
-                }
+                "data": {"task_id": "6c85c8cc-a77a-42b9-bc30-947815aa0558"},
            },
        }

@@ -200,8 +223,8 @@ class TaskQueryResponse(BaseResponse):
                    ],
                    "combined_videos": [
                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/combined-1.mp4"
-                    ]
-                }
+                    ],
+                },
            },
        }

@@ -220,8 +243,8 @@ class TaskDeletionResponse(BaseResponse):
                    ],
                    "combined_videos": [
                        "http://127.0.0.1:8080/tasks/6c85c8cc-a77a-42b9-bc30-947815aa0558/combined-1.mp4"
-                    ]
-                }
+                    ],
+                },
            },
        }

@@ -234,7 +257,7 @@ class VideoScriptResponse(BaseResponse):
                "message": "success",
                "data": {
                    "video_script": "春天的花海，是大自然的一幅美丽画卷。在这个季节里，大地复苏，万物生长，花朵争相绽放，形成了一片五彩斑斓的花海..."
-                }
+                },
            },
        }

@@ -245,9 +268,7 @@ class VideoTermsResponse(BaseResponse):
            "example": {
                "status": 200,
                "message": "success",
-                "data": {
-                    "video_terms": ["sky", "tree"]
-                }
+                "data": {"video_terms": ["sky", "tree"]},
            },
        }

@@ -263,10 +284,10 @@ class BgmRetrieveResponse(BaseResponse):
                        {
                            "name": "output013.mp3",
                            "size": 1891269,
-                            "file": "/MoneyPrinterTurbo/resource/songs/output013.mp3"
+                            "file": "/MoneyPrinterTurbo/resource/songs/output013.mp3",
                        }
                    ]
-                }
+                },
            },
        }

@@ -277,8 +298,6 @@ class BgmUploadResponse(BaseResponse):
            "example": {
                "status": 200,
                "message": "success",
-                "data": {
-                    "file": "/MoneyPrinterTurbo/resource/songs/example.mp3"
-                }
+                "data": {"file": "/MoneyPrinterTurbo/resource/songs/example.mp3"},
            },
        }
--- a/app/router.py
+++ b/app/router.py
@@ -6,9 +6,10 @@ Resources:
    1. https://fastapi.tiangolo.com/tutorial/bigger-applications

 """
+
 from fastapi import APIRouter

-from app.controllers.v1 import video, llm
+from app.controllers.v1 import llm, video

 root_api_router = APIRouter()
 # v1
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -1,193 +1,299 @@
+import json
 import logging
 import re
-import json
+import requests
 from typing import List
+
+import g4f
 from loguru import logger
-from openai import OpenAI
-from openai import AzureOpenAI
+from openai import AzureOpenAI, OpenAI
 from openai.types.chat import ChatCompletion

 from app.config import config

+_max_retries = 5
+

 def _generate_response(prompt: str) -> str:
-    content = ""
-    llm_provider = config.app.get("llm_provider", "openai")
-    logger.info(f"llm provider: {llm_provider}")
-    if llm_provider == "g4f":
-        model_name = config.app.get("g4f_model_name", "")
-        if not model_name:
-            model_name = "gpt-3.5-turbo-16k-0613"
-        import g4f
-        content = g4f.ChatCompletion.create(
-            model=model_name,
-            messages=[{"role": "user", "content": prompt}],
-        )
-    else:
-        api_version = ""  # for azure
-        if llm_provider == "moonshot":
-            api_key = config.app.get("moonshot_api_key")
-            model_name = config.app.get("moonshot_model_name")
-            base_url = "https://api.moonshot.cn/v1"
-        elif llm_provider == "ollama":
-            # api_key = config.app.get("openai_api_key")
-            api_key = "ollama"  # any string works but you are required to have one
-            model_name = config.app.get("ollama_model_name")
-            base_url = config.app.get("ollama_base_url", "")
-            if not base_url:
-                base_url = "http://localhost:11434/v1"
-        elif llm_provider == "openai":
-            api_key = config.app.get("openai_api_key")
-            model_name = config.app.get("openai_model_name")
-            base_url = config.app.get("openai_base_url", "")
-            if not base_url:
-                base_url = "https://api.openai.com/v1"
-        elif llm_provider == "oneapi":
-            api_key = config.app.get("oneapi_api_key")
-            model_name = config.app.get("oneapi_model_name")
-            base_url = config.app.get("oneapi_base_url", "")
-        elif llm_provider == "azure":
-            api_key = config.app.get("azure_api_key")
-            model_name = config.app.get("azure_model_name")
-            base_url = config.app.get("azure_base_url", "")
-            api_version = config.app.get("azure_api_version", "2024-02-15-preview")
-        elif llm_provider == "gemini":
-            api_key = config.app.get("gemini_api_key")
-            model_name = config.app.get("gemini_model_name")
-            base_url = "***"
-        elif llm_provider == "qwen":
-            api_key = config.app.get("qwen_api_key")
-            model_name = config.app.get("qwen_model_name")
-            base_url = "***"
-        elif llm_provider == "cloudflare":
-            api_key = config.app.get("cloudflare_api_key")
-            model_name = config.app.get("cloudflare_model_name")
-            account_id = config.app.get("cloudflare_account_id")
-            base_url = "***"
-        else:
-            raise ValueError("llm_provider is not set, please set it in the config.toml file.")
-
-        if not api_key:
-            raise ValueError(f"{llm_provider}: api_key is not set, please set it in the config.toml file.")
-        if not model_name:
-            raise ValueError(f"{llm_provider}: model_name is not set, please set it in the config.toml file.")
-        if not base_url:
-            raise ValueError(f"{llm_provider}: base_url is not set, please set it in the config.toml file.")
-
-        if llm_provider == "qwen":
-            import dashscope
-            from dashscope.api_entities.dashscope_response import GenerationResponse
-            dashscope.api_key = api_key
-            response = dashscope.Generation.call(
+    try:
+        content = ""
+        llm_provider = config.app.get("llm_provider", "openai")
+        logger.info(f"llm provider: {llm_provider}")
+        if llm_provider == "g4f":
+            model_name = config.app.get("g4f_model_name", "")
+            if not model_name:
+                model_name = "gpt-3.5-turbo-16k-0613"
+            content = g4f.ChatCompletion.create(
                model=model_name,
-                messages=[{"role": "user", "content": prompt}]
+                messages=[{"role": "user", "content": prompt}],
+            )
+        else:
+            api_version = ""  # for azure
+            if llm_provider == "moonshot":
+                api_key = config.app.get("moonshot_api_key")
+                model_name = config.app.get("moonshot_model_name")
+                base_url = "https://api.moonshot.cn/v1"
+            elif llm_provider == "ollama":
+                # api_key = config.app.get("openai_api_key")
+                api_key = "ollama"  # any string works but you are required to have one
+                model_name = config.app.get("ollama_model_name")
+                base_url = config.app.get("ollama_base_url", "")
+                if not base_url:
+                    base_url = "http://localhost:11434/v1"
+            elif llm_provider == "openai":
+                api_key = config.app.get("openai_api_key")
+                model_name = config.app.get("openai_model_name")
+                base_url = config.app.get("openai_base_url", "")
+                if not base_url:
+                    base_url = "https://api.openai.com/v1"
+            elif llm_provider == "oneapi":
+                api_key = config.app.get("oneapi_api_key")
+                model_name = config.app.get("oneapi_model_name")
+                base_url = config.app.get("oneapi_base_url", "")
+            elif llm_provider == "azure":
+                api_key = config.app.get("azure_api_key")
+                model_name = config.app.get("azure_model_name")
+                base_url = config.app.get("azure_base_url", "")
+                api_version = config.app.get("azure_api_version", "2024-02-15-preview")
+            elif llm_provider == "gemini":
+                api_key = config.app.get("gemini_api_key")
+                model_name = config.app.get("gemini_model_name")
+                base_url = "***"
+            elif llm_provider == "qwen":
+                api_key = config.app.get("qwen_api_key")
+                model_name = config.app.get("qwen_model_name")
+                base_url = "***"
+            elif llm_provider == "cloudflare":
+                api_key = config.app.get("cloudflare_api_key")
+                model_name = config.app.get("cloudflare_model_name")
+                account_id = config.app.get("cloudflare_account_id")
+                base_url = "***"
+            elif llm_provider == "deepseek":
+                api_key = config.app.get("deepseek_api_key")
+                model_name = config.app.get("deepseek_model_name")
+                base_url = config.app.get("deepseek_base_url")
+                if not base_url:
+                    base_url = "https://api.deepseek.com"
+            elif llm_provider == "ernie":
+                api_key = config.app.get("ernie_api_key")
+                secret_key = config.app.get("ernie_secret_key")
+                base_url = config.app.get("ernie_base_url")
+                model_name = "***"
+                if not secret_key:
+                    raise ValueError(
+                        f"{llm_provider}: secret_key is not set, please set it in the config.toml file."
+                    )
+            elif llm_provider == "pollinations":
+                try:
+                    base_url = config.app.get("pollinations_base_url", "")
+                    if not base_url:
+                        base_url = "https://text.pollinations.ai/openai"
+                    model_name = config.app.get("pollinations_model_name", "openai-fast")
+                   
+                    # Prepare the payload
+                    payload = {
+                        "model": model_name,
+                        "messages": [
+                            {"role": "user", "content": prompt}
+                        ],
+                        "seed": 101  # Optional but helps with reproducibility
+                    }
+                    
+                    # Optional parameters if configured
+                    if config.app.get("pollinations_private"):
+                        payload["private"] = True
+                    if config.app.get("pollinations_referrer"):
+                        payload["referrer"] = config.app.get("pollinations_referrer")
+                    
+                    headers = {
+                        "Content-Type": "application/json"
+                    }
+                    
+                    # Make the API request
+                    response = requests.post(base_url, headers=headers, json=payload)
+                    response.raise_for_status()
+                    result = response.json()
+                    
+                    if result and "choices" in result and len(result["choices"]) > 0:
+                        content = result["choices"][0]["message"]["content"]
+                        return content.replace("\n", "")
+                    else:
+                        raise Exception(f"[{llm_provider}] returned an invalid response format")
+                        
+                except requests.exceptions.RequestException as e:
+                    raise Exception(f"[{llm_provider}] request failed: {str(e)}")
+                except Exception as e:
+                    raise Exception(f"[{llm_provider}] error: {str(e)}")
+
+            if llm_provider not in ["pollinations", "ollama"]:  # Skip validation for providers that don't require API key
+                if not api_key:
+                    raise ValueError(
+                        f"{llm_provider}: api_key is not set, please set it in the config.toml file."
+                    )
+                if not model_name:
+                    raise ValueError(
+                        f"{llm_provider}: model_name is not set, please set it in the config.toml file."
+                    )
+                if not base_url:
+                    raise ValueError(
+                        f"{llm_provider}: base_url is not set, please set it in the config.toml file."
+                    )
+
+            if llm_provider == "qwen":
+                import dashscope
+                from dashscope.api_entities.dashscope_response import GenerationResponse
+
+                dashscope.api_key = api_key
+                response = dashscope.Generation.call(
+                    model=model_name, messages=[{"role": "user", "content": prompt}]
+                )
+                if response:
+                    if isinstance(response, GenerationResponse):
+                        status_code = response.status_code
+                        if status_code != 200:
+                            raise Exception(
+                                f'[{llm_provider}] returned an error response: "{response}"'
+                            )
+
+                        content = response["output"]["text"]
+                        return content.replace("\n", "")
+                    else:
+                        raise Exception(
+                            f'[{llm_provider}] returned an invalid response: "{response}"'
+                        )
+                else:
+                    raise Exception(f"[{llm_provider}] returned an empty response")
+
+            if llm_provider == "gemini":
+                import google.generativeai as genai
+
+                genai.configure(api_key=api_key, transport="rest")
+
+                generation_config = {
+                    "temperature": 0.5,
+                    "top_p": 1,
+                    "top_k": 1,
+                    "max_output_tokens": 2048,
+                }
+
+                safety_settings = [
+                    {
+                        "category": "HARM_CATEGORY_HARASSMENT",
+                        "threshold": "BLOCK_ONLY_HIGH",
+                    },
+                    {
+                        "category": "HARM_CATEGORY_HATE_SPEECH",
+                        "threshold": "BLOCK_ONLY_HIGH",
+                    },
+                    {
+                        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+                        "threshold": "BLOCK_ONLY_HIGH",
+                    },
+                    {
+                        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+                        "threshold": "BLOCK_ONLY_HIGH",
+                    },
+                ]
+
+                model = genai.GenerativeModel(
+                    model_name=model_name,
+                    generation_config=generation_config,
+                    safety_settings=safety_settings,
+                )
+
+                try:
+                    response = model.generate_content(prompt)
+                    candidates = response.candidates
+                    generated_text = candidates[0].content.parts[0].text
+                except (AttributeError, IndexError) as e:
+                    print("Gemini Error:", e)
+
+                return generated_text
+
+            if llm_provider == "cloudflare":
+                response = requests.post(
+                    f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model_name}",
+                    headers={"Authorization": f"Bearer {api_key}"},
+                    json={
+                        "messages": [
+                            {
+                                "role": "system",
+                                "content": "You are a friendly assistant",
+                            },
+                            {"role": "user", "content": prompt},
+                        ]
+                    },
+                )
+                result = response.json()
+                logger.info(result)
+                return result["result"]["response"]
+
+            if llm_provider == "ernie":
+                response = requests.post(
+                    "https://aip.baidubce.com/oauth/2.0/token", 
+                    params={
+                        "grant_type": "client_credentials",
+                        "client_id": api_key,
+                        "client_secret": secret_key,
+                    }
+                )
+                access_token = response.json().get("access_token")
+                url = f"{base_url}?access_token={access_token}"
+
+                payload = json.dumps(
+                    {
+                        "messages": [{"role": "user", "content": prompt}],
+                        "temperature": 0.5,
+                        "top_p": 0.8,
+                        "penalty_score": 1,
+                        "disable_search": False,
+                        "enable_citation": False,
+                        "response_format": "text",
+                    }
+                )
+                headers = {"Content-Type": "application/json"}
+
+                response = requests.request(
+                    "POST", url, headers=headers, data=payload
+                ).json()
+                return response.get("result")
+
+            if llm_provider == "azure":
+                client = AzureOpenAI(
+                    api_key=api_key,
+                    api_version=api_version,
+                    azure_endpoint=base_url,
+                )
+            else:
+                client = OpenAI(
+                    api_key=api_key,
+                    base_url=base_url,
+                )
+
+            response = client.chat.completions.create(
+                model=model_name, messages=[{"role": "user", "content": prompt}]
            )
            if response:
-                if isinstance(response, GenerationResponse):
-                    status_code = response.status_code
-                    if status_code != 200:
-                        raise Exception(
-                            f"[{llm_provider}] returned an error response: \"{response}\"")
-
-                    content = response["output"]["text"]
-                    return content.replace("\n", "")
+                if isinstance(response, ChatCompletion):
+                    content = response.choices[0].message.content
                else:
                    raise Exception(
-                        f"[{llm_provider}] returned an invalid response: \"{response}\"")
+                        f'[{llm_provider}] returned an invalid response: "{response}", please check your network '
+                        f"connection and try again."
+                    )
            else:
                raise Exception(
-                    f"[{llm_provider}] returned an empty response")
+                    f"[{llm_provider}] returned an empty response, please check your network connection and try again."
+                )

-        if llm_provider == "gemini":
-            import google.generativeai as genai
-            genai.configure(api_key=api_key, transport='rest')
-
-            generation_config = {
-                "temperature": 0.5,
-                "top_p": 1,
-                "top_k": 1,
-                "max_output_tokens": 2048,
-            }
-
-            safety_settings = [
-                {
-                    "category": "HARM_CATEGORY_HARASSMENT",
-                    "threshold": "BLOCK_ONLY_HIGH"
-                },
-                {
-                    "category": "HARM_CATEGORY_HATE_SPEECH",
-                    "threshold": "BLOCK_ONLY_HIGH"
-                },
-                {
-                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-                    "threshold": "BLOCK_ONLY_HIGH"
-                },
-                {
-                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-                    "threshold": "BLOCK_ONLY_HIGH"
-                },
-            ]
-
-            model = genai.GenerativeModel(model_name=model_name,
-                                          generation_config=generation_config,
-                                          safety_settings=safety_settings)
-
-            try:
-                response = model.generate_content(prompt)
-                candidates = response.candidates
-                generated_text = candidates[0].content.parts[0].text
-            except (AttributeError, IndexError) as e:
-                print("Gemini Error:", e)
-
-            return generated_text
-
-        if llm_provider == "cloudflare":
-            import requests
-            response = requests.post(
-                f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model_name}",
-                headers={"Authorization": f"Bearer {api_key}"},
-                json={
-                    "messages": [
-                        {"role": "system", "content": "You are a friendly assistant"},
-                        {"role": "user", "content": prompt}
-                    ]
-                }
-            )
-            result = response.json()
-            logger.info(result)
-            return result["result"]["response"]
-
-        if llm_provider == "azure":
-            client = AzureOpenAI(
-                api_key=api_key,
-                api_version=api_version,
-                azure_endpoint=base_url,
-            )
-        else:
-            client = OpenAI(
-                api_key=api_key,
-                base_url=base_url,
-            )
-
-        response = client.chat.completions.create(
-            model=model_name,
-            messages=[{"role": "user", "content": prompt}]
-        )
-        if response:
-            if isinstance(response, ChatCompletion):
-                content = response.choices[0].message.content
-            else:
-                raise Exception(
-                    f"[{llm_provider}] returned an invalid response: \"{response}\", please check your network "
-                    f"connection and try again.")
-        else:
-            raise Exception(
-                f"[{llm_provider}] returned an empty response, please check your network connection and try again.")
-
-    return content.replace("\n", "")
+        return content.replace("\n", "")
+    except Exception as e:
+        return f"Error: {str(e)}"


-def generate_script(video_subject: str, language: str = "", paragraph_number: int = 1) -> str:
+def generate_script(
+    video_subject: str, language: str = "", paragraph_number: int = 1
+) -> str:
    prompt = f"""
 # Role: Video Script Generator

@@ -213,11 +319,8 @@ Generate a script for a video, depending on the subject of the video.

    final_script = ""
    logger.info(f"subject: {video_subject}")
-    logger.debug(f"prompt: \n{prompt}")
-    response = _generate_response(prompt=prompt)

-    # Return the generated script
-    if response:
+    def format_response(response):
        # Clean the script
        # Remove asterisks, hashes
        response = response.replace("*", "")
@@ -231,18 +334,35 @@ Generate a script for a video, depending on the subject of the video.
        paragraphs = response.split("\n\n")

        # Select the specified number of paragraphs
-        selected_paragraphs = paragraphs[:paragraph_number]
+        # selected_paragraphs = paragraphs[:paragraph_number]

        # Join the selected paragraphs into a single string
-        final_script = "\n\n".join(selected_paragraphs)
+        return "\n\n".join(paragraphs)

-        # Print to console the number of paragraphs used
-        # logger.info(f"number of paragraphs used: {len(selected_paragraphs)}")
+    for i in range(_max_retries):
+        try:
+            response = _generate_response(prompt=prompt)
+            if response:
+                final_script = format_response(response)
+            else:
+                logging.error("gpt returned an empty response")
+
+            # g4f may return an error message
+            if final_script and "当日额度已消耗完" in final_script:
+                raise ValueError(final_script)
+
+            if final_script:
+                break
+        except Exception as e:
+            logger.error(f"failed to generate script: {e}")
+
+        if i < _max_retries:
+            logger.warning(f"failed to generate video script, trying again... {i + 1}")
+    if "Error: " in final_script:
+        logger.error(f"failed to generate video script: {final_script}")
    else:
-        logging.error("gpt returned an empty response")
-
-    logger.success(f"completed: \n{final_script}")
-    return final_script
+        logger.success(f"completed: \n{final_script}")
+    return final_script.strip()


 def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
@@ -273,25 +393,37 @@ Please note that you must use English for generating video search terms; Chinese
 """.strip()

    logger.info(f"subject: {video_subject}")
-    logger.debug(f"prompt: \n{prompt}")
-    response = _generate_response(prompt)
+
    search_terms = []
+    response = ""
+    for i in range(_max_retries):
+        try:
+            response = _generate_response(prompt)
+            if "Error: " in response:
+                logger.error(f"failed to generate video script: {response}")
+                return response
+            search_terms = json.loads(response)
+            if not isinstance(search_terms, list) or not all(
+                isinstance(term, str) for term in search_terms
+            ):
+                logger.error("response is not a list of strings.")
+                continue

-    try:
-        search_terms = json.loads(response)
-        if not isinstance(search_terms, list) or not all(isinstance(term, str) for term in search_terms):
-            raise ValueError("response is not a list of strings.")
+        except Exception as e:
+            logger.warning(f"failed to generate video terms: {str(e)}")
+            if response:
+                match = re.search(r"\[.*]", response)
+                if match:
+                    try:
+                        search_terms = json.loads(match.group())
+                    except Exception as e:
+                        logger.warning(f"failed to generate video terms: {str(e)}")
+                        pass

-    except (json.JSONDecodeError, ValueError):
-        # logger.warning(f"gpt returned an unformatted response. attempting to clean...")
-        # Attempt to extract list-like string and convert to list
-        match = re.search(r'\["(?:[^"\\]|\\.)*"(?:,\s*"[^"\\]*")*\]', response)
-        if match:
-            try:
-                search_terms = json.loads(match.group())
-            except json.JSONDecodeError:
-                logger.error(f"could not parse response: {response}")
-                return []
+        if search_terms and len(search_terms) > 0:
+            break
+        if i < _max_retries:
+            logger.warning(f"failed to generate video terms, trying again... {i + 1}")

    logger.success(f"completed: \n{search_terms}")
    return search_terms
@@ -299,9 +431,14 @@ Please note that you must use English for generating video search terms; Chinese

 if __name__ == "__main__":
    video_subject = "生命的意义是什么"
-    script = generate_script(video_subject=video_subject, language="zh-CN", paragraph_number=1)
-    # print("######################")
-    # print(script)
-    # search_terms = generate_terms(video_subject=video_subject, video_script=script, amount=5)
-    # print("######################")
-    # print(search_terms)
+    script = generate_script(
+        video_subject=video_subject, language="zh-CN", paragraph_number=1
+    )
+    print("######################")
+    print(script)
+    search_terms = generate_terms(
+        video_subject=video_subject, video_script=script, amount=5
+    )
+    print("######################")
+    print(search_terms)
+    
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -1,57 +1,62 @@
 import os
 import random
+from typing import List
 from urllib.parse import urlencode

 import requests
-from typing import List
 from loguru import logger
 from moviepy.video.io.VideoFileClip import VideoFileClip

 from app.config import config
-from app.models.schema import VideoAspect, VideoConcatMode, MaterialInfo
+from app.models.schema import MaterialInfo, VideoAspect, VideoConcatMode
 from app.utils import utils

 requested_count = 0


-def round_robin_api_key():
-    pexels_api_keys = config.app.get("pexels_api_keys")
-    if not pexels_api_keys:
+def get_api_key(cfg_key: str):
+    api_keys = config.app.get(cfg_key)
+    if not api_keys:
        raise ValueError(
-            f"\n\n##### pexels_api_keys is not set #####\n\nPlease set it in the config.toml file: {config.config_file}\n\n{utils.to_json(config.app)}")
+            f"\n\n##### {cfg_key} is not set #####\n\nPlease set it in the config.toml file: {config.config_file}\n\n"
+            f"{utils.to_json(config.app)}"
+        )

    # if only one key is provided, return it
-    if isinstance(pexels_api_keys, str):
-        return pexels_api_keys
+    if isinstance(api_keys, str):
+        return api_keys

    global requested_count
    requested_count += 1
-    return pexels_api_keys[requested_count % len(pexels_api_keys)]
+    return api_keys[requested_count % len(api_keys)]


-def search_videos(search_term: str,
-                  minimum_duration: int,
-                  video_aspect: VideoAspect = VideoAspect.portrait,
-                  ) -> List[MaterialInfo]:
+def search_videos_pexels(
+    search_term: str,
+    minimum_duration: int,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+) -> List[MaterialInfo]:
    aspect = VideoAspect(video_aspect)
    video_orientation = aspect.name
    video_width, video_height = aspect.to_resolution()
-
+    api_key = get_api_key("pexels_api_keys")
    headers = {
-        "Authorization": round_robin_api_key()
+        "Authorization": api_key,
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    }
-    proxies = config.pexels.get("proxies", None)
    # Build URL
-    params = {
-        "query": search_term,
-        "per_page": 20,
-        "orientation": video_orientation
-    }
+    params = {"query": search_term, "per_page": 20, "orientation": video_orientation}
    query_url = f"https://api.pexels.com/videos/search?{urlencode(params)}"
-    logger.info(f"searching videos: {query_url}, with proxies: {proxies}")
+    logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}")

    try:
-        r = requests.get(query_url, headers=headers, proxies=proxies, verify=False)
+        r = requests.get(
+            query_url,
+            headers=headers,
+            proxies=config.proxy,
+            verify=False,
+            timeout=(30, 60),
+        )
        response = r.json()
        video_items = []
        if "videos" not in response:
@@ -83,6 +88,62 @@ def search_videos(search_term: str,
    return []


+def search_videos_pixabay(
+    search_term: str,
+    minimum_duration: int,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+) -> List[MaterialInfo]:
+    aspect = VideoAspect(video_aspect)
+
+    video_width, video_height = aspect.to_resolution()
+
+    api_key = get_api_key("pixabay_api_keys")
+    # Build URL
+    params = {
+        "q": search_term,
+        "video_type": "all",  # Accepted values: "all", "film", "animation"
+        "per_page": 50,
+        "key": api_key,
+    }
+    query_url = f"https://pixabay.com/api/videos/?{urlencode(params)}"
+    logger.info(f"searching videos: {query_url}, with proxies: {config.proxy}")
+
+    try:
+        r = requests.get(
+            query_url, proxies=config.proxy, verify=False, timeout=(30, 60)
+        )
+        response = r.json()
+        video_items = []
+        if "hits" not in response:
+            logger.error(f"search videos failed: {response}")
+            return video_items
+        videos = response["hits"]
+        # loop through each video in the result
+        for v in videos:
+            duration = v["duration"]
+            # check if video has desired minimum duration
+            if duration < minimum_duration:
+                continue
+            video_files = v["videos"]
+            # loop through each url to determine the best quality
+            for video_type in video_files:
+                video = video_files[video_type]
+                w = int(video["width"])
+                # h = int(video["height"])
+                if w >= video_width:
+                    item = MaterialInfo()
+                    item.provider = "pixabay"
+                    item.url = video["url"]
+                    item.duration = duration
+                    video_items.append(item)
+                    break
+        return video_items
+    except Exception as e:
+        logger.error(f"search videos failed: {str(e)}")
+
+    return []
+
+
 def save_video(video_url: str, save_dir: str = "") -> str:
    if not save_dir:
        save_dir = utils.storage_dir("cache_videos")
@@ -100,10 +161,21 @@ def save_video(video_url: str, save_dir: str = "") -> str:
        logger.info(f"video already exists: {video_path}")
        return video_path

+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
+    }
+
    # if video does not exist, download it
-    proxies = config.pexels.get("proxies", None)
    with open(video_path, "wb") as f:
-        f.write(requests.get(video_url, proxies=proxies, verify=False, timeout=(60, 240)).content)
+        f.write(
+            requests.get(
+                video_url,
+                headers=headers,
+                proxies=config.proxy,
+                verify=False,
+                timeout=(60, 240),
+            ).content
+        )

    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
        try:
@@ -116,27 +188,34 @@ def save_video(video_url: str, save_dir: str = "") -> str:
        except Exception as e:
            try:
                os.remove(video_path)
-            except Exception as e:
+            except Exception:
                pass
            logger.warning(f"invalid video file: {video_path} => {str(e)}")
    return ""


-def download_videos(task_id: str,
-                    search_terms: List[str],
-                    video_aspect: VideoAspect = VideoAspect.portrait,
-                    video_contact_mode: VideoConcatMode = VideoConcatMode.random,
-                    audio_duration: float = 0.0,
-                    max_clip_duration: int = 5,
-                    ) -> List[str]:
+def download_videos(
+    task_id: str,
+    search_terms: List[str],
+    source: str = "pexels",
+    video_aspect: VideoAspect = VideoAspect.portrait,
+    video_contact_mode: VideoConcatMode = VideoConcatMode.random,
+    audio_duration: float = 0.0,
+    max_clip_duration: int = 5,
+) -> List[str]:
    valid_video_items = []
    valid_video_urls = []
    found_duration = 0.0
+    search_videos = search_videos_pexels
+    if source == "pixabay":
+        search_videos = search_videos_pixabay
+
    for search_term in search_terms:
-        # logger.info(f"searching videos for '{search_term}'")
-        video_items = search_videos(search_term=search_term,
-                                    minimum_duration=max_clip_duration,
-                                    video_aspect=video_aspect)
+        video_items = search_videos(
+            search_term=search_term,
+            minimum_duration=max_clip_duration,
+            video_aspect=video_aspect,
+        )
        logger.info(f"found {len(video_items)} videos for '{search_term}'")

        for item in video_items:
@@ -146,7 +225,8 @@ def download_videos(task_id: str,
                found_duration += item.duration

    logger.info(
-        f"found total videos: {len(valid_video_items)}, required duration: {audio_duration} seconds, found duration: {found_duration} seconds")
+        f"found total videos: {len(valid_video_items)}, required duration: {audio_duration} seconds, found duration: {found_duration} seconds"
+    )
    video_paths = []

    material_directory = config.app.get("material_directory", "").strip()
@@ -162,14 +242,18 @@ def download_videos(task_id: str,
    for item in valid_video_items:
        try:
            logger.info(f"downloading video: {item.url}")
-            saved_video_path = save_video(video_url=item.url, save_dir=material_directory)
+            saved_video_path = save_video(
+                video_url=item.url, save_dir=material_directory
+            )
            if saved_video_path:
                logger.info(f"video saved: {saved_video_path}")
                video_paths.append(saved_video_path)
                seconds = min(max_clip_duration, item.duration)
                total_duration += seconds
                if total_duration > audio_duration:
-                    logger.info(f"total duration of downloaded videos: {total_duration} seconds, skip downloading more")
+                    logger.info(
+                        f"total duration of downloaded videos: {total_duration} seconds, skip downloading more"
+                    )
                    break
        except Exception as e:
            logger.error(f"failed to download video: {utils.to_json(item)} => {str(e)}")
@@ -178,4 +262,6 @@ def download_videos(task_id: str,


 if __name__ == "__main__":
-    download_videos("test123", ["cat"], audio_duration=100)
+    download_videos(
+        "test123", ["Money Exchange Medium"], audio_duration=100, source="pixabay"
+    )
--- a/app/services/state.py
+++ b/app/services/state.py
@@ -1,12 +1,12 @@
 import ast
 from abc import ABC, abstractmethod
+
 from app.config import config
 from app.models import const


 # Base class for state management
 class BaseState(ABC):
-
    @abstractmethod
    def update_task(self, task_id: str, state: int, progress: int = 0, **kwargs):
        pass
@@ -15,19 +15,36 @@ class BaseState(ABC):
    def get_task(self, task_id: str):
        pass

+    @abstractmethod
+    def get_all_tasks(self, page: int, page_size: int):
+        pass
+

 # Memory state management
 class MemoryState(BaseState):
-
    def __init__(self):
        self._tasks = {}

-    def update_task(self, task_id: str, state: int = const.TASK_STATE_PROCESSING, progress: int = 0, **kwargs):
+    def get_all_tasks(self, page: int, page_size: int):
+        start = (page - 1) * page_size
+        end = start + page_size
+        tasks = list(self._tasks.values())
+        total = len(tasks)
+        return tasks[start:end], total
+
+    def update_task(
+        self,
+        task_id: str,
+        state: int = const.TASK_STATE_PROCESSING,
+        progress: int = 0,
+        **kwargs,
+    ):
        progress = int(progress)
        if progress > 100:
            progress = 100

        self._tasks[task_id] = {
+            "task_id": task_id,
            "state": state,
            "progress": progress,
            **kwargs,
@@ -43,17 +60,46 @@ class MemoryState(BaseState):

 # Redis state management
 class RedisState(BaseState):
-
-    def __init__(self, host='localhost', port=6379, db=0, password=None):
+    def __init__(self, host="localhost", port=6379, db=0, password=None):
        import redis
+
        self._redis = redis.StrictRedis(host=host, port=port, db=db, password=password)

-    def update_task(self, task_id: str, state: int = const.TASK_STATE_PROCESSING, progress: int = 0, **kwargs):
+    def get_all_tasks(self, page: int, page_size: int):
+        start = (page - 1) * page_size
+        end = start + page_size
+        tasks = []
+        cursor = 0
+        total = 0
+        while True:
+            cursor, keys = self._redis.scan(cursor, count=page_size)
+            total += len(keys)
+            if total > start:
+                for key in keys[max(0, start - total):end - total]:
+                    task_data = self._redis.hgetall(key)
+                    task = {
+                        k.decode("utf-8"): self._convert_to_original_type(v) for k, v in task_data.items()
+                    }
+                    tasks.append(task)
+                    if len(tasks) >= page_size:
+                        break
+            if cursor == 0 or len(tasks) >= page_size:
+                break
+        return tasks, total
+
+    def update_task(
+        self,
+        task_id: str,
+        state: int = const.TASK_STATE_PROCESSING,
+        progress: int = 0,
+        **kwargs,
+    ):
        progress = int(progress)
        if progress > 100:
            progress = 100

        fields = {
+            "task_id": task_id,
            "state": state,
            "progress": progress,
            **kwargs,
@@ -67,7 +113,10 @@ class RedisState(BaseState):
        if not task_data:
            return None

-        task = {key.decode('utf-8'): self._convert_to_original_type(value) for key, value in task_data.items()}
+        task = {
+            key.decode("utf-8"): self._convert_to_original_type(value)
+            for key, value in task_data.items()
+        }
        return task

    def delete_task(self, task_id: str):
@@ -79,7 +128,7 @@ class RedisState(BaseState):
        Convert the value from byte string to its original data type.
        You can extend this method to handle other data types as needed.
        """
-        value_str = value.decode('utf-8')
+        value_str = value.decode("utf-8")

        try:
            # try to convert byte string array to list
@@ -100,4 +149,10 @@ _redis_port = config.app.get("redis_port", 6379)
 _redis_db = config.app.get("redis_db", 0)
 _redis_password = config.app.get("redis_password", None)

-state = RedisState(host=_redis_host, port=_redis_port, db=_redis_db, password=_redis_password) if _enable_redis else MemoryState()
+state = (
+    RedisState(
+        host=_redis_host, port=_redis_port, db=_redis_db, password=_redis_password
+    )
+    if _enable_redis
+    else MemoryState()
+)
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -1,9 +1,9 @@
 import json
 import os.path
 import re
+from timeit import default_timer as timer

 from faster_whisper import WhisperModel
-from timeit import default_timer as timer
 from loguru import logger

 from app.config import config
@@ -23,10 +23,23 @@ def create(audio_file, subtitle_file: str = ""):
        if not os.path.isdir(model_path) or not os.path.isfile(model_bin_file):
            model_path = model_size

-        logger.info(f"loading model: {model_path}, device: {device}, compute_type: {compute_type}")
-        model = WhisperModel(model_size_or_path=model_path,
-                             device=device,
-                             compute_type=compute_type)
+        logger.info(
+            f"loading model: {model_path}, device: {device}, compute_type: {compute_type}"
+        )
+        try:
+            model = WhisperModel(
+                model_size_or_path=model_path, device=device, compute_type=compute_type
+            )
+        except Exception as e:
+            logger.error(
+                f"failed to load model: {e} \n\n"
+                f"********************************************\n"
+                f"this may be caused by network issue. \n"
+                f"please download the model manually and put it in the 'models' folder. \n"
+                f"see [README.md FAQ](https://github.com/harry0703/MoneyPrinterTurbo) for more details.\n"
+                f"********************************************\n\n"
+            )
+            return None

    logger.info(f"start, output file: {subtitle_file}")
    if not subtitle_file:
@@ -40,7 +53,9 @@ def create(audio_file, subtitle_file: str = ""):
        vad_parameters=dict(min_silence_duration_ms=500),
    )

-    logger.info(f"detected language: '{info.language}', probability: {info.language_probability:.2f}")
+    logger.info(
+        f"detected language: '{info.language}', probability: {info.language_probability:.2f}"
+    )

    start = timer()
    subtitles = []
@@ -53,11 +68,9 @@ def create(audio_file, subtitle_file: str = ""):
        msg = "[%.2fs -> %.2fs] %s" % (seg_start, seg_end, seg_text)
        logger.debug(msg)

-        subtitles.append({
-            "msg": seg_text,
-            "start_time": seg_start,
-            "end_time": seg_end
-        })
+        subtitles.append(
+            {"msg": seg_text, "start_time": seg_start, "end_time": seg_end}
+        )

    for segment in segments:
        words_idx = 0
@@ -75,7 +88,7 @@ def create(audio_file, subtitle_file: str = ""):
                    is_segmented = True

                seg_end = word.end
-                # 如果包含标点,则断句
+                # If it contains punctuation, then break the sentence.
                seg_text += word.word

                if utils.str_contains_punctuation(word.word):
@@ -110,7 +123,11 @@ def create(audio_file, subtitle_file: str = ""):
    for subtitle in subtitles:
        text = subtitle.get("msg")
        if text:
-            lines.append(utils.text_to_srt(idx, text, subtitle.get("start_time"), subtitle.get("end_time")))
+            lines.append(
+                utils.text_to_srt(
+                    idx, text, subtitle.get("start_time"), subtitle.get("end_time")
+                )
+            )
            idx += 1

    sub = "\n".join(lines) + "\n"
@@ -120,16 +137,19 @@ def create(audio_file, subtitle_file: str = ""):


 def file_to_subtitles(filename):
+    if not filename or not os.path.isfile(filename):
+        return []
+
    times_texts = []
    current_times = None
    current_text = ""
    index = 0
-    with open(filename, 'r', encoding="utf-8") as f:
+    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            times = re.findall("([0-9]*:[0-9]*:[0-9]*,[0-9]*)", line)
            if times:
                current_times = line
-            elif line.strip() == '' and current_times:
+            elif line.strip() == "" and current_times:
                index += 1
                times_texts.append((index, current_times.strip(), current_text.strip()))
                current_times, current_text = None, ""
@@ -138,27 +158,124 @@ def file_to_subtitles(filename):
    return times_texts


+def levenshtein_distance(s1, s2):
+    if len(s1) < len(s2):
+        return levenshtein_distance(s2, s1)
+
+    if len(s2) == 0:
+        return len(s1)
+
+    previous_row = range(len(s2) + 1)
+    for i, c1 in enumerate(s1):
+        current_row = [i + 1]
+        for j, c2 in enumerate(s2):
+            insertions = previous_row[j + 1] + 1
+            deletions = current_row[j] + 1
+            substitutions = previous_row[j] + (c1 != c2)
+            current_row.append(min(insertions, deletions, substitutions))
+        previous_row = current_row
+
+    return previous_row[-1]
+
+
+def similarity(a, b):
+    distance = levenshtein_distance(a.lower(), b.lower())
+    max_length = max(len(a), len(b))
+    return 1 - (distance / max_length)
+
+
 def correct(subtitle_file, video_script):
    subtitle_items = file_to_subtitles(subtitle_file)
    script_lines = utils.split_string_by_punctuations(video_script)

    corrected = False
-    if len(subtitle_items) == len(script_lines):
-        for i in range(len(script_lines)):
-            script_line = script_lines[i].strip()
-            subtitle_line = subtitle_items[i][2]
-            if script_line != subtitle_line:
-                logger.warning(f"line {i + 1}, script: {script_line}, subtitle: {subtitle_line}")
-                subtitle_items[i] = (subtitle_items[i][0], subtitle_items[i][1], script_line)
+    new_subtitle_items = []
+    script_index = 0
+    subtitle_index = 0
+
+    while script_index < len(script_lines) and subtitle_index < len(subtitle_items):
+        script_line = script_lines[script_index].strip()
+        subtitle_line = subtitle_items[subtitle_index][2].strip()
+
+        if script_line == subtitle_line:
+            new_subtitle_items.append(subtitle_items[subtitle_index])
+            script_index += 1
+            subtitle_index += 1
+        else:
+            combined_subtitle = subtitle_line
+            start_time = subtitle_items[subtitle_index][1].split(" --> ")[0]
+            end_time = subtitle_items[subtitle_index][1].split(" --> ")[1]
+            next_subtitle_index = subtitle_index + 1
+
+            while next_subtitle_index < len(subtitle_items):
+                next_subtitle = subtitle_items[next_subtitle_index][2].strip()
+                if similarity(
+                    script_line, combined_subtitle + " " + next_subtitle
+                ) > similarity(script_line, combined_subtitle):
+                    combined_subtitle += " " + next_subtitle
+                    end_time = subtitle_items[next_subtitle_index][1].split(" --> ")[1]
+                    next_subtitle_index += 1
+                else:
+                    break
+
+            if similarity(script_line, combined_subtitle) > 0.8:
+                logger.warning(
+                    f"Merged/Corrected - Script: {script_line}, Subtitle: {combined_subtitle}"
+                )
+                new_subtitle_items.append(
+                    (
+                        len(new_subtitle_items) + 1,
+                        f"{start_time} --> {end_time}",
+                        script_line,
+                    )
+                )
                corrected = True
+            else:
+                logger.warning(
+                    f"Mismatch - Script: {script_line}, Subtitle: {combined_subtitle}"
+                )
+                new_subtitle_items.append(
+                    (
+                        len(new_subtitle_items) + 1,
+                        f"{start_time} --> {end_time}",
+                        script_line,
+                    )
+                )
+                corrected = True
+
+            script_index += 1
+            subtitle_index = next_subtitle_index
+
+    # Process the remaining lines of the script.
+    while script_index < len(script_lines):
+        logger.warning(f"Extra script line: {script_lines[script_index]}")
+        if subtitle_index < len(subtitle_items):
+            new_subtitle_items.append(
+                (
+                    len(new_subtitle_items) + 1,
+                    subtitle_items[subtitle_index][1],
+                    script_lines[script_index],
+                )
+            )
+            subtitle_index += 1
+        else:
+            new_subtitle_items.append(
+                (
+                    len(new_subtitle_items) + 1,
+                    "00:00:00,000 --> 00:00:00,000",
+                    script_lines[script_index],
+                )
+            )
+        script_index += 1
+        corrected = True

    if corrected:
        with open(subtitle_file, "w", encoding="utf-8") as fd:
-            for item in subtitle_items:
-                fd.write(f"{item[0]}\n{item[1]}\n{item[2]}\n\n")
-        logger.info(f"subtitle corrected")
+            for i, item in enumerate(new_subtitle_items):
+                fd.write(f"{i + 1}\n{item[1]}\n{item[2]}\n\n")
+        logger.info("Subtitle corrected")
    else:
-        logger.success(f"subtitle is correct")
+        logger.success("Subtitle is correct")


 if __name__ == "__main__":
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -7,52 +7,42 @@ from loguru import logger

 from app.config import config
 from app.models import const
-from app.models.schema import VideoParams, VideoConcatMode
-from app.services import llm, material, voice, video, subtitle
+from app.models.schema import VideoConcatMode, VideoParams
+from app.services import llm, material, subtitle, video, voice
 from app.services import state as sm
 from app.utils import utils


-def start(task_id, params: VideoParams):
-    """
-    {
-        "video_subject": "",
-        "video_aspect": "横屏 16:9（西瓜视频）",
-        "voice_name": "女生-晓晓",
-        "enable_bgm": false,
-        "font_name": "STHeitiMedium 黑体-中",
-        "text_color": "#FFFFFF",
-        "font_size": 60,
-        "stroke_color": "#000000",
-        "stroke_width": 1.5
-    }
-    """
-    logger.info(f"start task: {task_id}")
-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
-
-    video_subject = params.video_subject
-    voice_name = voice.parse_voice_name(params.voice_name)
-    paragraph_number = params.paragraph_number
-    n_threads = params.n_threads
-    max_clip_duration = params.video_clip_duration
-
+def generate_script(task_id, params):
    logger.info("\n\n## generating video script")
    video_script = params.video_script.strip()
    if not video_script:
-        video_script = llm.generate_script(video_subject=video_subject, language=params.video_language,
-                                           paragraph_number=paragraph_number)
+        video_script = llm.generate_script(
+            video_subject=params.video_subject,
+            language=params.video_language,
+            paragraph_number=params.paragraph_number,
+        )
    else:
        logger.debug(f"video script: \n{video_script}")

-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
+    if not video_script:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error("failed to generate video script.")
+        return None

+    return video_script
+
+
+def generate_terms(task_id, params, video_script):
    logger.info("\n\n## generating video terms")
    video_terms = params.video_terms
    if not video_terms:
-        video_terms = llm.generate_terms(video_subject=video_subject, video_script=video_script, amount=5)
+        video_terms = llm.generate_terms(
+            video_subject=params.video_subject, video_script=video_script, amount=5
+        )
    else:
        if isinstance(video_terms, str):
-            video_terms = [term.strip() for term in re.split(r'[,，]', video_terms)]
+            video_terms = [term.strip() for term in re.split(r"[,，]", video_terms)]
        elif isinstance(video_terms, list):
            video_terms = [term.strip() for term in video_terms]
        else:
@@ -60,7 +50,16 @@ def start(task_id, params: VideoParams):

        logger.debug(f"video terms: {utils.to_json(video_terms)}")

-    script_file = path.join(utils.task_dir(task_id), f"script.json")
+    if not video_terms:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        logger.error("failed to generate video terms.")
+        return None
+
+    return video_terms
+
+
+def save_script_data(task_id, video_script, video_terms, params):
+    script_file = path.join(utils.task_dir(task_id), "script.json")
    script_data = {
        "script": video_script,
        "search_terms": video_terms,
@@ -70,80 +69,120 @@ def start(task_id, params: VideoParams):
    with open(script_file, "w", encoding="utf-8") as f:
        f.write(utils.to_json(script_data))

-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)

+def generate_audio(task_id, params, video_script):
    logger.info("\n\n## generating audio")
-    audio_file = path.join(utils.task_dir(task_id), f"audio.mp3")
-    sub_maker = voice.tts(text=video_script, voice_name=voice_name, voice_file=audio_file)
+    audio_file = path.join(utils.task_dir(task_id), "audio.mp3")
+    sub_maker = voice.tts(
+        text=video_script,
+        voice_name=voice.parse_voice_name(params.voice_name),
+        voice_rate=params.voice_rate,
+        voice_file=audio_file,
+    )
    if sub_maker is None:
        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
        logger.error(
-            "failed to generate audio, maybe the network is not available. if you are in China, please use a VPN.")
-        return
+            """failed to generate audio:
+1. check if the language of the voice matches the language of the video script.
+2. check if the network is available. If you are in China, it is recommended to use a VPN and enable the global traffic mode.
+        """.strip()
+        )
+        return None, None, None

-    audio_duration = voice.get_audio_duration(sub_maker)
-    audio_duration = math.ceil(audio_duration)
+    audio_duration = math.ceil(voice.get_audio_duration(sub_maker))
+    return audio_file, audio_duration, sub_maker

-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)

-    subtitle_path = ""
-    if params.subtitle_enabled:
-        subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
-        subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
-        logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
-        subtitle_fallback = False
-        if subtitle_provider == "edge":
-            voice.create_subtitle(text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path)
-            if not os.path.exists(subtitle_path):
-                subtitle_fallback = True
-                logger.warning("subtitle file not found, fallback to whisper")
+def generate_subtitle(task_id, params, video_script, sub_maker, audio_file):
+    if not params.subtitle_enabled:
+        return ""

-        if subtitle_provider == "whisper" or subtitle_fallback:
-            subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
-            logger.info("\n\n## correcting subtitle")
-            subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)
+    subtitle_path = path.join(utils.task_dir(task_id), "subtitle.srt")
+    subtitle_provider = config.app.get("subtitle_provider", "edge").strip().lower()
+    logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")

-        subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
-        if not subtitle_lines:
-            logger.warning(f"subtitle file is invalid: {subtitle_path}")
-            subtitle_path = ""
+    subtitle_fallback = False
+    if subtitle_provider == "edge":
+        voice.create_subtitle(
+            text=video_script, sub_maker=sub_maker, subtitle_file=subtitle_path
+        )
+        if not os.path.exists(subtitle_path):
+            subtitle_fallback = True
+            logger.warning("subtitle file not found, fallback to whisper")

-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+    if subtitle_provider == "whisper" or subtitle_fallback:
+        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+        logger.info("\n\n## correcting subtitle")
+        subtitle.correct(subtitle_file=subtitle_path, video_script=video_script)

-    logger.info("\n\n## downloading videos")
-    downloaded_videos = material.download_videos(task_id=task_id,
-                                                 search_terms=video_terms,
-                                                 video_aspect=params.video_aspect,
-                                                 video_contact_mode=params.video_concat_mode,
-                                                 audio_duration=audio_duration * params.video_count,
-                                                 max_clip_duration=max_clip_duration,
-                                                 )
-    if not downloaded_videos:
-        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
-        logger.error(
-            "failed to download videos, maybe the network is not available. if you are in China, please use a VPN.")
-        return
+    subtitle_lines = subtitle.file_to_subtitles(subtitle_path)
+    if not subtitle_lines:
+        logger.warning(f"subtitle file is invalid: {subtitle_path}")
+        return ""

-    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
+    return subtitle_path

+
+def get_video_materials(task_id, params, video_terms, audio_duration):
+    if params.video_source == "local":
+        logger.info("\n\n## preprocess local materials")
+        materials = video.preprocess_video(
+            materials=params.video_materials, clip_duration=params.video_clip_duration
+        )
+        if not materials:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error(
+                "no valid materials found, please check the materials and try again."
+            )
+            return None
+        return [material_info.url for material_info in materials]
+    else:
+        logger.info(f"\n\n## downloading videos from {params.video_source}")
+        downloaded_videos = material.download_videos(
+            task_id=task_id,
+            search_terms=video_terms,
+            source=params.video_source,
+            video_aspect=params.video_aspect,
+            video_contact_mode=params.video_concat_mode,
+            audio_duration=audio_duration * params.video_count,
+            max_clip_duration=params.video_clip_duration,
+        )
+        if not downloaded_videos:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            logger.error(
+                "failed to download videos, maybe the network is not available. if you are in China, please use a VPN."
+            )
+            return None
+        return downloaded_videos
+
+
+def generate_final_videos(
+    task_id, params, downloaded_videos, audio_file, subtitle_path
+):
    final_video_paths = []
    combined_video_paths = []
-    video_concat_mode = params.video_concat_mode
-    if params.video_count > 1:
-        video_concat_mode = VideoConcatMode.random
+    video_concat_mode = (
+        params.video_concat_mode if params.video_count == 1 else VideoConcatMode.random
+    )
+    video_transition_mode = params.video_transition_mode

    _progress = 50
    for i in range(params.video_count):
        index = i + 1
-        combined_video_path = path.join(utils.task_dir(task_id), f"combined-{index}.mp4")
+        combined_video_path = path.join(
+            utils.task_dir(task_id), f"combined-{index}.mp4"
+        )
        logger.info(f"\n\n## combining video: {index} => {combined_video_path}")
-        video.combine_videos(combined_video_path=combined_video_path,
-                             video_paths=downloaded_videos,
-                             audio_file=audio_file,
-                             video_aspect=params.video_aspect,
-                             video_concat_mode=video_concat_mode,
-                             max_clip_duration=max_clip_duration,
-                             threads=n_threads)
+        video.combine_videos(
+            combined_video_path=combined_video_path,
+            video_paths=downloaded_videos,
+            audio_file=audio_file,
+            video_aspect=params.video_aspect,
+            video_concat_mode=video_concat_mode,
+            video_transition_mode=video_transition_mode,
+            max_clip_duration=params.video_clip_duration,
+            threads=params.n_threads,
+        )

        _progress += 50 / params.video_count / 2
        sm.state.update_task(task_id, progress=_progress)
@@ -151,13 +190,13 @@ def start(task_id, params: VideoParams):
        final_video_path = path.join(utils.task_dir(task_id), f"final-{index}.mp4")

        logger.info(f"\n\n## generating video: {index} => {final_video_path}")
-        # Put everything together
-        video.generate_video(video_path=combined_video_path,
-                             audio_path=audio_file,
-                             subtitle_path=subtitle_path,
-                             output_file=final_video_path,
-                             params=params,
-                             )
+        video.generate_video(
+            video_path=combined_video_path,
+            audio_path=audio_file,
+            subtitle_path=subtitle_path,
+            output_file=final_video_path,
+            params=params,
+        )

        _progress += 50 / params.video_count / 2
        sm.state.update_task(task_id, progress=_progress)
@@ -165,11 +204,136 @@ def start(task_id, params: VideoParams):
        final_video_paths.append(final_video_path)
        combined_video_paths.append(combined_video_path)

-    logger.success(f"task {task_id} finished, generated {len(final_video_paths)} videos.")
+    return final_video_paths, combined_video_paths
+
+
+def start(task_id, params: VideoParams, stop_at: str = "video"):
+    logger.info(f"start task: {task_id}, stop_at: {stop_at}")
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=5)
+
+    if type(params.video_concat_mode) is str:
+        params.video_concat_mode = VideoConcatMode(params.video_concat_mode)
+
+    # 1. Generate script
+    video_script = generate_script(task_id, params)
+    if not video_script or "Error: " in video_script:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=10)
+
+    if stop_at == "script":
+        sm.state.update_task(
+            task_id, state=const.TASK_STATE_COMPLETE, progress=100, script=video_script
+        )
+        return {"script": video_script}
+
+    # 2. Generate terms
+    video_terms = ""
+    if params.video_source != "local":
+        video_terms = generate_terms(task_id, params, video_script)
+        if not video_terms:
+            sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+            return
+
+    save_script_data(task_id, video_script, video_terms, params)
+
+    if stop_at == "terms":
+        sm.state.update_task(
+            task_id, state=const.TASK_STATE_COMPLETE, progress=100, terms=video_terms
+        )
+        return {"script": video_script, "terms": video_terms}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=20)
+
+    # 3. Generate audio
+    audio_file, audio_duration, sub_maker = generate_audio(
+        task_id, params, video_script
+    )
+    if not audio_file:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=30)
+
+    if stop_at == "audio":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            audio_file=audio_file,
+        )
+        return {"audio_file": audio_file, "audio_duration": audio_duration}
+
+    # 4. Generate subtitle
+    subtitle_path = generate_subtitle(
+        task_id, params, video_script, sub_maker, audio_file
+    )
+
+    if stop_at == "subtitle":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            subtitle_path=subtitle_path,
+        )
+        return {"subtitle_path": subtitle_path}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=40)
+
+    # 5. Get video materials
+    downloaded_videos = get_video_materials(
+        task_id, params, video_terms, audio_duration
+    )
+    if not downloaded_videos:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    if stop_at == "materials":
+        sm.state.update_task(
+            task_id,
+            state=const.TASK_STATE_COMPLETE,
+            progress=100,
+            materials=downloaded_videos,
+        )
+        return {"materials": downloaded_videos}
+
+    sm.state.update_task(task_id, state=const.TASK_STATE_PROCESSING, progress=50)
+
+    # 6. Generate final videos
+    final_video_paths, combined_video_paths = generate_final_videos(
+        task_id, params, downloaded_videos, audio_file, subtitle_path
+    )
+
+    if not final_video_paths:
+        sm.state.update_task(task_id, state=const.TASK_STATE_FAILED)
+        return
+
+    logger.success(
+        f"task {task_id} finished, generated {len(final_video_paths)} videos."
+    )

    kwargs = {
        "videos": final_video_paths,
-        "combined_videos": combined_video_paths
+        "combined_videos": combined_video_paths,
+        "script": video_script,
+        "terms": video_terms,
+        "audio_file": audio_file,
+        "audio_duration": audio_duration,
+        "subtitle_path": subtitle_path,
+        "materials": downloaded_videos,
    }
-    sm.state.update_task(task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs)
+    sm.state.update_task(
+        task_id, state=const.TASK_STATE_COMPLETE, progress=100, **kwargs
+    )
    return kwargs
+
+
+if __name__ == "__main__":
+    task_id = "task_id"
+    params = VideoParams(
+        video_subject="金钱的作用",
+        voice_name="zh-CN-XiaoyiNeural-Female",
+        voice_rate=1.0,
+    )
+    start(task_id, params, stop_at="video")
--- a/app/services/utils/video_effects.py
+++ b/app/services/utils/video_effects.py
@@ -0,0 +1,21 @@
+from moviepy import Clip, vfx
+
+
+# FadeIn
+def fadein_transition(clip: Clip, t: float) -> Clip:
+    return clip.with_effects([vfx.FadeIn(t)])
+
+
+# FadeOut
+def fadeout_transition(clip: Clip, t: float) -> Clip:
+    return clip.with_effects([vfx.FadeOut(t)])
+
+
+# SlideIn
+def slidein_transition(clip: Clip, t: float, side: str) -> Clip:
+    return clip.with_effects([vfx.SlideIn(t, side)])
+
+
+# SlideOut
+def slideout_transition(clip: Clip, t: float, side: str) -> Clip:
+    return clip.with_effects([vfx.SlideOut(t, side)])
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -1,14 +1,102 @@
 import glob
+import itertools
+import os
 import random
+import gc
+import shutil
 from typing import List
-from PIL import ImageFont
 from loguru import logger
-from moviepy.editor import *
+from moviepy import (
+    AudioFileClip,
+    ColorClip,
+    CompositeAudioClip,
+    CompositeVideoClip,
+    ImageClip,
+    TextClip,
+    VideoFileClip,
+    afx,
+    concatenate_videoclips,
+)
 from moviepy.video.tools.subtitles import SubtitlesClip
+from PIL import ImageFont

-from app.models.schema import VideoAspect, VideoParams, VideoConcatMode
+from app.models import const
+from app.models.schema import (
+    MaterialInfo,
+    VideoAspect,
+    VideoConcatMode,
+    VideoParams,
+    VideoTransitionMode,
+)
+from app.services.utils import video_effects
 from app.utils import utils

+class SubClippedVideoClip:
+    def __init__(self, file_path, start_time=None, end_time=None, width=None, height=None, duration=None):
+        self.file_path = file_path
+        self.start_time = start_time
+        self.end_time = end_time
+        self.width = width
+        self.height = height
+        if duration is None:
+            self.duration = end_time - start_time
+        else:
+            self.duration = duration
+
+    def __str__(self):
+        return f"SubClippedVideoClip(file_path={self.file_path}, start_time={self.start_time}, end_time={self.end_time}, duration={self.duration}, width={self.width}, height={self.height})"
+
+
+audio_codec = "aac"
+video_codec = "libx264"
+fps = 30
+
+def close_clip(clip):
+    if clip is None:
+        return
+        
+    try:
+        # close main resources
+        if hasattr(clip, 'reader') and clip.reader is not None:
+            clip.reader.close()
+            
+        # close audio resources
+        if hasattr(clip, 'audio') and clip.audio is not None:
+            if hasattr(clip.audio, 'reader') and clip.audio.reader is not None:
+                clip.audio.reader.close()
+            del clip.audio
+            
+        # close mask resources
+        if hasattr(clip, 'mask') and clip.mask is not None:
+            if hasattr(clip.mask, 'reader') and clip.mask.reader is not None:
+                clip.mask.reader.close()
+            del clip.mask
+            
+        # handle child clips in composite clips
+        if hasattr(clip, 'clips') and clip.clips:
+            for child_clip in clip.clips:
+                if child_clip is not clip:  # avoid possible circular references
+                    close_clip(child_clip)
+            
+        # clear clip list
+        if hasattr(clip, 'clips'):
+            clip.clips = []
+            
+    except Exception as e:
+        logger.error(f"failed to close clip: {str(e)}")
+    
+    del clip
+    gc.collect()
+
+def delete_files(files: List[str] | str):
+    if isinstance(files, str):
+        files = [files]
+        
+    for file in files:
+        try:
+            os.remove(file)
+        except:
+            pass

 def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
    if not bgm_type:
@@ -26,98 +114,200 @@ def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
    return ""


-def combine_videos(combined_video_path: str,
-                   video_paths: List[str],
-                   audio_file: str,
-                   video_aspect: VideoAspect = VideoAspect.portrait,
-                   video_concat_mode: VideoConcatMode = VideoConcatMode.random,
-                   max_clip_duration: int = 5,
-                   threads: int = 2,
-                   ) -> str:
+def combine_videos(
+    combined_video_path: str,
+    video_paths: List[str],
+    audio_file: str,
+    video_aspect: VideoAspect = VideoAspect.portrait,
+    video_concat_mode: VideoConcatMode = VideoConcatMode.random,
+    video_transition_mode: VideoTransitionMode = None,
+    max_clip_duration: int = 5,
+    threads: int = 2,
+) -> str:
    audio_clip = AudioFileClip(audio_file)
    audio_duration = audio_clip.duration
-    logger.info(f"max duration of audio: {audio_duration} seconds")
+    logger.info(f"audio duration: {audio_duration} seconds")
    # Required duration of each clip
    req_dur = audio_duration / len(video_paths)
    req_dur = max_clip_duration
-    logger.info(f"each clip will be maximum {req_dur} seconds long")
+    logger.info(f"maximum clip duration: {req_dur} seconds")
    output_dir = os.path.dirname(combined_video_path)

    aspect = VideoAspect(video_aspect)
    video_width, video_height = aspect.to_resolution()

-    clips = []
+    processed_clips = []
+    subclipped_items = []
    video_duration = 0
+    for video_path in video_paths:
+        clip = VideoFileClip(video_path)
+        clip_duration = clip.duration
+        clip_w, clip_h = clip.size
+        close_clip(clip)
+        
+        start_time = 0
+
+        while start_time < clip_duration:
+            end_time = min(start_time + max_clip_duration, clip_duration)            
+            if clip_duration - start_time >= max_clip_duration:
+                subclipped_items.append(SubClippedVideoClip(file_path= video_path, start_time=start_time, end_time=end_time, width=clip_w, height=clip_h))
+            start_time = end_time    
+            if video_concat_mode.value == VideoConcatMode.sequential.value:
+                break
+
+    # random subclipped_items order
+    if video_concat_mode.value == VideoConcatMode.random.value:
+        random.shuffle(subclipped_items)
+        
+    logger.debug(f"total subclipped items: {len(subclipped_items)}")
+    
    # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
-    while video_duration < audio_duration:
-        # random video_paths order
-        if video_concat_mode.value == VideoConcatMode.random.value:
-            random.shuffle(video_paths)
+    for i, subclipped_item in enumerate(subclipped_items):
+        if video_duration > audio_duration:
+            break
        
-        for video_path in video_paths:
-            clip = VideoFileClip(video_path).without_audio()
-            # Check if clip is longer than the remaining audio
-            if (audio_duration - video_duration) < clip.duration:
-                clip = clip.subclip(0, (audio_duration - video_duration))
-            # Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image
-            elif req_dur < clip.duration:
-                clip = clip.subclip(0, req_dur)
-            clip = clip.set_fps(30)
+        logger.debug(f"processing clip {i+1}: {subclipped_item.width}x{subclipped_item.height}, current duration: {video_duration:.2f}s, remaining: {audio_duration - video_duration:.2f}s")
        
+        try:
+            clip = VideoFileClip(subclipped_item.file_path).subclipped(subclipped_item.start_time, subclipped_item.end_time)
+            clip_duration = clip.duration
            # Not all videos are same size, so we need to resize them
            clip_w, clip_h = clip.size
            if clip_w != video_width or clip_h != video_height:
                clip_ratio = clip.w / clip.h
                video_ratio = video_width / video_height
+                logger.debug(f"resizing clip, source: {clip_w}x{clip_h}, ratio: {clip_ratio:.2f}, target: {video_width}x{video_height}, ratio: {video_ratio:.2f}")
                
                if clip_ratio == video_ratio:
-                    # 等比例缩放
-                    clip = clip.resize((video_width, video_height))
+                    clip = clip.resized(new_size=(video_width, video_height))
                else:
-                    # 等比缩放视频
                    if clip_ratio > video_ratio:
-                        # 按照目标宽度等比缩放
                        scale_factor = video_width / clip_w
                    else:
-                        # 按照目标高度等比缩放
                        scale_factor = video_height / clip_h

                    new_width = int(clip_w * scale_factor)
                    new_height = int(clip_h * scale_factor)
-                    clip_resized = clip.resize(newsize=(new_width, new_height))

-                    background = ColorClip(size=(video_width, video_height), color=(0, 0, 0))
-                    clip = CompositeVideoClip([
-                        background.set_duration(clip.duration),
-                        clip_resized.set_position("center")
-                    ])
+                    background = ColorClip(size=(video_width, video_height), color=(0, 0, 0)).with_duration(clip_duration)
+                    clip_resized = clip.resized(new_size=(new_width, new_height)).with_position("center")
+                    clip = CompositeVideoClip([background, clip_resized])
                    
-                logger.info(f"resizing video to {video_width} x {video_height}, clip size: {clip_w} x {clip_h}")
+            shuffle_side = random.choice(["left", "right", "top", "bottom"])
+            if video_transition_mode.value == VideoTransitionMode.none.value:
+                clip = clip
+            elif video_transition_mode.value == VideoTransitionMode.fade_in.value:
+                clip = video_effects.fadein_transition(clip, 1)
+            elif video_transition_mode.value == VideoTransitionMode.fade_out.value:
+                clip = video_effects.fadeout_transition(clip, 1)
+            elif video_transition_mode.value == VideoTransitionMode.slide_in.value:
+                clip = video_effects.slidein_transition(clip, 1, shuffle_side)
+            elif video_transition_mode.value == VideoTransitionMode.slide_out.value:
+                clip = video_effects.slideout_transition(clip, 1, shuffle_side)
+            elif video_transition_mode.value == VideoTransitionMode.shuffle.value:
+                transition_funcs = [
+                    lambda c: video_effects.fadein_transition(c, 1),
+                    lambda c: video_effects.fadeout_transition(c, 1),
+                    lambda c: video_effects.slidein_transition(c, 1, shuffle_side),
+                    lambda c: video_effects.slideout_transition(c, 1, shuffle_side),
+                ]
+                shuffle_transition = random.choice(transition_funcs)
+                clip = shuffle_transition(clip)

            if clip.duration > max_clip_duration:
-                clip = clip.subclip(0, max_clip_duration)
+                clip = clip.subclipped(0, max_clip_duration)
                
-            clips.append(clip)
+            # wirte clip to temp file
+            clip_file = f"{output_dir}/temp-clip-{i+1}.mp4"
+            clip.write_videofile(clip_file, logger=None, fps=fps, codec=video_codec)
+            
+            close_clip(clip)
+        
+            processed_clips.append(SubClippedVideoClip(file_path=clip_file, duration=clip.duration, width=clip_w, height=clip_h))
            video_duration += clip.duration
            
-    video_clip = concatenate_videoclips(clips)
-    video_clip = video_clip.set_fps(30)
-    logger.info(f"writing")
-    # https://github.com/harry0703/MoneyPrinterTurbo/issues/111#issuecomment-2032354030
-    video_clip.write_videofile(filename=combined_video_path,
-                               threads=threads,
-                               logger=None,
-                               temp_audiofile_path=output_dir,
-                               audio_codec="aac",
-                               fps=30,
-                               )
-    video_clip.close()
-    logger.success(f"completed")
+        except Exception as e:
+            logger.error(f"failed to process clip: {str(e)}")
+    
+    # loop processed clips until the video duration matches or exceeds the audio duration.
+    if video_duration < audio_duration:
+        logger.warning(f"video duration ({video_duration:.2f}s) is shorter than audio duration ({audio_duration:.2f}s), looping clips to match audio length.")
+        base_clips = processed_clips.copy()
+        for clip in itertools.cycle(base_clips):
+            if video_duration >= audio_duration:
+                break
+            processed_clips.append(clip)
+            video_duration += clip.duration
+        logger.info(f"video duration: {video_duration:.2f}s, audio duration: {audio_duration:.2f}s, looped {len(processed_clips)-len(base_clips)} clips")
+     
+    # merge video clips progressively, avoid loading all videos at once to avoid memory overflow
+    logger.info("starting clip merging process")
+    if not processed_clips:
+        logger.warning("no clips available for merging")
+        return combined_video_path
+    
+    # if there is only one clip, use it directly
+    if len(processed_clips) == 1:
+        logger.info("using single clip directly")
+        shutil.copy(processed_clips[0].file_path, combined_video_path)
+        delete_files(processed_clips)
+        logger.info("video combining completed")
+        return combined_video_path
+    
+    # create initial video file as base
+    base_clip_path = processed_clips[0].file_path
+    temp_merged_video = f"{output_dir}/temp-merged-video.mp4"
+    temp_merged_next = f"{output_dir}/temp-merged-next.mp4"
+    
+    # copy first clip as initial merged video
+    shutil.copy(base_clip_path, temp_merged_video)
+    
+    # merge remaining video clips one by one
+    for i, clip in enumerate(processed_clips[1:], 1):
+        logger.info(f"merging clip {i}/{len(processed_clips)-1}, duration: {clip.duration:.2f}s")
+        
+        try:
+            # load current base video and next clip to merge
+            base_clip = VideoFileClip(temp_merged_video)
+            next_clip = VideoFileClip(clip.file_path)
+            
+            # merge these two clips
+            merged_clip = concatenate_videoclips([base_clip, next_clip])
+
+            # save merged result to temp file
+            merged_clip.write_videofile(
+                filename=temp_merged_next,
+                threads=threads,
+                logger=None,
+                temp_audiofile_path=output_dir,
+                audio_codec=audio_codec,
+                fps=fps,
+            )
+            close_clip(base_clip)
+            close_clip(next_clip)
+            close_clip(merged_clip)
+            
+            # replace base file with new merged file
+            delete_files(temp_merged_video)
+            os.rename(temp_merged_next, temp_merged_video)
+            
+        except Exception as e:
+            logger.error(f"failed to merge clip: {str(e)}")
+            continue
+    
+    # after merging, rename final result to target file name
+    os.rename(temp_merged_video, combined_video_path)
+    
+    # clean temp files
+    clip_files = [clip.file_path for clip in processed_clips]
+    delete_files(clip_files)
+            
+    logger.info("video combining completed")
    return combined_video_path


-def wrap_text(text, max_width, font='Arial', fontsize=60):
-    # 创建字体对象
+def wrap_text(text, max_width, font="Arial", fontsize=60):
+    # Create ImageFont
    font = ImageFont.truetype(font, fontsize)

    def get_text_size(inner_text):
@@ -129,13 +319,11 @@ def wrap_text(text, max_width, font='Arial', fontsize=60):
    if width <= max_width:
        return text, height

-    # logger.warning(f"wrapping text, max_width: {max_width}, text_width: {width}, text: {text}")
-
    processed = True

    _wrapped_lines_ = []
    words = text.split(" ")
-    _txt_ = ''
+    _txt_ = ""
    for word in words:
        _before = _txt_
        _txt_ += f"{word} "
@@ -151,14 +339,13 @@ def wrap_text(text, max_width, font='Arial', fontsize=60):
    _wrapped_lines_.append(_txt_)
    if processed:
        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
-        result = '\n'.join(_wrapped_lines_).strip()
+        result = "\n".join(_wrapped_lines_).strip()
        height = len(_wrapped_lines_) * height
-        # logger.warning(f"wrapped text: {result}")
        return result, height

    _wrapped_lines_ = []
    chars = list(text)
-    _txt_ = ''
+    _txt_ = ""
    for word in chars:
        _txt_ += word
        _width, _height = get_text_size(_txt_)
@@ -166,24 +353,24 @@ def wrap_text(text, max_width, font='Arial', fontsize=60):
            continue
        else:
            _wrapped_lines_.append(_txt_)
-            _txt_ = ''
+            _txt_ = ""
    _wrapped_lines_.append(_txt_)
-    result = '\n'.join(_wrapped_lines_).strip()
+    result = "\n".join(_wrapped_lines_).strip()
    height = len(_wrapped_lines_) * height
-    # logger.warning(f"wrapped text: {result}")
    return result, height


-def generate_video(video_path: str,
-                   audio_path: str,
-                   subtitle_path: str,
-                   output_file: str,
-                   params: VideoParams,
-                   ):
+def generate_video(
+    video_path: str,
+    audio_path: str,
+    subtitle_path: str,
+    output_file: str,
+    params: VideoParams,
+):
    aspect = VideoAspect(params.video_aspect)
    video_width, video_height = aspect.to_resolution()

-    logger.info(f"start, video size: {video_width} x {video_height}")
+    logger.info(f"generating video: {video_width} x {video_height}")
    logger.info(f"  ① video: {video_path}")
    logger.info(f"  ② audio: {audio_path}")
    logger.info(f"  ③ subtitle: {subtitle_path}")
@@ -199,46 +386,71 @@ def generate_video(video_path: str,
        if not params.font_name:
            params.font_name = "STHeitiMedium.ttc"
        font_path = os.path.join(utils.font_dir(), params.font_name)
-        if os.name == 'nt':
+        if os.name == "nt":
            font_path = font_path.replace("\\", "/")

-        logger.info(f"using font: {font_path}")
+        logger.info(f"  ⑤ font: {font_path}")

    def create_text_clip(subtitle_item):
+        params.font_size = int(params.font_size)
+        params.stroke_width = int(params.stroke_width)
        phrase = subtitle_item[1]
        max_width = video_width * 0.9
-        wrapped_txt, txt_height = wrap_text(phrase,
-                                            max_width=max_width,
-                                            font=font_path,
-                                            fontsize=params.font_size
-                                            )
+        wrapped_txt, txt_height = wrap_text(
+            phrase, max_width=max_width, font=font_path, fontsize=params.font_size
+        )
+        interline = int(params.font_size * 0.25)
+        size=(int(max_width), int(txt_height + params.font_size * 0.25 + (interline * (wrapped_txt.count("\n") + 1))))
+
        _clip = TextClip(
-            wrapped_txt,
+            text=wrapped_txt,
            font=font_path,
-            fontsize=params.font_size,
+            font_size=params.font_size,
            color=params.text_fore_color,
            bg_color=params.text_background_color,
            stroke_color=params.stroke_color,
            stroke_width=params.stroke_width,
-            print_cmd=False,
+            # interline=interline,
+            # size=size,
        )
        duration = subtitle_item[0][1] - subtitle_item[0][0]
-        _clip = _clip.set_start(subtitle_item[0][0])
-        _clip = _clip.set_end(subtitle_item[0][1])
-        _clip = _clip.set_duration(duration)
+        _clip = _clip.with_start(subtitle_item[0][0])
+        _clip = _clip.with_end(subtitle_item[0][1])
+        _clip = _clip.with_duration(duration)
        if params.subtitle_position == "bottom":
-            _clip = _clip.set_position(('center', video_height * 0.95 - _clip.h))
+            _clip = _clip.with_position(("center", video_height * 0.95 - _clip.h))
        elif params.subtitle_position == "top":
-            _clip = _clip.set_position(('center', video_height * 0.1))
-        else:
-            _clip = _clip.set_position(('center', 'center'))
+            _clip = _clip.with_position(("center", video_height * 0.05))
+        elif params.subtitle_position == "custom":
+            # Ensure the subtitle is fully within the screen bounds
+            margin = 10  # Additional margin, in pixels
+            max_y = video_height - _clip.h - margin
+            min_y = margin
+            custom_y = (video_height - _clip.h) * (params.custom_position / 100)
+            custom_y = max(
+                min_y, min(custom_y, max_y)
+            )  # Constrain the y value within the valid range
+            _clip = _clip.with_position(("center", custom_y))
+        else:  # center
+            _clip = _clip.with_position(("center", "center"))
        return _clip

-    video_clip = VideoFileClip(video_path)
-    audio_clip = AudioFileClip(audio_path).volumex(params.voice_volume)
+    video_clip = VideoFileClip(video_path).without_audio()
+    audio_clip = AudioFileClip(audio_path).with_effects(
+        [afx.MultiplyVolume(params.voice_volume)]
+    )
+
+    def make_textclip(text):
+        return TextClip(
+            text=text,
+            font=font_path,
+            font_size=params.font_size,
+        )

    if subtitle_path and os.path.exists(subtitle_path):
-        sub = SubtitlesClip(subtitles=subtitle_path, encoding='utf-8')
+        sub = SubtitlesClip(
+            subtitles=subtitle_path, encoding="utf-8", make_textclip=make_textclip
+        )
        text_clips = []
        for item in sub.subtitles:
            clip = create_text_clip(subtitle_item=item)
@@ -248,75 +460,72 @@ def generate_video(video_path: str,
    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
    if bgm_file:
        try:
-            bgm_clip = (AudioFileClip(bgm_file)
-                        .volumex(params.bgm_volume)
-                        .audio_fadeout(3))
-            bgm_clip = afx.audio_loop(bgm_clip, duration=video_clip.duration)
+            bgm_clip = AudioFileClip(bgm_file).with_effects(
+                [
+                    afx.MultiplyVolume(params.bgm_volume),
+                    afx.AudioFadeOut(3),
+                    afx.AudioLoop(duration=video_clip.duration),
+                ]
+            )
            audio_clip = CompositeAudioClip([audio_clip, bgm_clip])
        except Exception as e:
            logger.error(f"failed to add bgm: {str(e)}")

-    video_clip = video_clip.set_audio(audio_clip)
-    video_clip.write_videofile(output_file,
-                               audio_codec="aac",
-                               temp_audiofile_path=output_dir,
-                               threads=params.n_threads or 2,
-                               logger=None,
-                               fps=30,
-                               )
+    video_clip = video_clip.with_audio(audio_clip)
+    video_clip.write_videofile(
+        output_file,
+        audio_codec=audio_codec,
+        temp_audiofile_path=output_dir,
+        threads=params.n_threads or 2,
+        logger=None,
+        fps=fps,
+    )
    video_clip.close()
-    logger.success(f"completed")
+    del video_clip


-if __name__ == "__main__":
-    txt_en = "Here's your guide to travel hacks for budget-friendly adventures"
-    txt_zh = "测试长字段这是您的旅行技巧指南帮助您进行预算友好的冒险"
-    font = utils.resource_dir() + "/fonts/STHeitiMedium.ttc"
-    for txt in [txt_en, txt_zh]:
-        t, h = wrap_text(text=txt, max_width=1000, font=font, fontsize=60)
-        print(t)
+def preprocess_video(materials: List[MaterialInfo], clip_duration=4):
+    for material in materials:
+        if not material.url:
+            continue

-    task_id = "aa563149-a7ea-49c2-b39f-8c32cc225baf"
-    task_dir = utils.task_dir(task_id)
-    video_file = f"{task_dir}/combined-1.mp4"
-    audio_file = f"{task_dir}/audio.mp3"
-    subtitle_file = f"{task_dir}/subtitle.srt"
-    output_file = f"{task_dir}/final.mp4"
+        ext = utils.parse_extension(material.url)
+        try:
+            clip = VideoFileClip(material.url)
+        except Exception:
+            clip = ImageClip(material.url)

-    # video_paths = []
-    # for file in os.listdir(utils.storage_dir("test")):
-    #     if file.endswith(".mp4"):
-    #         video_paths.append(os.path.join(utils.storage_dir("test"), file))
-    #
-    # combine_videos(combined_video_path=video_file,
-    #                audio_file=audio_file,
-    #                video_paths=video_paths,
-    #                video_aspect=VideoAspect.portrait,
-    #                video_concat_mode=VideoConcatMode.random,
-    #                max_clip_duration=5,
-    #                threads=2)
+        width = clip.size[0]
+        height = clip.size[1]
+        if width < 480 or height < 480:
+            logger.warning(f"low resolution material: {width}x{height}, minimum 480x480 required")
+            continue

-    cfg = VideoParams()
-    cfg.video_aspect = VideoAspect.portrait
-    cfg.font_name = "STHeitiMedium.ttc"
-    cfg.font_size = 60
-    cfg.stroke_color = "#000000"
-    cfg.stroke_width = 1.5
-    cfg.text_fore_color = "#FFFFFF"
-    cfg.text_background_color = "transparent"
-    cfg.bgm_type = "random"
-    cfg.bgm_file = ""
-    cfg.bgm_volume = 1.0
-    cfg.subtitle_enabled = True
-    cfg.subtitle_position = "bottom"
-    cfg.n_threads = 2
-    cfg.paragraph_number = 1
+        if ext in const.FILE_TYPE_IMAGES:
+            logger.info(f"processing image: {material.url}")
+            # Create an image clip and set its duration to 3 seconds
+            clip = (
+                ImageClip(material.url)
+                .with_duration(clip_duration)
+                .with_position("center")
+            )
+            # Apply a zoom effect using the resize method.
+            # A lambda function is used to make the zoom effect dynamic over time.
+            # The zoom effect starts from the original size and gradually scales up to 120%.
+            # t represents the current time, and clip.duration is the total duration of the clip (3 seconds).
+            # Note: 1 represents 100% size, so 1.2 represents 120% size.
+            zoom_clip = clip.resized(
+                lambda t: 1 + (clip_duration * 0.03) * (t / clip.duration)
+            )

-    cfg.voice_volume = 1.0
+            # Optionally, create a composite video clip containing the zoomed clip.
+            # This is useful when you want to add other elements to the video.
+            final_clip = CompositeVideoClip([zoom_clip])

-    generate_video(video_path=video_file,
-                   audio_path=audio_file,
-                   subtitle_path=subtitle_file,
-                   output_file=output_file,
-                   params=cfg
-                   )
+            # Output the video to a file.
+            video_file = f"{material.url}.mp4"
+            final_clip.write_videofile(video_file, fps=30, logger=None)
+            close_clip(clip)
+            material.url = video_file
+            logger.success(f"image processed: {video_file}")
+    return materials
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -2,21 +2,48 @@ import asyncio
 import os
 import re
 from datetime import datetime
+from typing import Union
 from xml.sax.saxutils import unescape
+
+import edge_tts
+import requests
+from edge_tts import SubMaker, submaker
 from edge_tts.submaker import mktimestamp
 from loguru import logger
-from edge_tts import submaker, SubMaker
-import edge_tts
 from moviepy.video.tools import subtitles

 from app.config import config
 from app.utils import utils


+def get_siliconflow_voices() -> list[str]:
+    """
+    获取硅基流动的声音列表
+
+    Returns:
+        声音列表，格式为 ["siliconflow:FunAudioLLM/CosyVoice2-0.5B:alex", ...]
+    """
+    # 硅基流动的声音列表和对应的性别（用于显示）
+    voices_with_gender = [
+        ("FunAudioLLM/CosyVoice2-0.5B", "alex", "Male"),
+        ("FunAudioLLM/CosyVoice2-0.5B", "anna", "Female"),
+        ("FunAudioLLM/CosyVoice2-0.5B", "bella", "Female"),
+        ("FunAudioLLM/CosyVoice2-0.5B", "benjamin", "Male"),
+        ("FunAudioLLM/CosyVoice2-0.5B", "charles", "Male"),
+        ("FunAudioLLM/CosyVoice2-0.5B", "claire", "Female"),
+        ("FunAudioLLM/CosyVoice2-0.5B", "david", "Male"),
+        ("FunAudioLLM/CosyVoice2-0.5B", "diana", "Female"),
+    ]
+
+    # 添加siliconflow:前缀，并格式化为显示名称
+    return [
+        f"siliconflow:{model}:{voice}-{gender}"
+        for model, voice, gender in voices_with_gender
+    ]
+
+
 def get_all_azure_voices(filter_locals=None) -> list[str]:
-    if filter_locals is None:
-        filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW"]
-    voices_str = """
+    azure_voices_str = """
 Name: af-ZA-AdriNeural
 Gender: Female

@@ -302,21 +329,33 @@ Gender: Female
 Name: en-US-AnaNeural
 Gender: Female

+Name: en-US-AndrewMultilingualNeural
+Gender: Male
+
 Name: en-US-AndrewNeural
 Gender: Male

 Name: en-US-AriaNeural
 Gender: Female

+Name: en-US-AvaMultilingualNeural
+Gender: Female
+
 Name: en-US-AvaNeural
 Gender: Female

+Name: en-US-BrianMultilingualNeural
+Gender: Male
+
 Name: en-US-BrianNeural
 Gender: Male

 Name: en-US-ChristopherNeural
 Gender: Male

+Name: en-US-EmmaMultilingualNeural
+Gender: Female
+
 Name: en-US-EmmaNeural
 Gender: Female

@@ -602,12 +641,24 @@ Gender: Male
 Name: it-IT-ElsaNeural
 Gender: Female

-Name: it-IT-GiuseppeNeural
+Name: it-IT-GiuseppeMultilingualNeural
 Gender: Male

 Name: it-IT-IsabellaNeural
 Gender: Female

+Name: iu-Cans-CA-SiqiniqNeural
+Gender: Female
+
+Name: iu-Cans-CA-TaqqiqNeural
+Gender: Male
+
+Name: iu-Latn-CA-SiqiniqNeural
+Gender: Female
+
+Name: iu-Latn-CA-TaqqiqNeural
+Gender: Male
+
 Name: ja-JP-KeitaNeural
 Gender: Male

@@ -644,7 +695,7 @@ Gender: Male
 Name: kn-IN-SapnaNeural
 Gender: Female

-Name: ko-KR-HyunsuNeural
+Name: ko-KR-HyunsuMultilingualNeural
 Gender: Male

 Name: ko-KR-InJoonNeural
@@ -758,7 +809,7 @@ Gender: Male
 Name: pt-BR-FranciscaNeural
 Gender: Female

-Name: pt-BR-ThalitaNeural
+Name: pt-BR-ThalitaMultilingualNeural
 Gender: Female

 Name: pt-PT-DuarteNeural
@@ -988,27 +1039,20 @@ Name: zh-CN-XiaoxiaoMultilingualNeural-V2
 Gender: Female
    """.strip()
    voices = []
-    name = ''
-    for line in voices_str.split("\n"):
-        line = line.strip()
-        if not line:
-            continue
-        if line.startswith("Name: "):
-            name = line[6:].strip()
-        if line.startswith("Gender: "):
-            gender = line[8:].strip()
-            if name and gender:
-                # voices.append({
-                #     "name": name,
-                #     "gender": gender,
-                # })
-                if filter_locals:
-                    for filter_local in filter_locals:
-                        if name.lower().startswith(filter_local.lower()):
-                            voices.append(f"{name}-{gender}")
-                else:
-                    voices.append(f"{name}-{gender}")
-                name = ''
+    # 定义正则表达式模式，用于匹配 Name 和 Gender 行
+    pattern = re.compile(r"Name:\s*(.+)\s*Gender:\s*(.+)\s*", re.MULTILINE)
+    # 使用正则表达式查找所有匹配项
+    matches = pattern.findall(azure_voices_str)
+
+    for name, gender in matches:
+        # 应用过滤条件
+        if filter_locals and any(
+            name.lower().startswith(fl.lower()) for fl in filter_locals
+        ):
+            voices.append(f"{name}-{gender}")
+        elif not filter_locals:
+            voices.append(f"{name}-{gender}")
+
    voices.sort()
    return voices

@@ -1023,38 +1067,81 @@ def parse_voice_name(name: str):

 def is_azure_v2_voice(voice_name: str):
    voice_name = parse_voice_name(voice_name)
-    print(voice_name)
    if voice_name.endswith("-V2"):
        return voice_name.replace("-V2", "").strip()
    return ""


-def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+def is_siliconflow_voice(voice_name: str):
+    """检查是否是硅基流动的声音"""
+    return voice_name.startswith("siliconflow:")
+
+
+def tts(
+    text: str,
+    voice_name: str,
+    voice_rate: float,
+    voice_file: str,
+    voice_volume: float = 1.0,
+) -> Union[SubMaker, None]:
    if is_azure_v2_voice(voice_name):
        return azure_tts_v2(text, voice_name, voice_file)
-    return azure_tts_v1(text, voice_name, voice_file)
+    elif is_siliconflow_voice(voice_name):
+        # 从voice_name中提取模型和声音
+        # 格式: siliconflow:model:voice-Gender
+        parts = voice_name.split(":")
+        if len(parts) >= 3:
+            model = parts[1]
+            # 移除性别后缀，例如 "alex-Male" -> "alex"
+            voice_with_gender = parts[2]
+            voice = voice_with_gender.split("-")[0]
+            # 构建完整的voice参数，格式为 "model:voice"
+            full_voice = f"{model}:{voice}"
+            return siliconflow_tts(
+                text, model, full_voice, voice_rate, voice_file, voice_volume
+            )
+        else:
+            logger.error(f"Invalid siliconflow voice name format: {voice_name}")
+            return None
+    return azure_tts_v1(text, voice_name, voice_rate, voice_file)


-def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+def convert_rate_to_percent(rate: float) -> str:
+    if rate == 1.0:
+        return "+0%"
+    percent = round((rate - 1.0) * 100)
+    if percent > 0:
+        return f"+{percent}%"
+    else:
+        return f"{percent}%"
+
+
+def azure_tts_v1(
+    text: str, voice_name: str, voice_rate: float, voice_file: str
+) -> Union[SubMaker, None]:
+    voice_name = parse_voice_name(voice_name)
    text = text.strip()
+    rate_str = convert_rate_to_percent(voice_rate)
    for i in range(3):
        try:
            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")

            async def _do() -> SubMaker:
-                communicate = edge_tts.Communicate(text, voice_name)
+                communicate = edge_tts.Communicate(text, voice_name, rate=rate_str)
                sub_maker = edge_tts.SubMaker()
                with open(voice_file, "wb") as file:
                    async for chunk in communicate.stream():
                        if chunk["type"] == "audio":
                            file.write(chunk["data"])
                        elif chunk["type"] == "WordBoundary":
-                            sub_maker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
+                            sub_maker.create_sub(
+                                (chunk["offset"], chunk["duration"]), chunk["text"]
+                            )
                return sub_maker

            sub_maker = asyncio.run(_do())
            if not sub_maker or not sub_maker.subs:
-                logger.warning(f"failed, sub_maker is None or sub_maker.subs is None")
+                logger.warning("failed, sub_maker is None or sub_maker.subs is None")
                continue

            logger.info(f"completed, output file: {voice_file}")
@@ -1064,7 +1151,145 @@ def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None
    return None


-def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+def siliconflow_tts(
+    text: str,
+    model: str,
+    voice: str,
+    voice_rate: float,
+    voice_file: str,
+    voice_volume: float = 1.0,
+) -> Union[SubMaker, None]:
+    """
+    使用硅基流动的API生成语音
+
+    Args:
+        text: 要转换为语音的文本
+        model: 模型名称，如 "FunAudioLLM/CosyVoice2-0.5B"
+        voice: 声音名称，如 "FunAudioLLM/CosyVoice2-0.5B:alex"
+        voice_rate: 语音速度，范围[0.25, 4.0]
+        voice_file: 输出的音频文件路径
+        voice_volume: 语音音量，范围[0.6, 5.0]，需要转换为硅基流动的增益范围[-10, 10]
+
+    Returns:
+        SubMaker对象或None
+    """
+    text = text.strip()
+    api_key = config.siliconflow.get("api_key", "")
+
+    if not api_key:
+        logger.error("SiliconFlow API key is not set")
+        return None
+
+    # 将voice_volume转换为硅基流动的增益范围
+    # 默认voice_volume为1.0，对应gain为0
+    gain = voice_volume - 1.0
+    # 确保gain在[-10, 10]范围内
+    gain = max(-10, min(10, gain))
+
+    url = "https://api.siliconflow.cn/v1/audio/speech"
+
+    payload = {
+        "model": model,
+        "input": text,
+        "voice": voice,
+        "response_format": "mp3",
+        "sample_rate": 32000,
+        "stream": False,
+        "speed": voice_rate,
+        "gain": gain,
+    }
+
+    headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+
+    for i in range(3):  # 尝试3次
+        try:
+            logger.info(
+                f"start siliconflow tts, model: {model}, voice: {voice}, try: {i + 1}"
+            )
+
+            response = requests.post(url, json=payload, headers=headers)
+
+            if response.status_code == 200:
+                # 保存音频文件
+                with open(voice_file, "wb") as f:
+                    f.write(response.content)
+
+                # 创建一个空的SubMaker对象
+                sub_maker = SubMaker()
+
+                # 获取音频文件的实际长度
+                try:
+                    # 尝试使用moviepy获取音频长度
+                    from moviepy import AudioFileClip
+
+                    audio_clip = AudioFileClip(voice_file)
+                    audio_duration = audio_clip.duration
+                    audio_clip.close()
+
+                    # 将音频长度转换为100纳秒单位（与edge_tts兼容）
+                    audio_duration_100ns = int(audio_duration * 10000000)
+
+                    # 使用文本分割来创建更准确的字幕
+                    # 将文本按标点符号分割成句子
+                    sentences = utils.split_string_by_punctuations(text)
+
+                    if sentences:
+                        # 计算每个句子的大致时长（按字符数比例分配）
+                        total_chars = sum(len(s) for s in sentences)
+                        char_duration = (
+                            audio_duration_100ns / total_chars if total_chars > 0 else 0
+                        )
+
+                        current_offset = 0
+                        for sentence in sentences:
+                            if not sentence.strip():
+                                continue
+
+                            # 计算当前句子的时长
+                            sentence_chars = len(sentence)
+                            sentence_duration = int(sentence_chars * char_duration)
+
+                            # 添加到SubMaker
+                            sub_maker.subs.append(sentence)
+                            sub_maker.offset.append(
+                                (current_offset, current_offset + sentence_duration)
+                            )
+
+                            # 更新偏移量
+                            current_offset += sentence_duration
+                    else:
+                        # 如果无法分割，则使用整个文本作为一个字幕
+                        sub_maker.subs = [text]
+                        sub_maker.offset = [(0, audio_duration_100ns)]
+
+                except Exception as e:
+                    logger.warning(f"Failed to create accurate subtitles: {str(e)}")
+                    # 回退到简单的字幕
+                    sub_maker.subs = [text]
+                    # 使用音频文件的实际长度，如果无法获取，则假设为10秒
+                    sub_maker.offset = [
+                        (
+                            0,
+                            audio_duration_100ns
+                            if "audio_duration_100ns" in locals()
+                            else 10000000,
+                        )
+                    ]
+
+                logger.success(f"siliconflow tts succeeded: {voice_file}")
+                print("s", sub_maker.subs, sub_maker.offset)
+                return sub_maker
+            else:
+                logger.error(
+                    f"siliconflow tts failed with status code {response.status_code}: {response.text}"
+                )
+        except Exception as e:
+            logger.error(f"siliconflow tts failed: {str(e)}")
+
+    return None
+
+
+def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> Union[SubMaker, None]:
    voice_name = is_azure_v2_voice(voice_name)
    if not voice_name:
        logger.error(f"invalid voice name: {voice_name}")
@@ -1074,8 +1299,12 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None
    def _format_duration_to_offset(duration) -> int:
        if isinstance(duration, str):
            time_obj = datetime.strptime(duration, "%H:%M:%S.%f")
-            milliseconds = (time_obj.hour * 3600000) + (time_obj.minute * 60000) + (time_obj.second * 1000) + (
-                    time_obj.microsecond // 1000)
+            milliseconds = (
+                (time_obj.hour * 3600000)
+                + (time_obj.minute * 60000)
+                + (time_obj.second * 1000)
+                + (time_obj.microsecond // 1000)
+            )
            return milliseconds * 10000

        if isinstance(duration, int):
@@ -1108,20 +1337,33 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None
            # Creates an instance of a speech config with specified subscription key and service region.
            speech_key = config.azure.get("speech_key", "")
            service_region = config.azure.get("speech_region", "")
-            audio_config = speechsdk.audio.AudioOutputConfig(filename=voice_file, use_default_speaker=True)
-            speech_config = speechsdk.SpeechConfig(subscription=speech_key,
-                                                   region=service_region)
+            if not speech_key or not service_region:
+                logger.error("Azure speech key or region is not set")
+                return None
+
+            audio_config = speechsdk.audio.AudioOutputConfig(
+                filename=voice_file, use_default_speaker=True
+            )
+            speech_config = speechsdk.SpeechConfig(
+                subscription=speech_key, region=service_region
+            )
            speech_config.speech_synthesis_voice_name = voice_name
            # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
            #                            value='true')
-            speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
-                                       value='true')
+            speech_config.set_property(
+                property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
+                value="true",
+            )

            speech_config.set_speech_synthesis_output_format(
-                speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
-            speech_synthesizer = speechsdk.SpeechSynthesizer(audio_config=audio_config,
-                                                             speech_config=speech_config)
-            speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_cb)
+                speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3
+            )
+            speech_synthesizer = speechsdk.SpeechSynthesizer(
+                audio_config=audio_config, speech_config=speech_config
+            )
+            speech_synthesizer.synthesis_word_boundary.connect(
+                speech_synthesizer_word_boundary_cb
+            )

            result = speech_synthesizer.speak_text_async(text).get()
            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
@@ -1129,9 +1371,13 @@ def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None
                return sub_maker
            elif result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = result.cancellation_details
-                logger.error(f"azure v2 speech synthesis canceled: {cancellation_details.reason}")
+                logger.error(
+                    f"azure v2 speech synthesis canceled: {cancellation_details.reason}"
+                )
                if cancellation_details.reason == speechsdk.CancellationReason.Error:
-                    logger.error(f"azure v2 speech synthesis error: {cancellation_details.error_details}")
+                    logger.error(
+                        f"azure v2 speech synthesis error: {cancellation_details.error_details}"
+                    )
            logger.info(f"completed, output file: {voice_file}")
        except Exception as e:
            logger.error(f"failed, error: {str(e)}")
@@ -1168,11 +1414,7 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
        """
        start_t = mktimestamp(start_time).replace(".", ",")
        end_t = mktimestamp(end_time).replace(".", ",")
-        return (
-            f"{idx}\n"
-            f"{start_t} --> {end_t}\n"
-            f"{sub_text}\n"
-        )
+        return f"{idx}\n{start_t} --> {end_t}\n{sub_text}\n"

    start_time = -1.0
    sub_items = []
@@ -1229,12 +1471,16 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
            try:
                sbs = subtitles.file_to_subtitles(subtitle_file, encoding="utf-8")
                duration = max([tb for ((ta, tb), txt) in sbs])
-                logger.info(f"completed, subtitle file created: {subtitle_file}, duration: {duration}")
+                logger.info(
+                    f"completed, subtitle file created: {subtitle_file}, duration: {duration}"
+                )
            except Exception as e:
                logger.error(f"failed, error: {str(e)}")
                os.remove(subtitle_file)
        else:
-            logger.warning(f"failed, sub_items len: {len(sub_items)}, script_lines len: {len(script_lines)}")
+            logger.warning(
+                f"failed, sub_items len: {len(sub_items)}, script_lines len: {len(script_lines)}"
+            )

    except Exception as e:
        logger.error(f"failed, error: {str(e)}")
@@ -1258,7 +1504,6 @@ if __name__ == "__main__":
    voices = get_all_azure_voices()
    print(len(voices))

-
    async def _do():
        temp_dir = utils.storage_dir("temp")

@@ -1307,12 +1552,13 @@ if __name__ == "__main__":
        for voice_name in voice_names:
            voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
            subtitle_file = f"{temp_dir}/tts.mp3.srt"
-            sub_maker = azure_tts_v2(text=text, voice_name=voice_name, voice_file=voice_file)
+            sub_maker = azure_tts_v2(
+                text=text, voice_name=voice_name, voice_file=voice_file
+            )
            create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
            audio_duration = get_audio_duration(sub_maker)
            print(f"voice: {voice_name}, audio duration: {audio_duration}s")

-
    loop = asyncio.get_event_loop_policy().get_event_loop()
    try:
        loop.run_until_complete(_do())
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -1,12 +1,13 @@
+import json
 import locale
 import os
-import platform
+from pathlib import Path
 import threading
 from typing import Any
-from loguru import logger
-import json
 from uuid import uuid4
+
 import urllib3
+from loguru import logger

 from app.models import const

@@ -15,44 +16,44 @@ urllib3.disable_warnings()

 def get_response(status: int, data: Any = None, message: str = ""):
    obj = {
-        'status': status,
+        "status": status,
    }
    if data:
-        obj['data'] = data
+        obj["data"] = data
    if message:
-        obj['message'] = message
+        obj["message"] = message
    return obj


 def to_json(obj):
    try:
-        # 定义一个辅助函数来处理不同类型的对象
+        # Define a helper function to handle different types of objects
        def serialize(o):
-            # 如果对象是可序列化类型，直接返回
+            # If the object is a serializable type, return it directly
            if isinstance(o, (int, float, bool, str)) or o is None:
                return o
-            # 如果对象是二进制数据，转换为base64编码的字符串
+            # If the object is binary data, convert it to a base64-encoded string
            elif isinstance(o, bytes):
                return "*** binary data ***"
-            # 如果对象是字典，递归处理每个键值对
+            # If the object is a dictionary, recursively process each key-value pair
            elif isinstance(o, dict):
                return {k: serialize(v) for k, v in o.items()}
-            # 如果对象是列表或元组，递归处理每个元素
+            # If the object is a list or tuple, recursively process each element
            elif isinstance(o, (list, tuple)):
                return [serialize(item) for item in o]
-            # 如果对象是自定义类型，尝试返回其__dict__属性
-            elif hasattr(o, '__dict__'):
+            # If the object is a custom type, attempt to return its __dict__ attribute
+            elif hasattr(o, "__dict__"):
                return serialize(o.__dict__)
-            # 其他情况返回None（或者可以选择抛出异常）
+            # Return None for other cases (or choose to raise an exception)
            else:
                return None

-        # 使用serialize函数处理输入对象
+        # Use the serialize function to process the input object
        serialized_obj = serialize(obj)

-        # 序列化处理后的对象为JSON字符串
+        # Serialize the processed object into a JSON string
        return json.dumps(serialized_obj, ensure_ascii=False, indent=4)
-    except Exception as e:
+    except Exception:
        return None


@@ -67,10 +68,13 @@ def root_dir():
    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))


-def storage_dir(sub_dir: str = ""):
+def storage_dir(sub_dir: str = "", create: bool = False):
    d = os.path.join(root_dir(), "storage")
    if sub_dir:
        d = os.path.join(d, sub_dir)
+    if create and not os.path.exists(d):
+        os.makedirs(d)
+
    return d


@@ -91,7 +95,7 @@ def task_dir(sub_dir: str = ""):


 def font_dir(sub_dir: str = ""):
-    d = resource_dir(f"fonts")
+    d = resource_dir("fonts")
    if sub_dir:
        d = os.path.join(d, sub_dir)
    if not os.path.exists(d):
@@ -100,7 +104,7 @@ def font_dir(sub_dir: str = ""):


 def song_dir(sub_dir: str = ""):
-    d = resource_dir(f"songs")
+    d = resource_dir("songs")
    if sub_dir:
        d = os.path.join(d, sub_dir)
    if not os.path.exists(d):
@@ -109,7 +113,7 @@ def song_dir(sub_dir: str = ""):


 def public_dir(sub_dir: str = ""):
-    d = resource_dir(f"public")
+    d = resource_dir("public")
    if sub_dir:
        d = os.path.join(d, sub_dir)
    if not os.path.exists(d):
@@ -179,7 +183,7 @@ def split_string_by_punctuations(s):
            next_char = s[i + 1]

        if char == "." and previous_char.isdigit() and next_char.isdigit():
-            # 取现1万，按2.5%收取手续费, 2.5 中的 . 不能作为换行标记
+            # # In the case of "withdraw 10,000, charged at 2.5% fee", the dot in "2.5" should not be treated as a line break marker
            txt += char
            continue

@@ -196,7 +200,8 @@ def split_string_by_punctuations(s):

 def md5(text):
    import hashlib
-    return hashlib.md5(text.encode('utf-8')).hexdigest()
+
+    return hashlib.md5(text.encode("utf-8")).hexdigest()


 def get_system_locale():
@@ -206,7 +211,7 @@ def get_system_locale():
        # en_US, en_GB return en
        language_code = loc[0].split("_")[0]
        return language_code
-    except Exception as e:
+    except Exception:
        return "en"


@@ -219,3 +224,7 @@ def load_locales(i18n_dir):
                with open(os.path.join(root, file), "r", encoding="utf-8") as f:
                    _locales[lang] = json.loads(f.read())
    return _locales
+
+
+def parse_extension(filename):
+    return Path(filename).suffix.lower().lstrip('.')
--- a/config.example.toml
+++ b/config.example.toml
@@ -1,170 +1,214 @@
 [app]
-    # Pexels API Key
-    # Register at https://www.pexels.com/api/ to get your API key.
-    # You can use multiple keys to avoid rate limits.
-    # For example: pexels_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
-    # 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
-    pexels_api_keys = []
+video_source = "pexels" # "pexels" or "pixabay"

-    # 如果你没有 OPENAI API Key，可以使用 g4f 代替，或者使用国内的 Moonshot API
-    # If you don't have an OPENAI API Key, you can use g4f instead
+# 是否隐藏配置面板
+hide_config = false

-    # 支持的提供商 (Supported providers):
-    #   openai
-    #   moonshot (月之暗面)
-    #   oneapi
-    #   g4f
-    #   azure
-    #   qwen (通义千问)
-    #   gemini
-    llm_provider="openai"
+# Pexels API Key
+# Register at https://www.pexels.com/api/ to get your API key.
+# You can use multiple keys to avoid rate limits.
+# For example: pexels_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
+# 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
+pexels_api_keys = []

-    ########## Ollama Settings
-    # No need to set it unless you want to use your own proxy
-    ollama_base_url = ""
-    # Check your available models at https://ollama.com/library
-    ollama_model_name = ""
+# Pixabay API Key
+# Register at https://pixabay.com/api/docs/ to get your API key.
+# You can use multiple keys to avoid rate limits.
+# For example: pixabay_api_keys = ["123adsf4567adf89","abd1321cd13efgfdfhi"]
+# 特别注意格式，Key 用英文双引号括起来，多个Key用逗号隔开
+pixabay_api_keys = []

-    ########## OpenAI API Key
-    # Get your API key at https://platform.openai.com/api-keys
-    openai_api_key = ""
-    # No need to set it unless you want to use your own proxy
-    openai_base_url = ""
-    # Check your available models at https://platform.openai.com/account/limits
-    openai_model_name = "gpt-4-turbo-preview"
+# 支持的提供商 (Supported providers):
+#   openai
+#   moonshot    (月之暗面)
+#   azure
+#   qwen        (通义千问)
+#   deepseek
+#   gemini
+#   ollama
+#   g4f
+#   oneapi
+#   cloudflare
+#   ernie       (文心一言)
+llm_provider = "openai"

-    ########## Moonshot API Key
-    # Visit https://platform.moonshot.cn/console/api-keys to get your API key.
-    moonshot_api_key=""
-    moonshot_base_url = "https://api.moonshot.cn/v1"
-    moonshot_model_name = "moonshot-v1-8k"
+########## Pollinations AI Settings
+# Visit https://pollinations.ai/ to learn more
+# API Key is optional - leave empty for public access
+pollinations_api_key = ""
+# Default base URL for Pollinations API
+pollinations_base_url = "https://pollinations.ai/api/v1"
+# Default model for text generation
+pollinations_model_name = "openai-fast"

-    ########## OneAPI API Key
-    # Visit https://github.com/songquanpeng/one-api to get your API key
-    oneapi_api_key=""
-    oneapi_base_url=""
-    oneapi_model_name=""
+########## Ollama Settings
+# No need to set it unless you want to use your own proxy
+ollama_base_url = ""
+# Check your available models at https://ollama.com/library
+ollama_model_name = ""

-    ########## G4F
-    # Visit https://github.com/xtekky/gpt4free to get more details
-    # Supported model list: https://github.com/xtekky/gpt4free/blob/main/g4f/models.py
-    g4f_model_name = "gpt-3.5-turbo-16k-0613"
+########## OpenAI API Key
+# Get your API key at https://platform.openai.com/api-keys
+openai_api_key = ""
+# No need to set it unless you want to use your own proxy
+openai_base_url = ""
+# Check your available models at https://platform.openai.com/account/limits
+openai_model_name = "gpt-4o-mini"

-    ########## Azure API Key
-    # Visit https://learn.microsoft.com/zh-cn/azure/ai-services/openai/ to get more details
-    # API documentation: https://learn.microsoft.com/zh-cn/azure/ai-services/openai/reference
-    azure_api_key = ""
-    azure_base_url=""
-    azure_model_name="gpt-35-turbo" # replace with your model deployment name
-    azure_api_version = "2024-02-15-preview"
+########## Moonshot API Key
+# Visit https://platform.moonshot.cn/console/api-keys to get your API key.
+moonshot_api_key = ""
+moonshot_base_url = "https://api.moonshot.cn/v1"
+moonshot_model_name = "moonshot-v1-8k"

-    ########## Gemini API Key
-    gemini_api_key=""
-    gemini_model_name = "gemini-1.0-pro"
+########## OneAPI API Key
+# Visit https://github.com/songquanpeng/one-api to get your API key
+oneapi_api_key = ""
+oneapi_base_url = ""
+oneapi_model_name = ""

-    ########## Qwen API Key
-    # Visit https://dashscope.console.aliyun.com/apiKey to get your API key
-    # Visit below links to get more details
-    # https://tongyi.aliyun.com/qianwen/
-    # https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction
-    qwen_api_key = ""
-    qwen_model_name = "qwen-max" 
+########## G4F
+# Visit https://github.com/xtekky/gpt4free to get more details
+# Supported model list: https://github.com/xtekky/gpt4free/blob/main/g4f/models.py
+g4f_model_name = "gpt-3.5-turbo"
+
+########## Azure API Key
+# Visit https://learn.microsoft.com/zh-cn/azure/ai-services/openai/ to get more details
+# API documentation: https://learn.microsoft.com/zh-cn/azure/ai-services/openai/reference
+azure_api_key = ""
+azure_base_url = ""
+azure_model_name = "gpt-35-turbo"        # replace with your model deployment name
+azure_api_version = "2024-02-15-preview"
+
+########## Gemini API Key
+gemini_api_key = ""
+gemini_model_name = "gemini-1.0-pro"
+
+########## Qwen API Key
+# Visit https://dashscope.console.aliyun.com/apiKey to get your API key
+# Visit below links to get more details
+# https://tongyi.aliyun.com/qianwen/
+# https://help.aliyun.com/zh/dashscope/developer-reference/model-introduction
+qwen_api_key = ""
+qwen_model_name = "qwen-max"


-    # Subtitle Provider, "edge" or "whisper"
-    # If empty, the subtitle will not be generated
-    subtitle_provider = "edge"
+########## DeepSeek API Key
+# Visit https://platform.deepseek.com/api_keys to get your API key
+deepseek_api_key = ""
+deepseek_base_url = "https://api.deepseek.com"
+deepseek_model_name = "deepseek-chat"

-    #
-    # ImageMagick
-    #
-    # Once you have installed it, ImageMagick will be automatically detected, except on Windows!
-    # On Windows, for example "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
-    # Download from https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
+# Subtitle Provider, "edge" or "whisper"
+# If empty, the subtitle will not be generated
+subtitle_provider = "edge"

-    # imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe"
+#
+# ImageMagick
+#
+# Once you have installed it, ImageMagick will be automatically detected, except on Windows!
+# On Windows, for example "C:\Program Files (x86)\ImageMagick-7.1.1-Q16-HDRI\magick.exe"
+# Download from https://imagemagick.org/archive/binaries/ImageMagick-7.1.1-29-Q16-x64-static.exe
+
+# imagemagick_path = "C:\\Program Files (x86)\\ImageMagick-7.1.1-Q16\\magick.exe"


-    #
-    # FFMPEG
-    #
-    # 通常情况下，ffmpeg 会被自动下载，并且会被自动检测到。
-    # 但是如果你的环境有问题，无法自动下载，可能会遇到如下错误：
-    #   RuntimeError: No ffmpeg exe could be found.
-    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
-    # 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path，下载地址：https://www.gyan.dev/ffmpeg/builds/
+#
+# FFMPEG
+#
+# 通常情况下，ffmpeg 会被自动下载，并且会被自动检测到。
+# 但是如果你的环境有问题，无法自动下载，可能会遇到如下错误：
+#   RuntimeError: No ffmpeg exe could be found.
+#   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
+# 此时你可以手动下载 ffmpeg 并设置 ffmpeg_path，下载地址：https://www.gyan.dev/ffmpeg/builds/

-    # Under normal circumstances, ffmpeg is downloaded automatically and detected automatically.
-    # However, if there is an issue with your environment that prevents automatic downloading, you might encounter the following error:
-    #   RuntimeError: No ffmpeg exe could be found.
-    #   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
-    # In such cases, you can manually download ffmpeg and set the ffmpeg_path, download link: https://www.gyan.dev/ffmpeg/builds/
+# Under normal circumstances, ffmpeg is downloaded automatically and detected automatically.
+# However, if there is an issue with your environment that prevents automatic downloading, you might encounter the following error:
+#   RuntimeError: No ffmpeg exe could be found.
+#   Install ffmpeg on your system, or set the IMAGEIO_FFMPEG_EXE environment variable.
+# In such cases, you can manually download ffmpeg and set the ffmpeg_path, download link: https://www.gyan.dev/ffmpeg/builds/

-    # ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
-    #########################################################################################
+# ffmpeg_path = "C:\\Users\\harry\\Downloads\\ffmpeg.exe"
+#########################################################################################

-    # 当视频生成成功后，API服务提供的视频下载接入点，默认为当前服务的地址和监听端口
-    # 比如 http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
-    # 如果你需要使用域名对外提供服务（一般会用nginx做代理），则可以设置为你的域名
-    # 比如 https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
-    # endpoint="https://xxxx.com"
+# 当视频生成成功后，API服务提供的视频下载接入点，默认为当前服务的地址和监听端口
+# 比如 http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+# 如果你需要使用域名对外提供服务（一般会用nginx做代理），则可以设置为你的域名
+# 比如 https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+# endpoint="https://xxxx.com"

-    # When the video is successfully generated, the API service provides a download endpoint for the video, defaulting to the service's current address and listening port.
-    # For example, http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
-    # If you need to provide the service externally using a domain name (usually done with nginx as a proxy), you can set it to your domain name.
-    # For example, https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
-    # endpoint="https://xxxx.com"
-    endpoint=""
+# When the video is successfully generated, the API service provides a download endpoint for the video, defaulting to the service's current address and listening port.
+# For example, http://127.0.0.1:8080/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+# If you need to provide the service externally using a domain name (usually done with nginx as a proxy), you can set it to your domain name.
+# For example, https://xxxx.com/tasks/6357f542-a4e1-46a1-b4c9-bf3bd0df5285/final-1.mp4
+# endpoint="https://xxxx.com"
+endpoint = ""


-    # Video material storage location
-    # material_directory = ""                    # Indicates that video materials will be downloaded to the default folder, the default folder is ./storage/cache_videos under the current project
-    # material_directory = "/user/harry/videos"  # Indicates that video materials will be downloaded to a specified folder
-    # material_directory = "task"                # Indicates that video materials will be downloaded to the current task's folder, this method does not allow sharing of already downloaded video materials
+# Video material storage location
+# material_directory = ""                    # Indicates that video materials will be downloaded to the default folder, the default folder is ./storage/cache_videos under the current project
+# material_directory = "/user/harry/videos"  # Indicates that video materials will be downloaded to a specified folder
+# material_directory = "task"                # Indicates that video materials will be downloaded to the current task's folder, this method does not allow sharing of already downloaded video materials

-    # 视频素材存放位置
-    # material_directory = ""                    #表示将视频素材下载到默认的文件夹，默认文件夹为当前项目下的 ./storage/cache_videos
-    # material_directory = "/user/harry/videos"  #表示将视频素材下载到指定的文件夹中
-    # material_directory = "task"                #表示将视频素材下载到当前任务的文件夹中，这种方式无法共享已经下载的视频素材
+# 视频素材存放位置
+# material_directory = ""                    #表示将视频素材下载到默认的文件夹，默认文件夹为当前项目下的 ./storage/cache_videos
+# material_directory = "/user/harry/videos"  #表示将视频素材下载到指定的文件夹中
+# material_directory = "task"                #表示将视频素材下载到当前任务的文件夹中，这种方式无法共享已经下载的视频素材

-    material_directory = ""
+material_directory = ""
+
+# Used for state management of the task
+enable_redis = false
+redis_host = "localhost"
+redis_port = 6379
+redis_db = 0
+redis_password = ""
+
+# 文生视频时的最大并发任务数
+max_concurrent_tasks = 5

-    # Used for state management of the task
-    enable_redis = false
-    redis_host = "localhost"
-    redis_port = 6379
-    redis_db = 0

 [whisper]
-    # Only effective when subtitle_provider is "whisper"
+# Only effective when subtitle_provider is "whisper"

-    # Run on GPU with FP16
-    # model = WhisperModel(model_size, device="cuda", compute_type="float16")
+# Run on GPU with FP16
+# model = WhisperModel(model_size, device="cuda", compute_type="float16")

-    # Run on GPU with INT8
-    # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
+# Run on GPU with INT8
+# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")

-    # Run on CPU with INT8
-    # model = WhisperModel(model_size, device="cpu", compute_type="int8")
+# Run on CPU with INT8
+# model = WhisperModel(model_size, device="cpu", compute_type="int8")

-    # recommended model_size: "large-v3"
-    model_size="large-v3"
-    # if you want to use GPU, set device="cuda"
-    device="CPU"
-    compute_type="int8"
+# recommended model_size: "large-v3"
+model_size = "large-v3"
+# if you want to use GPU, set device="cuda"
+device = "CPU"
+compute_type = "int8"

-[pexels]
-    video_concat_mode="sequential" # "random" or "sequential"
-    [pexels.proxies]
-        ### Use a proxy to access the Pexels API
-        ### Format: "http://<username>:<password>@<proxy>:<port>"
-        ### Example: "http://user:pass@proxy:1234"
-        ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
-        # http = "http://10.10.1.10:3128"
-        # https = "http://10.10.1.10:1080"
+
+[proxy]
+### Use a proxy to access the Pexels API
+### Format: "http://<username>:<password>@<proxy>:<port>"
+### Example: "http://user:pass@proxy:1234"
+### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
+
+# http = "http://10.10.1.10:3128"
+# https = "http://10.10.1.10:1080"

 [azure]
-    # Azure Speech API Key
-    # Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
-    speech_key=""
-    speech_region=""
+# Azure Speech API Key
+# Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
+speech_key = ""
+speech_region = ""
+
+[siliconflow]
+# SiliconFlow API Key
+# Get your API key at https://siliconflow.cn
+api_key = ""
+
+[ui]
+# UI related settings
+# 是否隐藏日志信息
+# Whether to hide logs in the UI
+hide_log = false
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -6,7 +6,7 @@ services:
    build:
      context: .
      dockerfile: Dockerfile
-    container_name: "webui"
+    container_name: "moneyprinterturbo-webui"
    ports:
      - "8501:8501"
    command: [ "streamlit", "run", "./webui/Main.py","--browser.serverAddress=127.0.0.1","--server.enableCORS=True","--browser.gatherUsageStats=False" ]
@@ -16,7 +16,7 @@ services:
    build:
      context: .
      dockerfile: Dockerfile
-    container_name: "api"
+    container_name: "moneyprinterturbo-api"
    ports:
      - "8080:8080"
    command: [ "python3", "main.py" ]
--- a/docs/MoneyPrinterTurbo.ipynb
+++ b/docs/MoneyPrinterTurbo.ipynb
@@ -0,0 +1,118 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MoneyPrinterTurbo Setup Guide\n",
+        "\n",
+        "This notebook will guide you through the process of setting up [MoneyPrinterTurbo](https://github.com/harry0703/MoneyPrinterTurbo)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 1. Clone Repository and Install Dependencies\n",
+        "\n",
+        "First, we'll clone the repository from GitHub and install all required packages:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "S8Eu-aQarY_B"
+      },
+      "outputs": [],
+      "source": [
+        "!git clone https://github.com/harry0703/MoneyPrinterTurbo.git\n",
+        "%cd MoneyPrinterTurbo\n",
+        "!pip install -q -r requirements.txt\n",
+        "!pip install pyngrok --quiet"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 2. Configure ngrok for Remote Access\n",
+        "\n",
+        "We'll use ngrok to create a secure tunnel to expose our local Streamlit server to the internet.\n",
+        "\n",
+        "**Important**: You need to get your authentication token from the [ngrok dashboard](https://dashboard.ngrok.com/get-started/your-authtoken) to use this service."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from pyngrok import ngrok\n",
+        "\n",
+        "# Terminate any existing ngrok tunnels\n",
+        "ngrok.kill()\n",
+        "\n",
+        "# Set your authentication token\n",
+        "# Replace \"your_ngrok_auth_token\" with your actual token\n",
+        "ngrok.set_auth_token(\"your_ngrok_auth_token\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 3. Launch Application and Generate Public URL\n",
+        "\n",
+        "Now we'll start the Streamlit server and create an ngrok tunnel to make it accessible online:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "collapsed": true,
+        "id": "oahsIOXmwjl9",
+        "outputId": "ee23a96c-af21-4207-deb7-9fab69e0c05e"
+      },
+      "outputs": [],
+      "source": [
+        "import subprocess\n",
+        "import time\n",
+        "\n",
+        "print(\"🚀 Starting MoneyPrinterTurbo...\")\n",
+        "# Start Streamlit server on port 8501\n",
+        "streamlit_proc = subprocess.Popen([\n",
+        "    \"streamlit\", \"run\", \"./webui/Main.py\", \"--server.port=8501\"\n",
+        "])\n",
+        "\n",
+        "# Wait for the server to initialize\n",
+        "time.sleep(5)\n",
+        "\n",
+        "print(\"🌐 Creating ngrok tunnel to expose the MoneyPrinterTurbo...\")\n",
+        "public_url = ngrok.connect(8501, bind_tls=True)\n",
+        "\n",
+        "print(\"✅ Deployment complete! Access your MoneyPrinterTurbo at:\")\n",
+        "print(public_url)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/docs/api.jpg
+++ b/docs/api.jpg
--- a/docs/douyin.jpg
+++ b/docs/douyin.jpg
--- a/docs/picwish.com.jpg
+++ b/docs/picwish.com.jpg
--- a/docs/picwish.jpg
+++ b/docs/picwish.jpg
--- a/docs/shipinghao.jpg
+++ b/docs/shipinghao.jpg
--- a/docs/webui-en.jpg
+++ b/docs/webui-en.jpg
--- a/docs/webui.jpg
+++ b/docs/webui.jpg
--- a/docs/wechat-03.jpg
+++ b/docs/wechat-03.jpg
--- a/main.py
+++ b/main.py
@@ -1,8 +1,16 @@
 import uvicorn
 from loguru import logger
+
 from app.config import config

-if __name__ == '__main__':
-    logger.info("start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs")
-    uvicorn.run(app="app.asgi:app", host=config.listen_host, port=config.listen_port, reload=config.reload_debug,
-                log_level="warning")
+if __name__ == "__main__":
+    logger.info(
+        "start server, docs: http://127.0.0.1:" + str(config.listen_port) + "/docs"
+    )
+    uvicorn.run(
+        app="app.asgi:app",
+        host=config.listen_host,
+        port=config.listen_port,
+        reload=config.reload_debug,
+        log_level="warning",
+    )
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,25 +1,16 @@
-requests~=2.31.0
-moviepy~=2.0.0.dev2
-openai~=1.13.3
-faster-whisper~=1.0.1
-edge_tts~=6.1.10
-uvicorn~=0.27.1
-fastapi~=0.110.0
-tomli~=2.0.1
-streamlit~=1.32.0
-loguru~=0.7.2
-aiohttp~=3.9.3
-urllib3~=2.2.1
-pillow~=9.5.0
-pydantic~=2.6.3
-g4f~=0.2.5.4
-dashscope~=1.15.0
-google.generativeai~=0.4.1
-python-multipart~=0.0.9
-redis==5.0.3
-# if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
-# please install opencv-python to fix "PIL.Image' has no attribute 'ANTIALIAS'" error
-opencv-python
-# for azure speech
-# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
-azure-cognitiveservices-speech~=1.37.0
+moviepy==2.1.2
+streamlit==1.45.0
+edge_tts==6.1.19
+fastapi==0.115.6
+uvicorn==0.32.1
+openai==1.56.1
+faster-whisper==1.1.0
+loguru==0.7.3
+google.generativeai==0.8.3
+dashscope==1.20.14
+g4f==0.5.2.2
+azure-cognitiveservices-speech==1.41.1
+redis==5.2.0
+python-multipart==0.0.19
+pyyaml
+requests>=2.31.0
--- a/resource/fonts/Charm-Bold.ttf
+++ b/resource/fonts/Charm-Bold.ttf
--- a/resource/fonts/Charm-Regular.ttf
+++ b/resource/fonts/Charm-Regular.ttf
--- a/resource/fonts/MicrosoftYaHeiBold.ttc
+++ b/resource/fonts/MicrosoftYaHeiBold.ttc
--- a/resource/fonts/MicrosoftYaHeiNormal.ttc
+++ b/resource/fonts/MicrosoftYaHeiNormal.ttc
--- a/resource/fonts/UTM
+++ b/resource/fonts/UTM
--- a/test/README.md
+++ b/test/README.md
@@ -0,0 +1,40 @@
+# MoneyPrinterTurbo Test Directory
+
+This directory contains unit tests for the **MoneyPrinterTurbo** project.
+
+## Directory Structure
+
+- `services/`: Tests for components in the `app/services` directory  
+  - `test_video.py`: Tests for the video service  
+  - `test_task.py`: Tests for the task service  
+  - `test_voice.py`: Tests for the voice service  
+
+## Running Tests
+
+You can run the tests using Python’s built-in `unittest` framework:
+
+```bash
+# Run all tests
+python -m unittest discover -s test
+
+# Run a specific test file
+python -m unittest test/services/test_video.py
+
+# Run a specific test class
+python -m unittest test.services.test_video.TestVideoService
+
+# Run a specific test method
+python -m unittest test.services.test_video.TestVideoService.test_preprocess_video
+````
+
+## Adding New Tests
+
+To add tests for other components, follow these guidelines:
+
+1. Create test files prefixed with `test_` in the appropriate subdirectory
+2. Use `unittest.TestCase` as the base class for your test classes
+3. Name test methods with the `test_` prefix
+
+## Test Resources
+
+Place any resource files required for testing in the `test/resources` directory.
--- a/test/init.py
+++ b/test/init.py
@@ -0,0 +1 @@
+# Unit test package for test
--- a/test/resources/1.png
+++ b/test/resources/1.png
--- a/test/resources/1.png.mp4
+++ b/test/resources/1.png.mp4
--- a/test/resources/2.png
+++ b/test/resources/2.png
--- a/test/resources/2.png.mp4
+++ b/test/resources/2.png.mp4
--- a/test/resources/3.png
+++ b/test/resources/3.png
--- a/test/resources/3.png.mp4
+++ b/test/resources/3.png.mp4
--- a/test/resources/4.png
+++ b/test/resources/4.png
--- a/test/resources/5.png
+++ b/test/resources/5.png
--- a/test/resources/6.png
+++ b/test/resources/6.png
--- a/test/resources/7.png
+++ b/test/resources/7.png
--- a/test/resources/8.png
+++ b/test/resources/8.png
--- a/test/resources/9.png
+++ b/test/resources/9.png
--- a/test/services/init.py
+++ b/test/services/init.py
@@ -0,0 +1 @@
+# Unit test package for services
--- a/test/services/test_task.py
+++ b/test/services/test_task.py
@@ -0,0 +1,66 @@
+import unittest
+import os
+import sys
+from pathlib import Path
+
+# add project root to python path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from app.services import task as tm
+from app.models.schema import MaterialInfo, VideoParams
+
+resources_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources")
+
+class TestTaskService(unittest.TestCase):
+    def setUp(self):
+        pass
+    
+    def tearDown(self):
+        pass
+    
+    def test_task_local_materials(self):
+        task_id = "00000000-0000-0000-0000-000000000000"
+        video_materials=[]
+        for i in range(1, 4):
+            video_materials.append(MaterialInfo(
+                provider="local",
+                url=os.path.join(resources_dir, f"{i}.png"),
+                duration=0
+            ))
+
+        params = VideoParams(
+            video_subject="金钱的作用",
+            video_script="金钱不仅是交换媒介，更是社会资源的分配工具。它能满足基本生存需求，如食物和住房，也能提供教育、医疗等提升生活品质的机会。拥有足够的金钱意味着更多选择权，比如职业自由或创业可能。但金钱的作用也有边界，它无法直接购买幸福、健康或真诚的人际关系。过度追逐财富可能导致价值观扭曲，忽视精神层面的需求。理想的状态是理性看待金钱，将其作为实现目标的工具而非终极目的。",
+            video_terms="money importance, wealth and society, financial freedom, money and happiness, role of money",
+            video_aspect="9:16",
+            video_concat_mode="random",
+            video_transition_mode="None",
+            video_clip_duration=3,
+            video_count=1,
+            video_source="local",
+            video_materials=video_materials,
+            video_language="",
+            voice_name="zh-CN-XiaoxiaoNeural-Female",
+            voice_volume=1.0,
+            voice_rate=1.0,
+            bgm_type="random",
+            bgm_file="",
+            bgm_volume=0.2,
+            subtitle_enabled=True,
+            subtitle_position="bottom",
+            custom_position=70.0,
+            font_name="MicrosoftYaHeiBold.ttc",
+            text_fore_color="#FFFFFF",
+            text_background_color=True,
+            font_size=60,
+            stroke_color="#000000",
+            stroke_width=1.5,
+            n_threads=2,
+            paragraph_number=1
+        )
+        result = tm.start(task_id=task_id, params=params)
+        print(result)
+    
+
+if __name__ == "__main__":
+    unittest.main() 
--- a/test/services/test_video.py
+++ b/test/services/test_video.py
@@ -0,0 +1,85 @@
+
+import unittest
+import os
+import sys
+from pathlib import Path
+from moviepy import (
+    VideoFileClip,
+)
+# add project root to python path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+from app.models.schema import MaterialInfo
+from app.services import video as vd
+from app.utils import utils
+
+resources_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources")
+
+class TestVideoService(unittest.TestCase):
+    def setUp(self):
+        self.test_img_path = os.path.join(resources_dir, "1.png")
+    
+    def tearDown(self):
+        pass
+    
+    def test_preprocess_video(self):
+        if not os.path.exists(self.test_img_path):
+            self.fail(f"test image not found: {self.test_img_path}")
+        
+        # test preprocess_video function
+        m = MaterialInfo()
+        m.url = self.test_img_path
+        m.provider = "local"
+        print(m)
+        
+        materials = vd.preprocess_video([m], clip_duration=4)
+        print(materials)
+        
+        # verify result
+        self.assertIsNotNone(materials)
+        self.assertEqual(len(materials), 1)
+        self.assertTrue(materials[0].url.endswith(".mp4"))
+        
+        # moviepy get video info
+        clip = VideoFileClip(materials[0].url)
+        print(clip)
+        
+        # clean generated test video file
+        if os.path.exists(materials[0].url):
+            os.remove(materials[0].url)
+    
+    def test_wrap_text(self):
+        """test text wrapping function"""
+        try:
+            font_path = os.path.join(utils.font_dir(), "STHeitiMedium.ttc")
+            if not os.path.exists(font_path):
+                self.fail(f"font file not found: {font_path}")
+                
+            # test english text wrapping
+            test_text_en = "This is a test text for wrapping long sentences in english language"
+            
+            wrapped_text_en, text_height_en = vd.wrap_text(
+                text=test_text_en,
+                max_width=300,
+                font=font_path,
+                fontsize=30
+            )
+            print(wrapped_text_en, text_height_en)
+            # verify text is wrapped
+            self.assertIn("\n", wrapped_text_en)
+            
+            # test chinese text wrapping
+            test_text_zh = "这是一段用来测试中文长句换行的文本内容，应该会根据宽度限制进行换行处理"
+            wrapped_text_zh, text_height_zh = vd.wrap_text(
+                text=test_text_zh,
+                max_width=300,
+                font=font_path,
+                fontsize=30
+            )   
+            print(wrapped_text_zh, text_height_zh)
+            # verify chinese text is wrapped
+            self.assertIn("\n", wrapped_text_zh)
+        except Exception as e:
+            self.fail(f"test wrap_text failed: {str(e)}")
+
+if __name__ == "__main__":
+    unittest.main() 
--- a/test/services/test_voice.py
+++ b/test/services/test_voice.py
@@ -0,0 +1,107 @@
+import asyncio
+import unittest
+import os
+import sys
+from pathlib import Path
+
+# add project root to python path
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from app.utils import utils
+from app.services import voice as vs
+
+temp_dir = utils.storage_dir("temp")
+
+text_en = """
+What is the meaning of life? 
+This question has puzzled philosophers, scientists, and thinkers of all kinds for centuries. 
+Throughout history, various cultures and individuals have come up with their interpretations and beliefs around the purpose of life. 
+Some say it's to seek happiness and self-fulfillment, while others believe it's about contributing to the welfare of others and making a positive impact in the world. 
+Despite the myriad of perspectives, one thing remains clear: the meaning of life is a deeply personal concept that varies from one person to another. 
+It's an existential inquiry that encourages us to reflect on our values, desires, and the essence of our existence.
+"""
+
+text_zh = """
+预计未来3天深圳冷空气活动频繁，未来两天持续阴天有小雨，出门带好雨具；
+10-11日持续阴天有小雨，日温差小，气温在13-17℃之间，体感阴凉；
+12日天气短暂好转，早晚清凉；
+"""
+
+voice_rate=1.0
+voice_volume=1.0
+                    
+class TestVoiceService(unittest.TestCase):
+    def setUp(self):
+        self.loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(self.loop)
+    
+    def tearDown(self):
+        self.loop.close()
+    
+    def test_siliconflow(self):
+        voice_name = "siliconflow:FunAudioLLM/CosyVoice2-0.5B:alex-Male"
+        voice_name = vs.parse_voice_name(voice_name)
+        
+        async def _do():
+            parts = voice_name.split(":")
+            if len(parts) >= 3:
+                model = parts[1]
+                # 移除性别后缀，例如 "alex-Male" -> "alex"
+                voice_with_gender = parts[2]
+                voice = voice_with_gender.split("-")[0]
+                # 构建完整的voice参数，格式为 "model:voice"
+                full_voice = f"{model}:{voice}"
+                voice_file = f"{temp_dir}/tts-siliconflow-{voice}.mp3"
+                subtitle_file = f"{temp_dir}/tts-siliconflow-{voice}.srt"
+                sub_maker = vs.siliconflow_tts(
+                    text=text_zh, model=model, voice=full_voice, voice_file=voice_file, voice_rate=voice_rate, voice_volume=voice_volume
+                )
+                if not sub_maker:
+                    self.fail("siliconflow tts failed")
+                vs.create_subtitle(sub_maker=sub_maker, text=text_zh, subtitle_file=subtitle_file)
+                audio_duration = vs.get_audio_duration(sub_maker)
+                print(f"voice: {voice_name}, audio duration: {audio_duration}s")
+            else:
+                self.fail("siliconflow invalid voice name")
+
+        self.loop.run_until_complete(_do())
+    
+    def test_azure_tts_v1(self):
+        voice_name = "zh-CN-XiaoyiNeural-Female"
+        voice_name = vs.parse_voice_name(voice_name)
+        print(voice_name)
+        
+        voice_file = f"{temp_dir}/tts-azure-v1-{voice_name}.mp3"
+        subtitle_file = f"{temp_dir}/tts-azure-v1-{voice_name}.srt"
+        sub_maker = vs.azure_tts_v1(
+            text=text_zh, voice_name=voice_name, voice_file=voice_file, voice_rate=voice_rate
+        )
+        if not sub_maker:
+            self.fail("azure tts v1 failed")
+        vs.create_subtitle(sub_maker=sub_maker, text=text_zh, subtitle_file=subtitle_file)
+        audio_duration = vs.get_audio_duration(sub_maker)
+        print(f"voice: {voice_name}, audio duration: {audio_duration}s")
+
+    def test_azure_tts_v2(self):
+        voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
+        voice_name = vs.parse_voice_name(voice_name)
+        print(voice_name)
+
+        async def _do():
+            voice_file = f"{temp_dir}/tts-azure-v2-{voice_name}.mp3"
+            subtitle_file = f"{temp_dir}/tts-azure-v2-{voice_name}.srt"
+            sub_maker = vs.azure_tts_v2(
+                text=text_zh, voice_name=voice_name, voice_file=voice_file
+            )
+            if not sub_maker:
+                self.fail("azure tts v2 failed")
+            vs.create_subtitle(sub_maker=sub_maker, text=text_zh, subtitle_file=subtitle_file)
+            audio_duration = vs.get_audio_duration(sub_maker)
+            print(f"voice: {voice_name}, audio duration: {audio_duration}s")
+
+        self.loop.run_until_complete(_do())
+
+if __name__ == "__main__":
+    # python -m unittest test.services.test_voice.TestVoiceService.test_azure_tts_v1
+    # python -m unittest test.services.test_voice.TestVoiceService.test_azure_tts_v2
+    unittest.main() 
--- a/webui/Main.py
+++ b/webui/Main.py
--- a/webui/i18n/de.json
+++ b/webui/i18n/de.json
@@ -1,6 +1,14 @@
 {
-  "Language": "German",
+  "Language": "Deutsch",
  "Translation": {
+    "Login Required": "Anmeldung erforderlich",
+    "Please login to access settings": "Bitte melden Sie sich an, um auf die Einstellungen zuzugreifen",
+    "Username": "Benutzername",
+    "Password": "Passwort",
+    "Login": "Anmelden",
+    "Login Error": "Anmeldefehler",
+    "Incorrect username or password": "Falscher Benutzername oder Passwort",
+    "Please enter your username and password": "Bitte geben Sie Ihren Benutzernamen und Ihr Passwort ein",
    "Video Script Settings": "**Drehbuch / Topic des Videos**",
    "Video Subject": "Worum soll es in dem Video gehen? (Geben Sie ein Keyword an, :red[Dank KI wird automatisch ein Drehbuch generieren])",
    "Script Language": "Welche Sprache soll zum Generieren von Drehbüchern  verwendet werden? :red[KI generiert anhand dieses Begriffs das Drehbuch]",
@@ -10,12 +18,19 @@
    "Generate Video Keywords": "Klicken Sie, um KI zum Generieren zu verwenden [Video Keywords] basierend auf dem **Drehbuch**",
    "Please Enter the Video Subject": "Bitte geben Sie zuerst das Drehbuch an",
    "Generating Video Script and Keywords": "KI generiert ein Drehbuch und Schlüsselwörter...",
-    "Generating Video Keywords": "AI is generating video keywords...",
+    "Generating Video Keywords": "KI generiert Video-Schlüsselwörter...",
    "Video Keywords": "Video Schlüsselwörter (:blue[① Optional, KI generiert ② Verwende **, (Kommas)** zur Trennung der Wörter, in englischer Sprache])",
    "Video Settings": "**Video Einstellungen**",
    "Video Concat Mode": "Videoverkettungsmodus",
    "Random": "Zufällige Verkettung (empfohlen)",
    "Sequential": "Sequentielle Verkettung",
+    "Video Transition Mode": "Video Übergangsmodus",
+    "None": "Kein Übergang",
+    "Shuffle": "Zufällige Übergänge",
+    "FadeIn": "FadeIn",
+    "FadeOut": "FadeOut",
+    "SlideIn": "SlideIn",
+    "SlideOut": "SlideOut",
    "Video Ratio": "Video-Seitenverhältnis",
    "Portrait": "Portrait 9:16",
    "Landscape": "Landschaft 16:9",
@@ -23,9 +38,10 @@
    "Number of Videos Generated Simultaneously": "Anzahl der parallel generierten Videos",
    "Audio Settings": "**Audio Einstellungen**",
    "Speech Synthesis": "Sprachausgabe",
-    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
-    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Region": "Region(:red[Erforderlich，[Region abrufen](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API-Schlüssel(:red[Erforderlich，[API-Schlüssel abrufen](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "Lautstärke der Sprachausgabe",
+    "Speech Rate": "Lesegeschwindigkeit (1,0 bedeutet 1x)",
    "Male": "Männlich",
    "Female": "Weiblich",
    "Background Music": "Hintergrundmusik",
@@ -41,6 +57,7 @@
    "Top": "Oben",
    "Center": "Mittig",
    "Bottom": "Unten (empfohlen)",
+    "Custom": "Benutzerdefinierte Position (70, was 70% von oben bedeutet)",
    "Font Size": "Schriftgröße für Untertitel",
    "Font Color": "Schriftfarbe",
    "Stroke Color": "Kontur",
@@ -50,16 +67,39 @@
    "Generating Video": "Video wird erstellt, bitte warten...",
    "Start Generating Video": "Beginne mit der Generierung",
    "Video Generation Completed": "Video erfolgreich generiert",
+    "Video Generation Failed": "Video Generierung fehlgeschlagen",
    "You can download the generated video from the following links": "Sie können das generierte Video über die folgenden Links herunterladen",
-    "Basic Settings": "**Grunde Instellungen**",
-    "Pexels API Key": "Pexels API Key (:red[Required] [Get API Key](https://www.pexels.com/api/))",
-    "Language": "Language",
-    "LLM Provider": "LLM Provider",
-    "API Key": "API Key (:red[Required])",
-    "Base Url": "Base Url",
-    "Model Name": "Model Name",
-    "Please Enter the LLM API Key": "Please Enter the **LLM API Key**",
-    "Please Enter the Pexels API Key": "Please Enter the **Pexels API Key**",
-    "Get Help": "If you need help, or have any questions, you can join discord for help: https://harryai.cc"
+    "Basic Settings": "**Grundeinstellungen** (:blue[Klicken zum Erweitern])",
+    "Language": "Sprache",
+    "Pexels API Key": "Pexels API-Schlüssel ([API-Schlüssel abrufen](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Pixabay API-Schlüssel ([API-Schlüssel abrufen](https://pixabay.com/api/docs/#api_search_videos))",
+    "LLM Provider": "KI-Modellanbieter",
+    "API Key": "API-Schlüssel (:red[Erforderlich])",
+    "Base Url": "Basis-URL",
+    "Account ID": "Konto-ID (Aus dem Cloudflare-Dashboard)",
+    "Model Name": "Modellname",
+    "Please Enter the LLM API Key": "Bitte geben Sie den **KI-Modell API-Schlüssel** ein",
+    "Please Enter the Pexels API Key": "Bitte geben Sie den **Pexels API-Schlüssel** ein",
+    "Please Enter the Pixabay API Key": "Bitte geben Sie den **Pixabay API-Schlüssel** ein",
+    "Get Help": "Wenn Sie Hilfe benötigen oder Fragen haben, können Sie dem Discord beitreten: https://harryai.cc",
+    "Video Source": "Videoquelle",
+    "TikTok": "TikTok (TikTok-Unterstützung kommt bald)",
+    "Bilibili": "Bilibili (Bilibili-Unterstützung kommt bald)",
+    "Xiaohongshu": "Xiaohongshu (Xiaohongshu-Unterstützung kommt bald)",
+    "Local file": "Lokale Datei",
+    "Play Voice": "Sprachausgabe abspielen",
+    "Voice Example": "Dies ist ein Beispieltext zum Testen der Sprachsynthese",
+    "Synthesizing Voice": "Sprachsynthese läuft, bitte warten...",
+    "TTS Provider": "Sprachsynthese-Anbieter auswählen",
+    "TTS Servers": "TTS-Server",
+    "No voices available for the selected TTS server. Please select another server.": "Keine Stimmen für den ausgewählten TTS-Server verfügbar. Bitte wählen Sie einen anderen Server.",
+    "SiliconFlow API Key": "SiliconFlow API-Schlüssel",
+    "SiliconFlow TTS Settings": "SiliconFlow TTS-Einstellungen",
+    "Speed: Range [0.25, 4.0], default is 1.0": "Geschwindigkeit: Bereich [0.25, 4.0], Standardwert ist 1.0",
+    "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Lautstärke: Verwendet die Sprachlautstärke-Einstellung, Standardwert 1.0 entspricht Verstärkung 0",
+    "Hide Log": "Protokoll ausblenden",
+    "Hide Basic Settings": "Basis-Einstellungen ausblenden\n\nWenn diese Option deaktiviert ist, wird die Basis-Einstellungen-Leiste nicht auf der Seite angezeigt.\n\nWenn Sie sie erneut anzeigen möchten, setzen Sie `hide_config = false` in `config.toml`",
+    "LLM Settings": "**LLM-Einstellungen**",
+    "Video Source Settings": "**Videoquellen-Einstellungen**"
  }
 }
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@@ -1,6 +1,14 @@
 {
  "Language": "English",
  "Translation": {
+    "Login Required": "Login Required",
+    "Please login to access settings": "Please login to access settings",
+    "Username": "Username",
+    "Password": "Password",
+    "Login": "Login",
+    "Login Error": "Login Error",
+    "Incorrect username or password": "Incorrect username or password",
+    "Please enter your username and password": "Please enter your username and password",
    "Video Script Settings": "**Video Script Settings**",
    "Video Subject": "Video Subject (Provide a keyword, :red[AI will automatically generate] video script)",
    "Script Language": "Language for Generating Video Script (AI will automatically output based on the language of your subject)",
@@ -16,6 +24,13 @@
    "Video Concat Mode": "Video Concatenation Mode",
    "Random": "Random Concatenation (Recommended)",
    "Sequential": "Sequential Concatenation",
+    "Video Transition Mode": "Video Transition Mode",
+    "None": "None",
+    "Shuffle": "Shuffle",
+    "FadeIn": "FadeIn",
+    "FadeOut": "FadeOut",
+    "SlideIn": "SlideIn",
+    "SlideOut": "SlideOut",
    "Video Ratio": "Video Aspect Ratio",
    "Portrait": "Portrait 9:16",
    "Landscape": "Landscape 16:9",
@@ -26,6 +41,7 @@
    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "Speech Volume (1.0 represents 100%)",
+    "Speech Rate": "Speech Rate (1.0 means 1x speed)",
    "Male": "Male",
    "Female": "Female",
    "Background Music": "Background Music",
@@ -41,6 +57,7 @@
    "Top": "Top",
    "Center": "Center",
    "Bottom": "Bottom (Recommended)",
+    "Custom": "Custom position (70, indicating 70% down from the top)",
    "Font Size": "Subtitle Font Size",
    "Font Color": "Subtitle Font Color",
    "Stroke Color": "Subtitle Outline Color",
@@ -50,8 +67,10 @@
    "Generating Video": "Generating video, please wait...",
    "Start Generating Video": "Start Generating Video",
    "Video Generation Completed": "Video Generation Completed",
+    "Video Generation Failed": "Video Generation Failed",
    "You can download the generated video from the following links": "You can download the generated video from the following links",
-    "Pexels API Key": "Pexels API Key (:red[Required] [Get API Key](https://www.pexels.com/api/))",
+    "Pexels API Key": "Pexels API Key ([Get API Key](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Pixabay API Key ([Get API Key](https://pixabay.com/api/docs/#api_search_videos))",
    "Basic Settings": "**Basic Settings** (:blue[Click to expand])",
    "Language": "Language",
    "LLM Provider": "LLM Provider",
@@ -61,6 +80,26 @@
    "Model Name": "Model Name",
    "Please Enter the LLM API Key": "Please Enter the **LLM API Key**",
    "Please Enter the Pexels API Key": "Please Enter the **Pexels API Key**",
-    "Get Help": "If you need help, or have any questions, you can join discord for help: https://harryai.cc"
+    "Please Enter the Pixabay API Key": "Please Enter the **Pixabay API Key**",
+    "Get Help": "If you need help, or have any questions, you can join discord for help: https://harryai.cc",
+    "Video Source": "Video Source",
+    "TikTok": "TikTok (TikTok support is coming soon)",
+    "Bilibili": "Bilibili (Bilibili support is coming soon)",
+    "Xiaohongshu": "Xiaohongshu (Xiaohongshu support is coming soon)",
+    "Local file": "Local file",
+    "Play Voice": "Play Voice",
+    "Voice Example": "This is an example text for testing speech synthesis",
+    "Synthesizing Voice": "Synthesizing voice, please wait...",
+    "TTS Provider": "Select the voice synthesis provider",
+    "TTS Servers": "TTS Servers",
+    "No voices available for the selected TTS server. Please select another server.": "No voices available for the selected TTS server. Please select another server.",
+    "SiliconFlow API Key": "SiliconFlow API Key [Click to get](https://cloud.siliconflow.cn/account/ak)",
+    "SiliconFlow TTS Settings": "SiliconFlow TTS Settings",
+    "Speed: Range [0.25, 4.0], default is 1.0": "Speed: Range [0.25, 4.0], default is 1.0",
+    "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0",
+    "Hide Log": "Hide Log",
+    "Hide Basic Settings": "Hide Basic Settings\n\nHidden, the basic settings panel will not be displayed on the page.\n\nIf you need to display it again, please set `hide_config = false` in `config.toml`",
+    "LLM Settings": "**LLM Settings**",
+    "Video Source Settings": "**Video Source Settings**"
  }
 }
--- a/webui/i18n/pt.json
+++ b/webui/i18n/pt.json
@@ -0,0 +1,105 @@
+{
+  "Language": "Português Brasileiro",
+  "Translation": {
+    "Login Required": "Login Necessário",
+    "Please login to access settings": "Por favor, faça login para acessar as configurações",
+    "Username": "Nome de usuário",
+    "Password": "Senha",
+    "Login": "Entrar",
+    "Login Error": "Erro de Login",
+    "Incorrect username or password": "Nome de usuário ou senha incorretos",
+    "Please enter your username and password": "Por favor, digite seu nome de usuário e senha",
+    "Video Script Settings": "**Configurações do Roteiro do Vídeo**",
+    "Video Subject": "Tema do Vídeo (Forneça uma palavra-chave, :red[a IA irá gerar automaticamente] o roteiro do vídeo)",
+    "Script Language": "Idioma para Gerar o Roteiro do Vídeo (a IA irá gerar automaticamente com base no idioma do seu tema)",
+    "Generate Video Script and Keywords": "Clique para usar a IA para gerar o [Roteiro do Vídeo] e as [Palavras-chave do Vídeo] com base no **tema**",
+    "Auto Detect": "Detectar Automaticamente",
+    "Video Script": "Roteiro do Vídeo (:blue[① Opcional, gerado pela IA  ② Pontuação adequada ajuda na geração de legendas])",
+    "Generate Video Keywords": "Clique para usar a IA para gerar [Palavras-chave do Vídeo] com base no **roteiro**",
+    "Please Enter the Video Subject": "Por favor, insira o Roteiro do Vídeo primeiro",
+    "Generating Video Script and Keywords": "A IA está gerando o roteiro do vídeo e as palavras-chave...",
+    "Generating Video Keywords": "A IA está gerando as palavras-chave do vídeo...",
+    "Video Keywords": "Palavras-chave do Vídeo (:blue[① Opcional, gerado pela IA ② Use **vírgulas em inglês** para separar, somente em inglês])",
+    "Video Settings": "**Configurações do Vídeo**",
+    "Video Concat Mode": "Modo de Concatenação de Vídeo",
+    "Random": "Concatenação Aleatória (Recomendado)",
+    "Sequential": "Concatenação Sequencial",
+    "Video Transition Mode": "Modo de Transição de Vídeo",
+    "None": "Nenhuma Transição",
+    "Shuffle": "Transição Aleatória",
+    "FadeIn": "FadeIn",
+    "FadeOut": "FadeOut",
+    "SlideIn": "SlideIn",
+    "SlideOut": "SlideOut",
+    "Video Ratio": "Proporção do Vídeo",
+    "Portrait": "Retrato 9:16",
+    "Landscape": "Paisagem 16:9",
+    "Clip Duration": "Duração Máxima dos Clipes de Vídeo (segundos)",
+    "Number of Videos Generated Simultaneously": "Número de Vídeos Gerados Simultaneamente",
+    "Audio Settings": "**Configurações de Áudio**",
+    "Speech Synthesis": "Voz de Síntese de Fala",
+    "Speech Region": "Região(:red[Obrigatório，[Obter Região](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "Chave da API(:red[Obrigatório，[Obter Chave da API](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Volume da Fala (1.0 representa 100%)",
+    "Speech Rate": "Velocidade da Fala (1.0 significa velocidade 1x)",
+    "Male": "Masculino",
+    "Female": "Feminino",
+    "Background Music": "Música de Fundo",
+    "No Background Music": "Sem Música de Fundo",
+    "Random Background Music": "Música de Fundo Aleatória",
+    "Custom Background Music": "Música de Fundo Personalizada",
+    "Custom Background Music File": "Por favor, insira o caminho do arquivo para a música de fundo personalizada:",
+    "Background Music Volume": "Volume da Música de Fundo (0.2 representa 20%, a música de fundo não deve ser muito alta)",
+    "Subtitle Settings": "**Configurações de Legendas**",
+    "Enable Subtitles": "Ativar Legendas (Se desmarcado, as configurações abaixo não terão efeito)",
+    "Font": "Fonte da Legenda",
+    "Position": "Posição da Legenda",
+    "Top": "Superior",
+    "Center": "Centralizar",
+    "Bottom": "Inferior (Recomendado)",
+    "Custom": "Posição personalizada (70, indicando 70% abaixo do topo)",
+    "Font Size": "Tamanho da Fonte da Legenda",
+    "Font Color": "Cor da Fonte da Legenda",
+    "Stroke Color": "Cor do Contorno da Legenda",
+    "Stroke Width": "Largura do Contorno da Legenda",
+    "Generate Video": "Gerar Vídeo",
+    "Video Script and Subject Cannot Both Be Empty": "O Tema do Vídeo e o Roteiro do Vídeo não podem estar ambos vazios",
+    "Generating Video": "Gerando vídeo, por favor aguarde...",
+    "Start Generating Video": "Começar a Gerar Vídeo",
+    "Video Generation Completed": "Geração do Vídeo Concluída",
+    "Video Generation Failed": "Falha na Geração do Vídeo",
+    "You can download the generated video from the following links": "Você pode baixar o vídeo gerado a partir dos seguintes links",
+    "Basic Settings": "**Configurações Básicas** (:blue[Clique para expandir])",
+    "Language": "Idioma",
+    "Pexels API Key": "Chave da API do Pexels ([Obter Chave da API](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Chave da API do Pixabay ([Obter Chave da API](https://pixabay.com/api/docs/#api_search_videos))",
+    "LLM Provider": "Provedor LLM",
+    "API Key": "Chave da API (:red[Obrigatório])",
+    "Base Url": "URL Base",
+    "Account ID": "ID da Conta (Obter no painel do Cloudflare)",
+    "Model Name": "Nome do Modelo",
+    "Please Enter the LLM API Key": "Por favor, insira a **Chave da API LLM**",
+    "Please Enter the Pexels API Key": "Por favor, insira a **Chave da API do Pexels**",
+    "Please Enter the Pixabay API Key": "Por favor, insira a **Chave da API do Pixabay**",
+    "Get Help": "Se precisar de ajuda ou tiver alguma dúvida, você pode entrar no discord para obter ajuda: https://harryai.cc",
+    "Video Source": "Fonte do Vídeo",
+    "TikTok": "TikTok (Suporte para TikTok em breve)",
+    "Bilibili": "Bilibili (Suporte para Bilibili em breve)",
+    "Xiaohongshu": "Xiaohongshu (Suporte para Xiaohongshu em breve)",
+    "Local file": "Arquivo local",
+    "Play Voice": "Reproduzir Voz",
+    "Voice Example": "Este é um exemplo de texto para testar a síntese de fala",
+    "Synthesizing Voice": "Sintetizando voz, por favor aguarde...",
+    "TTS Provider": "Selecione o provedor de síntese de voz",
+    "TTS Servers": "Servidores TTS",
+    "No voices available for the selected TTS server. Please select another server.": "Não há vozes disponíveis para o servidor TTS selecionado. Por favor, selecione outro servidor.",
+    "SiliconFlow API Key": "Chave API do SiliconFlow",
+    "SiliconFlow TTS Settings": "Configurações do SiliconFlow TTS",
+    "Speed: Range [0.25, 4.0], default is 1.0": "Velocidade: Intervalo [0.25, 4.0], o padrão é 1.0",
+    "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Volume: Usa a configuração de Volume de Fala, o padrão 1.0 corresponde ao ganho 0",
+    "Hide Log": "Ocultar Log",
+    "Hide Basic Settings": "Ocultar Configurações Básicas\n\nOculto, o painel de configurações básicas não será exibido na página.\n\nSe precisar exibi-lo novamente, defina `hide_config = false` em `config.toml`",
+    "LLM Settings": "**Configurações do LLM**",
+    "Video Source Settings": "**Configurações da Fonte do Vídeo**"
+  }
+}
--- a/webui/i18n/vi.json
+++ b/webui/i18n/vi.json
@@ -0,0 +1,105 @@
+{
+  "Language": "Tiếng Việt",
+  "Translation": {
+    "Login Required": "Yêu cầu đăng nhập",
+    "Please login to access settings": "Vui lòng đăng nhập để truy cập cài đặt",
+    "Username": "Tên đăng nhập",
+    "Password": "Mật khẩu",
+    "Login": "Đăng nhập",
+    "Login Error": "Lỗi đăng nhập",
+    "Incorrect username or password": "Tên đăng nhập hoặc mật khẩu không chính xác",
+    "Please enter your username and password": "Vui lòng nhập tên đăng nhập và mật khẩu của bạn",
+    "Video Script Settings": "**Cài Đặt Kịch Bản Video**",
+    "Video Subject": "Chủ Đề Video (Cung cấp một từ khóa, :red[AI sẽ tự động tạo ra] kịch bản video)",
+    "Script Language": "Ngôn Ngữ cho Việc Tạo Kịch Bản Video (AI sẽ tự động xuất ra dựa trên ngôn ngữ của chủ đề của bạn)",
+    "Generate Video Script and Keywords": "Nhấn để sử dụng AI để tạo [Kịch Bản Video] và [Từ Khóa Video] dựa trên **chủ đề**",
+    "Auto Detect": "Tự Động Phát Hiện",
+    "Video Script": "Kịch Bản Video (:blue[① Tùy chọn, AI tạo ra  ② Dấu câu chính xác giúp việc tạo phụ đề)",
+    "Generate Video Keywords": "Nhấn để sử dụng AI để tạo [Từ Khóa Video] dựa trên **kịch bản**",
+    "Please Enter the Video Subject": "Vui lòng Nhập Kịch Bản Video Trước",
+    "Generating Video Script and Keywords": "AI đang tạo kịch bản video và từ khóa...",
+    "Generating Video Keywords": "AI đang tạo từ khóa video...",
+    "Video Keywords": "Từ Khóa Video (:blue[① Tùy chọn, AI tạo ra ② Sử dụng dấu phẩy **Tiếng Anh** để phân tách, chỉ sử dụng Tiếng Anh])",
+    "Video Settings": "**Cài Đặt Video**",
+    "Video Concat Mode": "Chế Độ Nối Video",
+    "Random": "Nối Ngẫu Nhiên (Được Khuyến Nghị)",
+    "Sequential": "Nối Theo Thứ Tự",
+    "Video Transition Mode": "Chế Độ Chuyển Đổi Video",
+    "None": "Không Có Chuyển Đổi",
+    "Shuffle": "Chuyển Đổi Ngẫu Nhiên",
+    "FadeIn": "FadeIn",
+    "FadeOut": "FadeOut",
+    "SlideIn": "SlideIn",
+    "SlideOut": "SlideOut",
+    "Video Ratio": "Tỷ Lệ Khung Hình Video",
+    "Portrait": "Dọc 9:16",
+    "Landscape": "Ngang 16:9",
+    "Clip Duration": "Thời Lượng Tối Đa Của Đoạn Video (giây)",
+    "Number of Videos Generated Simultaneously": "Số Video Được Tạo Ra Đồng Thời",
+    "Audio Settings": "**Cài Đặt Âm Thanh**",
+    "Speech Synthesis": "Giọng Đọc Văn Bản",
+    "Speech Region": "Vùng(:red[Bắt Buộc，[Lấy Vùng](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "Khóa API(:red[Bắt Buộc，[Lấy Khóa API](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Volume": "Âm Lượng Giọng Đọc (1.0 đại diện cho 100%)",
+    "Speech Rate": "Tốc độ đọc (1.0 biểu thị tốc độ gốc)",
+    "Male": "Nam",
+    "Female": "Nữ",
+    "Background Music": "Âm Nhạc Nền",
+    "No Background Music": "Không Có Âm Nhạc Nền",
+    "Random Background Music": "Âm Nhạc Nền Ngẫu Nhiên",
+    "Custom Background Music": "Âm Nhạc Nền Tùy Chỉnh",
+    "Custom Background Music File": "Vui lòng nhập đường dẫn tệp cho âm nhạc nền tùy chỉnh:",
+    "Background Music Volume": "Âm Lượng Âm Nhạc Nền (0.2 đại diện cho 20%, âm nhạc nền không nên quá to)",
+    "Subtitle Settings": "**Cài Đặt Phụ Đề**",
+    "Enable Subtitles": "Bật Phụ Đề (Nếu không chọn, các cài đặt dưới đây sẽ không có hiệu lực)",
+    "Font": "Phông Chữ Phụ Đề",
+    "Position": "Vị Trí Phụ Đề",
+    "Top": "Trên",
+    "Center": "Giữa",
+    "Bottom": "Dưới (Được Khuyến Nghị)",
+    "Custom": "Vị trí tùy chỉnh (70, chỉ ra là cách đầu trang 70%)",
+    "Font Size": "Cỡ Chữ Phụ Đề",
+    "Font Color": "Màu Chữ Phụ Đề",
+    "Stroke Color": "Màu Viền Phụ Đề",
+    "Stroke Width": "Độ Rộng Viền Phụ Đề",
+    "Generate Video": "Tạo Video",
+    "Video Script and Subject Cannot Both Be Empty": "Chủ Đề Video và Kịch Bản Video không thể cùng trống",
+    "Generating Video": "Đang tạo video, vui lòng đợi...",
+    "Start Generating Video": "Bắt Đầu Tạo Video",
+    "Video Generation Completed": "Hoàn Tất Tạo Video",
+    "Video Generation Failed": "Tạo Video Thất Bại",
+    "You can download the generated video from the following links": "Bạn có thể tải video được tạo ra từ các liên kết sau",
+    "Basic Settings": "**Cài Đặt Cơ Bản** (:blue[Nhấp để mở rộng])",
+    "Language": "Ngôn Ngữ",
+    "Pexels API Key": "Khóa API Pexels ([Lấy Khóa API](https://www.pexels.com/api/))",
+    "Pixabay API Key": "Khóa API Pixabay ([Lấy Khóa API](https://pixabay.com/api/docs/#api_search_videos))",
+    "LLM Provider": "Nhà Cung Cấp LLM",
+    "API Key": "Khóa API (:red[Bắt Buộc])",
+    "Base Url": "Url Cơ Bản",
+    "Account ID": "ID Tài Khoản (Lấy từ bảng điều khiển Cloudflare)",
+    "Model Name": "Tên Mô Hình",
+    "Please Enter the LLM API Key": "Vui lòng Nhập **Khóa API LLM**",
+    "Please Enter the Pexels API Key": "Vui lòng Nhập **Khóa API Pexels**",
+    "Please Enter the Pixabay API Key": "Vui lòng Nhập **Khóa API Pixabay**",
+    "Get Help": "Nếu bạn cần giúp đỡ hoặc có bất kỳ câu hỏi nào, bạn có thể tham gia discord để được giúp đỡ: https://harryai.cc",
+    "Video Source": "Nguồn Video",
+    "TikTok": "TikTok (Hỗ trợ TikTok sắp ra mắt)",
+    "Bilibili": "Bilibili (Hỗ trợ Bilibili sắp ra mắt)",
+    "Xiaohongshu": "Xiaohongshu (Hỗ trợ Xiaohongshu sắp ra mắt)",
+    "Local file": "Tệp cục bộ",
+    "Play Voice": "Phát Giọng Nói",
+    "Voice Example": "Đây là văn bản mẫu để kiểm tra tổng hợp giọng nói",
+    "Synthesizing Voice": "Đang tổng hợp giọng nói, vui lòng đợi...",
+    "TTS Provider": "Chọn nhà cung cấp tổng hợp giọng nói",
+    "TTS Servers": "Máy chủ TTS",
+    "No voices available for the selected TTS server. Please select another server.": "Không có giọng nói nào cho máy chủ TTS đã chọn. Vui lòng chọn máy chủ khác.",
+    "SiliconFlow API Key": "Khóa API SiliconFlow",
+    "SiliconFlow TTS Settings": "Cài đặt SiliconFlow TTS",
+    "Speed: Range [0.25, 4.0], default is 1.0": "Tốc độ: Phạm vi [0.25, 4.0], mặc định là 1.0",
+    "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "Âm lượng: Sử dụng cài đặt Âm lượng Giọng nói, mặc định 1.0 tương ứng với tăng ích 0",
+    "Hide Log": "Ẩn Nhật Ký",
+    "Hide Basic Settings": "Ẩn Cài Đặt Cơ Bản\n\nẨn, thanh cài đặt cơ bản sẽ không hiển thị trên trang web.\n\nNếu bạn muốn hiển thị lại, vui lòng đặt `hide_config = false` trong `config.toml`",
+    "LLM Settings": "**Cài Đặt LLM**",
+    "Video Source Settings": "**Cài Đặt Nguồn Video**"
+  }
+}
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -1,6 +1,14 @@
 {
  "Language": "简体中文",
  "Translation": {
+    "Login Required": "需要登录",
+    "Please login to access settings": "请登录后访问配置设置 (:gray[默认用户名: admin, 密码: admin, 您可以在 config.toml 中修改])",
+    "Username": "用户名",
+    "Password": "密码",
+    "Login": "登录",
+    "Login Error": "登录错误",
+    "Incorrect username or password": "用户名或密码不正确",
+    "Please enter your username and password": "请输入用户名和密码",
    "Video Script Settings": "**文案设置**",
    "Video Subject": "视频主题（给定一个关键词，:red[AI自动生成]视频文案）",
    "Script Language": "生成视频脚本的语言（一般情况AI会自动根据你输入的主题语言输出）",
@@ -16,16 +24,24 @@
    "Video Concat Mode": "视频拼接模式",
    "Random": "随机拼接（推荐）",
    "Sequential": "顺序拼接",
+    "Video Transition Mode": "视频转场模式",
+    "None": "无转场",
+    "Shuffle": "随机转场",
+    "FadeIn": "渐入",
+    "FadeOut": "渐出",
+    "SlideIn": "滑动入",
+    "SlideOut": "滑动出",
    "Video Ratio": "视频比例",
    "Portrait": "竖屏 9:16（抖音视频）",
    "Landscape": "横屏 16:9（西瓜视频）",
-    "Clip Duration": "视频片段最大时长(秒)",
+    "Clip Duration": "视频片段最大时长(秒)（**不是视频总长度**，是指每个**合成片段**的长度）",
    "Number of Videos Generated Simultaneously": "同时生成视频数量",
    "Audio Settings": "**音频设置**",
-    "Speech Synthesis": "朗读声音（:red[尽量与文案语言保持一致]）",
-    "Speech Region": "服务区域(:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
-    "Speech Key": "API Key(:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Synthesis": "朗读声音（:red[**与文案语言保持一致**。注意：V2版效果更好，但是需要API KEY]）",
+    "Speech Region": "服务区域 (:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key (:red[必填，密钥1 或 密钥2 均可 [点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "朗读音量（1.0表示100%）",
+    "Speech Rate": "朗读速度（1.0表示1倍速）",
    "Male": "男性",
    "Female": "女性",
    "Background Music": "背景音乐",
@@ -41,6 +57,7 @@
    "Top": "顶部",
    "Center": "中间",
    "Bottom": "底部（推荐）",
+    "Custom": "自定义位置（70，表示离顶部70%的位置）",
    "Font Size": "字幕大小",
    "Font Color": "字幕颜色",
    "Stroke Color": "描边颜色",
@@ -50,10 +67,12 @@
    "Generating Video": "正在生成视频，请稍候...",
    "Start Generating Video": "开始生成视频",
    "Video Generation Completed": "视频生成完成",
+    "Video Generation Failed": "视频生成失败",
    "You can download the generated video from the following links": "你可以从以下链接下载生成的视频",
    "Basic Settings": "**基础设置** (:blue[点击展开])",
    "Language": "界面语言",
-    "Pexels API Key": "Pexels API Key (:red[必填] [点击获取](https://www.pexels.com/api/))",
+    "Pexels API Key": "Pexels API Key ([点击获取](https://www.pexels.com/api/)) :red[推荐使用]",
+    "Pixabay API Key": "Pixabay API Key ([点击获取](https://pixabay.com/api/docs/#api_search_videos)) :red[可以不用配置，如果 Pexels 无法使用，再选择Pixabay]",
    "LLM Provider": "大模型提供商",
    "API Key": "API Key (:red[必填，需要到大模型提供商的后台申请])",
    "Base Url": "Base Url (可选)",
@@ -61,6 +80,26 @@
    "Model Name": "模型名称 (:blue[需要到大模型提供商的后台确认被授权的模型名称])",
    "Please Enter the LLM API Key": "请先填写大模型 **API Key**",
    "Please Enter the Pexels API Key": "请先填写 **Pexels API Key**",
-    "Get Help": "有任何问题或建议，可以加入 **微信群** 求助或讨论：https://harryai.cc"
+    "Please Enter the Pixabay API Key": "请先填写 **Pixabay API Key**",
+    "Get Help": "有任何问题或建议，可以加入 **微信群** 求助或讨论：https://harryai.cc",
+    "Video Source": "视频来源",
+    "TikTok": "抖音 (TikTok 支持中，敬请期待)",
+    "Bilibili": "哔哩哔哩 (Bilibili 支持中，敬请期待)",
+    "Xiaohongshu": "小红书 (Xiaohongshu 支持中，敬请期待)",
+    "Local file": "本地文件",
+    "Play Voice": "试听语音合成",
+    "Voice Example": "这是一段测试语音合成的示例文本",
+    "Synthesizing Voice": "语音合成中，请稍候...",
+    "TTS Provider": "语音合成提供商",
+    "TTS Servers": "TTS服务器",
+    "No voices available for the selected TTS server. Please select another server.": "当前选择的TTS服务器没有可用的声音，请选择其他服务器。",
+    "SiliconFlow API Key": "硅基流动API密钥 [点击获取](https://cloud.siliconflow.cn/account/ak)",
+    "SiliconFlow TTS Settings": "硅基流动TTS设置",
+    "Speed: Range [0.25, 4.0], default is 1.0": "语速范围 [0.25, 4.0]，默认值为1.0",
+    "Volume: Uses Speech Volume setting, default 1.0 maps to gain 0": "音量：使用朗读音量设置，默认值1.0对应增益0",
+    "Hide Log": "隐藏日志",
+    "Hide Basic Settings": "隐藏基础设置\n\n隐藏后，基础设置面板将不会显示在页面中。\n\n如需要再次显示，请在 `config.toml` 中设置 `hide_config = false`",
+    "LLM Settings": "**大模型设置**",
+    "Video Source Settings": "**视频源设置**"
  }
 }