init

2024-03-11 16:37:49 +08:00
parent d4f7b53b84
commit 06df797234
71 changed files with 2725 additions and 1 deletions
--- a/app/init.py
+++ b/app/init.py
--- a/app/asgi.py
+++ b/app/asgi.py
@@ -0,0 +1,60 @@
+"""Application implementation - ASGI."""
+
+from fastapi import FastAPI, Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from loguru import logger
+from fastapi.staticfiles import StaticFiles
+
+from app.config import config
+from app.models.exception import HttpException
+from app.router import root_api_router
+from app.utils import utils
+
+
+def exception_handler(request: Request, e: HttpException):
+    return JSONResponse(
+        status_code=e.status_code,
+        content=utils.get_response(e.status_code, e.data, e.message),
+    )
+
+
+def validation_exception_handler(request: Request, e: RequestValidationError):
+    return JSONResponse(
+        status_code=400,
+        content=utils.get_response(status=400, data=e.errors(), message='field required'),
+    )
+
+
+def get_application() -> FastAPI:
+    """Initialize FastAPI application.
+
+    Returns:
+       FastAPI: Application object instance.
+
+    """
+    instance = FastAPI(
+        title=config.project_name,
+        description=config.project_description,
+        version=config.project_version,
+        debug=False,
+    )
+    instance.include_router(root_api_router)
+    instance.add_exception_handler(HttpException, exception_handler)
+    instance.add_exception_handler(RequestValidationError, validation_exception_handler)
+    return instance
+
+
+app = get_application()
+public_dir = utils.public_dir()
+app.mount("/", StaticFiles(directory=public_dir, html=True), name="")
+
+
+@app.on_event("shutdown")
+def shutdown_event():
+    logger.info("shutdown event")
+
+
+@app.on_event("startup")
+def startup_event():
+    logger.info("startup event")
--- a/app/config/init.py
+++ b/app/config/init.py
@@ -0,0 +1,51 @@
+import os
+import sys
+
+from loguru import logger
+
+from app.config import config
+from app.utils import utils
+
+
+def __init_logger():
+    _log_file = utils.storage_dir("logs/server.log")
+    _lvl = config.log_level
+    root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+    def format_record(record):
+        # 获取日志记录中的文件全路径
+        file_path = record["file"].path
+        # 将绝对路径转换为相对于项目根目录的路径
+        relative_path = os.path.relpath(file_path, root_dir)
+        # 更新记录中的文件路径
+        record["file"].path = f"./{relative_path}"
+        # 返回修改后的格式字符串
+        # 您可以根据需要调整这里的格式
+        _format = '<green>{time:%Y-%m-%d %H:%M:%S}</> | ' + \
+                  '<level>{level}</> | ' + \
+                  '"{file.path}:{line}":<blue> {function}</> ' + \
+                  '- <level>{message}</>' + "\n"
+        return _format
+
+    logger.remove()
+
+    logger.add(
+        sys.stdout,
+        level=_lvl,
+        format=format_record,
+        colorize=True,
+    )
+
+    logger.add(
+        _log_file,
+        level=_lvl,
+        format=format_record,
+        rotation="00:00",
+        retention="3 days",
+        backtrace=True,
+        diagnose=True,
+        enqueue=True,
+    )
+
+
+__init_logger()
--- a/app/config/config.py
+++ b/app/config/config.py
@@ -0,0 +1,31 @@
+import os
+
+import tomli
+from loguru import logger
+
+root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+config_file = f"{root_dir}/config.toml"
+logger.info(f"load config from file: {config_file}")
+
+with open(config_file, mode="rb") as fp:
+    _cfg = tomli.load(fp)
+
+app = _cfg.get("app", {})
+whisper = _cfg.get("whisper", {})
+
+hostname = os.uname().nodename
+
+log_level = _cfg.get("log_level", "DEBUG")
+listen_host = _cfg.get("listen_host", "0.0.0.0")
+listen_port = _cfg.get("listen_port", 8080)
+project_name = _cfg.get("project_name", "MoneyPrinterTurbo")
+project_description = _cfg.get("project_description", "MoneyPrinterTurbo\n by 抖音-网旭哈瑞.AI")
+project_version = _cfg.get("project_version", "1.0.0")
+reload_debug = False
+
+__cfg = {
+    "hostname": hostname,
+    "listen_host": listen_host,
+    "listen_port": listen_port,
+}
+logger.info(__cfg)
--- a/app/controllers/base.py
+++ b/app/controllers/base.py
@@ -0,0 +1,27 @@
+from uuid import uuid4
+
+from fastapi import Request
+
+from app.config import config
+from app.models.exception import HttpException
+
+
+def get_task_id(request: Request):
+    task_id = request.headers.get('x-task-id')
+    if not task_id:
+        task_id = uuid4()
+    return str(task_id)
+
+
+def get_api_key(request: Request):
+    api_key = request.headers.get('x-api-key')
+    return api_key
+
+
+def verify_token(request: Request):
+    token = get_api_key(request)
+    if token != config.app.get("api_key", ""):
+        request_id = get_task_id(request)
+        request_url = request.url
+        user_agent = request.headers.get('user-agent')
+        raise HttpException(task_id=request_id, status_code=401, message=f"invalid token: {request_url}, {user_agent}")
--- a/app/controllers/ping.py
+++ b/app/controllers/ping.py
@@ -0,0 +1,9 @@
+from fastapi import APIRouter
+from fastapi import Request
+
+router = APIRouter()
+
+
+@router.get("/ping", tags=["Health Check"], description="检查服务可用性", response_description="pong")
+def ping(request: Request) -> str:
+    return "pong"
--- a/app/controllers/v1/base.py
+++ b/app/controllers/v1/base.py
@@ -0,0 +1,11 @@
+from fastapi import APIRouter, Depends
+
+
+def new_router(dependencies=None):
+    router = APIRouter()
+    router.tags = ['V1']
+    router.prefix = '/api/v1'
+    # 将认证依赖项应用于所有路由
+    if dependencies:
+        router.dependencies = dependencies
+    return router
--- a/app/controllers/v1/video.py
+++ b/app/controllers/v1/video.py
@@ -0,0 +1,44 @@
+from os import path
+
+from fastapi import Request, Depends, Path
+from loguru import logger
+
+from app.controllers import base
+from app.controllers.v1.base import new_router
+from app.models.exception import HttpException
+from app.models.schema import TaskVideoRequest, TaskQueryResponse, TaskResponse, TaskQueryRequest
+from app.services import task as tm
+from app.utils import utils
+
+# 认证依赖项
+# router = new_router(dependencies=[Depends(base.verify_token)])
+router = new_router()
+
+
+@router.post("/videos", response_model=TaskResponse, summary="使用主题来生成短视频")
+async def create_video(request: Request, body: TaskVideoRequest):
+    task_id = utils.get_uuid()
+    request_id = base.get_task_id(request)
+    try:
+        task = {
+            "task_id": task_id,
+            "request_id": request_id,
+        }
+        body_dict = body.dict()
+        task.update(body_dict)
+        result = tm.start(task_id=task_id, params=body)
+        task["result"] = result
+        logger.success(f"video created: {utils.to_json(task)}")
+        return utils.get_response(200, task)
+    except ValueError as e:
+        raise HttpException(task_id=task_id, status_code=400, message=f"{request_id}: {str(e)}")
+
+
+@router.get("/tasks/{task_id}", response_model=TaskQueryResponse, summary="查询任务状态")
+async def get_task(request: Request, task_id: str = Path(..., description="任务ID"),
+                   query: TaskQueryRequest = Depends()):
+    request_id = base.get_task_id(request)
+    data = query.dict()
+    data["task_id"] = task_id
+    raise HttpException(task_id=task_id, status_code=404,
+                        message=f"{request_id}: task not found", data=data)
--- a/app/models/init.py
+++ b/app/models/init.py
--- a/app/models/const.py
+++ b/app/models/const.py
@@ -0,0 +1,4 @@
+punctuations = [
+    "?", ",", ".", "、", ";",
+    "？", "，", "。", "、", "；",
+]
--- a/app/models/exception.py
+++ b/app/models/exception.py
@@ -0,0 +1,26 @@
+import traceback
+from typing import Any
+
+from loguru import logger
+
+
+class HttpException(Exception):
+    def __init__(self, task_id: str, status_code: int, message: str = '', data: Any = None):
+        self.message = message
+        self.status_code = status_code
+        self.data = data
+        # 获取异常堆栈信息
+        tb_str = traceback.format_exc().strip()
+        if not tb_str or tb_str == "NoneType: None":
+            msg = f'HttpException: {status_code}, {task_id}, {message}'
+        else:
+            msg = f'HttpException: {status_code}, {task_id}, {message}\n{tb_str}'
+
+        if status_code == 400:
+            logger.warning(msg)
+        else:
+            logger.error(msg)
+
+
+class FileNotFoundException(Exception):
+    pass
--- a/app/models/schema.py
+++ b/app/models/schema.py
@@ -0,0 +1,119 @@
+from enum import Enum
+from typing import Any, Optional
+
+from pydantic import BaseModel
+import warnings
+
+# 忽略 Pydantic 的特定警告
+warnings.filterwarnings("ignore", category=UserWarning, message="Field name.*shadows an attribute in parent.*")
+
+
+class VideoAspect(str, Enum):
+    landscape = "16:9"
+    portrait = "9:16"
+    square = "1:1"
+
+    def to_resolution(self):
+        if self == VideoAspect.landscape.value:
+            return 1920, 1080
+        elif self == VideoAspect.portrait.value:
+            return 1080, 1920
+        elif self == VideoAspect.square.value:
+            return 1080, 1080
+        return 1080, 1920
+
+
+VoiceNames = [
+    # zh-CN
+    "female-zh-CN-XiaoxiaoNeural",
+    "female-zh-CN-XiaoyiNeural",
+    "female-zh-CN-liaoning-XiaobeiNeural",
+    "female-zh-CN-shaanxi-XiaoniNeural",
+
+    "male-zh-CN-YunjianNeural",
+    "male-zh-CN-YunxiNeural",
+    "male-zh-CN-YunxiaNeural",
+    "male-zh-CN-YunyangNeural",
+
+    # "female-zh-HK-HiuGaaiNeural",
+    # "female-zh-HK-HiuMaanNeural",
+    # "male-zh-HK-WanLungNeural",
+    #
+    # "female-zh-TW-HsiaoChenNeural",
+    # "female-zh-TW-HsiaoYuNeural",
+    # "male-zh-TW-YunJheNeural",
+
+    # en-US
+
+    "female-en-US-AnaNeural",
+    "female-en-US-AriaNeural",
+    "female-en-US-AvaNeural",
+    "female-en-US-EmmaNeural",
+    "female-en-US-JennyNeural",
+    "female-en-US-MichelleNeural",
+
+    "male-en-US-AndrewNeural",
+    "male-en-US-BrianNeural",
+    "male-en-US-ChristopherNeural",
+    "male-en-US-EricNeural",
+    "male-en-US-GuyNeural",
+    "male-en-US-RogerNeural",
+    "male-en-US-SteffanNeural",
+]
+
+
+class VideoParams:
+    """
+    {
+      "video_subject": "",
+      "video_aspect": "横屏 16:9（西瓜视频）",
+      "voice_name": "女生-晓晓",
+      "bgm_name": "random",
+      "font_name": "STHeitiMedium 黑体-中",
+      "text_color": "#FFFFFF",
+      "font_size": 60,
+      "stroke_color": "#000000",
+      "stroke_width": 1.5
+    }
+    """
+    video_subject: str
+    video_aspect: Optional[VideoAspect] = VideoAspect.portrait.value
+    voice_name: Optional[str] = VoiceNames[0]
+    bgm_name: Optional[str] = "random"
+    font_name: Optional[str] = "STHeitiMedium.ttc"
+    text_fore_color: Optional[str] = "#FFFFFF"
+    font_size: int = 60
+    stroke_color: Optional[str] = "#000000"
+    stroke_width: float = 1.5
+    n_threads: Optional[int] = 2
+    paragraph_number: Optional[int] = 1
+
+
+class BaseResponse(BaseModel):
+    status: int = 200
+    message: Optional[str] = 'success'
+    data: Any = None
+
+
+class TaskVideoRequest(VideoParams, BaseModel):
+    pass
+
+
+class TaskQueryRequest(BaseModel):
+    pass
+
+
+######################################################################################################
+######################################################################################################
+######################################################################################################
+######################################################################################################
+class TaskResponse(BaseResponse):
+    class TaskResponseData(BaseModel):
+        task_id: str
+        task_type: str = ""
+
+    data: TaskResponseData
+
+
+class TaskQueryResponse(BaseResponse):
+    pass
--- a/app/router.py
+++ b/app/router.py
@@ -0,0 +1,15 @@
+"""Application configuration - root APIRouter.
+
+Defines all FastAPI application endpoints.
+
+Resources:
+    1. https://fastapi.tiangolo.com/tutorial/bigger-applications
+
+"""
+from fastapi import APIRouter
+
+from app.controllers.v1 import video
+
+root_api_router = APIRouter()
+# v1
+root_api_router.include_router(video.router)
--- a/app/services/init.py
+++ b/app/services/init.py
--- a/app/services/gpt.py
+++ b/app/services/gpt.py
@@ -0,0 +1,152 @@
+import logging
+import re
+import json
+import openai
+from typing import List
+from loguru import logger
+
+from app.config import config
+
+openai_api_key = config.app.get("openai_api_key")
+if not openai_api_key:
+    raise ValueError("openai_api_key is not set, please set it in the config.toml file.")
+
+openai_model_name = config.app.get("openai_model_name")
+if not openai_model_name:
+    raise ValueError("openai_model_name is not set, please set it in the config.toml file.")
+
+openai_base_url = config.app.get("openai_base_url")
+
+openai.api_key = openai_api_key
+openai_model_name = openai_model_name
+if openai_base_url:
+    openai.base_url = openai_base_url
+
+
+def _generate_response(prompt: str) -> str:
+    model_name = openai_model_name
+
+    response = openai.chat.completions.create(
+        model=model_name,
+        messages=[{"role": "user", "content": prompt}],
+    ).choices[0].message.content
+    return response
+
+
+def generate_script(video_subject: str, language: str = "zh-CN", paragraph_number: int = 1) -> str:
+    prompt = f"""
+# Role: Video Script Generator
+
+## Goals:
+Generate a script for a video, depending on the subject of the video.
+
+## Constrains:
+1. the script is to be returned as a string with the specified number of paragraphs.
+2. do not under any circumstance reference this prompt in your response.
+3. get straight to the point, don't start with unnecessary things like, "welcome to this video".
+4. you must not include any type of markdown or formatting in the script, never use a title. 
+5. only return the raw content of the script. 
+6. do not include "voiceover", "narrator" or similar indicators of what should be spoken at the beginning of each paragraph or line. 
+7. you must not mention the prompt, or anything about the script itself. also, never talk about the amount of paragraphs or lines. just write the script.
+
+## Output Example:
+What is the meaning of life. This question has puzzled philosophers.
+
+# Initialization:
+- video subject: {video_subject}
+- output language: {language}
+- number of paragraphs: {paragraph_number}
+""".strip()
+
+    final_script = ""
+    logger.info(f"subject: {video_subject}")
+    logger.debug(f"prompt: \n{prompt}")
+    response = _generate_response(prompt=prompt)
+
+    # Return the generated script
+    if response:
+        # Clean the script
+        # Remove asterisks, hashes
+        response = response.replace("*", "")
+        response = response.replace("#", "")
+
+        # Remove markdown syntax
+        response = re.sub(r"\[.*\]", "", response)
+        response = re.sub(r"\(.*\)", "", response)
+
+        # Split the script into paragraphs
+        paragraphs = response.split("\n\n")
+
+        # Select the specified number of paragraphs
+        selected_paragraphs = paragraphs[:paragraph_number]
+
+        # Join the selected paragraphs into a single string
+        final_script = "\n\n".join(selected_paragraphs)
+
+        # Print to console the number of paragraphs used
+        # logger.info(f"number of paragraphs used: {len(selected_paragraphs)}")
+    else:
+        logging.error("gpt returned an empty response")
+
+    logger.success(f"completed: \n{final_script}")
+    return final_script
+
+
+def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
+    prompt = f"""
+# Role: Video Search Terms Generator
+
+## Goals:
+Generate {amount} search terms for stock videos, depending on the subject of a video.
+
+## Constrains:
+1. the search terms are to be returned as a json-array of strings.
+2. each search term should consist of 1-3 words, always add the main subject of the video.
+3. you must only return the json-array of strings. you must not return anything else. you must not return the script.
+4. the search terms must be related to the subject of the video.
+5. reply with english search terms only.
+
+## Output Example:
+["search term 1", "search term 2", "search term 3","search term 4","search term 5"]
+
+## Context:
+### Video Subject
+{video_subject}
+
+### Video Script
+{video_script}
+""".strip()
+
+    logger.info(f"subject: {video_subject}")
+    logger.debug(f"prompt: \n{prompt}")
+    response = _generate_response(prompt)
+    search_terms = []
+
+    try:
+        search_terms = json.loads(response)
+        if not isinstance(search_terms, list) or not all(isinstance(term, str) for term in search_terms):
+            raise ValueError("response is not a list of strings.")
+
+    except (json.JSONDecodeError, ValueError):
+        # logger.warning(f"gpt returned an unformatted response. attempting to clean...")
+        # Attempt to extract list-like string and convert to list
+        match = re.search(r'\["(?:[^"\\]|\\.)*"(?:,\s*"[^"\\]*")*\]', response)
+        if match:
+            try:
+                search_terms = json.loads(match.group())
+            except json.JSONDecodeError:
+                logger.error(f"could not parse response: {response}")
+                return []
+
+    logger.success(f"completed: \n{search_terms}")
+    return search_terms
+
+
+if __name__ == "__main__":
+    video_subject = "生命的意义是什么"
+    script = generate_script(video_subject=video_subject, language="zh-CN", paragraph_number=1)
+    # print("######################")
+    # print(script)
+    search_terms = generate_terms(video_subject=video_subject, video_script=script, amount=5)
+    # print("######################")
+    # print(search_terms)
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -0,0 +1,112 @@
+import time
+
+import requests
+from typing import List
+from loguru import logger
+
+from app.config import config
+from app.models.schema import VideoAspect
+from app.utils import utils
+
+requested_count = 0
+pexels_api_keys = config.app.get("pexels_api_keys")
+if not pexels_api_keys:
+    raise ValueError("pexels_api_keys is not set, please set it in the config.toml file.")
+
+
+def round_robin_api_key():
+    global requested_count
+    requested_count += 1
+    return pexels_api_keys[requested_count % len(pexels_api_keys)]
+
+
+def search_videos(search_term: str,
+                  wanted_count: int,
+                  minimum_duration: int,
+                  video_aspect: VideoAspect = VideoAspect.portrait,
+                  locale: str = "zh-CN"
+                  ) -> List[str]:
+    aspect = VideoAspect(video_aspect)
+    video_orientation = aspect.name
+    video_width, video_height = aspect.to_resolution()
+
+    headers = {
+        "Authorization": round_robin_api_key()
+    }
+
+    # Build URL
+    query_url = f"https://api.pexels.com/videos/search?query={search_term}&per_page=15&orientation={video_orientation}&locale={locale}"
+    logger.info(f"searching videos: {query_url}")
+    # Send the request
+    r = requests.get(query_url, headers=headers)
+
+    # Parse the response
+    response = r.json()
+    video_urls = []
+
+    try:
+        videos_count = min(len(response["videos"]), wanted_count)
+        # loop through each video in the result
+        for i in range(videos_count):
+            # check if video has desired minimum duration
+            if response["videos"][i]["duration"] < minimum_duration:
+                continue
+            video_files = response["videos"][i]["video_files"]
+            # loop through each url to determine the best quality
+            for video in video_files:
+                # Check if video has a valid download link
+                # if ".com/external" in video["link"]:
+                w = int(video["width"])
+                h = int(video["height"])
+                if w == video_width and h == video_height:
+                    video_urls.append(video["link"])
+                    break
+
+    except Exception as e:
+        logger.error(f"search videos failed: {e}")
+
+    return video_urls
+
+
+def save_video(video_url: str, save_dir: str) -> str:
+    video_id = f"vid-{str(int(time.time() * 1000))}"
+    video_path = f"{save_dir}/{video_id}.mp4"
+    with open(video_path, "wb") as f:
+        f.write(requests.get(video_url).content)
+
+    return video_path
+
+
+def download_videos(task_id: str,
+                    search_terms: List[str],
+                    video_aspect: VideoAspect = VideoAspect.portrait,
+                    wanted_count: int = 15,
+                    minimum_duration: int = 5
+                    ) -> List[str]:
+    valid_video_urls = []
+    for search_term in search_terms:
+        # logger.info(f"searching videos for '{search_term}'")
+        video_urls = search_videos(search_term=search_term,
+                                   wanted_count=wanted_count,
+                                   minimum_duration=minimum_duration,
+                                   video_aspect=video_aspect)
+        logger.info(f"found {len(video_urls)} videos for '{search_term}'")
+        i = 0
+        for url in video_urls:
+            if url not in valid_video_urls:
+                valid_video_urls.append(url)
+                i += 1
+                if i >= 3:
+                    break
+
+    logger.info(f"downloading videos: {len(valid_video_urls)}")
+    video_paths = []
+    save_dir = utils.task_dir(task_id)
+    for video_url in valid_video_urls:
+        try:
+            saved_video_path = save_video(video_url, save_dir)
+            video_paths.append(saved_video_path)
+        except Exception as e:
+            logger.error(f"failed to download video: {video_url}, {e}")
+    logger.success(f"downloaded {len(video_paths)} videos")
+    return video_paths
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -0,0 +1,167 @@
+import json
+import re
+
+from faster_whisper import WhisperModel
+from timeit import default_timer as timer
+from loguru import logger
+
+from app.config import config
+from app.models import const
+from app.utils import utils
+
+model_size = config.whisper.get("model_size", "large-v3")
+device = config.whisper.get("device", "cpu")
+compute_type = config.whisper.get("compute_type", "int8")
+
+model = WhisperModel(model_size_or_path=model_size, device=device, compute_type=compute_type)
+
+
+def create(audio_file, subtitle_file: str = ""):
+    logger.info(f"start, output file: {subtitle_file}")
+    if not subtitle_file:
+        subtitle_file = f"{audio_file}.srt"
+
+    segments, info = model.transcribe(
+        audio_file,
+        beam_size=5,
+        word_timestamps=True,
+        vad_filter=True,
+        vad_parameters=dict(min_silence_duration_ms=500),
+    )
+
+    logger.info(f"detected language: '{info.language}', probability: {info.language_probability:.2f}")
+
+    start = timer()
+    subtitles = []
+
+    def recognized(seg_text, seg_start, seg_end):
+        seg_text = seg_text.strip()
+        if not seg_text:
+            return
+
+        msg = "[%.2fs -> %.2fs] %s" % (seg_start, seg_end, seg_text)
+        logger.debug(msg)
+
+        subtitles.append({
+            "msg": seg_text,
+            "start_time": seg_start,
+            "end_time": seg_end
+        })
+
+    for segment in segments:
+        words_idx = 0
+        words_len = len(segment.words)
+
+        seg_start = 0
+        seg_end = 0
+        seg_text = ""
+
+        if segment.words:
+            is_segmented = False
+            for word in segment.words:
+                if not is_segmented:
+                    seg_start = word.start
+                    is_segmented = True
+
+                seg_end = word.end
+                # 如果包含标点,则断句
+                seg_text += word.word
+
+                if utils.str_contains_punctuation(word.word):
+                    # remove last char
+                    seg_text = seg_text[:-1]
+                    if not seg_text:
+                        continue
+
+                    recognized(seg_text, seg_start, seg_end)
+
+                    is_segmented = False
+                    seg_text = ""
+
+                if words_idx == 0 and segment.start < word.start:
+                    seg_start = word.start
+                if words_idx == (words_len - 1) and segment.end > word.end:
+                    seg_end = word.end
+                words_idx += 1
+
+        if not seg_text:
+            continue
+
+        recognized(seg_text, seg_start, seg_end)
+
+    end = timer()
+
+    diff = end - start
+    logger.info(f"complete, elapsed: {diff:.2f} s")
+
+    idx = 1
+    lines = []
+    for subtitle in subtitles:
+        text = subtitle.get("msg")
+        if text:
+            lines.append(utils.text_to_srt(idx, text, subtitle.get("start_time"), subtitle.get("end_time")))
+            idx += 1
+
+    sub = "\n".join(lines)
+    with open(subtitle_file, "w") as f:
+        f.write(sub)
+    logger.info(f"subtitle file created: {subtitle_file}")
+
+
+def file_to_subtitles(filename):
+    times_texts = []
+    current_times = None
+    current_text = ""
+    index = 0
+    with open(filename, 'r') as f:
+        for line in f:
+            times = re.findall("([0-9]*:[0-9]*:[0-9]*,[0-9]*)", line)
+            if times:
+                current_times = line
+            elif line.strip() == '' and current_times:
+                index += 1
+                times_texts.append((index, current_times.strip(), current_text.strip()))
+                current_times, current_text = None, ""
+            elif current_times:
+                current_text += line
+    return times_texts
+
+
+def correct(subtitle_file, video_script):
+    subtitle_items = file_to_subtitles(subtitle_file)
+    script_lines = utils.split_string_by_punctuations(video_script)
+
+    corrected = False
+    if len(subtitle_items) == len(script_lines):
+        for i in range(len(script_lines)):
+            script_line = script_lines[i].strip()
+            subtitle_line = subtitle_items[i][2]
+            if script_line != subtitle_line:
+                logger.warning(f"line {i + 1}, script: {script_line}, subtitle: {subtitle_line}")
+                subtitle_items[i] = (subtitle_items[i][0], subtitle_items[i][1], script_line)
+                corrected = True
+
+    if corrected:
+        with open(subtitle_file, "w") as fd:
+            for item in subtitle_items:
+                fd.write(f"{item[0]}\n{item[1]}\n{item[2]}\n\n")
+        logger.info(f"subtitle corrected")
+    else:
+        logger.success(f"subtitle is correct")
+
+
+if __name__ == "__main__":
+    task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
+    task_dir = utils.task_dir(task_id)
+    subtitle_file = f"{task_dir}/subtitle.srt"
+
+    subtitles = file_to_subtitles(subtitle_file)
+    print(subtitles)
+
+    script_file = f"{task_dir}/script.json"
+    with open(script_file, "r") as f:
+        script_content = f.read()
+    s = json.loads(script_content)
+    script = s.get("script")
+
+    correct(subtitle_file, script)
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -0,0 +1,113 @@
+from os import path
+
+from loguru import logger
+
+from app.config import config
+from app.models.schema import VideoParams, VoiceNames
+from app.services import gpt, material, voice, video, subtitle
+from app.utils import utils
+
+
+def _parse_voice(name: str):
+    # "female-zh-CN-XiaoxiaoNeural",
+    # remove first part split by "-"
+    if name not in VoiceNames:
+        name = VoiceNames[0]
+
+    parts = name.split("-")
+    _lang = f"{parts[1]}-{parts[2]}"
+    _voice = f"{_lang}-{parts[3]}"
+
+    return _voice, _lang
+
+
+def start(task_id, params: VideoParams):
+    """
+    {
+        "video_subject": "",
+        "video_aspect": "横屏 16:9（西瓜视频）",
+        "voice_name": "女生-晓晓",
+        "enable_bgm": false,
+        "font_name": "STHeitiMedium 黑体-中",
+        "text_color": "#FFFFFF",
+        "font_size": 60,
+        "stroke_color": "#000000",
+        "stroke_width": 1.5
+    }
+    """
+    logger.info(f"start task: {task_id}")
+    video_subject = params.video_subject
+    voice_name, language = _parse_voice(params.voice_name)
+    paragraph_number = params.paragraph_number
+    n_threads = params.n_threads
+
+    logger.info("\n\n## generating video script")
+    script = gpt.generate_script(video_subject=video_subject, language=language, paragraph_number=paragraph_number)
+
+    logger.info("\n\n## generating video terms")
+    search_terms = gpt.generate_terms(video_subject=video_subject, video_script=script, amount=5)
+
+    script_file = path.join(utils.task_dir(task_id), f"script.json")
+    script_data = {
+        "script": script,
+        "search_terms": search_terms
+    }
+
+    with open(script_file, "w") as f:
+        f.write(utils.to_json(script_data))
+
+    audio_file = path.join(utils.task_dir(task_id), f"audio.mp3")
+    subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+
+    logger.info("\n\n## generating audio")
+    sub_maker = voice.tts(text=script, voice_name=voice_name, voice_file=audio_file)
+
+    subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+    logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
+    if subtitle_provider == "edge":
+        voice.create_subtitle(text=script, sub_maker=sub_maker, subtitle_file=subtitle_path)
+    if subtitle_provider == "whisper":
+        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+        logger.info("\n\n## correcting subtitle")
+        subtitle.correct(subtitle_file=subtitle_path, video_script=script)
+
+    logger.info("\n\n## downloading videos")
+    video_paths = material.download_videos(task_id=task_id, search_terms=search_terms, video_aspect=params.video_aspect,
+                                           wanted_count=20,
+                                           minimum_duration=5)
+
+    logger.info("\n\n## combining videos")
+    combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    video.combine_videos(combined_video_path=combined_video_path,
+                         video_paths=video_paths,
+                         audio_file=audio_file,
+                         video_aspect=params.video_aspect,
+                         max_clip_duration=5,
+                         threads=n_threads)
+
+    final_video_path = path.join(utils.task_dir(task_id), f"final.mp4")
+
+    bgm_file = video.get_bgm_file(bgm_name=params.bgm_name)
+    logger.info("\n\n## generating video")
+    # Put everything together
+    video.generate_video(video_path=combined_video_path,
+                         audio_path=audio_file,
+                         subtitle_path=subtitle_path,
+                         output_file=final_video_path,
+
+                         video_aspect=params.video_aspect,
+
+                         threads=n_threads,
+
+                         font_name=params.font_name,
+                         fontsize=params.font_size,
+                         text_fore_color=params.text_fore_color,
+                         stroke_color=params.stroke_color,
+                         stroke_width=params.stroke_width,
+
+                         bgm_file=bgm_file
+                         )
+    logger.start(f"task {task_id} finished")
+    return {
+        "video_file": final_video_path,
+    }
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -0,0 +1,246 @@
+import glob
+import random
+from typing import List
+from PIL import ImageFont
+from loguru import logger
+from moviepy.editor import *
+from moviepy.video.fx.crop import crop
+from moviepy.video.tools.subtitles import SubtitlesClip
+
+from app.models.schema import VideoAspect
+from app.utils import utils
+
+
+def get_bgm_file(bgm_name: str = "random"):
+    if not bgm_name:
+        return ""
+    if bgm_name == "random":
+        suffix = "*.mp3"
+        song_dir = utils.song_dir()
+        # 使用glob.glob获取指定扩展名的文件列表
+        files = glob.glob(os.path.join(song_dir, suffix))
+        # 使用random.choice从列表中随机选择一个文件
+        return random.choice(files)
+
+    file = os.path.join(utils.song_dir(), bgm_name)
+    if os.path.exists(file):
+        return file
+    return ""
+
+
+def combine_videos(combined_video_path: str,
+                   video_paths: List[str],
+                   audio_file: str,
+                   video_aspect: VideoAspect = VideoAspect.portrait,
+                   max_clip_duration: int = 5,
+                   threads: int = 2,
+                   ) -> str:
+    logger.info(f"combining {len(video_paths)} videos into one file: {combined_video_path}")
+    audio_clip = AudioFileClip(audio_file)
+    max_duration = audio_clip.duration
+    logger.info(f"max duration of audio: {max_duration} seconds")
+    # Required duration of each clip
+    req_dur = max_duration / len(video_paths)
+    logger.info(f"each clip will be maximum {req_dur} seconds long")
+
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    clips = []
+    tot_dur = 0
+    # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
+    while tot_dur < max_duration:
+        for video_path in video_paths:
+            clip = VideoFileClip(video_path)
+            clip = clip.without_audio()
+            # Check if clip is longer than the remaining audio
+            if (max_duration - tot_dur) < clip.duration:
+                clip = clip.subclip(0, (max_duration - tot_dur))
+            # Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image
+            elif req_dur < clip.duration:
+                clip = clip.subclip(0, req_dur)
+            clip = clip.set_fps(30)
+
+            # Not all videos are same size, so we need to resize them
+            # logger.info(f"{video_path}: size is {clip.w} x {clip.h}, expected {video_width} x {video_height}")
+            if clip.w != video_width or clip.h != video_height:
+                if round((clip.w / clip.h), 4) < 0.5625:
+                    clip = crop(clip,
+                                width=clip.w,
+                                height=round(clip.w / 0.5625),
+                                x_center=clip.w / 2,
+                                y_center=clip.h / 2
+                                )
+                else:
+                    clip = crop(clip,
+                                width=round(0.5625 * clip.h),
+                                height=clip.h,
+                                x_center=clip.w / 2,
+                                y_center=clip.h / 2
+                                )
+                logger.info(f"resizing video to {video_width} x {video_height}")
+                clip = clip.resize((video_width, video_height))
+
+            if clip.duration > max_clip_duration:
+                clip = clip.subclip(0, max_clip_duration)
+
+            clips.append(clip)
+            tot_dur += clip.duration
+
+    final_clip = concatenate_videoclips(clips)
+    final_clip = final_clip.set_fps(30)
+    logger.info(f"writing")
+    final_clip.write_videofile(combined_video_path, threads=threads)
+    logger.success(f"completed")
+    return combined_video_path
+
+
+def wrap_text(text, max_width, font='Arial', fontsize=60):
+    # 创建字体对象
+    font = ImageFont.truetype(font, fontsize)
+
+    def get_text_size(inner_text):
+        left, top, right, bottom = font.getbbox(inner_text)
+        return right - left, bottom - top
+
+    width, height = get_text_size(text)
+    if width <= max_width:
+        return text
+
+    logger.warning(f"wrapping text, max_width: {max_width}, text_width: {width}, text: {text}")
+    _wrapped_lines_ = []
+    # 使用textwrap尝试分行，然后检查每行是否符合宽度限制
+
+    chars = list(text)
+    _txt_ = ''
+    for char in chars:
+        _txt_ += char
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            _wrapped_lines_.append(_txt_)
+            _txt_ = ''
+    _wrapped_lines_.append(_txt_)
+    return '\n'.join(_wrapped_lines_)
+
+
+def generate_video(video_path: str,
+                   audio_path: str,
+                   subtitle_path: str,
+                   output_file: str,
+                   video_aspect: VideoAspect = VideoAspect.portrait,
+
+                   threads: int = 2,
+
+                   font_name: str = "",
+                   fontsize: int = 60,
+                   stroke_color: str = "#000000",
+                   stroke_width: float = 1.5,
+                   text_fore_color: str = "white",
+                   text_background_color: str = "transparent",
+
+                   bgm_file: str = "",
+                   ):
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    logger.info(f"start, video size: {video_width} x {video_height}")
+    logger.info(f"  ① video: {video_path}")
+    logger.info(f"  ② audio: {audio_path}")
+    logger.info(f"  ③ subtitle: {subtitle_path}")
+    logger.info(f"  ④ output: {output_file}")
+
+    if not font_name:
+        font_name = "STHeitiMedium.ttc"
+    font_path = os.path.join(utils.font_dir(), font_name)
+    logger.info(f"using font: {font_path}")
+
+    # 自定义的生成器函数，包含换行逻辑
+    def generator(txt):
+        # 应用自动换行
+        wrapped_txt = wrap_text(txt, max_width=video_width - 100,
+                                font=font_path,
+                                fontsize=fontsize)  # 调整max_width以适应你的视频
+        return TextClip(
+            wrapped_txt,
+            font=font_path,
+            fontsize=fontsize,
+            color=text_fore_color,
+            bg_color=text_background_color,
+            stroke_color=stroke_color,
+            stroke_width=stroke_width,
+            print_cmd=False,
+        )
+
+    position_height = video_height - 200
+    if video_aspect == VideoAspect.landscape:
+        position_height = video_height - 100
+
+    clips = [
+        VideoFileClip(video_path),
+        # subtitles.set_position(lambda _t: ('center', position_height))
+    ]
+    # Burn the subtitles into the video
+    if subtitle_path and os.path.exists(subtitle_path):
+        subtitles = SubtitlesClip(subtitle_path, generator)
+        clips.append(subtitles.set_position(lambda _t: ('center', position_height)))
+
+    result = CompositeVideoClip(clips)
+
+    # Add the audio
+    audio = AudioFileClip(audio_path)
+    result = result.set_audio(audio)
+
+    temp_output_file = f"{output_file}.temp.mp4"
+    logger.info(f"writing to temp file: {temp_output_file}")
+    result.write_videofile(temp_output_file, threads=threads or 2)
+
+    video_clip = VideoFileClip(temp_output_file)
+    if bgm_file:
+        logger.info(f"adding background music: {bgm_file}")
+        # Add song to video at 30% volume using moviepy
+        original_duration = video_clip.duration
+        original_audio = video_clip.audio
+        song_clip = AudioFileClip(bgm_file).set_fps(44100)
+        # Set the volume of the song to 10% of the original volume
+        song_clip = song_clip.volumex(0.2).set_fps(44100)
+        # Add the song to the video
+        comp_audio = CompositeAudioClip([original_audio, song_clip])
+        video_clip = video_clip.set_audio(comp_audio)
+        video_clip = video_clip.set_fps(30)
+        video_clip = video_clip.set_duration(original_duration)
+    # 编码为aac，否则iPhone里面无法播放
+    logger.info(f"encoding audio codec to aac")
+    video_clip.write_videofile(output_file, audio_codec="aac", threads=threads)
+    # delete the temp file
+    os.remove(temp_output_file)
+    logger.success(f"completed")
+
+
+if __name__ == "__main__":
+    txt = "hello 幸福经常被描述为最终人生目标和人类追求的核心 但它通常涉及对个人生活中意义和目的的深刻感悟"
+    font = utils.resource_dir() + "/fonts/STHeitiMedium.ttc"
+    t = wrap_text(text=txt, max_width=1000, font=font, fontsize=60)
+    print(t)
+
+    task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
+    task_dir = utils.task_dir(task_id)
+    video_file = f"{task_dir}/combined.mp4"
+    audio_file = f"{task_dir}/audio.mp3"
+    subtitle_file = f"{task_dir}/subtitle.srt"
+    output_file = f"{task_dir}/final.mp4"
+    generate_video(video_path=video_file,
+                   audio_path=audio_file,
+                   subtitle_path=subtitle_file,
+                   output_file=output_file,
+                   video_aspect=VideoAspect.portrait,
+                   threads=2,
+                   font_name="STHeitiMedium.ttc",
+                   fontsize=60,
+                   stroke_color="#000000",
+                   stroke_width=1.5,
+                   text_fore_color="white",
+                   text_background_color="transparent",
+                   bgm_file=""
+                   )
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -0,0 +1,101 @@
+import asyncio
+from xml.sax.saxutils import unescape
+from edge_tts.submaker import mktimestamp
+from loguru import logger
+from edge_tts import submaker, SubMaker
+import edge_tts
+from app.utils import utils
+
+
+def tts(text: str, voice_name: str, voice_file: str) -> SubMaker:
+    logger.info(f"start, voice name: {voice_name}")
+
+    async def _do() -> SubMaker:
+        communicate = edge_tts.Communicate(text, voice_name)
+        sub_maker = edge_tts.SubMaker()
+        with open(voice_file, "wb") as file:
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    file.write(chunk["data"])
+                elif chunk["type"] == "WordBoundary":
+                    sub_maker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
+        return sub_maker
+
+    sub_maker = asyncio.run(_do())
+    logger.info(f"completed, output file: {voice_file}")
+    return sub_maker
+
+
+def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+    """
+    优化字幕文件
+    1. 将字幕文件按照标点符号分割成多行
+    2. 逐行匹配字幕文件中的文本
+    3. 生成新的字幕文件
+    """
+
+    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
+        """
+        1
+        00:00:00,000 --> 00:00:02,360
+        跑步是一项简单易行的运动
+        """
+        start_t = mktimestamp(start_time).replace(".", ",")
+        end_t = mktimestamp(end_time).replace(".", ",")
+        return (
+            f"{idx}\n"
+            f"{start_t} --> {end_t}\n"
+            f"{sub_text}\n"
+        )
+
+    start_time = -1.0
+    sub_items = []
+    sub_index = 0
+
+    script_lines = utils.split_string_by_punctuations(text)
+
+    sub_line = ""
+    for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
+        _start_time, end_time = offset
+        if start_time < 0:
+            start_time = _start_time
+
+        sub = unescape(sub)
+        sub_line += sub
+        if sub_line == script_lines[sub_index]:
+            sub_index += 1
+            sub_items.append(formatter(
+                idx=sub_index,
+                start_time=start_time,
+                end_time=end_time,
+                sub_text=sub_line,
+            ))
+            start_time = -1.0
+            sub_line = ""
+
+    with open(subtitle_file, "w", encoding="utf-8") as file:
+        file.write("\n".join(sub_items))
+
+
+if __name__ == "__main__":
+    temp_dir = utils.storage_dir("temp")
+
+    voice_names = [
+        # 女性
+        "zh-CN-XiaoxiaoNeural",
+        "zh-CN-XiaoyiNeural",
+        # 男性
+        "zh-CN-YunyangNeural",
+        "zh-CN-YunxiNeural",
+    ]
+    text = """
+预计未来3天深圳冷空气活动频繁，未来两天持续阴天有小雨，出门带好雨具；
+10-11日持续阴天有小雨，日温差小，气温在13-17℃之间，体感阴凉；
+12日天气短暂好转，早晚清凉；
+    """
+
+    for voice_name in voice_names:
+        voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
+        subtitle_file = f"{temp_dir}/tts.mp3.srt"
+        sub_maker = tts(text=text, voice_name=voice_name, voice_file=voice_file)
+        create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -0,0 +1,167 @@
+import os
+import threading
+from typing import Any
+from loguru import logger
+import json
+from uuid import uuid4
+import urllib3
+
+from app.models import const
+
+urllib3.disable_warnings()
+
+
+def get_response(status: int, data: Any = None, message: str = ""):
+    obj = {
+        'status': status,
+    }
+    if data:
+        obj['data'] = data
+    if message:
+        obj['message'] = message
+    return obj
+
+
+def to_json(obj):
+    # 定义一个辅助函数来处理不同类型的对象
+    def serialize(o):
+        # 如果对象是可序列化类型，直接返回
+        if isinstance(o, (int, float, bool, str)) or o is None:
+            return o
+        # 如果对象是二进制数据，转换为base64编码的字符串
+        elif isinstance(o, bytes):
+            return "*** binary data ***"
+        # 如果对象是字典，递归处理每个键值对
+        elif isinstance(o, dict):
+            return {k: serialize(v) for k, v in o.items()}
+        # 如果对象是列表或元组，递归处理每个元素
+        elif isinstance(o, (list, tuple)):
+            return [serialize(item) for item in o]
+        # 如果对象是自定义类型，尝试返回其__dict__属性
+        elif hasattr(o, '__dict__'):
+            return serialize(o.__dict__)
+        # 其他情况返回None（或者可以选择抛出异常）
+        else:
+            return None
+
+    # 使用serialize函数处理输入对象
+    serialized_obj = serialize(obj)
+
+    # 序列化处理后的对象为JSON字符串
+    return json.dumps(serialized_obj, ensure_ascii=False, indent=4)
+
+
+def get_uuid(remove_hyphen: bool = False):
+    u = str(uuid4())
+    if remove_hyphen:
+        u = u.replace("-", "")
+    return u
+
+
+def root_dir():
+    return os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+
+
+def storage_dir(sub_dir: str = ""):
+    d = os.path.join(root_dir(), "storage")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    return d
+
+
+def resource_dir(sub_dir: str = ""):
+    d = os.path.join(root_dir(), "resource")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    return d
+
+
+def task_dir(sub_dir: str = ""):
+    d = os.path.join(storage_dir(), "tasks")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def font_dir(sub_dir: str = ""):
+    d = resource_dir(f"fonts")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def song_dir(sub_dir: str = ""):
+    d = resource_dir(f"songs")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def public_dir(sub_dir: str = ""):
+    d = resource_dir(f"public")
+    if sub_dir:
+        d = os.path.join(d, sub_dir)
+    if not os.path.exists(d):
+        os.makedirs(d)
+    return d
+
+
+def run_in_background(func, *args, **kwargs):
+    def run():
+        try:
+            func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"run_in_background error: {e}")
+
+    thread = threading.Thread(target=run)
+    thread.start()
+    return thread
+
+
+def time_convert_seconds_to_hmsm(seconds) -> str:
+    hours = int(seconds // 3600)
+    seconds = seconds % 3600
+    minutes = int(seconds // 60)
+    milliseconds = int(seconds * 1000) % 1000
+    seconds = int(seconds % 60)
+    return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, minutes, seconds, milliseconds)
+
+
+def text_to_srt(idx: int, msg: str, start_time: float, end_time: float) -> str:
+    start_time = time_convert_seconds_to_hmsm(start_time)
+    end_time = time_convert_seconds_to_hmsm(end_time)
+    srt = """%d
+%s --> %s
+%s
+        """ % (
+        idx,
+        start_time,
+        end_time,
+        msg,
+    )
+    return srt
+
+
+def str_contains_punctuation(word):
+    for p in const.punctuations:
+        if p in word:
+            return True
+    return False
+
+
+def split_string_by_punctuations(s):
+    result = []
+    txt = ""
+    for char in s:
+        if char not in const.punctuations:
+            txt += char
+        else:
+            result.append(txt.strip())
+            txt = ""
+    return result