init

2024-03-11 16:37:49 +08:00
parent d4f7b53b84
commit 06df797234
71 changed files with 2725 additions and 1 deletions
--- a/app/services/init.py
+++ b/app/services/init.py
--- a/app/services/gpt.py
+++ b/app/services/gpt.py
@@ -0,0 +1,152 @@
+import logging
+import re
+import json
+import openai
+from typing import List
+from loguru import logger
+
+from app.config import config
+
+openai_api_key = config.app.get("openai_api_key")
+if not openai_api_key:
+    raise ValueError("openai_api_key is not set, please set it in the config.toml file.")
+
+openai_model_name = config.app.get("openai_model_name")
+if not openai_model_name:
+    raise ValueError("openai_model_name is not set, please set it in the config.toml file.")
+
+openai_base_url = config.app.get("openai_base_url")
+
+openai.api_key = openai_api_key
+openai_model_name = openai_model_name
+if openai_base_url:
+    openai.base_url = openai_base_url
+
+
+def _generate_response(prompt: str) -> str:
+    model_name = openai_model_name
+
+    response = openai.chat.completions.create(
+        model=model_name,
+        messages=[{"role": "user", "content": prompt}],
+    ).choices[0].message.content
+    return response
+
+
+def generate_script(video_subject: str, language: str = "zh-CN", paragraph_number: int = 1) -> str:
+    prompt = f"""
+# Role: Video Script Generator
+
+## Goals:
+Generate a script for a video, depending on the subject of the video.
+
+## Constrains:
+1. the script is to be returned as a string with the specified number of paragraphs.
+2. do not under any circumstance reference this prompt in your response.
+3. get straight to the point, don't start with unnecessary things like, "welcome to this video".
+4. you must not include any type of markdown or formatting in the script, never use a title. 
+5. only return the raw content of the script. 
+6. do not include "voiceover", "narrator" or similar indicators of what should be spoken at the beginning of each paragraph or line. 
+7. you must not mention the prompt, or anything about the script itself. also, never talk about the amount of paragraphs or lines. just write the script.
+
+## Output Example:
+What is the meaning of life. This question has puzzled philosophers.
+
+# Initialization:
+- video subject: {video_subject}
+- output language: {language}
+- number of paragraphs: {paragraph_number}
+""".strip()
+
+    final_script = ""
+    logger.info(f"subject: {video_subject}")
+    logger.debug(f"prompt: \n{prompt}")
+    response = _generate_response(prompt=prompt)
+
+    # Return the generated script
+    if response:
+        # Clean the script
+        # Remove asterisks, hashes
+        response = response.replace("*", "")
+        response = response.replace("#", "")
+
+        # Remove markdown syntax
+        response = re.sub(r"\[.*\]", "", response)
+        response = re.sub(r"\(.*\)", "", response)
+
+        # Split the script into paragraphs
+        paragraphs = response.split("\n\n")
+
+        # Select the specified number of paragraphs
+        selected_paragraphs = paragraphs[:paragraph_number]
+
+        # Join the selected paragraphs into a single string
+        final_script = "\n\n".join(selected_paragraphs)
+
+        # Print to console the number of paragraphs used
+        # logger.info(f"number of paragraphs used: {len(selected_paragraphs)}")
+    else:
+        logging.error("gpt returned an empty response")
+
+    logger.success(f"completed: \n{final_script}")
+    return final_script
+
+
+def generate_terms(video_subject: str, video_script: str, amount: int = 5) -> List[str]:
+    prompt = f"""
+# Role: Video Search Terms Generator
+
+## Goals:
+Generate {amount} search terms for stock videos, depending on the subject of a video.
+
+## Constrains:
+1. the search terms are to be returned as a json-array of strings.
+2. each search term should consist of 1-3 words, always add the main subject of the video.
+3. you must only return the json-array of strings. you must not return anything else. you must not return the script.
+4. the search terms must be related to the subject of the video.
+5. reply with english search terms only.
+
+## Output Example:
+["search term 1", "search term 2", "search term 3","search term 4","search term 5"]
+
+## Context:
+### Video Subject
+{video_subject}
+
+### Video Script
+{video_script}
+""".strip()
+
+    logger.info(f"subject: {video_subject}")
+    logger.debug(f"prompt: \n{prompt}")
+    response = _generate_response(prompt)
+    search_terms = []
+
+    try:
+        search_terms = json.loads(response)
+        if not isinstance(search_terms, list) or not all(isinstance(term, str) for term in search_terms):
+            raise ValueError("response is not a list of strings.")
+
+    except (json.JSONDecodeError, ValueError):
+        # logger.warning(f"gpt returned an unformatted response. attempting to clean...")
+        # Attempt to extract list-like string and convert to list
+        match = re.search(r'\["(?:[^"\\]|\\.)*"(?:,\s*"[^"\\]*")*\]', response)
+        if match:
+            try:
+                search_terms = json.loads(match.group())
+            except json.JSONDecodeError:
+                logger.error(f"could not parse response: {response}")
+                return []
+
+    logger.success(f"completed: \n{search_terms}")
+    return search_terms
+
+
+if __name__ == "__main__":
+    video_subject = "生命的意义是什么"
+    script = generate_script(video_subject=video_subject, language="zh-CN", paragraph_number=1)
+    # print("######################")
+    # print(script)
+    search_terms = generate_terms(video_subject=video_subject, video_script=script, amount=5)
+    # print("######################")
+    # print(search_terms)
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -0,0 +1,112 @@
+import time
+
+import requests
+from typing import List
+from loguru import logger
+
+from app.config import config
+from app.models.schema import VideoAspect
+from app.utils import utils
+
+requested_count = 0
+pexels_api_keys = config.app.get("pexels_api_keys")
+if not pexels_api_keys:
+    raise ValueError("pexels_api_keys is not set, please set it in the config.toml file.")
+
+
+def round_robin_api_key():
+    global requested_count
+    requested_count += 1
+    return pexels_api_keys[requested_count % len(pexels_api_keys)]
+
+
+def search_videos(search_term: str,
+                  wanted_count: int,
+                  minimum_duration: int,
+                  video_aspect: VideoAspect = VideoAspect.portrait,
+                  locale: str = "zh-CN"
+                  ) -> List[str]:
+    aspect = VideoAspect(video_aspect)
+    video_orientation = aspect.name
+    video_width, video_height = aspect.to_resolution()
+
+    headers = {
+        "Authorization": round_robin_api_key()
+    }
+
+    # Build URL
+    query_url = f"https://api.pexels.com/videos/search?query={search_term}&per_page=15&orientation={video_orientation}&locale={locale}"
+    logger.info(f"searching videos: {query_url}")
+    # Send the request
+    r = requests.get(query_url, headers=headers)
+
+    # Parse the response
+    response = r.json()
+    video_urls = []
+
+    try:
+        videos_count = min(len(response["videos"]), wanted_count)
+        # loop through each video in the result
+        for i in range(videos_count):
+            # check if video has desired minimum duration
+            if response["videos"][i]["duration"] < minimum_duration:
+                continue
+            video_files = response["videos"][i]["video_files"]
+            # loop through each url to determine the best quality
+            for video in video_files:
+                # Check if video has a valid download link
+                # if ".com/external" in video["link"]:
+                w = int(video["width"])
+                h = int(video["height"])
+                if w == video_width and h == video_height:
+                    video_urls.append(video["link"])
+                    break
+
+    except Exception as e:
+        logger.error(f"search videos failed: {e}")
+
+    return video_urls
+
+
+def save_video(video_url: str, save_dir: str) -> str:
+    video_id = f"vid-{str(int(time.time() * 1000))}"
+    video_path = f"{save_dir}/{video_id}.mp4"
+    with open(video_path, "wb") as f:
+        f.write(requests.get(video_url).content)
+
+    return video_path
+
+
+def download_videos(task_id: str,
+                    search_terms: List[str],
+                    video_aspect: VideoAspect = VideoAspect.portrait,
+                    wanted_count: int = 15,
+                    minimum_duration: int = 5
+                    ) -> List[str]:
+    valid_video_urls = []
+    for search_term in search_terms:
+        # logger.info(f"searching videos for '{search_term}'")
+        video_urls = search_videos(search_term=search_term,
+                                   wanted_count=wanted_count,
+                                   minimum_duration=minimum_duration,
+                                   video_aspect=video_aspect)
+        logger.info(f"found {len(video_urls)} videos for '{search_term}'")
+        i = 0
+        for url in video_urls:
+            if url not in valid_video_urls:
+                valid_video_urls.append(url)
+                i += 1
+                if i >= 3:
+                    break
+
+    logger.info(f"downloading videos: {len(valid_video_urls)}")
+    video_paths = []
+    save_dir = utils.task_dir(task_id)
+    for video_url in valid_video_urls:
+        try:
+            saved_video_path = save_video(video_url, save_dir)
+            video_paths.append(saved_video_path)
+        except Exception as e:
+            logger.error(f"failed to download video: {video_url}, {e}")
+    logger.success(f"downloaded {len(video_paths)} videos")
+    return video_paths
--- a/app/services/subtitle.py
+++ b/app/services/subtitle.py
@@ -0,0 +1,167 @@
+import json
+import re
+
+from faster_whisper import WhisperModel
+from timeit import default_timer as timer
+from loguru import logger
+
+from app.config import config
+from app.models import const
+from app.utils import utils
+
+model_size = config.whisper.get("model_size", "large-v3")
+device = config.whisper.get("device", "cpu")
+compute_type = config.whisper.get("compute_type", "int8")
+
+model = WhisperModel(model_size_or_path=model_size, device=device, compute_type=compute_type)
+
+
+def create(audio_file, subtitle_file: str = ""):
+    logger.info(f"start, output file: {subtitle_file}")
+    if not subtitle_file:
+        subtitle_file = f"{audio_file}.srt"
+
+    segments, info = model.transcribe(
+        audio_file,
+        beam_size=5,
+        word_timestamps=True,
+        vad_filter=True,
+        vad_parameters=dict(min_silence_duration_ms=500),
+    )
+
+    logger.info(f"detected language: '{info.language}', probability: {info.language_probability:.2f}")
+
+    start = timer()
+    subtitles = []
+
+    def recognized(seg_text, seg_start, seg_end):
+        seg_text = seg_text.strip()
+        if not seg_text:
+            return
+
+        msg = "[%.2fs -> %.2fs] %s" % (seg_start, seg_end, seg_text)
+        logger.debug(msg)
+
+        subtitles.append({
+            "msg": seg_text,
+            "start_time": seg_start,
+            "end_time": seg_end
+        })
+
+    for segment in segments:
+        words_idx = 0
+        words_len = len(segment.words)
+
+        seg_start = 0
+        seg_end = 0
+        seg_text = ""
+
+        if segment.words:
+            is_segmented = False
+            for word in segment.words:
+                if not is_segmented:
+                    seg_start = word.start
+                    is_segmented = True
+
+                seg_end = word.end
+                # 如果包含标点,则断句
+                seg_text += word.word
+
+                if utils.str_contains_punctuation(word.word):
+                    # remove last char
+                    seg_text = seg_text[:-1]
+                    if not seg_text:
+                        continue
+
+                    recognized(seg_text, seg_start, seg_end)
+
+                    is_segmented = False
+                    seg_text = ""
+
+                if words_idx == 0 and segment.start < word.start:
+                    seg_start = word.start
+                if words_idx == (words_len - 1) and segment.end > word.end:
+                    seg_end = word.end
+                words_idx += 1
+
+        if not seg_text:
+            continue
+
+        recognized(seg_text, seg_start, seg_end)
+
+    end = timer()
+
+    diff = end - start
+    logger.info(f"complete, elapsed: {diff:.2f} s")
+
+    idx = 1
+    lines = []
+    for subtitle in subtitles:
+        text = subtitle.get("msg")
+        if text:
+            lines.append(utils.text_to_srt(idx, text, subtitle.get("start_time"), subtitle.get("end_time")))
+            idx += 1
+
+    sub = "\n".join(lines)
+    with open(subtitle_file, "w") as f:
+        f.write(sub)
+    logger.info(f"subtitle file created: {subtitle_file}")
+
+
+def file_to_subtitles(filename):
+    times_texts = []
+    current_times = None
+    current_text = ""
+    index = 0
+    with open(filename, 'r') as f:
+        for line in f:
+            times = re.findall("([0-9]*:[0-9]*:[0-9]*,[0-9]*)", line)
+            if times:
+                current_times = line
+            elif line.strip() == '' and current_times:
+                index += 1
+                times_texts.append((index, current_times.strip(), current_text.strip()))
+                current_times, current_text = None, ""
+            elif current_times:
+                current_text += line
+    return times_texts
+
+
+def correct(subtitle_file, video_script):
+    subtitle_items = file_to_subtitles(subtitle_file)
+    script_lines = utils.split_string_by_punctuations(video_script)
+
+    corrected = False
+    if len(subtitle_items) == len(script_lines):
+        for i in range(len(script_lines)):
+            script_line = script_lines[i].strip()
+            subtitle_line = subtitle_items[i][2]
+            if script_line != subtitle_line:
+                logger.warning(f"line {i + 1}, script: {script_line}, subtitle: {subtitle_line}")
+                subtitle_items[i] = (subtitle_items[i][0], subtitle_items[i][1], script_line)
+                corrected = True
+
+    if corrected:
+        with open(subtitle_file, "w") as fd:
+            for item in subtitle_items:
+                fd.write(f"{item[0]}\n{item[1]}\n{item[2]}\n\n")
+        logger.info(f"subtitle corrected")
+    else:
+        logger.success(f"subtitle is correct")
+
+
+if __name__ == "__main__":
+    task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
+    task_dir = utils.task_dir(task_id)
+    subtitle_file = f"{task_dir}/subtitle.srt"
+
+    subtitles = file_to_subtitles(subtitle_file)
+    print(subtitles)
+
+    script_file = f"{task_dir}/script.json"
+    with open(script_file, "r") as f:
+        script_content = f.read()
+    s = json.loads(script_content)
+    script = s.get("script")
+
+    correct(subtitle_file, script)
--- a/app/services/task.py
+++ b/app/services/task.py
@@ -0,0 +1,113 @@
+from os import path
+
+from loguru import logger
+
+from app.config import config
+from app.models.schema import VideoParams, VoiceNames
+from app.services import gpt, material, voice, video, subtitle
+from app.utils import utils
+
+
+def _parse_voice(name: str):
+    # "female-zh-CN-XiaoxiaoNeural",
+    # remove first part split by "-"
+    if name not in VoiceNames:
+        name = VoiceNames[0]
+
+    parts = name.split("-")
+    _lang = f"{parts[1]}-{parts[2]}"
+    _voice = f"{_lang}-{parts[3]}"
+
+    return _voice, _lang
+
+
+def start(task_id, params: VideoParams):
+    """
+    {
+        "video_subject": "",
+        "video_aspect": "横屏 16:9（西瓜视频）",
+        "voice_name": "女生-晓晓",
+        "enable_bgm": false,
+        "font_name": "STHeitiMedium 黑体-中",
+        "text_color": "#FFFFFF",
+        "font_size": 60,
+        "stroke_color": "#000000",
+        "stroke_width": 1.5
+    }
+    """
+    logger.info(f"start task: {task_id}")
+    video_subject = params.video_subject
+    voice_name, language = _parse_voice(params.voice_name)
+    paragraph_number = params.paragraph_number
+    n_threads = params.n_threads
+
+    logger.info("\n\n## generating video script")
+    script = gpt.generate_script(video_subject=video_subject, language=language, paragraph_number=paragraph_number)
+
+    logger.info("\n\n## generating video terms")
+    search_terms = gpt.generate_terms(video_subject=video_subject, video_script=script, amount=5)
+
+    script_file = path.join(utils.task_dir(task_id), f"script.json")
+    script_data = {
+        "script": script,
+        "search_terms": search_terms
+    }
+
+    with open(script_file, "w") as f:
+        f.write(utils.to_json(script_data))
+
+    audio_file = path.join(utils.task_dir(task_id), f"audio.mp3")
+    subtitle_path = path.join(utils.task_dir(task_id), f"subtitle.srt")
+
+    logger.info("\n\n## generating audio")
+    sub_maker = voice.tts(text=script, voice_name=voice_name, voice_file=audio_file)
+
+    subtitle_provider = config.app.get("subtitle_provider", "").strip().lower()
+    logger.info(f"\n\n## generating subtitle, provider: {subtitle_provider}")
+    if subtitle_provider == "edge":
+        voice.create_subtitle(text=script, sub_maker=sub_maker, subtitle_file=subtitle_path)
+    if subtitle_provider == "whisper":
+        subtitle.create(audio_file=audio_file, subtitle_file=subtitle_path)
+        logger.info("\n\n## correcting subtitle")
+        subtitle.correct(subtitle_file=subtitle_path, video_script=script)
+
+    logger.info("\n\n## downloading videos")
+    video_paths = material.download_videos(task_id=task_id, search_terms=search_terms, video_aspect=params.video_aspect,
+                                           wanted_count=20,
+                                           minimum_duration=5)
+
+    logger.info("\n\n## combining videos")
+    combined_video_path = path.join(utils.task_dir(task_id), f"combined.mp4")
+    video.combine_videos(combined_video_path=combined_video_path,
+                         video_paths=video_paths,
+                         audio_file=audio_file,
+                         video_aspect=params.video_aspect,
+                         max_clip_duration=5,
+                         threads=n_threads)
+
+    final_video_path = path.join(utils.task_dir(task_id), f"final.mp4")
+
+    bgm_file = video.get_bgm_file(bgm_name=params.bgm_name)
+    logger.info("\n\n## generating video")
+    # Put everything together
+    video.generate_video(video_path=combined_video_path,
+                         audio_path=audio_file,
+                         subtitle_path=subtitle_path,
+                         output_file=final_video_path,
+
+                         video_aspect=params.video_aspect,
+
+                         threads=n_threads,
+
+                         font_name=params.font_name,
+                         fontsize=params.font_size,
+                         text_fore_color=params.text_fore_color,
+                         stroke_color=params.stroke_color,
+                         stroke_width=params.stroke_width,
+
+                         bgm_file=bgm_file
+                         )
+    logger.start(f"task {task_id} finished")
+    return {
+        "video_file": final_video_path,
+    }
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -0,0 +1,246 @@
+import glob
+import random
+from typing import List
+from PIL import ImageFont
+from loguru import logger
+from moviepy.editor import *
+from moviepy.video.fx.crop import crop
+from moviepy.video.tools.subtitles import SubtitlesClip
+
+from app.models.schema import VideoAspect
+from app.utils import utils
+
+
+def get_bgm_file(bgm_name: str = "random"):
+    if not bgm_name:
+        return ""
+    if bgm_name == "random":
+        suffix = "*.mp3"
+        song_dir = utils.song_dir()
+        # 使用glob.glob获取指定扩展名的文件列表
+        files = glob.glob(os.path.join(song_dir, suffix))
+        # 使用random.choice从列表中随机选择一个文件
+        return random.choice(files)
+
+    file = os.path.join(utils.song_dir(), bgm_name)
+    if os.path.exists(file):
+        return file
+    return ""
+
+
+def combine_videos(combined_video_path: str,
+                   video_paths: List[str],
+                   audio_file: str,
+                   video_aspect: VideoAspect = VideoAspect.portrait,
+                   max_clip_duration: int = 5,
+                   threads: int = 2,
+                   ) -> str:
+    logger.info(f"combining {len(video_paths)} videos into one file: {combined_video_path}")
+    audio_clip = AudioFileClip(audio_file)
+    max_duration = audio_clip.duration
+    logger.info(f"max duration of audio: {max_duration} seconds")
+    # Required duration of each clip
+    req_dur = max_duration / len(video_paths)
+    logger.info(f"each clip will be maximum {req_dur} seconds long")
+
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    clips = []
+    tot_dur = 0
+    # Add downloaded clips over and over until the duration of the audio (max_duration) has been reached
+    while tot_dur < max_duration:
+        for video_path in video_paths:
+            clip = VideoFileClip(video_path)
+            clip = clip.without_audio()
+            # Check if clip is longer than the remaining audio
+            if (max_duration - tot_dur) < clip.duration:
+                clip = clip.subclip(0, (max_duration - tot_dur))
+            # Only shorten clips if the calculated clip length (req_dur) is shorter than the actual clip to prevent still image
+            elif req_dur < clip.duration:
+                clip = clip.subclip(0, req_dur)
+            clip = clip.set_fps(30)
+
+            # Not all videos are same size, so we need to resize them
+            # logger.info(f"{video_path}: size is {clip.w} x {clip.h}, expected {video_width} x {video_height}")
+            if clip.w != video_width or clip.h != video_height:
+                if round((clip.w / clip.h), 4) < 0.5625:
+                    clip = crop(clip,
+                                width=clip.w,
+                                height=round(clip.w / 0.5625),
+                                x_center=clip.w / 2,
+                                y_center=clip.h / 2
+                                )
+                else:
+                    clip = crop(clip,
+                                width=round(0.5625 * clip.h),
+                                height=clip.h,
+                                x_center=clip.w / 2,
+                                y_center=clip.h / 2
+                                )
+                logger.info(f"resizing video to {video_width} x {video_height}")
+                clip = clip.resize((video_width, video_height))
+
+            if clip.duration > max_clip_duration:
+                clip = clip.subclip(0, max_clip_duration)
+
+            clips.append(clip)
+            tot_dur += clip.duration
+
+    final_clip = concatenate_videoclips(clips)
+    final_clip = final_clip.set_fps(30)
+    logger.info(f"writing")
+    final_clip.write_videofile(combined_video_path, threads=threads)
+    logger.success(f"completed")
+    return combined_video_path
+
+
+def wrap_text(text, max_width, font='Arial', fontsize=60):
+    # 创建字体对象
+    font = ImageFont.truetype(font, fontsize)
+
+    def get_text_size(inner_text):
+        left, top, right, bottom = font.getbbox(inner_text)
+        return right - left, bottom - top
+
+    width, height = get_text_size(text)
+    if width <= max_width:
+        return text
+
+    logger.warning(f"wrapping text, max_width: {max_width}, text_width: {width}, text: {text}")
+    _wrapped_lines_ = []
+    # 使用textwrap尝试分行，然后检查每行是否符合宽度限制
+
+    chars = list(text)
+    _txt_ = ''
+    for char in chars:
+        _txt_ += char
+        _width, _height = get_text_size(_txt_)
+        if _width <= max_width:
+            continue
+        else:
+            _wrapped_lines_.append(_txt_)
+            _txt_ = ''
+    _wrapped_lines_.append(_txt_)
+    return '\n'.join(_wrapped_lines_)
+
+
+def generate_video(video_path: str,
+                   audio_path: str,
+                   subtitle_path: str,
+                   output_file: str,
+                   video_aspect: VideoAspect = VideoAspect.portrait,
+
+                   threads: int = 2,
+
+                   font_name: str = "",
+                   fontsize: int = 60,
+                   stroke_color: str = "#000000",
+                   stroke_width: float = 1.5,
+                   text_fore_color: str = "white",
+                   text_background_color: str = "transparent",
+
+                   bgm_file: str = "",
+                   ):
+    aspect = VideoAspect(video_aspect)
+    video_width, video_height = aspect.to_resolution()
+
+    logger.info(f"start, video size: {video_width} x {video_height}")
+    logger.info(f"  ① video: {video_path}")
+    logger.info(f"  ② audio: {audio_path}")
+    logger.info(f"  ③ subtitle: {subtitle_path}")
+    logger.info(f"  ④ output: {output_file}")
+
+    if not font_name:
+        font_name = "STHeitiMedium.ttc"
+    font_path = os.path.join(utils.font_dir(), font_name)
+    logger.info(f"using font: {font_path}")
+
+    # 自定义的生成器函数，包含换行逻辑
+    def generator(txt):
+        # 应用自动换行
+        wrapped_txt = wrap_text(txt, max_width=video_width - 100,
+                                font=font_path,
+                                fontsize=fontsize)  # 调整max_width以适应你的视频
+        return TextClip(
+            wrapped_txt,
+            font=font_path,
+            fontsize=fontsize,
+            color=text_fore_color,
+            bg_color=text_background_color,
+            stroke_color=stroke_color,
+            stroke_width=stroke_width,
+            print_cmd=False,
+        )
+
+    position_height = video_height - 200
+    if video_aspect == VideoAspect.landscape:
+        position_height = video_height - 100
+
+    clips = [
+        VideoFileClip(video_path),
+        # subtitles.set_position(lambda _t: ('center', position_height))
+    ]
+    # Burn the subtitles into the video
+    if subtitle_path and os.path.exists(subtitle_path):
+        subtitles = SubtitlesClip(subtitle_path, generator)
+        clips.append(subtitles.set_position(lambda _t: ('center', position_height)))
+
+    result = CompositeVideoClip(clips)
+
+    # Add the audio
+    audio = AudioFileClip(audio_path)
+    result = result.set_audio(audio)
+
+    temp_output_file = f"{output_file}.temp.mp4"
+    logger.info(f"writing to temp file: {temp_output_file}")
+    result.write_videofile(temp_output_file, threads=threads or 2)
+
+    video_clip = VideoFileClip(temp_output_file)
+    if bgm_file:
+        logger.info(f"adding background music: {bgm_file}")
+        # Add song to video at 30% volume using moviepy
+        original_duration = video_clip.duration
+        original_audio = video_clip.audio
+        song_clip = AudioFileClip(bgm_file).set_fps(44100)
+        # Set the volume of the song to 10% of the original volume
+        song_clip = song_clip.volumex(0.2).set_fps(44100)
+        # Add the song to the video
+        comp_audio = CompositeAudioClip([original_audio, song_clip])
+        video_clip = video_clip.set_audio(comp_audio)
+        video_clip = video_clip.set_fps(30)
+        video_clip = video_clip.set_duration(original_duration)
+    # 编码为aac，否则iPhone里面无法播放
+    logger.info(f"encoding audio codec to aac")
+    video_clip.write_videofile(output_file, audio_codec="aac", threads=threads)
+    # delete the temp file
+    os.remove(temp_output_file)
+    logger.success(f"completed")
+
+
+if __name__ == "__main__":
+    txt = "hello 幸福经常被描述为最终人生目标和人类追求的核心 但它通常涉及对个人生活中意义和目的的深刻感悟"
+    font = utils.resource_dir() + "/fonts/STHeitiMedium.ttc"
+    t = wrap_text(text=txt, max_width=1000, font=font, fontsize=60)
+    print(t)
+
+    task_id = "c12fd1e6-4b0a-4d65-a075-c87abe35a072"
+    task_dir = utils.task_dir(task_id)
+    video_file = f"{task_dir}/combined.mp4"
+    audio_file = f"{task_dir}/audio.mp3"
+    subtitle_file = f"{task_dir}/subtitle.srt"
+    output_file = f"{task_dir}/final.mp4"
+    generate_video(video_path=video_file,
+                   audio_path=audio_file,
+                   subtitle_path=subtitle_file,
+                   output_file=output_file,
+                   video_aspect=VideoAspect.portrait,
+                   threads=2,
+                   font_name="STHeitiMedium.ttc",
+                   fontsize=60,
+                   stroke_color="#000000",
+                   stroke_width=1.5,
+                   text_fore_color="white",
+                   text_background_color="transparent",
+                   bgm_file=""
+                   )
--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -0,0 +1,101 @@
+import asyncio
+from xml.sax.saxutils import unescape
+from edge_tts.submaker import mktimestamp
+from loguru import logger
+from edge_tts import submaker, SubMaker
+import edge_tts
+from app.utils import utils
+
+
+def tts(text: str, voice_name: str, voice_file: str) -> SubMaker:
+    logger.info(f"start, voice name: {voice_name}")
+
+    async def _do() -> SubMaker:
+        communicate = edge_tts.Communicate(text, voice_name)
+        sub_maker = edge_tts.SubMaker()
+        with open(voice_file, "wb") as file:
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    file.write(chunk["data"])
+                elif chunk["type"] == "WordBoundary":
+                    sub_maker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"])
+        return sub_maker
+
+    sub_maker = asyncio.run(_do())
+    logger.info(f"completed, output file: {voice_file}")
+    return sub_maker
+
+
+def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+    """
+    优化字幕文件
+    1. 将字幕文件按照标点符号分割成多行
+    2. 逐行匹配字幕文件中的文本
+    3. 生成新的字幕文件
+    """
+
+    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
+        """
+        1
+        00:00:00,000 --> 00:00:02,360
+        跑步是一项简单易行的运动
+        """
+        start_t = mktimestamp(start_time).replace(".", ",")
+        end_t = mktimestamp(end_time).replace(".", ",")
+        return (
+            f"{idx}\n"
+            f"{start_t} --> {end_t}\n"
+            f"{sub_text}\n"
+        )
+
+    start_time = -1.0
+    sub_items = []
+    sub_index = 0
+
+    script_lines = utils.split_string_by_punctuations(text)
+
+    sub_line = ""
+    for _, (offset, sub) in enumerate(zip(sub_maker.offset, sub_maker.subs)):
+        _start_time, end_time = offset
+        if start_time < 0:
+            start_time = _start_time
+
+        sub = unescape(sub)
+        sub_line += sub
+        if sub_line == script_lines[sub_index]:
+            sub_index += 1
+            sub_items.append(formatter(
+                idx=sub_index,
+                start_time=start_time,
+                end_time=end_time,
+                sub_text=sub_line,
+            ))
+            start_time = -1.0
+            sub_line = ""
+
+    with open(subtitle_file, "w", encoding="utf-8") as file:
+        file.write("\n".join(sub_items))
+
+
+if __name__ == "__main__":
+    temp_dir = utils.storage_dir("temp")
+
+    voice_names = [
+        # 女性
+        "zh-CN-XiaoxiaoNeural",
+        "zh-CN-XiaoyiNeural",
+        # 男性
+        "zh-CN-YunyangNeural",
+        "zh-CN-YunxiNeural",
+    ]
+    text = """
+预计未来3天深圳冷空气活动频繁，未来两天持续阴天有小雨，出门带好雨具；
+10-11日持续阴天有小雨，日温差小，气温在13-17℃之间，体感阴凉；
+12日天气短暂好转，早晚清凉；
+    """
+
+    for voice_name in voice_names:
+        voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
+        subtitle_file = f"{temp_dir}/tts.mp3.srt"
+        sub_maker = tts(text=text, voice_name=voice_name, voice_file=voice_file)
+        create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)