Merge pull request #264 from harry0703/dev

support azure new speech voice and fix the bug where clip were not closed
optimize code
2024-04-16 09:00:10 +08:00 · 2024-04-15 17:47:10 +08:00 · 2024-04-15 17:46:56 +08:00 · 2024-04-15 17:46:24 +08:00 · 2024-04-15 17:45:05 +08:00 · 2024-04-15 17:23:29 +08:00
20 changed files with 429 additions and 71 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,3 +21,4 @@ __pycache__/
 .svn/

 storage/
+config.toml
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,5 @@
 /*/__pycache__/*
 .vscode
 /**/.streamlit
+__pycache__
+logs/
--- a/README.md
+++ b/README.md
@@ -66,6 +66,9 @@
 - [ ] 支持更多的语音合成服务商，比如 OpenAI TTS, Azure TTS
 - [ ] 自动上传到YouTube平台

+## 交流讨论 💬
+<img src="docs/wechat-03.jpg" width="300">
+
 ## 视频演示 📺

 ### 竖屏 9:16
@@ -102,8 +105,17 @@
 </tbody>
 </table>

+## 配置要求 📦
+- 建议最低 CPU 4核或以上，内存 8G 或以上，显卡非必须
+- Windows 10 或 MacOS 11.0 以上系统
+
 ## 安装部署 📥

+> 不想部署的可以直接下载安装包，解压直接使用
+- **Windows** 版本下载地址
+  - 百度网盘: https://pan.baidu.com/s/1BB3SGtAFTytzFLS5t2d8Gg?pwd=5bry
+
+### 前提条件
 - 尽量不要使用 **中文路径**，避免出现一些无法预料的问题
 - 请确保你的 **网络** 是正常的，VPN需要打开`全局流量`模式

@@ -230,8 +242,8 @@ python main.py

 当前支持2种字幕生成方式：

- edge: 生成速度更快，性能更好，对电脑配置没有要求，但是质量可能不稳定
- whisper: 生成速度较慢，性能较差，对电脑配置有一定要求，但是质量更可靠。
+- **edge**: 生成`速度快`，性能更好，对电脑配置没有要求，但是质量可能不稳定
+- **whisper**: 生成`速度慢`，性能较差，对电脑配置有一定要求，但是`质量更可靠`。

 可以修改 `config.toml` 配置文件中的 `subtitle_provider` 进行切换

@@ -241,6 +253,25 @@ python main.py
 1. whisper 模式下需要到 HuggingFace 下载一个模型文件，大约 3GB 左右，请确保网络通畅
 2. 如果留空，表示不生成字幕。

+> 由于国内无法访问 HuggingFace，可以使用以下方法下载 `whisper-large-v3` 的模型文件
+
+下载地址：
+- 百度网盘: https://pan.baidu.com/s/11h3Q6tsDtjQKTjUu3sc5cA?pwd=xjs9
+- 夸克网盘：https://pan.quark.cn/s/3ee3d991d64b
+
+模型下载后解压，整个目录放到 `.\MoneyPrinterTurbo\models` 里面，
+最终的文件路径应该是这样: `.\MoneyPrinterTurbo\models\whisper-large-v3`
+```
+MoneyPrinterTurbo  
+  ├─models
+  │   └─whisper-large-v3
+  │          config.json
+  │          model.bin
+  │          preprocessor_config.json
+  │          tokenizer.json
+  │          vocabulary.json
+```
+
 ## 背景音乐 🎵

 用于视频的背景音乐，位于项目的 `resource/songs` 目录下。
@@ -375,14 +406,6 @@ pip install Pillow==8.4.0

 - 可以提交 [issue](https://github.com/harry0703/MoneyPrinterTurbo/issues)
  或者 [pull request](https://github.com/harry0703/MoneyPrinterTurbo/pulls)。
- 也可以关注我的 **抖音** 或 **视频号**：`网旭哈瑞.AI`
-    - 我会在上面发布一些 **使用教程** 和 **纯技术** 分享。
-    - 如果有更新和优化，我也会在上面 **及时通知**。
-    - 有问题也可以在上面 **留言**，我会 **尽快回复**。
-
-|                   抖音                    |              |                     视频号                     |
-|:---------------------------------------:|:------------:|:-------------------------------------------:|
-| <img src="docs/douyin.jpg" width="180"> |              | <img src="docs/shipinghao.jpg" width="200"> |

 ## 参考项目 📚

--- a/app/config/config.py
+++ b/app/config/config.py
@@ -36,6 +36,8 @@ def save_config():
        _cfg["app"] = app
        _cfg["whisper"] = whisper
        _cfg["pexels"] = pexels
+        _cfg["azure"] = azure
+        _cfg["ui"] = ui
        f.write(toml.dumps(_cfg))


@@ -43,6 +45,7 @@ _cfg = load_config()
 app = _cfg.get("app", {})
 whisper = _cfg.get("whisper", {})
 pexels = _cfg.get("pexels", {})
+azure = _cfg.get("azure", {})
 ui = _cfg.get("ui", {})

 hostname = socket.gethostname()
@@ -53,7 +56,7 @@ listen_port = _cfg.get("listen_port", 8080)
 project_name = _cfg.get("project_name", "MoneyPrinterTurbo")
 project_description = _cfg.get("project_description",
                               "<a href='https://github.com/harry0703/MoneyPrinterTurbo'>https://github.com/harry0703/MoneyPrinterTurbo</a>")
-project_version = _cfg.get("project_version", "1.1.0")
+project_version = _cfg.get("project_version", "1.1.2")
 reload_debug = False

 imagemagick_path = app.get("imagemagick_path", "")
@@ -63,3 +66,5 @@ if imagemagick_path and os.path.isfile(imagemagick_path):
 ffmpeg_path = app.get("ffmpeg_path", "")
 if ffmpeg_path and os.path.isfile(ffmpeg_path):
    os.environ["IMAGEIO_FFMPEG_EXE"] = ffmpeg_path
+
+logger.info(f"{project_name} v{project_version}")
--- a/app/controllers/v1/video.py
+++ b/app/controllers/v1/video.py
@@ -1,8 +1,10 @@
 import os
 import glob
+import pathlib
 import shutil

 from fastapi import Request, Depends, Path, BackgroundTasks, UploadFile
+from fastapi.responses import FileResponse, StreamingResponse
 from fastapi.params import File
 from loguru import logger

@@ -78,7 +80,7 @@ def get_task(request: Request, task_id: str = Path(..., description="Task ID"),


@router.delete("/tasks/{task_id}", response_model=TaskDeletionResponse, summary="Delete a generated short video task")
-def create_video(request: Request, task_id: str = Path(..., description="Task ID")):
+def delete_video(request: Request, task_id: str = Path(..., description="Task ID")):
    request_id = base.get_task_id(request)
    task = sm.state.get_task(task_id)
    if task:
@@ -89,7 +91,7 @@ def create_video(request: Request, task_id: str = Path(..., description="Task ID

        sm.state.delete_task(task_id)
        logger.success(f"video deleted: {utils.to_json(task)}")
-        return utils.get_response(200, task)
+        return utils.get_response(200)

    raise HttpException(task_id=task_id, status_code=404, message=f"{request_id}: task not found")

@@ -130,3 +132,63 @@ def upload_bgm_file(request: Request, file: UploadFile = File(...)):
        return utils.get_response(200, response)

    raise HttpException('', status_code=400, message=f"{request_id}: Only *.mp3 files can be uploaded")
+
+
+@router.get("/stream/{file_path:path}")
+async def stream_video(request: Request, file_path: str):
+    tasks_dir = utils.task_dir()
+    video_path = os.path.join(tasks_dir, file_path)
+    range_header = request.headers.get('Range')
+    video_size = os.path.getsize(video_path)
+    start, end = 0, video_size - 1
+
+    length = video_size
+    if range_header:
+        range_ = range_header.split('bytes=')[1]
+        start, end = [int(part) if part else None for part in range_.split('-')]
+        if start is None:
+            start = video_size - end
+            end = video_size - 1
+        if end is None:
+            end = video_size - 1
+        length = end - start + 1
+
+    def file_iterator(file_path, offset=0, bytes_to_read=None):
+        with open(file_path, 'rb') as f:
+            f.seek(offset, os.SEEK_SET)
+            remaining = bytes_to_read or video_size
+            while remaining > 0:
+                bytes_to_read = min(4096, remaining)
+                data = f.read(bytes_to_read)
+                if not data:
+                    break
+                remaining -= len(data)
+                yield data
+
+    response = StreamingResponse(file_iterator(video_path, start, length), media_type='video/mp4')
+    response.headers['Content-Range'] = f'bytes {start}-{end}/{video_size}'
+    response.headers['Accept-Ranges'] = 'bytes'
+    response.headers['Content-Length'] = str(length)
+    response.status_code = 206  # Partial Content
+
+    return response
+
+
+@router.get("/download/{file_path:path}")
+async def download_video(_: Request, file_path: str):
+    """
+    download video
+    :param _: Request request
+    :param file_path: video file path, eg: /cd1727ed-3473-42a2-a7da-4faafafec72b/final-1.mp4
+    :return: video file
+    """
+    tasks_dir = utils.task_dir()
+    video_path = os.path.join(tasks_dir, file_path)
+    file_path = pathlib.Path(video_path)
+    filename = file_path.stem
+    extension = file_path.suffix
+    headers = {
+        "Content-Disposition": f"attachment; filename={filename}{extension}"
+    }
+    return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}",
+                        media_type=f'video/{extension[1:]}')
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -59,6 +59,11 @@ def _generate_response(prompt: str) -> str:
            api_key = config.app.get("qwen_api_key")
            model_name = config.app.get("qwen_model_name")
            base_url = "***"
+        elif llm_provider == "cloudflare":
+            api_key = config.app.get("cloudflare_api_key")
+            model_name = config.app.get("cloudflare_model_name")
+            account_id = config.app.get("cloudflare_account_id")
+            base_url = "***"
        else:
            raise ValueError("llm_provider is not set, please set it in the config.toml file.")

@@ -71,17 +76,31 @@ def _generate_response(prompt: str) -> str:

        if llm_provider == "qwen":
            import dashscope
+            from dashscope.api_entities.dashscope_response import GenerationResponse
            dashscope.api_key = api_key
            response = dashscope.Generation.call(
                model=model_name,
                messages=[{"role": "user", "content": prompt}]
            )
-            content = response["output"]["text"]
-            return content.replace("\n", "")
+            if response:
+                if isinstance(response, GenerationResponse):
+                    status_code = response.status_code
+                    if status_code != 200:
+                        raise Exception(
+                            f"[{llm_provider}] returned an error response: \"{response}\"")
+
+                    content = response["output"]["text"]
+                    return content.replace("\n", "")
+                else:
+                    raise Exception(
+                        f"[{llm_provider}] returned an invalid response: \"{response}\"")
+            else:
+                raise Exception(
+                    f"[{llm_provider}] returned an empty response")

        if llm_provider == "gemini":
            import google.generativeai as genai
-            genai.configure(api_key=api_key)
+            genai.configure(api_key=api_key, transport='rest')

            generation_config = {
                "temperature": 0.5,
@@ -113,10 +132,30 @@ def _generate_response(prompt: str) -> str:
                                          generation_config=generation_config,
                                          safety_settings=safety_settings)

-            convo = model.start_chat(history=[])
+            try:
+                response = model.generate_content(prompt)
+                candidates = response.candidates
+                generated_text = candidates[0].content.parts[0].text
+            except (AttributeError, IndexError) as e:
+                print("Gemini Error:", e)

-            convo.send_message(prompt)
-            return convo.last.text
+            return generated_text
+
+        if llm_provider == "cloudflare":
+            import requests
+            response = requests.post(
+                f"https://api.cloudflare.com/client/v4/accounts/{account_id}/ai/run/{model_name}",
+                headers={"Authorization": f"Bearer {api_key}"},
+                json={
+                    "messages": [
+                        {"role": "system", "content": "You are a friendly assistant"},
+                        {"role": "user", "content": prompt}
+                    ]
+                }
+            )
+            result = response.json()
+            logger.info(result)
+            return result["result"]["response"]

        if llm_provider == "azure":
            client = AzureOpenAI(
--- a/app/services/material.py
+++ b/app/services/material.py
@@ -5,6 +5,7 @@ from urllib.parse import urlencode
 import requests
 from typing import List
 from loguru import logger
+from moviepy.video.io.VideoFileClip import VideoFileClip

 from app.config import config
 from app.models.schema import VideoAspect, VideoConcatMode, MaterialInfo
@@ -105,7 +106,19 @@ def save_video(video_url: str, save_dir: str = "") -> str:
        f.write(requests.get(video_url, proxies=proxies, verify=False, timeout=(60, 240)).content)

    if os.path.exists(video_path) and os.path.getsize(video_path) > 0:
-        return video_path
+        try:
+            clip = VideoFileClip(video_path)
+            duration = clip.duration
+            fps = clip.fps
+            clip.close()
+            if duration > 0 and fps > 0:
+                return video_path
+        except Exception as e:
+            try:
+                os.remove(video_path)
+            except Exception as e:
+                pass
+            logger.warning(f"invalid video file: {video_path} => {str(e)}")
    return ""


--- a/app/services/state.py
+++ b/app/services/state.py
@@ -44,9 +44,9 @@ class MemoryState(BaseState):
 # Redis state management
 class RedisState(BaseState):

-    def __init__(self, host='localhost', port=6379, db=0):
+    def __init__(self, host='localhost', port=6379, db=0, password=None):
        import redis
-        self._redis = redis.StrictRedis(host=host, port=port, db=db)
+        self._redis = redis.StrictRedis(host=host, port=port, db=db, password=password)

    def update_task(self, task_id: str, state: int = const.TASK_STATE_PROCESSING, progress: int = 0, **kwargs):
        progress = int(progress)
@@ -98,5 +98,6 @@ _enable_redis = config.app.get("enable_redis", False)
 _redis_host = config.app.get("redis_host", "localhost")
 _redis_port = config.app.get("redis_port", 6379)
 _redis_db = config.app.get("redis_db", 0)
+_redis_password = config.app.get("redis_password", None)

-state = RedisState(host=_redis_host, port=_redis_port, db=_redis_db) if _enable_redis else MemoryState()
+state = RedisState(host=_redis_host, port=_redis_port, db=_redis_db, password=_redis_password) if _enable_redis else MemoryState()
--- a/app/services/video.py
+++ b/app/services/video.py
@@ -13,15 +13,16 @@ from app.utils import utils
 def get_bgm_file(bgm_type: str = "random", bgm_file: str = ""):
    if not bgm_type:
        return ""
+
+    if bgm_file and os.path.exists(bgm_file):
+        return bgm_file
+
    if bgm_type == "random":
        suffix = "*.mp3"
        song_dir = utils.song_dir()
        files = glob.glob(os.path.join(song_dir, suffix))
        return random.choice(files)

-    if os.path.exists(bgm_file):
-        return bgm_file
-
    return ""


@@ -99,16 +100,18 @@ def combine_videos(combined_video_path: str,
            clips.append(clip)
            video_duration += clip.duration

-    final_clip = concatenate_videoclips(clips)
-    final_clip = final_clip.set_fps(30)
+    video_clip = concatenate_videoclips(clips)
+    video_clip = video_clip.set_fps(30)
    logger.info(f"writing")
    # https://github.com/harry0703/MoneyPrinterTurbo/issues/111#issuecomment-2032354030
-    final_clip.write_videofile(filename=combined_video_path,
+    video_clip.write_videofile(filename=combined_video_path,
                               threads=threads,
                               logger=None,
                               temp_audiofile_path=output_dir,
                               audio_codec="aac",
+                               fps=30,
                               )
+    video_clip.close()
    logger.success(f"completed")
    return combined_video_path

@@ -126,7 +129,7 @@ def wrap_text(text, max_width, font='Arial', fontsize=60):
    if width <= max_width:
        return text, height

-    logger.warning(f"wrapping text, max_width: {max_width}, text_width: {width}, text: {text}")
+    # logger.warning(f"wrapping text, max_width: {max_width}, text_width: {width}, text: {text}")

    processed = True

@@ -150,7 +153,7 @@ def wrap_text(text, max_width, font='Arial', fontsize=60):
        _wrapped_lines_ = [line.strip() for line in _wrapped_lines_]
        result = '\n'.join(_wrapped_lines_).strip()
        height = len(_wrapped_lines_) * height
-        logger.warning(f"wrapped text: {result}")
+        # logger.warning(f"wrapped text: {result}")
        return result, height

    _wrapped_lines_ = []
@@ -167,7 +170,7 @@ def wrap_text(text, max_width, font='Arial', fontsize=60):
    _wrapped_lines_.append(_txt_)
    result = '\n'.join(_wrapped_lines_).strip()
    height = len(_wrapped_lines_) * height
-    logger.warning(f"wrapped text: {result}")
+    # logger.warning(f"wrapped text: {result}")
    return result, height


@@ -244,19 +247,24 @@ def generate_video(video_path: str,

    bgm_file = get_bgm_file(bgm_type=params.bgm_type, bgm_file=params.bgm_file)
    if bgm_file:
-        bgm_clip = (AudioFileClip(bgm_file)
-                    .set_duration(video_clip.duration)
-                    .volumex(params.bgm_volume)
-                    .audio_fadeout(3))
+        try:
+            bgm_clip = (AudioFileClip(bgm_file)
+                        .volumex(params.bgm_volume)
+                        .audio_fadeout(3))
+            bgm_clip = afx.audio_loop(bgm_clip, duration=video_clip.duration)
+            audio_clip = CompositeAudioClip([audio_clip, bgm_clip])
+        except Exception as e:
+            logger.error(f"failed to add bgm: {str(e)}")

-        audio_clip = CompositeAudioClip([audio_clip, bgm_clip])
    video_clip = video_clip.set_audio(audio_clip)
    video_clip.write_videofile(output_file,
                               audio_codec="aac",
                               temp_audiofile_path=output_dir,
                               threads=params.n_threads or 2,
-                               logger=None)
-
+                               logger=None,
+                               fps=30,
+                               )
+    video_clip.close()
    logger.success(f"completed")


--- a/app/services/voice.py
+++ b/app/services/voice.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import re
+from datetime import datetime
 from xml.sax.saxutils import unescape
 from edge_tts.submaker import mktimestamp
 from loguru import logger
@@ -8,10 +9,11 @@ from edge_tts import submaker, SubMaker
 import edge_tts
 from moviepy.video.tools import subtitles

+from app.config import config
 from app.utils import utils


-def get_all_voices(filter_locals=None) -> list[str]:
+def get_all_azure_voices(filter_locals=None) -> list[str]:
    if filter_locals is None:
        filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW"]
    voices_str = """
@@ -956,6 +958,34 @@ Gender: Female

 Name: zu-ZA-ThembaNeural
 Gender: Male
+
+
+Name: en-US-AvaMultilingualNeural-V2
+Gender: Female
+
+Name: en-US-AndrewMultilingualNeural-V2
+Gender: Male
+
+Name: en-US-EmmaMultilingualNeural-V2
+Gender: Female
+
+Name: en-US-BrianMultilingualNeural-V2
+Gender: Male
+
+Name: de-DE-FlorianMultilingualNeural-V2
+Gender: Male
+
+Name: de-DE-SeraphinaMultilingualNeural-V2
+Gender: Female
+
+Name: fr-FR-RemyMultilingualNeural-V2
+Gender: Male
+
+Name: fr-FR-VivienneMultilingualNeural-V2
+Gender: Female
+
+Name: zh-CN-XiaoxiaoMultilingualNeural-V2
+Gender: Female
    """.strip()
    voices = []
    name = ''
@@ -986,11 +1016,26 @@ Gender: Male
 def parse_voice_name(name: str):
    # zh-CN-XiaoyiNeural-Female
    # zh-CN-YunxiNeural-Male
+    # zh-CN-XiaoxiaoMultilingualNeural-V2-Female
    name = name.replace("-Female", "").replace("-Male", "").strip()
    return name


+def is_azure_v2_voice(voice_name: str):
+    voice_name = parse_voice_name(voice_name)
+    print(voice_name)
+    if voice_name.endswith("-V2"):
+        return voice_name.replace("-V2", "").strip()
+    return ""
+
+
 def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+    if is_azure_v2_voice(voice_name):
+        return azure_tts_v2(text, voice_name, voice_file)
+    return azure_tts_v1(text, voice_name, voice_file)
+
+
+def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
    text = text.strip()
    for i in range(3):
        try:
@@ -1019,14 +1064,82 @@ def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
    return None


-def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
-    """
-    优化字幕文件
-    1. 将字幕文件按照标点符号分割成多行
-    2. 逐行匹配字幕文件中的文本
-    3. 生成新的字幕文件
-    """
-    text = text.replace("\n", " ")
+def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
+    voice_name = is_azure_v2_voice(voice_name)
+    if not voice_name:
+        logger.error(f"invalid voice name: {voice_name}")
+        raise ValueError(f"invalid voice name: {voice_name}")
+    text = text.strip()
+
+    def _format_duration_to_offset(duration) -> int:
+        if isinstance(duration, str):
+            time_obj = datetime.strptime(duration, "%H:%M:%S.%f")
+            milliseconds = (time_obj.hour * 3600000) + (time_obj.minute * 60000) + (time_obj.second * 1000) + (
+                    time_obj.microsecond // 1000)
+            return milliseconds * 10000
+
+        if isinstance(duration, int):
+            return duration
+
+        return 0
+
+    for i in range(3):
+        try:
+            logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
+
+            import azure.cognitiveservices.speech as speechsdk
+
+            sub_maker = SubMaker()
+
+            def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
+                # print('WordBoundary event:')
+                # print('\tBoundaryType: {}'.format(evt.boundary_type))
+                # print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000)))
+                # print('\tDuration: {}'.format(evt.duration))
+                # print('\tText: {}'.format(evt.text))
+                # print('\tTextOffset: {}'.format(evt.text_offset))
+                # print('\tWordLength: {}'.format(evt.word_length))
+
+                duration = _format_duration_to_offset(str(evt.duration))
+                offset = _format_duration_to_offset(evt.audio_offset)
+                sub_maker.subs.append(evt.text)
+                sub_maker.offset.append((offset, offset + duration))
+
+            # Creates an instance of a speech config with specified subscription key and service region.
+            speech_key = config.azure.get("speech_key", "")
+            service_region = config.azure.get("speech_region", "")
+            audio_config = speechsdk.audio.AudioOutputConfig(filename=voice_file, use_default_speaker=True)
+            speech_config = speechsdk.SpeechConfig(subscription=speech_key,
+                                                   region=service_region)
+            speech_config.speech_synthesis_voice_name = voice_name
+            # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
+            #                            value='true')
+            speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
+                                       value='true')
+
+            speech_config.set_speech_synthesis_output_format(
+                speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
+            speech_synthesizer = speechsdk.SpeechSynthesizer(audio_config=audio_config,
+                                                             speech_config=speech_config)
+            speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_cb)
+
+            result = speech_synthesizer.speak_text_async(text).get()
+            if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
+                logger.success(f"azure v2 speech synthesis succeeded: {voice_file}")
+                return sub_maker
+            elif result.reason == speechsdk.ResultReason.Canceled:
+                cancellation_details = result.cancellation_details
+                logger.error(f"azure v2 speech synthesis canceled: {cancellation_details.reason}")
+                if cancellation_details.reason == speechsdk.CancellationReason.Error:
+                    logger.error(f"azure v2 speech synthesis error: {cancellation_details.error_details}")
+            logger.info(f"completed, output file: {voice_file}")
+        except Exception as e:
+            logger.error(f"failed, error: {str(e)}")
+    return None
+
+
+def _format_text(text: str) -> str:
+    # text = text.replace("\n", " ")
    text = text.replace("[", " ")
    text = text.replace("]", " ")
    text = text.replace("(", " ")
@@ -1034,6 +1147,18 @@ def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str)
    text = text.replace("{", " ")
    text = text.replace("}", " ")
    text = text.strip()
+    return text
+
+
+def create_subtitle(sub_maker: submaker.SubMaker, text: str, subtitle_file: str):
+    """
+    优化字幕文件
+    1. 将字幕文件按照标点符号分割成多行
+    2. 逐行匹配字幕文件中的文本
+    3. 生成新的字幕文件
+    """
+
+    text = _format_text(text)

    def formatter(idx: int, start_time: float, end_time: float, sub_text: str) -> str:
        """
@@ -1125,8 +1250,12 @@ def get_audio_duration(sub_maker: submaker.SubMaker):


 if __name__ == "__main__":
-    voices = get_all_voices()
-    print(voices)
+    voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
+    voice_name = parse_voice_name(voice_name)
+    voice_name = is_azure_v2_voice(voice_name)
+    print(voice_name)
+
+    voices = get_all_azure_voices()
    print(len(voices))


@@ -1134,6 +1263,7 @@ if __name__ == "__main__":
        temp_dir = utils.storage_dir("temp")

        voice_names = [
+            "zh-CN-XiaoxiaoMultilingualNeural",
            # 女性
            "zh-CN-XiaoxiaoNeural",
            "zh-CN-XiaoyiNeural",
@@ -1156,10 +1286,28 @@ if __name__ == "__main__":
                   """

        text = "[Opening scene: A sunny day in a suburban neighborhood. A young boy named Alex, around 8 years old, is playing in his front yard with his loyal dog, Buddy.]\n\n[Camera zooms in on Alex as he throws a ball for Buddy to fetch. Buddy excitedly runs after it and brings it back to Alex.]\n\nAlex: Good boy, Buddy! You're the best dog ever!\n\n[Buddy barks happily and wags his tail.]\n\n[As Alex and Buddy continue playing, a series of potential dangers loom nearby, such as a stray dog approaching, a ball rolling towards the street, and a suspicious-looking stranger walking by.]\n\nAlex: Uh oh, Buddy, look out!\n\n[Buddy senses the danger and immediately springs into action. He barks loudly at the stray dog, scaring it away. Then, he rushes to retrieve the ball before it reaches the street and gently nudges it back towards Alex. Finally, he stands protectively between Alex and the stranger, growling softly to warn them away.]\n\nAlex: Wow, Buddy, you're like my superhero!\n\n[Just as Alex and Buddy are about to head inside, they hear a loud crash from a nearby construction site. They rush over to investigate and find a pile of rubble blocking the path of a kitten trapped underneath.]\n\nAlex: Oh no, Buddy, we have to help!\n\n[Buddy barks in agreement and together they work to carefully move the rubble aside, allowing the kitten to escape unharmed. The kitten gratefully nuzzles against Buddy, who responds with a friendly lick.]\n\nAlex: We did it, Buddy! We saved the day again!\n\n[As Alex and Buddy walk home together, the sun begins to set, casting a warm glow over the neighborhood.]\n\nAlex: Thanks for always being there to watch over me, Buddy. You're not just my dog, you're my best friend.\n\n[Buddy barks happily and nuzzles against Alex as they disappear into the sunset, ready to face whatever adventures tomorrow may bring.]\n\n[End scene.]"
+
+        text = "大家好，我是乔哥，一个想帮你把信用卡全部还清的家伙！\n今天我们要聊的是信用卡的取现功能。\n你是不是也曾经因为一时的资金紧张，而拿着信用卡到ATM机取现？如果是，那你得好好看看这个视频了。\n现在都2024年了，我以为现在不会再有人用信用卡取现功能了。前几天一个粉丝发来一张图片，取现1万。\n信用卡取现有三个弊端。\n一，信用卡取现功能代价可不小。会先收取一个取现手续费，比如这个粉丝，取现1万，按2.5%收取手续费，收取了250元。\n二，信用卡正常消费有最长56天的免息期，但取现不享受免息期。从取现那一天开始，每天按照万5收取利息，这个粉丝用了11天，收取了55元利息。\n三，频繁的取现行为，银行会认为你资金紧张，会被标记为高风险用户，影响你的综合评分和额度。\n那么，如果你资金紧张了，该怎么办呢？\n乔哥给你支一招，用破思机摩擦信用卡，只需要少量的手续费，而且还可以享受最长56天的免息期。\n最后，如果你对玩卡感兴趣，可以找乔哥领取一本《卡神秘籍》，用卡过程中遇到任何疑惑，也欢迎找乔哥交流。\n别忘了，关注乔哥，回复用卡技巧，免费领取《2024用卡技巧》，让我们一起成为用卡高手！"
+
+        text = """
+        2023全年业绩速览
+公司全年累计实现营业收入1476.94亿元，同比增长19.01%，归母净利润747.34亿元，同比增长19.16%。EPS达到59.49元。第四季度单季，营业收入444.25亿元，同比增长20.26%，环比增长31.86%；归母净利润218.58亿元，同比增长19.33%，环比增长29.37%。这一阶段
+的业绩表现不仅突显了公司的增长动力和盈利能力，也反映出公司在竞争激烈的市场环境中保持了良好的发展势头。
+2023年Q4业绩速览
+第四季度，营业收入贡献主要增长点；销售费用高增致盈利能力承压；税金同比上升27%，扰动净利率表现。
+业绩解读
+利润方面，2023全年贵州茅台，>归母净利润增速为19%，其中营业收入正贡献18%，营业成本正贡献百分之一，管理费用正贡献百分之一点四。(注：归母净利润增速值=营业收入增速+各科目贡献，展示贡献/拖累的前四名科目，且要求贡献值/净利润增速>15%)
+"""
+        text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚，看到窗前的明月，不禁想起远方的家乡和亲人"
+
+        text = _format_text(text)
+        lines = utils.split_string_by_punctuations(text)
+        print(lines)
+
        for voice_name in voice_names:
            voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
            subtitle_file = f"{temp_dir}/tts.mp3.srt"
-            sub_maker = tts(text=text, voice_name=voice_name, voice_file=voice_file)
+            sub_maker = azure_tts_v2(text=text, voice_name=voice_name, voice_file=voice_file)
            create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
            audio_duration = get_audio_duration(sub_maker)
            print(f"voice: {voice_name}, audio duration: {audio_duration}s")
--- a/app/utils/utils.py
+++ b/app/utils/utils.py
@@ -163,12 +163,34 @@ def str_contains_punctuation(word):
 def split_string_by_punctuations(s):
    result = []
    txt = ""
-    for char in s:
+
+    previous_char = ""
+    next_char = ""
+    for i in range(len(s)):
+        char = s[i]
+        if char == "\n":
+            result.append(txt.strip())
+            txt = ""
+            continue
+
+        if i > 0:
+            previous_char = s[i - 1]
+        if i < len(s) - 1:
+            next_char = s[i + 1]
+
+        if char == "." and previous_char.isdigit() and next_char.isdigit():
+            # 取现1万，按2.5%收取手续费, 2.5 中的 . 不能作为换行标记
+            txt += char
+            continue
+
        if char not in const.PUNCTUATIONS:
            txt += char
        else:
            result.append(txt.strip())
            txt = ""
+    result.append(txt.strip())
+    # filter empty string
+    result = list(filter(None, result))
    return result


--- a/config.example.toml
+++ b/config.example.toml
@@ -162,3 +162,9 @@
        ### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
        # http = "http://10.10.1.10:3128"
        # https = "http://10.10.1.10:1080"
+
+[azure]
+    # Azure Speech API Key
+    # Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
+    speech_key=""
+    speech_region=""
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,3 @@
-version: "3"
-
 x-common-volumes: &common-volumes
  - ./:/MoneyPrinterTurbo

--- a/docs/wechat-03.jpg
+++ b/docs/wechat-03.jpg
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,3 +17,9 @@ dashscope~=1.15.0
 google.generativeai~=0.4.1
 python-multipart~=0.0.9
 redis==5.0.3
+# if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
+# please install opencv-python to fix "PIL.Image' has no attribute 'ANTIALIAS'" error
+opencv-python
+# for azure speech
+# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
+azure-cognitiveservices-speech~=1.37.0
--- a/webui.bat
+++ b/webui.bat
@@ -1,2 +1,7 @@
+@echo off
+set CURRENT_DIR=%CD%
+echo ***** Current directory: %CURRENT_DIR% *****
+set PYTHONPATH=%CURRENT_DIR%
+
 rem set HF_ENDPOINT=https://hf-mirror.com
 streamlit run .\webui\Main.py --browser.gatherUsageStats=False --server.enableCORS=True
--- a/webui/Main.py
+++ b/webui/Main.py
@@ -1,5 +1,6 @@
 import sys
 import os
+import time

 # Add the root directory of the project to the system path to allow importing modules from the project
 root_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
@@ -62,6 +63,7 @@ def get_all_fonts():
        for file in files:
            if file.endswith(".ttf") or file.endswith(".ttc"):
                fonts.append(file)
+    fonts.sort()
    return fonts


@@ -164,7 +166,6 @@ with st.expander(tr("Basic Settings"), expanded=False):
            code = selected_language.split(" - ")[0].strip()
            st.session_state['ui_language'] = code
            config.ui['language'] = code
-            config.save_config()

    with middle_config_panel:
        #   openai
@@ -175,7 +176,7 @@ with st.expander(tr("Basic Settings"), expanded=False):
        #   qwen (通义千问)
        #   gemini
        #   ollama
-        llm_providers = ['OpenAI', 'Moonshot', 'Azure', 'Qwen', 'Gemini', 'Ollama', 'G4f', 'OneAPI']
+        llm_providers = ['OpenAI', 'Moonshot', 'Azure', 'Qwen', 'Gemini', 'Ollama', 'G4f', 'OneAPI', "Cloudflare"]
        saved_llm_provider = config.app.get("llm_provider", "OpenAI").lower()
        saved_llm_provider_index = 0
        for i, provider in enumerate(llm_providers):
@@ -190,6 +191,7 @@ with st.expander(tr("Basic Settings"), expanded=False):
        llm_api_key = config.app.get(f"{llm_provider}_api_key", "")
        llm_base_url = config.app.get(f"{llm_provider}_base_url", "")
        llm_model_name = config.app.get(f"{llm_provider}_model_name", "")
+        llm_account_id = config.app.get(f"{llm_provider}_account_id", "")
        st_llm_api_key = st.text_input(tr("API Key"), value=llm_api_key, type="password")
        st_llm_base_url = st.text_input(tr("Base Url"), value=llm_base_url)
        st_llm_model_name = st.text_input(tr("Model Name"), value=llm_model_name)
@@ -200,7 +202,10 @@ with st.expander(tr("Basic Settings"), expanded=False):
        if st_llm_model_name:
            config.app[f"{llm_provider}_model_name"] = st_llm_model_name

-        config.save_config()
+        if llm_provider == 'cloudflare':
+            st_llm_account_id = st.text_input(tr("Account ID"), value=llm_account_id)
+            if st_llm_account_id:
+                config.app[f"{llm_provider}_account_id"] = st_llm_account_id

    with right_config_panel:
        pexels_api_keys = config.app.get("pexels_api_keys", [])
@@ -212,7 +217,6 @@ with st.expander(tr("Basic Settings"), expanded=False):
        pexels_api_key = pexels_api_key.replace(" ", "")
        if pexels_api_key:
            config.app["pexels_api_keys"] = pexels_api_key.split(",")
-            config.save_config()

 panel = st.columns(3)
 left_panel = panel[0]
@@ -295,20 +299,20 @@ with middle_panel:
                                          index=0)
    with st.container(border=True):
        st.write(tr("Audio Settings"))
-        voices = voice.get_all_voices(filter_locals=["zh-CN", "zh-HK", "zh-TW", "de-DE", "en-US"])
+        voices = voice.get_all_azure_voices(filter_locals=["zh-CN", "zh-HK", "zh-TW", "de-DE", "en-US", "fr-FR"])
        friendly_names = {
-            voice: voice.
+            v: v.
            replace("Female", tr("Female")).
            replace("Male", tr("Male")).
            replace("Neural", "") for
-            voice in voices}
+            v in voices}
        saved_voice_name = config.ui.get("voice_name", "")
        saved_voice_name_index = 0
        if saved_voice_name in friendly_names:
            saved_voice_name_index = list(friendly_names.keys()).index(saved_voice_name)
        else:
-            for i, voice in enumerate(voices):
-                if voice.lower().startswith(st.session_state['ui_language'].lower()):
+            for i, v in enumerate(voices):
+                if v.lower().startswith(st.session_state['ui_language'].lower()):
                    saved_voice_name_index = i
                    break

@@ -319,7 +323,13 @@ with middle_panel:
        voice_name = list(friendly_names.keys())[list(friendly_names.values()).index(selected_friendly_name)]
        params.voice_name = voice_name
        config.ui['voice_name'] = voice_name
-        config.save_config()
+        if voice.is_azure_v2_voice(voice_name):
+            saved_azure_speech_region = config.azure.get(f"speech_region", "")
+            saved_azure_speech_key = config.azure.get(f"speech_key", "")
+            azure_speech_region = st.text_input(tr("Speech Region"), value=saved_azure_speech_region)
+            azure_speech_key = st.text_input(tr("Speech Key"), value=saved_azure_speech_key, type="password")
+            config.azure["speech_region"] = azure_speech_region
+            config.azure["speech_key"] = azure_speech_key

        params.voice_volume = st.selectbox(tr("Speech Volume"),
                                           options=[0.6, 0.8, 1.0, 1.2, 1.5, 2.0, 3.0, 4.0, 5.0], index=2)
@@ -356,7 +366,6 @@ with right_panel:
            saved_font_name_index = font_names.index(saved_font_name)
        params.font_name = st.selectbox(tr("Font"), font_names, index=saved_font_name_index)
        config.ui['font_name'] = params.font_name
-        config.save_config()

        subtitle_positions = [
            (tr("Top"), "top"),
@@ -439,3 +448,5 @@ if start_button:
    open_task_folder(task_id)
    logger.info(tr("Video Generation Completed"))
    scroll_to_bottom()
+
+config.save_config()
--- a/webui/i18n/de.json
+++ b/webui/i18n/de.json
@@ -23,6 +23,8 @@
    "Number of Videos Generated Simultaneously": "Anzahl der parallel generierten Videos",
    "Audio Settings": "**Audio Einstellungen**",
    "Speech Synthesis": "Sprachausgabe",
+    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "Lautstärke der Sprachausgabe",
    "Male": "Männlich",
    "Female": "Weiblich",
--- a/webui/i18n/en.json
+++ b/webui/i18n/en.json
@@ -23,6 +23,8 @@
    "Number of Videos Generated Simultaneously": "Number of Videos Generated Simultaneously",
    "Audio Settings": "**Audio Settings**",
    "Speech Synthesis": "Speech Synthesis Voice",
+    "Speech Region": "Region(:red[Required，[Get Region](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[Required，[Get API Key](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "Speech Volume (1.0 represents 100%)",
    "Male": "Male",
    "Female": "Female",
@@ -55,6 +57,7 @@
    "LLM Provider": "LLM Provider",
    "API Key": "API Key (:red[Required])",
    "Base Url": "Base Url",
+    "Account ID": "Account ID (Get from Cloudflare dashboard)",
    "Model Name": "Model Name",
    "Please Enter the LLM API Key": "Please Enter the **LLM API Key**",
    "Please Enter the Pexels API Key": "Please Enter the **Pexels API Key**",
--- a/webui/i18n/zh.json
+++ b/webui/i18n/zh.json
@@ -23,6 +23,8 @@
    "Number of Videos Generated Simultaneously": "同时生成视频数量",
    "Audio Settings": "**音频设置**",
    "Speech Synthesis": "朗读声音（:red[尽量与文案语言保持一致]）",
+    "Speech Region": "服务区域(:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
+    "Speech Key": "API Key(:red[必填，[点击获取](https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices)])",
    "Speech Volume": "朗读音量（1.0表示100%）",
    "Male": "男性",
    "Female": "女性",
@@ -55,6 +57,7 @@
    "LLM Provider": "大模型提供商",
    "API Key": "API Key (:red[必填，需要到大模型提供商的后台申请])",
    "Base Url": "Base Url (可选)",
+    "Account ID": "账户ID (Cloudflare的dash面板url中获取)",
    "Model Name": "模型名称 (:blue[需要到大模型提供商的后台确认被授权的模型名称])",
    "Please Enter the LLM API Key": "请先填写大模型 **API Key**",
    "Please Enter the Pexels API Key": "请先填写 **Pexels API Key**",
Author	SHA1	Message	Date
Harry	414bcb0621	Merge pull request #264 from harry0703/dev support azure new speech voice and fix the bug where clip were not closed	2024-04-16 09:00:10 +08:00
harry	d4eb7bc333	optimize code	2024-04-15 17:47:10 +08:00
harry	1e96357f00	fix the bug where the last subtitle line was missing	2024-04-15 17:46:56 +08:00
harry	2e58d7ccf2	fix the bug where clip were not closed	2024-04-15 17:46:24 +08:00
harry	176660b442	support azure new speech voice	2024-04-15 17:45:05 +08:00
Harry	b9b9bea2a6	Merge pull request #261 from KevinZhang19870314/main chore: add video download api endpoint	2024-04-15 17:23:29 +08:00
Kevin Zhang	17df9a1f27	Merge branch 'harry0703:main' into main	2024-04-15 14:49:36 +08:00
kevin.zhang	00052b4c50	chore: add video download api endpoint	2024-04-15 14:47:57 +08:00
Harry	b8369349ea	Merge pull request #260 from harry0703/dev optimize subtitle segmentation and code	2024-04-15 11:30:33 +08:00
Harry	3de3e19276	Merge pull request #259 from KevinZhang19870314/main refactor: video stream api revise	2024-04-15 11:30:10 +08:00
harry	bd33419460	optimize subtitle segmentation optimize code	2024-04-15 11:29:04 +08:00
kevin.zhang	d13a3cf6e9	refactor: Streaming MP4 files in the browser using video html element instead of waiting for the entire file to download before playing	2024-04-15 09:51:40 +08:00
Harry	3e4d5f52fd	Merge pull request #258 from harry0703/dev update readme	2024-04-14 21:18:11 +08:00
harry	9a1ee9abfb	update readme	2024-04-14 21:16:51 +08:00
Harry	2c41e6be62	Merge pull request #254 from harry0703/dev optimize segmentation	2024-04-13 21:51:33 +08:00
harry	a17d52c1ae	optimize segmentation	2024-04-13 21:50:45 +08:00
Harry	b1506b9161	Merge pull request #253 from harry0703/dev optimize segmentation	2024-04-13 21:04:22 +08:00
harry	53923e0d25	optimize segmentation	2024-04-13 21:03:55 +08:00
Harry	1a302a1791	Merge pull request #252 from harry0703/dev fix some bugs	2024-04-13 20:26:52 +08:00
harry	ce0f557702	1. prioritize using the bgm_file. 2. optimized the logic for looping the BGM.	2024-04-13 20:24:09 +08:00
harry	a8d208bdc3	added validation for video file using moviepy to ensure video file is valid before processing.	2024-04-13 20:19:08 +08:00
Harry	0cb71d6218	Merge pull request #251 from harry0703/dev write_videofile set fps=30	2024-04-12 22:02:25 +08:00
harry	52b92d175d	write_videofile set fps=30	2024-04-12 22:02:01 +08:00
Harry	76e1407d9b	Merge pull request #250 from harry0703/dev update readme	2024-04-12 18:54:51 +08:00
harry	26437a666c	update readme	2024-04-12 18:54:25 +08:00
Harry	8907958fec	Merge pull request #249 from KevinZhang19870314/main add password support for redis state and stream api support for video	2024-04-12 17:54:35 +08:00
kevin.zhang	0550e433d1	Merge branch 'main' of https://github.com/KevinZhang19870314/MoneyPrinterTurbo	2024-04-12 17:47:14 +08:00
kevin.zhang	1fb3399b02	chore: add stream support for video	2024-04-12 17:43:21 +08:00
Harry	9ab13a74a2	Merge pull request #248 from elf-mouse/main fix: response parsing bug for gemini	2024-04-12 17:03:01 +08:00
elf-mouse	1dbfcfadab	Merge branch 'main' of github.com:elf-mouse/MoneyPrinterTurbo	2024-04-12 16:50:32 +08:00
elf-mouse	ee7306d216	fix: response parsing bug for gemini	2024-04-12 15:49:23 +08:00
Harry	a8b54415a5	Merge pull request #242 from harry0703/dev update readme	2024-04-12 10:22:40 +08:00
harry	7a8e25dc36	update readme	2024-04-12 10:22:01 +08:00
Harry	c8adc453ae	Merge pull request #241 from harry0703/dev add qwen error logs	2024-04-12 10:08:08 +08:00
harry	24a9ca514e	add qwen error logs	2024-04-12 10:05:14 +08:00
harry	a7466b2393	add qwen error logs	2024-04-12 10:04:52 +08:00
Harry	1f2b36a4f0	Merge pull request #240 from harry0703/dev fix webui.bat and docker-compose.yml	2024-04-11 23:40:21 +08:00
harry	91218ecf95	fix webui.bat and docker-compose.yml	2024-04-11 23:39:49 +08:00
Harry	a0a5a4059f	Merge pull request #238 from highkay/main 增加Cloudflare workers ai作为llm后端	2024-04-11 23:11:17 +08:00
highkay	90f0f560b2	Merge branch 'main' of https://github.com/harry0703/MoneyPrinterTurbo into main	2024-04-11 22:55:18 +08:00
highkay	05da4a3766	- 增加Cloudflare workers ai作为llm后端 - 增加一些gitignore	2024-04-11 22:55:08 +08:00
Harry	066e33def9	Merge pull request #237 from harry0703/dev update readme	2024-04-11 22:03:17 +08:00
harry	bb66b7e10c	update readme	2024-04-11 22:02:00 +08:00