using coqui.ai and python to convert a conversation into (spoken) speech using your voice as one conversant

Install coqui (see other post here: https://johnsaiblog.blogspot.com/2025/03/coqui-going-down-how-to-install-before.html)

Now if you save your discussion into a text file, called full_conversation.txt, in this order:

speaker 1 blah blah blah

speaker 2 blah blah etc etc

(ie one speaker per line, no multi-line sentences), you can split the file up into sections of speech to convert to speech. Make sure each sentence is approximately 250 letters or less. Shorter is better.

This script creates a folder called conversation_parts/ which contains the conversation split up by speaker and by sentence. 

import os

import re


# Input and output folder

input_file = "full_conversation.txt"  # The full transcript

output_folder = "conversation_parts"

max_length = 250  # Maximum characters per chunk


# Ensure output folder exists

os.makedirs(output_folder, exist_ok=True)


# Function to split text into manageable chunks

def split_text(text, max_length=250):

    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split at punctuation

    chunks = []

    current_chunk = ""


    for sentence in sentences:

        if len(current_chunk) + len(sentence) <= max_length:

            current_chunk += " " + sentence if current_chunk else sentence

        else:

            chunks.append(current_chunk.strip())  # Store previous chunk

            current_chunk = sentence  # Start new chunk


    if current_chunk:

        chunks.append(current_chunk.strip())  # Add last chunk


    return chunks


# Read the transcript file

with open(input_file, "r", encoding="utf-8") as f:

    lines = f.readlines()


# Process each line as a speaker turn

file_count = 1

for i, line in enumerate(lines):

    line = line.strip()

    if not line:

        continue  # Skip empty lines


    speaker = "a" if i % 2 == 0 else "b"  # Alternating speakers

    is_interlocutor = speaker == "b"  # Interlocutor gets British voice


    # Split long lines into chunks

    chunks = split_text(line, max_length)


    # Save each chunk as a separate text file

    for j, chunk in enumerate(chunks):

        suffix = f"-{j+1}" if len(chunks) > 1 else ""  # Numbering for split parts

        filename = f"convo-{file_count:03}{speaker}{'-interloc' if is_interlocutor else ''}{suffix}.txt"

        output_path = os.path.join(output_folder, filename)


        with open(output_path, "w", encoding="utf-8") as out_f:

            out_f.write(chunk)


        print(f"Created: {output_path}")


        file_count += 1  # Increment numbering


print("  Transcript successfully split into parts.")

This next script converts the sentences into audio wav files in speech format ie spoken wording, using coqui. The sound is a bit artificial but it's ok for general use. It uses a file called my_voice.wav to create audio in your own voice, so put a more or less one minute sample of yourself speaking into a wav file and call it my_voice.wav. Note that it gives you an american accent, sorry.

import os

import subprocess

import shlex


# Folder containing split text files

input_folder = "conversation_parts"


# Iterate through sorted files

for file in sorted(os.listdir(input_folder)):

    if file.endswith(".txt"):

        file_path = os.path.join(input_folder, file)


        with open(file_path, "r", encoding="utf-8") as f:

            text = f.read().strip()


        if not text:

            print(f"Skipping empty file: {file}")

            continue  # Skip empty files


        output_path = file_path.replace(".txt", ".wav")


        # Escape quotes inside the text properly

        safe_text = shlex.quote(text)


        # Determine which voice to use

        if "interloc" in file:  # British accent

            tts_command = f"""

            tts --text {safe_text} \

                --model_name tts_models/en/vctk/vits \

                --speaker_idx p234 \

                --out_path {shlex.quote(output_path)}

            """

        else:  # Your cloned voice

            tts_command = f"""

            tts --text {safe_text} \

                --model_name tts_models/multilingual/multi-dataset/xtts_v2 \

                --speaker_wav my_voice.wav \

                --language_idx "en" \

                --out_path {shlex.quote(output_path)}

            """


        print(f"Generating audio: {output_path}")


        try:

            subprocess.run(tts_command, shell=True, check=True)

        except subprocess.CalledProcessError as e:

            print(f"  Error generating {output_path}: {e}")


print("  Audio files generated for all dialogue parts.")

Finally, this last file joins all the audios into a single audio file.

#!/bin/bash


# Output file

OUTPUT="final_conversation.wav"


# Temporary silence file (0.2 seconds)

SILENCE="silence.wav"


# Generate silence

ffmpeg -f lavfi -i anullsrc=r=22050:cl=mono -t 0.2 -y "$SILENCE"


# Create the file list for concatenation

rm -f file_list.txt

for file in $(ls conversation_parts/*.wav | sort -V); do

    echo "file '$file'" >> file_list.txt

    echo "file '$SILENCE'" >> file_list.txt

done


# Concatenate audio files

ffmpeg -f concat -safe 0 -i file_list.txt -c copy "$OUTPUT"


# Clean up temporary silence file

rm "$SILENCE" file_list.txt


echo "  Conversation audio created: $OUTPUT"

Popular posts from this blog

Recent Experiments testing ChatGPT's limits and some useful prompts

Testing ChatGPT on coding - a script to make CSV to ICS

Deepseek