using coqui.ai and python to convert a conversation into (spoken) speech using your voice as one conversant
Install coqui (see other post here: https://johnsaiblog.blogspot.com/2025/03/coqui-going-down-how-to-install-before.html)
Now if you save your discussion into a text file, called full_conversation.txt, in this order:
speaker 1 blah blah blah
speaker 2 blah blah etc etc
(ie one speaker per line, no multi-line sentences), you can split the file up into sections of speech to convert to speech. Make sure each sentence is approximately 250 letters or less. Shorter is better.
This script creates a folder called conversation_parts/ which contains the conversation split up by speaker and by sentence.
import os
import re
# Input and output folder
input_file = "full_conversation.txt" # The full transcript
output_folder = "conversation_parts"
max_length = 250 # Maximum characters per chunk
# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)
# Function to split text into manageable chunks
def split_text(text, max_length=250):
sentences = re.split(r'(?<=[.!?])\s+', text) # Split at punctuation
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) <= max_length:
current_chunk += " " + sentence if current_chunk else sentence
else:
chunks.append(current_chunk.strip()) # Store previous chunk
current_chunk = sentence # Start new chunk
if current_chunk:
chunks.append(current_chunk.strip()) # Add last chunk
return chunks
# Read the transcript file
with open(input_file, "r", encoding="utf-8") as f:
lines = f.readlines()
# Process each line as a speaker turn
file_count = 1
for i, line in enumerate(lines):
line = line.strip()
if not line:
continue # Skip empty lines
speaker = "a" if i % 2 == 0 else "b" # Alternating speakers
is_interlocutor = speaker == "b" # Interlocutor gets British voice
# Split long lines into chunks
chunks = split_text(line, max_length)
# Save each chunk as a separate text file
for j, chunk in enumerate(chunks):
suffix = f"-{j+1}" if len(chunks) > 1 else "" # Numbering for split parts
filename = f"convo-{file_count:03}{speaker}{'-interloc' if is_interlocutor else ''}{suffix}.txt"
output_path = os.path.join(output_folder, filename)
with open(output_path, "w", encoding="utf-8") as out_f:
out_f.write(chunk)
print(f"Created: {output_path}")
file_count += 1 # Increment numbering
print(" Transcript successfully split into parts.")
This next script converts the sentences into audio wav files in speech format ie spoken wording, using coqui. The sound is a bit artificial but it's ok for general use. It uses a file called my_voice.wav to create audio in your own voice, so put a more or less one minute sample of yourself speaking into a wav file and call it my_voice.wav. Note that it gives you an american accent, sorry.
import os
import subprocess
import shlex
# Folder containing split text files
input_folder = "conversation_parts"
# Iterate through sorted files
for file in sorted(os.listdir(input_folder)):
if file.endswith(".txt"):
file_path = os.path.join(input_folder, file)
with open(file_path, "r", encoding="utf-8") as f:
text = f.read().strip()
if not text:
print(f"Skipping empty file: {file}")
continue # Skip empty files
output_path = file_path.replace(".txt", ".wav")
# Escape quotes inside the text properly
safe_text = shlex.quote(text)
# Determine which voice to use
if "interloc" in file: # British accent
tts_command = f"""
tts --text {safe_text} \
--model_name tts_models/en/vctk/vits \
--speaker_idx p234 \
--out_path {shlex.quote(output_path)}
"""
else: # Your cloned voice
tts_command = f"""
tts --text {safe_text} \
--model_name tts_models/multilingual/multi-dataset/xtts_v2 \
--speaker_wav my_voice.wav \
--language_idx "en" \
--out_path {shlex.quote(output_path)}
"""
print(f"Generating audio: {output_path}")
try:
subprocess.run(tts_command, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f" Error generating {output_path}: {e}")
print(" Audio files generated for all dialogue parts.")
Finally, this last file joins all the audios into a single audio file.
#!/bin/bash
# Output file
OUTPUT="final_conversation.wav"
# Temporary silence file (0.2 seconds)
SILENCE="silence.wav"
# Generate silence
ffmpeg -f lavfi -i anullsrc=r=22050:cl=mono -t 0.2 -y "$SILENCE"
# Create the file list for concatenation
rm -f file_list.txt
for file in $(ls conversation_parts/*.wav | sort -V); do
echo "file '$file'" >> file_list.txt
echo "file '$SILENCE'" >> file_list.txt
done
# Concatenate audio files
ffmpeg -f concat -safe 0 -i file_list.txt -c copy "$OUTPUT"
# Clean up temporary silence file
rm "$SILENCE" file_list.txt
echo " Conversation audio created: $OUTPUT"