@@ -53,21 +53,20 @@ export interface TTSOptions {
5353 baseURL : string ;
5454 encoding : TTSEncoding ;
5555 streamingLatency ?: number ;
56- wordTokenizer : tokenize . WordTokenizer ;
56+ wordTokenizer : tokenize . WordTokenizer | tokenize . SentenceTokenizer ;
5757 chunkLengthSchedule ?: number [ ] ;
5858 enableSsmlParsing : boolean ;
5959 inactivityTimeout : number ;
6060 syncAlignment : boolean ;
6161 autoMode ?: boolean ;
6262}
6363
64- const defaultTTSOptions : TTSOptions = {
64+ const defaultTTSOptionsBase = {
6565 apiKey : process . env . ELEVEN_API_KEY ,
6666 voice : DEFAULT_VOICE ,
6767 modelID : 'eleven_turbo_v2_5' ,
6868 baseURL : API_BASE_URL_V1 ,
69- encoding : 'pcm_22050' ,
70- wordTokenizer : new tokenize . basic . WordTokenizer ( false ) ,
69+ encoding : 'pcm_22050' as TTSEncoding ,
7170 enableSsmlParsing : false ,
7271 inactivityTimeout : DEFAULT_INACTIVITY_TIMEOUT ,
7372 syncAlignment : true ,
@@ -78,13 +77,33 @@ export class TTS extends tts.TTS {
7877 label = 'elevenlabs.TTS' ;
7978
8079 constructor ( opts : Partial < TTSOptions > = { } ) {
81- super ( sampleRateFromFormat ( opts . encoding || defaultTTSOptions . encoding ) , 1 , {
80+ super ( sampleRateFromFormat ( opts . encoding || defaultTTSOptionsBase . encoding ) , 1 , {
8281 streaming : true ,
8382 } ) ;
8483
84+ // Set autoMode to true by default if not provided is Python behavior,
85+ // but to make it non-breaking, we keep false as default in typescript
86+ const autoMode = opts . autoMode !== undefined ? opts . autoMode : false ;
87+
88+ // Set default tokenizer based on autoMode if not provided
89+ let wordTokenizer = opts . wordTokenizer ;
90+ if ( ! wordTokenizer ) {
91+ wordTokenizer = autoMode
92+ ? new tokenize . basic . SentenceTokenizer ( )
93+ : new tokenize . basic . WordTokenizer ( false ) ;
94+ } else if ( autoMode && ! ( wordTokenizer instanceof tokenize . SentenceTokenizer ) ) {
95+ // Warn if autoMode is enabled but a WordTokenizer was provided
96+ log ( ) . warn (
97+ 'autoMode is enabled, it expects full sentences or phrases. ' +
98+ 'Please provide a SentenceTokenizer instead of a WordTokenizer.' ,
99+ ) ;
100+ }
101+
85102 this . #opts = {
86- ...defaultTTSOptions ,
103+ ...defaultTTSOptionsBase ,
87104 ...opts ,
105+ autoMode,
106+ wordTokenizer,
88107 } ;
89108
90109 if ( this . #opts. apiKey === undefined ) {
@@ -156,10 +175,10 @@ export class SynthesizeStream extends tts.SynthesizeStream {
156175 }
157176
158177 protected async run ( ) {
159- const segments = new AsyncIterableQueue < tokenize . WordStream > ( ) ;
178+ const segments = new AsyncIterableQueue < tokenize . WordStream | tokenize . SentenceStream > ( ) ;
160179
161180 const tokenizeInput = async ( ) => {
162- let stream : tokenize . WordStream | null = null ;
181+ let stream : tokenize . WordStream | tokenize . SentenceStream | null = null ;
163182 for await ( const text of this . input ) {
164183 if ( this . abortController . signal . aborted ) {
165184 break ;
@@ -191,7 +210,7 @@ export class SynthesizeStream extends tts.SynthesizeStream {
191210 await Promise . all ( [ tokenizeInput ( ) , runStream ( ) ] ) ;
192211 }
193212
194- async #runWS( stream : tokenize . WordStream , maxRetry = 3 ) {
213+ async #runWS( stream : tokenize . WordStream | tokenize . SentenceStream , maxRetry = 3 ) {
195214 let retries = 0 ;
196215 let ws : WebSocket ;
197216 while ( true ) {
@@ -229,20 +248,40 @@ export class SynthesizeStream extends tts.SynthesizeStream {
229248 const requestId = shortuuid ( ) ;
230249 const segmentId = shortuuid ( ) ;
231250
232- ws . send (
233- JSON . stringify ( {
234- text : ' ' ,
235- voice_settings : this . #opts. voice . settings ,
236- ...( this . #opts. chunkLengthSchedule && {
237- generation_config : {
238- chunk_length_schedule : this . #opts. chunkLengthSchedule ,
239- } ,
240- } ) ,
251+ // simple helper to make sure what we send to ws.send
252+ const wsSend = ( data : {
253+ // (SynthesizeContent from python)
254+ text : string ;
255+ // setting flush somehow never finishes the current speech generation
256+ // https://github.com/livekit/agents-js/pull/820#issuecomment-3517138706
257+ // flush?: boolean;
258+ // initialization
259+ voice_settings ?: VoiceSettings ;
260+ generation_config ?: {
261+ chunk_length_schedule : number [ ] ;
262+ } ;
263+ } ) => {
264+ ws . send ( JSON . stringify ( data ) ) ;
265+ } ;
266+
267+ wsSend ( {
268+ text : ' ' ,
269+ voice_settings : this . #opts. voice . settings ,
270+ ...( this . #opts. chunkLengthSchedule && {
271+ generation_config : {
272+ chunk_length_schedule : this . #opts. chunkLengthSchedule ,
273+ } ,
241274 } ) ,
242- ) ;
275+ } ) ;
243276 let eosSent = false ;
244277
245278 const sendTask = async ( ) => {
279+ // Determine if we should flush on each chunk (sentence)
280+ /*const flushOnChunk =
281+ this.#opts.wordTokenizer instanceof tokenize.SentenceTokenizer &&
282+ this.#opts.autoMode !== undefined &&
283+ this.#opts.autoMode;*/
284+
246285 let xmlContent : string [ ] = [ ] ;
247286 for await ( const data of stream ) {
248287 if ( this . abortController . signal . aborted ) {
@@ -260,15 +299,20 @@ export class SynthesizeStream extends tts.SynthesizeStream {
260299 }
261300 }
262301
263- ws . send ( JSON . stringify ( { text : text + ' ' } ) ) ; // must always end with a space
302+ wsSend ( {
303+ text : text + ' ' , // must always end with a space
304+ // ...(flushOnChunk && { flush: true }),
305+ } ) ;
264306 }
265307
266308 if ( xmlContent . length ) {
267309 this . #logger. warn ( 'ElevenLabs stream ended with incomplete XML content' ) ;
268310 }
269311
270- // no more tokens, mark eos
271- ws . send ( JSON . stringify ( { text : '' } ) ) ;
312+ // no more tokens, mark eos with flush
313+ // setting flush somehow never finishes the current speech generation
314+ // wsSend({ text: '', flush: true });
315+ wsSend ( { text : '' } ) ;
272316 eosSent = true ;
273317 } ;
274318
0 commit comments