Code viewer for World: Audio Speech Synthesis Usi...

// // Cloned by duggano5 on 27 Nov 2023 from World "Chat with GPT model" by Starter user 
// // Please leave this clone trail here.

// global variables
let voice_id = 'XrExE9yKIg1WjnnlVkGX'; // initial voice id
let model_id = 'eleven_multilingual_v2'; // the model that all the voices use
let tokens = []; // where we store all the audio up to now (a buffer)
const audioContext = new AudioContext(); // create an instance of the audio context so we can process the buffers 
let api_key = ""; // for storing the user's API key
let exampleJSON = '{"metadata":{"transaction_key":"deprecated","request_id":"7d4e89cb-d8ff-4658-97df-ec69c22af844","sha256":"1cbc50afa05abbfd139c37e22ffaa1db485e962371a9be5602b5472e29fd7e2b","created":"2023-12-03T00:42:38.596Z","duration":23.431875,"channels":1,"models":["a375937a-9156-40cb-940d-6a1103040ef1"],"model_info":{"a375937a-9156-40cb-940d-6a1103040ef1":{"name":"2-general-nova","version":"2023-11-14.3290","arch":"nova-2"}}},"results":{"channels":[{"alternatives":[{"transcript":"Thisisasampleparagraphtodemonstratethisapplication.Ittakesastringoftextandthentranscribesthem.Sincethisisintendedfordubbing,itcanacceptadjacentinputthatspecifiesthestartandendtimesofwords.Ifthere\'sanoticeablepause,itseparatesthetranscriptions,insertsagap,andjoinsthemtogetheragain.Here\'sanexampleofapause.Isn\'tthatsomething?","confidence":0.99995744,"words":[{"word":"this","start":0,"end":0.16,"confidence":0.9999032,"punctuated_word":"This"},{"word":"is","start":0.16,"end":0.32,"confidence":0.99999285,"punctuated_word":"is"},{"word":"a","start":0.32,"end":0.48,"confidence":0.9999958,"punctuated_word":"a"},{"word":"sample","start":0.48,"end":0.88,"confidence":0.99998355,"punctuated_word":"sample"},{"word":"paragraph","start":0.88,"end":1.38,"confidence":0.99998975,"punctuated_word":"paragraph"},{"word":"to","start":1.4399999,"end":1.68,"confidence":0.9999887,"punctuated_word":"to"},{"word":"demonstrate","start":1.68,"end":2.1599998,"confidence":0.9999994,"punctuated_word":"demonstrate"},{"word":"this","start":2.1599998,"end":2.3999999,"confidence":0.99999034,"punctuated_word":"this"},{"word":"application","start":2.3999999,"end":2.8999999,"confidence":0.9999779,"punctuated_word":"application."},{"word":"it","start":3.6,"end":3.84,"confidence":0.9999716,"punctuated_word":"It"},{"word":"takes","start":3.84,"end":4.08,"confidence":0.9999993,"punctuated_word":"takes"},{"word":"a","start":4.08,"end":4.16,"confidence":0.99997735,"punctuated_word":"a"},{"word":"string","start":4.16,"end":4.48,"confidence":0.9999995,"punctuated_word":"string"},{"word":"of","start":4.48,"end":4.64,"confidence":0.9999926,"punctuated_word":"of"},{"word":"text","start":4.64,"end":5.12,"confidence":0.9999385,"punctuated_word":"text"},{"word":"and","start":5.12,"end":5.3599997,"confidence":0.98379934,"punctuated_word":"and"},{"word":"then","start":5.3599997,"end":5.6,"confidence":0.9999926,"punctuated_word":"then"},{"word":"transcribes","start":5.6,"end":6.1,"confidence":0.9998105,"punctuated_word":"transcribes"},{"word":"them","start":6.24,"end":6.74,"confidence":0.9998946,"punctuated_word":"them."},{"word":"since","start":7.04,"end":7.3599997,"confidence":0.9999672,"punctuated_word":"Since"},{"word":"this","start":7.3599997,"end":7.52,"confidence":0.99999535,"punctuated_word":"this"},{"word":"is","start":7.52,"end":7.68,"confidence":0.9999819,"punctuated_word":"is"},{"word":"intended","start":7.68,"end":8.16,"confidence":0.9999999,"punctuated_word":"intended"},{"word":"for","start":8.16,"end":8.32,"confidence":0.99996793,"punctuated_word":"for"},{"word":"dubbing","start":8.32,"end":8.82,"confidence":0.9999528,"punctuated_word":"dubbing,"},{"word":"it","start":8.955,"end":9.115,"confidence":0.9998977,"punctuated_word":"it"},{"word":"can","start":9.115,"end":9.275,"confidence":0.99999654,"punctuated_word":"can"},{"word":"accept","start":9.275,"end":9.755,"confidence":0.99998605,"punctuated_word":"accept"},{"word":"adjacent","start":9.755,"end":10.235,"confidence":0.90853304,"punctuated_word":"adjacent"},{"word":"input","start":10.235,"end":10.635,"confidence":0.9999794,"punctuated_word":"input"},{"word":"that","start":10.635,"end":10.875,"confidence":0.99990463,"punctuated_word":"that"},{"word":"specifies","start":10.875,"end":11.375,"confidence":0.99999106,"punctuated_word":"specifies"},{"word":"the","start":11.434999,"end":11.675,"confidence":0.9999901,"punctuated_word":"the"},{"word":"start","start":11.675,"end":11.995,"confidence":0.9999969,"punctuated_word":"start"},{"word":"and","start":11.995,"end":12.235,"confidence":0.99498194,"punctuated_word":"and"},{"word":"end","start":12.235,"end":12.395,"confidence":0.99995744,"punctuated_word":"end"},{"word":"times","start":12.395,"end":12.715,"confidence":0.9999367,"punctuated_word":"times"},{"word":"of","start":12.715,"end":12.875,"confidence":0.99999404,"punctuated_word":"of"},{"word":"words","start":12.875,"end":13.375,"confidence":0.9999747,"punctuated_word":"words."},{"word":"if","start":13.835,"end":13.995,"confidence":0.99998116,"punctuated_word":"If"},{"word":"there\'s","start":13.995,"end":14.235,"confidence":0.99998134,"punctuated_word":"there\'s"},{"word":"a","start":14.235,"end":14.315001,"confidence":0.99998367,"punctuated_word":"a"},{"word":"noticeable","start":14.315001,"end":14.815001,"confidence":0.9999558,"punctuated_word":"noticeable"},{"word":"pause","start":14.875,"end":15.375,"confidence":0.9999834,"punctuated_word":"pause,"},{"word":"it","start":15.514999,"end":15.754999,"confidence":0.9999238,"punctuated_word":"it"},{"word":"separates","start":15.754999,"end":16.154999,"confidence":0.9998172,"punctuated_word":"separates"},{"word":"the","start":16.154999,"end":16.395,"confidence":0.99997795,"punctuated_word":"the"},{"word":"transcriptions","start":16.395,"end":16.895,"confidence":0.9997569,"punctuated_word":"transcriptions,"},{"word":"inserts","start":17.369999,"end":17.689999,"confidence":0.99945074,"punctuated_word":"inserts"},{"word":"a","start":17.689999,"end":17.849998,"confidence":0.9995147,"punctuated_word":"a"},{"word":"gap","start":17.849998,"end":18.25,"confidence":0.75946546,"punctuated_word":"gap,"},{"word":"and","start":18.25,"end":18.41,"confidence":0.9996228,"punctuated_word":"and"},{"word":"joins","start":18.41,"end":18.73,"confidence":0.9998728,"punctuated_word":"joins"},{"word":"them","start":18.73,"end":18.89,"confidence":0.9995384,"punctuated_word":"them"},{"word":"together","start":18.89,"end":19.289999,"confidence":0.99980515,"punctuated_word":"together"},{"word":"again","start":19.289999,"end":19.789999,"confidence":0.97341293,"punctuated_word":"again."},{"word":"here\'s","start":20.169998,"end":20.49,"confidence":0.99876666,"punctuated_word":"Here\'s"},{"word":"an","start":20.49,"end":20.65,"confidence":0.99967766,"punctuated_word":"an"},{"word":"example","start":20.65,"end":21.05,"confidence":0.99976784,"punctuated_word":"example"},{"word":"of","start":21.05,"end":21.21,"confidence":0.999361,"punctuated_word":"of"},{"word":"a","start":21.21,"end":21.369999,"confidence":0.99836403,"punctuated_word":"a"},{"word":"pause","start":21.369999,"end":21.869999,"confidence":0.9982108,"punctuated_word":"pause."},{"word":"isn\'t","start":25.169998,"end":25.489998,"confidence":0.99689317,"punctuated_word":"Isn\'t"},{"word":"that","start":25.489998,"end":25.73,"confidence":0.9950251,"punctuated_word":"that"},{"word":"something","start":25.73,"end":26.23,"confidence":0.99172926,"punctuated_word":"something!"}],"paragraphs":{"transcript":"\nThisisasampleparagraphtodemonstratethisapplication.Ittakesastringoftextandthentranscribesthem.Sincethisisintendedfordubbing,itcanacceptadjacentinputthatspecifiesthestartandendtimesofwords.Ifthere\'sanoticeablepause,itseparatesthetranscriptions,insertsagap,andjoinsthemtogetheragain.Here\'sanexampleofapause.\n\nIsn\'tthatsomething?","paragraphs":[{"sentences":[{"text":"Thisisasampleparagraphtodemonstratethisapplication.","start":0,"end":2.8999999},{"text":"Ittakesastringoftextandthentranscribesthem.","start":3.6,"end":6.74},{"text":"Sincethisisintendedfordubbing,itcanacceptadjacentinputthatspecifiesthestartandendtimesofwords.","start":7.04,"end":13.375},{"text":"Ifthere\'sanoticeablepause,itseparatesthetranscriptions,insertsagap,andjoinsthemtogetheragain.","start":13.835,"end":19.789999},{"text":"Here\'sanexampleofapause.","start":20.169998,"end":21.869999}],"num_words":62,"start":0,"end":21.869999},{"sentences":[{"text":"Isn\'tthatsomething?","start":22.169998,"end":23.23}],"num_words":3,"start":22.169998,"end":23.23}]}}]}]}}';

const mainDiv = document.createElement('div'); // for appending elements to
mainDiv.className = 'grid gap-6 grid-cols-2 p-24 justify-center'; // basic tailwind styling

let voiceSettings = { // the default voice settings, max stability and similarity for consistency
    stability: 1.0,
    similarity_boost: 1.0,
};

/** 
 * Converts text to speech using the specified model, voice settings, and API key using the Eleven API.
 * @param {string} text - The text to convert to speech.
 * @returns {Promise<AudioBuffer>} - A promise that resolves to the audio buffer representing the converted speech.
 * @throws {Error} - If there is an HTTP error or an error occurs during audio decoding.
 */
async function convertTextToSpeech(text) {
    const body = { // the voice settings
        model_id: model_id,
        text: text,
        voice_settings: voiceSettings
    };
    
    const options = { // the request
        method: 'POST',
        headers: {
            'xi-api-key': api_key,
            'Content-Type': 'application/json'
        },
        body: JSON.stringify(body)
    };

    try { // try to get the audio from the api
        const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${voice_id}?output_format=mp3_44100_64`, options);

        if (!response.ok) { // throw an error if we have trouble getting the audio
            throw new Error(`HTTP error! status: ${response.status}`);
        }

        const arrayBuffer = await response.arrayBuffer(); // turn the audio response into an array buffer
        const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); // decode the audio buffer
        tokens.push(audioBuffer); // add the buffer to the array
        return audioBuffer; // return the buffer if we need it, as of now not used

    }
    catch (err) { // catch any errors, log them
        console.error(err);
        throw err;
    }
}

/**
 * Takes a JSON and transcribes all the words into speech, adding silence gaps where necessary.
 * @param {string} jsonText - The JSON string containing the words to transcribe.
 * @returns {Promise<void>} - A promise that resolves when the transcription is complete.
 */
async function transcribeWordsFromJson(jsonText) {
    const jsonData = JSON.parse(jsonText);
    const words = jsonData.results.channels[0].alternatives[0].words; // where the words are expected to be

    let currentText = words[0].word; // initialise the words and times
    let lastEndTime = words[0].end;

    for (let i = 1; i < words.length; i++) { // loop over all the words
        const wordInfo = words[i]; // get the data of the current word
        const timeGap = wordInfo.start - lastEndTime; // 

        if (timeGap > 0.9) { // we have a gap, transcribe what we have
            await convertTextToSpeech(currentText);
            await appendSilence(timeGap); // append the gap of silence
            currentText = wordInfo.word; // start new text
        }
        else currentText += ' ' + wordInfo.word; // otherwise keep building the text

        lastEndTime = wordInfo.end; // update the end time
    }

    if (currentText) await convertTextToSpeech(currentText); // transcribe any remaining text
}
    
/**
 * Appends silence of a specified duration to the tokens array.
 * @param {number} silenceDuration - The duration of the silence in seconds.
 * @returns {AudioBuffer} - The silence buffer.
 */
function appendSilence(silenceDuration) {
    console.log("Noticable found, duration: " + silenceDuration + " seconds."); // for demo 

    const sampleRate = 44100; // matching the rate the api returns
    const silenceBuffer = audioContext.createBuffer( // add a new buffer of silence
        1, sampleRate * silenceDuration, sampleRate
    );
    tokens.push(silenceBuffer); // add the buffer to the array
    return silenceBuffer; // return the buffer if we need it, as of now not used
}

/**
 * Takes all the audio buffers from the tokens array and turns them into one buffer.
 * @param {AudioBuffer[]} buffers - An array of audio buffers to join.
 * @param {AudioContext} audioContext - The audio context used to create the output buffer.
 * @returns {AudioBuffer} - The concatenated audio buffer.
 */
function joinBuffers(buffers, audioContext) {
    const totalLength = buffers.reduce((sum, buffer) => sum + buffer.length, 0);
    const outputBuffer = audioContext.createBuffer(
        buffers[0].numberOfChannels, 
        totalLength, 
        buffers[0].sampleRate
    );

    let offset = 0;
    buffers.forEach(buffer => {
        for (let channel = 0; channel < buffer.numberOfChannels; channel++) {
            outputBuffer.getChannelData(channel).set(buffer.getChannelData(channel), offset);
        }
        offset += buffer.length;
    });

    return outputBuffer;
}

/**
 * Creates a WAV file from an audio buffer and adds it to the html.
 * @param {AudioBuffer} audioBuffer - The audio buffer to convert to WAV.
 */
function createWAV(audioBuffer) {
    const blob = bufferToWav(audioBuffer, audioBuffer.length);
    const url = URL.createObjectURL(blob);
    const audio = new Audio(url);
    audio.controls = true;
    audio.className = 'block w-full p-4 text-gray-900 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500';
    mainDiv.appendChild(audio);
}

/**
 * Converts an AudioBuffer to a WAV file so we can embed it
 * barely modified from https://stackoverflow.com/questions/61264581/how-to-convert-audio-buffer-to-mp3-in-javascript
 * @param {AudioBuffer} abuffer - The AudioBuffer to convert.
 * @param {number} len - The length of the AudioBuffer in seconds.
 * @returns {Blob} - The converted WAV file as a Blob object.
 */
function bufferToWav(abuffer, len) {
    let numOfChan = abuffer.numberOfChannels,
        length = len * numOfChan * 2 + 44,
        buffer = new ArrayBuffer(length),
        view = new DataView(buffer),
        channels = [], i, sample,
        offset = 0,
        pos = 0;

    // write WAVE header
    setUint32(0x46464952); // "RIFF"
    setUint32(length - 8); // file length - 8
    setUint32(0x45564157); // "WAVE"

    setUint32(0x20746d66); // "fmt " chunk
    setUint32(16); // length = 16
    setUint16(1); // PCM (uncompressed)
    setUint16(numOfChan);
    setUint32(abuffer.sampleRate);
    setUint32(abuffer.sampleRate * 2 * numOfChan); // byte rate
    setUint16(numOfChan * 2); // block align
    setUint16(16); // bits per sample

    setUint32(0x61746164); // "data" - chunk
    setUint32(length - pos - 4); // chunk length

    // write interleaved data
    for(i = 0; i < abuffer.numberOfChannels; i++)
        channels.push(abuffer.getChannelData(i));

    while(pos < length) {
        for(i = 0; i < numOfChan; i++) { // interleave channels
            sample = Math.max(-1, Math.min(1, channels[i][offset])); // clamp
            sample = (0.5 + sample < 0 ? sample * 32768 : sample * 32767)|0; // scale to 16-bit signed int
            view.setInt16(pos, sample, true); // write 16-bit sample
            pos += 2;
        }
        offset++ // next source sample
    }

    // create Blob
    return new Blob([buffer], { type: "audio/wav" });

    function setUint16(data) {
        view.setUint16(pos, data, true);
        pos += 2;
    }

    function setUint32(data) {
        view.setUint32(pos, data, true);
        pos += 4;
    }
}

/**
 * Processes the input text and creates an audio player.
 * @param {string} text - The input text to be processed.
 * @param {string} api_key_value - The API key value.
 * @returns {Promise<void>} - A promise that resolves when the audio player is created.
 */
async function processAndCreateAudioPlayer(text, api_key_value) {
    api_key = api_key_value; // save the api key from the input
    const isJsonInput = jsonInputToggle.checked; // check if we want to process it as json
    tokens = []; // reset the tokens array
    await (isJsonInput ? transcribeWordsFromJson(text) : convertTextToSpeech(text)); // transcribe the text

    const combinedBuffer = joinBuffers(tokens, audioContext); // join all the buffers together
    createWAV(combinedBuffer); // for playback
}

/**
 * Adds a new element to a container.
 * 
 * @param {HTMLElement} container - The container element to append the new element to.
 * @param {string} element - The type of element to create (e.g., 'div', 'input', 'button').
 * @param {string} [placeholder=''] - The placeholder text for input elements.
 * @param {string} [id=''] - The ID attribute for the new element.
 * @param {string[]} [classes=[]] - An array of CSS classes to apply to the new element.
 * @returns {HTMLElement} - The newly created element.
 */
function addElement(container, element, placeholder='', id='', classes=[]) {
    const newElement = document.createElement(element);

    if (placeholder) newElement.placeholder = placeholder;
    if (id) newElement.id = id;
    if (classes.length) newElement.className = classes.join(' ');

    container.appendChild(newElement);
    return newElement;
}

/**
 * Creates and appends a label element to a container element.
 * 
 * @param {HTMLElement} container - The container element to append the label to.
 * @param {string} forElement - The ID of the element the label is associated with.
 * @param {string[]} [classes=[]] - An optional array of CSS classes to apply to the label.
 * @param {string} [text=''] - The text content of the label.
 * @returns {HTMLLabelElement} The created label element.
 */
function addLabel(container, forElement, classes=[], text='') {
    const newLabel = document.createElement('label');
    newLabel.htmlFor = forElement;
    newLabel.textContent = text;
    if (classes.length) newLabel.className = classes.join(' ');
    container.appendChild(newLabel);
    return newLabel;
}

/**
 * loads the ui and styles it (including the button that starts the process)
 */
function createUI()
{
    const apiKeyInput = addElement(mainDiv, 'input', 'Enter API key', 'apiKeyInput', ['mb-2', 'border', 'border', 'border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5']);
    addLabel(mainDiv, apiKeyInput, ['block', 'mb-1', 'text-xl', 'text-gray-900'], 'API Key')

    const textInput = addElement(mainDiv, 'input', 'Enter text to send to the API', 'textInput', ['block w-full p-4 text-gray-900 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500']);
    addLabel(mainDiv, textInput, ['block mb-2 text-xl text-gray-900'], 'Text to send to the API')

    const jsonInputToggle = addElement(mainDiv, 'input', '', 'jsonInputToggle', ['flex justify-center mb-1 w-4 h-4 border border-gray-300 rounded bg-gray-50 focus:ring-3 focus:ring-blue-300']);
    jsonInputToggle.type = 'checkbox';
    jsonInputToggle.id = 'jsonInputToggle';
    addLabel(mainDiv, jsonInputToggle, ['block text-xl text-gray-900'], 'JSON Input Mode');

    // describe what the json toggle does
    const jsonInputDescription = addElement(mainDiv, 'p', '', '', ['block text-gray-400', 'text-md']);
    jsonInputDescription.textContent = 'Set it to JSON mode to transcribe audio with gaps. It takes JSONS in the format of the example:';

    // copy json button
    const copyJsonButton = document.createElement('button');
    copyJsonButton.textContent = 'Copy example JSON';
    copyJsonButton.className = 'block w-full p-4 text-gray-600 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500';
    mainDiv.appendChild(copyJsonButton);

    
    copyJsonButton.addEventListener('click', () => {
        navigator.clipboard.writeText(exampleJSON);
    });

    var voices = { // voice id: voice name
        '2EiwWnXFnvU5JabPnv8n': 'Clyde (American Veteran)',
        'CYw3kZ02Hs0563khs1Fj': 'Dave (British-Essex)',
        'D38z5RcWu1voky8WS1ja': 'Finn (Irish Sailor)',
        'GBv7mTt0atIp3Br8iCZE': 'Thomas (Calm American)',
        'IKne3meq5aSn9XLyUdCD': 'Charlie (Casual Australian)',
        'LcfcDJNUP1GQjkzn1xUU': 'Emily (Young American)',
        'N2lVS1w4EtoT3dr4eOWO': 'Callum (Hoarse American)',
        'ODq5zmih8GrVes37Dizd': 'Patrick (Shouty American)',
        'SOYHLrjzK2X1ezoPC6cr': 'Harry (Anxious American)',
        'TX3LPaxmHKxFdv7VOQHJ': 'Liam (Young American)',
        'ThT5KcBeYPX3keUQqHPh': 'Dorothy (Young British)',
        'XB0fDUnXU5powFXDhCwa': 'Charlotte (British-Swedish)',
        'XrExE9yKIg1WjnnlVkGX': 'Matilda (Warm American)',
        'Yko7PKHZNXotIFUBG7I9': 'Matthew (Calm British)',
        'ZQe5CZNOzWyzPSCn5a3c': 'James (Calm Australian)',
        'Zlb1dXrM653N07WRdFW3': 'Joseph (British Reporter)',
        'bVMeCyTHy58xNoL34h3p': 'Jeremy (American-Irish)',
        'flq6f7yk4E4fJM5XTYuZ': 'Michael (Old American)',
        'g5CIjZEefAph4nQFvHAz': 'Ethan (American Whisper)',
        'jBpfuIE2acCO8z3wKNLl': 'Gigi (Young American)',
        'jsCqWAovK2LkecY7zXl4': 'Freya (Hyped American)',
        'oWAxZDx7w5VEj9dCyTzz': 'Grace (American-Southern)',
        'onwK4e9ZLuTAKqWW03F9': 'Daniel (British Presenter)',
        'pMsXgVXv3BLzUgSXRplE': 'Serena (Pleasant American)',
        'piTKgcLEGmPE4e6mEKli': 'Nicole (American Whisper)',
        't0jbNlBVZ17f02VDIeMI': 'Jessie (raspy American)',
        'wViXBPUzp2ZZixB1xQuM': 'Ryan (American Soldier)',
        'z9fAnlkpzviPz146aGWa': 'Glinda (Witchy American)',
        'zcAOhNBS3c14rBihAFp1': 'Giovanni (Italian-American)',
        'zrHiDhphv9ZnVXBqCLjz': 'Mimi (Swedish-American)'
    };

    // Create a select element
    const select = document.createElement('select');
    select.id = 'voiceSelect';
    select.onchange = () => voice_id = select.value; // update the variable

    // Populate the select element with options
    for (var id in voices) {
        var option = document.createElement('option');
        option.value = id;
        option.textContent = voices[id];
        select.appendChild(option);
    }
    select.className = 'block w-full p-4 text-gray-900 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500';
    mainDiv.appendChild(select); // add it to the main div
    addLabel(mainDiv, select, ['block mb-2 text-xl font-medium text-gray-900'], 'Voice');

    // slider for stability
    const stabilitySlider = document.createElement('input');
    stabilitySlider.type = 'range';
    stabilitySlider.min = 0;
    stabilitySlider.max = 1;
    stabilitySlider.step = 0.01;
    stabilitySlider.value = 1;
    stabilitySlider.id = 'stabilitySlider';
    mainDiv.appendChild(stabilitySlider);
    addLabel(mainDiv, stabilitySlider, ['block mb-2 text-xl font-medium text-gray-900'], 'Voice Stability');

    // update the variables on slider change
    stabilitySlider.onchange = () => voiceSettings.stability = parseFloat(stabilitySlider.value);

    // submit button
    const button = document.createElement('button');
    button.textContent = 'Synthesise';
    button.className = 'block w-full p-4 text-gray-900 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500';
    mainDiv.appendChild(button);
    button.addEventListener('click', () => { // button click event
        processAndCreateAudioPlayer(textInput.value, apiKeyInput.value); 
    });

    document.body.appendChild(mainDiv); // add the ui to the page
}

// load tailwind for styling
const script = document.createElement('script');
script.src = 'https://cdn.tailwindcss.com';
document.head.appendChild(script);

// load Google's Inter font for styling
const link = document.createElement('link');
link.href = 'https://fonts.googleapis.com/css2?family=Inter:wght@400;700&display=swap';
link.rel = 'stylesheet';
document.head.appendChild(link);
document.body.style.fontFamily = 'Inter, sans-serif';

window.addEventListener('load', createUI); // load the UI when the page loads