// World by Ethan Doyle.
// Student Number 22497082.
// ** IMPORTANT **
// The API keys are listed in the order that they were submitted.
// World that makes calls to multiple AI Models which give a caption/description of an image uploaded to the world.
// If code is taken from a website, it will be commented beside the code.
// Tutorials / code resources used: https://www.w3schools.com/howto/howto_html_file_upload_button.asp
// https://stackoverflow.com/questions/18694437/how-to-preview-image-before-uploading-in-jquery/62382964#62382964
// https://www.w3schools.com/jquery/jquery_intro.asp
// https://www.w3schools.com/jquery/jquery_ajax_intro.asp
// https://www.w3schools.com/howto/howto_html_file_upload_button.asp
// https://adamfard.com/blog/how-to-use-chatgpt-api
// https://community.openai.com/t/uploading-images-to-the-chatgpt-api/985494
// https://refine.dev/blog/css-rounded-corners/#what-is-css-border-radius
// https://www.w3schools.com/html/html_css.asp
// https://htmlcheatsheet.com/css/
// Everything else was done through trial & error.
// This world uses a couple of AI models.
// The first model it uses is OpenAI's ChatGPT model, GPT-4o-mini.
// The second and third models are from Hugging Face, a community-driven website that contains a plethora of AI models to choose from. It uses BLIP and VIT-GPT2.
// The last model it uses is grok-vision-beta (xAI) to compare the three prior models.
$('body').css("margin", "15px"); // Learnt how to modify margin and padding universal values from https://ancientbrain.com/world.php?world=2850716357's source code.
$('body').css("padding", "15px");
$(document).ready(function() { // From W3Schools jQuery tutorial
$('body').css("background-color", "black");
$('body').css('color', 'white');
});
// The base HTML written into the document. Handles the functions for getting the API keys, file upload and getting captions.
// CSS elements gleaned from https://www.w3schools.com/html/html_css.asp and https://htmlcheatsheet.com/css/
document.write (`
<h1>Image Captions</h1>
This world is centered around uploading an image (PNG or JPEG, please) to the site. <br>
The AI models GPT-4o-mini, BLIP and VIT-GPT2 will give a brief caption to your uploaded image.<br>
The captions will then be evaluated and compared by Grok-Vision-Beta.<br>
<div>
<h1>API Keys</h1>
<p>Before moving on, you must enter three API keys.</p>
<p> Note that...<br>
API Key 1 is for ChatGPT.<br>
API Key 2 is for Hugging Face.<br>
API Key 3 is for xAI.
</p>
<form id="apiForm">
<label for="api1">API Key 1: </label>
<input type="text" id="apiKey1" style="width:450px"><br>
<label for="api2">API Key 2: </label>
<input type="text" id="apiKey2" style="width:450px"><br>
<label for="api3">API Key 3: </label>
<input type="text" id="apiKey3" style="width:450px">
<button type="button" onclick="getApi()">Submit Keys</button>
</form>
<div id="apikeyout"> </div>
</div>
<div style="text-align: center; display: flex; flex-direction: column; align-items: center;">
<h1>Upload an Image</h1>
<p>Please upload an image for captioning below.</p>
<input type="file" id="imageUpload" accept="image/*" onChange={handleImageUpload(event)}> <br>
<button onclick="onSubmit()">Get Caption</button>
<br>
<div id="errordiv"> </div>
<br>
<img id="imageUploaded" alt="Uploaded Image" style="display:none; max-height:750px; max-width:1000px">
</div>
<br>
<div id="output">
</div>
`);
let userimg = ""; // Holds the user submitted imaage
let typechecker = ""; // For type comparison
// **
// Please enter the API keys here (in form submission).
// **
let key1 = "";
let key2 = "";
let key3 = "";
function getApi() { // Gets the API keys.
key1 = $(apiKey1).val();
key2 = $(apiKey2).val();
key3 = $(apiKey3).val();
console.log(key1);
console.log(key2);
console.log(key3);
if(key1 === "" || key2 === "" || key3 === ""){
$("#apikeyout").html("<font color=red><h2>Error: Please enter a value for all keys.</h2></font>");
}
else{
$("#apikeyout").html("<p>API Keys set.</p>");
}
}
function handleImageUpload(event) { // Handles image upload from the onChange tag in <input>.
const file = event.target.files[0]; // Grabs the file from user input.
let lenz = file.name.length; // Gets the file length so we can slice it.
console.log(lenz);
console.log(file.name); // logging for testing
typechecker = ((file.name).slice(lenz - 4, lenz)).trim(); // Makes sure the file is a .png or .jpeg.
if(!(typechecker === ".png" || typechecker === "jpeg" || typechecker == ".jpg")){ // Checker for above.
$("#errordiv").html("<font color=red><h2>Please enter an image with the correct file extensions. (Accepted types: PNG, JPEG/JPG)</h2></font>"); // Puts a message on the screen telling the user to enter in an image.
$("#output").html("");
$("#imageUploaded").attr("src", "").hide();
console.log("Please upload a .png or .jpeg file.");
return;
}
else if(!file) { // Makes sure that there is actually a file uploaded.
$("#errordiv").html("<font color=red><h2>Please enter an image! (Accepted types: PNG, JPEG/JPG)</h2></font>");
$("#output").html("");
$("#imageUploaded").attr("src", '').hide();
console.log("Please upload a valid image.");
return;
}
else{ // This else statement is for clearing the error message on the screen (if there is one) and the AI model responses (if there are any).
$("#errordiv").html("<p></p>");
$("#output").html("");
}
const reader = new FileReader(); // Reads the file.
reader.onload = function () { // Function to read and display the file.
$("#imageUploaded").attr("src", reader.result).show(); // Makes sure that the file is displayed in the <img> tag.
userimg = reader.result.split(",")[1];
}
reader.readAsDataURL(file);
}
async function onSubmit() { // For button submission.
let lenz = userimg.length;
let temp = userimg;
if(!(typechecker === ".png" || typechecker === "jpeg" || typechecker == ".jpg")){ // Does the same as the checker in handleImageUpload. Honestly, kind of unnecessary, but it serves as a just-in-case scenario.
$("#errordiv").html("<font color=red><h2>Please enter an image! (Accepted types: PNG, JPEG/JPG)</h2></font>");
console.log("Please upload a .png or .jpeg file.");
return;
}
else if(!userimg) { // If there is no image, tell the user there is none and prompt them to upload an image.
$("#errordiv").html("<font color=red><h2>Please enter an image! (Accepted types: PNG, JPEG/JPG)</h2></font>");
console.log("Please upload an image first.");
return;
}
else if(key1 === "" || key2 === "" || key3 === ""){ // If no keys are entered, do not submit the images.
$("#errordiv").html("<font color=red><h2>You haven't fully entered in any API keys yet.</h2></font>");
return;
}
else{ // This else statement is for clearing the error message on the screen (if there is one) and the AI model responses (if there are any).
$("#errordiv").html("<p></p>");
$("#output").html("");
}
$("#output").html("<h2 style='display:flex; justify-content: center'>Processing caption...</h2>");
try {
const caption1 = await getCaption1(userimg); // These three variables await the functions that call the API. Hold the data from the AI responses.
const caption2 = await getCaption2(userimg);
const caption3 = await getCaption3(userimg);
const comparison = await getComparison(userimg, caption1.choices[0].message.content, caption2[0].generated_text, caption3[0].generated_text); // This variable awaits the comparison.
// Code in <style> tags was gleaned from https://stackoverflow.com/questions/50058483/how-to-display-divs-horizontally-in-line-in-html
// Also gleaned from many CSS tutorials such as https://refine.dev/blog/css-rounded-corners/#what-is-css-border-radius and https://www.w3schools.com/html/html_css.asp
$("#output").html(`
<div class="sideways-container" style="display:flex; justify-content: center; flex-direction:row">
<div class="item">
<h2>GPT-4o-mini's Caption:</h2>
<p>${caption1.choices[0].message.content}</p>
</div>
<br>
<div class="item">
<h2>BLIP's Caption:</h2>
<p>${caption2[0].generated_text}</p>
</div>
<br>
<div class="item">
<h2>VIT-GPT2's Caption:</h2>
<p>${caption3[0].generated_text}</p>
</div>
</div>
<br>
<div class="sideways-container2" style="display:flex; justify-content: center; flex-direction:row">
<div style="width:60vw; padding: 20px; color: black; border: 1px solid black; background-color: lightblue; border-radius: 15px; max-width:810px">
<h2>Grok Vision's Comparison:</h2>
<p>${comparison.choices[0].message.content}</p>
</div>
<style>
.item {
width:60vw;
color: black;
border: 1px solid black;
padding: 20px;
max-width:350px;
margin-right: 10px;
background-color: lightblue;
border-radius: 15px
}
</style>`
);
} catch (error) {
console.log(error.message);
}
}
async function getCaption1(sendingImage) {
const response = await fetch("https://api.openai.com/v1/chat/completions", { // Fetch request API reference at https://platform.openai.com/docs/api-reference/chat
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key1
},
body: JSON.stringify({ // Convert body to a JSON string.
model: 'gpt-4o-mini',
messages: [
{
'role': 'user',
'content': [
{'type': 'text', 'text': 'Give this image a brief caption. Be formal.'},
{
'type': 'image_url',
'image_url': {
'url': 'data:image/*;base64,' + sendingImage // Converts image to a base64 string. It should already be encoded, but we want to include the beginning part of the string.
}
}
]
}
],
})
}).catch(err=>console.log(err));
const data = await response.json(); // Catch errors in console (and so on)
console.log(data);
return data;
}
async function getCaption2(sendingImage) {
const response = await fetch("https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large", { // Fetch request API reference taken from https://huggingface.co/Salesforce/blip-image-captioning-base?inference_api=true
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key2
},
body: JSON.stringify({
'inputs': sendingImage
})
}).catch(err=>console.log(err));
const data = await response.json();
console.log(data);
return data;
}
async function getCaption3(sendingImage){
const response = await fetch("https://api-inference.huggingface.co/models/nlpconnect/vit-gpt2-image-captioning", { // Fetch request snippet taken from https://huggingface.co/nlpconnect/vit-gpt2-image-captioning.
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key2,
},
body: JSON.stringify({
'image': sendingImage
})
}).catch(err=>console.log(err));
const data = await response.json();
console.log(data);
return data;
}
async function getComparison(sendingImage, response1, response2, response3){
const response = await fetch("https://api.x.ai/v1/chat/completions", { // Fetch request snippet taken from https://docs.x.ai/api#authentication
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key3,
},
body: JSON.stringify({
messages: [
{
'role': 'user',
'content': [
{'type': 'text', 'text': 'I have three written captions from three different AI models for the attached image. They are as following: ChatGPT: ' + response1 + ' BLIP: ' + response2 + ' VIT-GPT: ' + response3 + '. On a scale of 1-10, rank each caption and why. Separate each ranking by a new line.'},
{
'type': 'image_url',
'image_url': {
'url': 'data:image/*;base64,' + sendingImage // We want to include the beginning part of the base64 encoded string.
}
}
]
}
],
model: "grok-vision-beta"
})
}).catch(err=>console.log(err));
const data = await response.json();
console.log(data);
return(data);
}
//---- setup -------------------------------------------------------
// Do NOT make a setup function.
// This is done for you in the API. The API setup just creates a canvas.
// Anything else you want to run at the start should go into the following two functions.
function beforesetup() // Optional
{
// Anything you want to run at the start BEFORE the canvas is created
}
function aftersetup() // Optional
{
// Anything you want to run at the start AFTER the canvas is created
}
//---- draw -------------------------------------------------------
function draw() // Optional
{
// Can put P5 instructions to be executed every step here, or in AB.world.nextStep()
}