diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..727366b Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 9a4f3fa..c17cb15 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ screenshot* vimGPT/ .env __pycache__/ +node_modules/ \ No newline at end of file diff --git a/main.py b/main.py index 4375dea..fbdf8b0 100644 --- a/main.py +++ b/main.py @@ -1,12 +1,9 @@ import argparse import time - from whisper_mic import WhisperMic - import vision from vimbot import Vimbot - def main(voice_mode): print("Initializing the Vimbot driver...") driver = Vimbot() @@ -25,19 +22,18 @@ def main(voice_mode): print(f"Objective received: {objective}") else: objective = input("Please enter your objective: ") + model = input("Please enter a model to use (llava, gpt4v, cogvlm, claude): ") while True: time.sleep(1) print("Capturing the screen...") screenshot = driver.capture() - print("Getting actions for the given objective...") - action = vision.get_actions(screenshot, objective) + action = vision.get_actions(screenshot, objective, model) print(f"JSON Response: {action}") if driver.perform_action(action): # returns True if done break - def main_entry(): parser = argparse.ArgumentParser(description="Run the Vimbot with optional voice input.") parser.add_argument( @@ -48,7 +44,6 @@ def main_entry(): args = parser.parse_args() main(args.voice) - if __name__ == "__main__": try: main_entry() diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..79cc684 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,220 @@ +{ + "name": "vimgpt", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "vimgpt", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "playwright": "^1.42.1", + "replicate": "^0.29.1" + } + }, + "node_modules/abort-controller": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", + "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==", + "optional": true, + "dependencies": { + "event-target-shim": "^5.0.0" + }, + "engines": { + "node": ">=6.5" + } + }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "optional": true + }, + "node_modules/buffer": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-6.0.3.tgz", + "integrity": "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "optional": true, + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.2.1" + } + }, + "node_modules/event-target-shim": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", + "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==", + "optional": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/events": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", + "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==", + "optional": true, + "engines": { + "node": ">=0.8.x" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/ieee754": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", + "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "optional": true + }, + "node_modules/playwright": { + "version": "1.42.1", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.42.1.tgz", + "integrity": "sha512-PgwB03s2DZBcNRoW+1w9E+VkLBxweib6KTXM0M3tkiT4jVxKSi6PmVJ591J+0u10LUrgxB7dLRbiJqO5s2QPMg==", + "dependencies": { + "playwright-core": "1.42.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=16" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.42.1", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.42.1.tgz", + "integrity": "sha512-mxz6zclokgrke9p1vtdy/COWBH+eOZgYUVVU34C73M+4j4HLlQJHtfcqiqqxpP0o8HhMkflvfbquLX5dg6wlfA==", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=16" + } + }, + "node_modules/process": { + "version": "0.11.10", + "resolved": "https://registry.npmjs.org/process/-/process-0.11.10.tgz", + "integrity": "sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==", + "optional": true, + "engines": { + "node": ">= 0.6.0" + } + }, + "node_modules/readable-stream": { + "version": "4.5.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-4.5.2.tgz", + "integrity": "sha512-yjavECdqeZ3GLXNgRXgeQEdz9fvDDkNKyHnbHRFtOr7/LcfgBcmct7t/ET+HaCTqfh06OzoAxrkN/IfjJBVe+g==", + "optional": true, + "dependencies": { + "abort-controller": "^3.0.0", + "buffer": "^6.0.3", + "events": "^3.3.0", + "process": "^0.11.10", + "string_decoder": "^1.3.0" + }, + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" + } + }, + "node_modules/replicate": { + "version": "0.29.1", + "resolved": "https://registry.npmjs.org/replicate/-/replicate-0.29.1.tgz", + "integrity": "sha512-AezrONSwjYohugcxOd334A4zijdVQ4QyGZHysB9dg7auCng2vuXbi5EFkqTX+kZ+aihxJdhO1bURYmgrxOZg2w==", + "engines": { + "git": ">=2.11.0", + "node": ">=18.0.0", + "npm": ">=7.19.0", + "yarn": ">=1.7.0" + }, + "optionalDependencies": { + "readable-stream": ">=4.0.0" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "optional": true + }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "optional": true, + "dependencies": { + "safe-buffer": "~5.2.0" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..2e9e4b8 --- /dev/null +++ b/package.json @@ -0,0 +1,16 @@ +{ + "name": "vimgpt", + "version": "1.0.0", + "description": "Giving multimodal models an interface to play with.", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [], + "author": "", + "license": "ISC", + "dependencies": { + "playwright": "^1.42.1", + "replicate": "^0.29.1" + } +} diff --git a/vision.py b/vision.py index 3b6b5ac..618ad6b 100644 --- a/vision.py +++ b/vision.py @@ -1,17 +1,17 @@ import base64 import json -import os from io import BytesIO - import openai from dotenv import load_dotenv from PIL import Image +import requests +import replicate +import ast +import anthropic load_dotenv() -openai.api_key = os.getenv("OPENAI_API_KEY") IMG_RES = 1080 - # Function to encode the image def encode_and_resize(image): W, H = image.size @@ -21,10 +21,63 @@ def encode_and_resize(image): encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8") return encoded_image - -def get_actions(screenshot, objective): +def get_actions(screenshot, objective, model): encoded_screenshot = encode_and_resize(screenshot) - response = openai.chat.completions.create( + if model == 'claude': # Anthropic API + message = anthropic.Anthropic().messages.create( + model="claude-3-opus-20240229", + max_tokens=1024, + messages=[ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": encoded_screenshot, + } + }, + { + "type": "text", + "text": f"You need to choose which action to take to help a user do this task: {objective}. Your options are navigate, type, click, and done. Navigate should take you to the specified URL. Type and click take strings where if you want to click on an object, return the string with the yellow character sequence you want to click on, and to type just a string with the message you want to type. For clicks, please only respond with the 1-2 letter sequence in the yellow box, and if there are multiple valid options choose the one you think a user would select. For typing, please return a click to click on the box along with a type with the message to write. When the page seems satisfactory, return done as a key with no value. You must respond in JSON only with no other fluff or bad things will happen. The JSON keys must ONLY be one of navigate, type, or click. Do not return the JSON inside a code block. Your answer needs to be in this dictionary format: 'click': 'D', 'type': 'rick roll'. This is the correct response if you want to click on the yellow box D and type 'rick roll'. Please do this correctly, don't include key-value pairs if the value is empty. The 'click' key value pair must be a yellow box you can see in the screenshot. Do not return an empty string as a value to a key. Do not include the key-value pair of 'done' unless you believe the objective has been completed. If you want to click on something, PLEASE analyze the image and make sure the field you want to click on exists as a yellow highlighted box." + } + ] + } + ] + ) + return json.loads(message.content[0].text) + if model == 'llava': # Ollama Local API + url = 'http://localhost:11434/api/generate' + data = { + 'model': 'llava', + 'prompt': f"You need to choose which action to take to help a user do this task: {objective}. Your options are navigate, type, click, and done. Navigate should take you to the specified URL. Type and click take strings where if you want to click on an object, return the string with the yellow character sequence you want to click on, and to type just a string with the message you want to type. For clicks, please only respond with the 1-2 letter sequence in the yellow box, and if there are multiple valid options choose the one you think a user would select. For typing, please return a click to click on the box along with a type with the message to write. When the page seems satisfactory, return done as a key with no value. You must respond in JSON only with no other fluff or bad things will happen. The JSON keys must ONLY be one of navigate, type, or click. Do not return the JSON inside a code block. Your answer needs to be in this dictionary format: 'click': 'D', 'type': 'rick roll'. This is the correct response if you want to click on the yellow box D and type 'rick roll'. Please do this correctly, don't include key-value pairs if the value is empty. The 'click' key value pair must be a yellow box you can see in the screenshot. Do not return an empty string as a value to a key. Do not include the key-value pair of 'done' unless you believe the objective has been completed. If you want to click on something, PLEASE analyze the image and make sure the field you want to click on exists as a yellow highlighted box.", + 'images': [encoded_screenshot], + 'format': 'json' + } + response = requests.post(url, json=data) + jsonStrings = response.text.split('\n') + output = '' + for i in jsonStrings: + if i.strip(): + real = json.loads(i) + output += real['response'] + return json.loads(output) + if model == 'cogvlm': # NEEDS WORK, Replicate API + output = replicate.run("naklecha/cogvlm:ec3886f9ea85dd0aee216585be5e6d07b04c9650f7b8b08363a14eb89e207eb2", + input={ + "image": f"data:image/jpeg;base64,{encoded_screenshot}", + "prompt": f"You need to choose which action to take to help a user do this task: {objective}. Your options are navigate, type, click, and done. Navigate should take you to a specified URL. Type and click take strings where if you want to click on an object, return the string with the yellow character sequence you want to click on, and to type just a string with the message you want to type. For clicks, please only respond with the 1-2 letter sequence in the yellow box, and if there are multiple valid options choose the one you think a user would select. For typing, please return a click to click on the box along with a type with the message to write. When the page seems satisfactory, return done as a key with no value. You must respond in JSON only with no other fluff or bad things will happen. The JSON keys must ONLY be one of navigate, type, or click. Do not return the JSON inside a code block. Your answer needs to be in this dictionary format: 'click': 'D', 'type': 'rick roll'. This is the correct response if you want to click on the yellow box D and type 'rick roll'. Please do this correctly, don't include key-value pairs if the value is empty. The 'click' key value pair must be a yellow box you can see in the screenshot. Do not return an empty string as a value to a key. Do not include the key-value pair of 'done' unless you believe the objective has been completed. If you want to click on something, PLEASE analyze the image and make sure the field you want to click on exists as a yellow highlighted box. If you want to navigate, you must provide a link as the value to the key-value pair of navigate. YOU MUST RESPOND IN DICTIONARY FORMAT, WRAP YOUR ANSWERS WITH {{}} CHARACTERS PLEASE." + } + ) + print(output) + formatted_string = "{'" + output.replace(": ", "': ").replace(", ", ", '") + "}" # This does not always transform the output to a dictionary + dictionary = ast.literal_eval(formatted_string) + newJson = json.dumps(dictionary, indent=4) + return json.loads(newJson) + if model == 'gpt4v': #OpenAI API + response = openai.chat.completions.create( model="gpt-4-vision-preview", messages=[ { @@ -44,31 +97,29 @@ def get_actions(screenshot, objective): } ], max_tokens=100, - ) - - try: - json_response = json.loads(response.choices[0].message.content) - except json.JSONDecodeError: - print("Error: Invalid JSON response") - cleaned_response = openai.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - { - "role": "system", - "content": "You are a helpful assistant to fix an invalid JSON response. You need to fix the invalid JSON response to be valid JSON. You must respond in JSON only with no other fluff or bad things will happen. Do not return the JSON inside a code block.", - }, - {"role": "user", "content": f"The invalid JSON response is: {response.choices[0].message.content}"}, - ], ) try: - cleaned_json_response = json.loads(cleaned_response.choices[0].message.content) + json_response = json.loads(response.choices[0].message.content) except json.JSONDecodeError: print("Error: Invalid JSON response") - return {} - return cleaned_json_response - - return json_response - + cleaned_response = openai.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": "You are a helpful assistant to fix an invalid JSON response. You need to fix the invalid JSON response to be valid JSON. You must respond in JSON only with no other fluff or bad things will happen. Do not return the JSON inside a code block.", + }, + {"role": "user", "content": f"The invalid JSON response is: {response.choices[0].message.content}"}, + ], + ) + try: + cleaned_json_response = json.loads(cleaned_response.choices[0].message.content) + except json.JSONDecodeError: + print("Error: Invalid JSON response") + return {} + return cleaned_json_response + return json_response + return None if __name__ == "__main__": image = Image.open("image.png")