Some language models that support vision capabilities accept images as part of the prompt. Here are some of the different formats you can use to include images as input.
import { generateObject } from 'ai';import { openai } from '@ai-sdk/openai';import dotenv from 'dotenv';import { z } from 'zod';
dotenv.config();
async function main() { const { object } = await generateObject({ model: openai('gpt-4-turbo'), maxTokens: 512, schema: z.object({ stamps: z.array( z.object({ country: z.string(), date: z.string(), }), ), }), messages: [ { role: 'user', content: [ { type: 'text', text: 'list all the stamps in these passport pages?', }, { type: 'image', image: new URL( 'https://upload.wikimedia.org/wikipedia/commons/thumb/c/c5/WW2_Spanish_official_passport.jpg/1498px-WW2_Spanish_official_passport.jpg', ), }, ], }, ], });
console.log(object);}
main();
import { generateObject } from 'ai';import { openai } from '@ai-sdk/openai';import dotenv from 'dotenv';import { z } from 'zod';
dotenv.config();
async function main() { const { object } = await generateObject({ model: openai('gpt-4-turbo'), maxTokens: 512, schema: z.object({ stamps: z.array( z.object({ country: z.string(), date: z.string(), }), ), }), messages: [ { role: 'user', content: [ { type: 'text', text: 'list all the stamps in these passport pages?', }, { type: 'image', image: fs.readFileSync('./node/attachments/eclipse.jpg'), }, ], }, ], });
console.log(object);}
main();