Try Gemini 1.5 models, our newest multimodal models in Vertex AI, and see what you can build with a 1M token context window. Try Gemini 1.5 models, our newest multimodal models in Vertex AI, and see what you can build with a 1M token context window.

Process images, video, audio, and text with Gemini 1.5 Pro

This sample shows you how to process images, video, audio, and text at the same time. This sample works with Gemini 1.5 Pro only.

Explore further

For detailed documentation that includes this code sample, see the following:

Send multimodal prompt requests

Code sample

C#

Before trying this sample, follow the C# setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI C# API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


using Google.Cloud.AIPlatform.V1;
using System;
using System.Threading.Tasks;

public class MultimodalAllInput
{
    public async Task<string> AnswerFromMultimodalInput(
        string projectId = "your-project-id",
        string location = "us-central1",
        string publisher = "google",
        string model = "gemini-1.5-pro-preview-0409")
    {

        var predictionServiceClient = new PredictionServiceClientBuilder
        {
            Endpoint = $"{location}-aiplatform.googleapis.com"
        }.Build();

        string prompt = "Watch each frame in the video carefully and answer the questions.\n"
                  + "Only base your answers strictly on what information is available in "
                  + "the video attached. Do not make up any information that is not part "
                  + "of the video and do not be too verbose, be to the point.\n\n"
                  + "Questions:\n"
                  + "- When is the moment in the image happening in the video? "
                  + "Provide a timestamp.\n"
                  + "- What is the context of the moment and what does the narrator say about it?";

        var generateContentRequest = new GenerateContentRequest
        {
            Model = $"projects/{projectId}/locations/{location}/publishers/{publisher}/models/{model}",
            Contents =
            {
                new Content
                {
                    Role = "USER",
                    Parts =
                    {
                        new Part { Text = prompt },
                        new Part { FileData = new() { MimeType = "video/mp4", FileUri = "gs://cloud-samples-data/generative-ai/video/behind_the_scenes_pixel.mp4" } },
                        new Part { FileData = new() { MimeType = "image/png", FileUri = "gs://cloud-samples-data/generative-ai/image/a-man-and-a-dog.png" } }
                    }
                }
            }
        };

        GenerateContentResponse response = await predictionServiceClient.GenerateContentAsync(generateContentRequest);

        string responseText = response.Candidates[0].Content.Parts[0].Text;
        Console.WriteLine(responseText);

        return responseText;
    }
}

Go

Before trying this sample, follow the Go setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Go API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

import (
	"context"
	"errors"
	"fmt"
	"io"
	"mime"
	"path/filepath"

	"cloud.go888ogle.com.fqhub.com/go/vertexai/genai"
)

// multimodalPrompt is a sample prompt type consisting of one video, one image, and a text question.
type multimodalPrompt struct {
	// video and image are Google Cloud Storage paths starting with "gs://"
	video, image string
	// question is the question asked to the model
	question string
}

// generateContentFromVideoWithAudio shows how to send a multi-modal prompt to a model, writing the response to
// the provided io.Writer.
func generateContentFromVideoWithAudio(w io.Writer, prompt multimodalPrompt, projectID, location, modelName string) error {
	// prompt := multimodalPrompt{
	// 	video: "gs://cloud-samples-data/generative-ai/video/behind_the_scenes_pixel.mp4",
	// 	image: "gs://cloud-samples-data/generative-ai/image/a-man-and-a-dog.png",
	// 	question: `
	// 		Watch each frame in the video carefully and answer the questions.
	// 		Only base your answers strictly on what information is available in the video attached.
	// 		Do not make up any information that is not part of the video and do not be too
	// 		verbose, be to the point.
	//
	// 		Questions:
	// 		- When is the moment in the image happening in the video? Provide a timestamp.
	// 		- What is the context of the moment and what does the narrator say about it?
	// `,
	// location := "us-central1"
	// modelName := "gemini-1.5-pro-preview-0409"
	ctx := context.Background()

	client, err := genai.NewClient(ctx, projectID, location)
	if err != nil {
		return fmt.Errorf("unable to create client: %w", err)
	}
	defer client.Close()

	model := client.GenerativeModel(modelName)

	vidPart := genai.FileData{
		MIMEType: mime.TypeByExtension(filepath.Ext(prompt.video)),
		FileURI:  prompt.video,
	}

	imgPart := genai.FileData{
		MIMEType: mime.TypeByExtension(filepath.Ext(prompt.image)),
		FileURI:  prompt.image,
	}

	res, err := model.GenerateContent(ctx, vidPart, imgPart, genai.Text(prompt.question))
	if err != nil {
		return fmt.Errorf("unable to generate contents: %w", err)
	}

	if len(res.Candidates) == 0 ||
		len(res.Candidates[0].Content.Parts) == 0 {
		return errors.New("empty response from model")
	}

	fmt.Fprintf(w, "generated response: %s\n", res.Candidates[0].Content.Parts[0])
	return nil
}

Java

Before trying this sample, follow the Java setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Java API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


import com.google.cloud.vertexai.VertexAI;
import com.google.cloud.vertexai.api.GenerateContentResponse;
import com.google.cloud.vertexai.generativeai.ContentMaker;
import com.google.cloud.vertexai.generativeai.GenerativeModel;
import com.google.cloud.vertexai.generativeai.PartMaker;
import com.google.cloud.vertexai.generativeai.ResponseHandler;
import java.io.IOException;

public class MultimodalAllInput {

  public static void main(String[] args) throws IOException {
    // TODO(developer): Replace these variables before running the sample.
    String projectId = "your-google-cloud-project-id";
    String location = "us-central1";
    String modelName = "gemini-1.5-pro-preview-0409";

    multimodalAllInput(projectId, location, modelName);
  }

  // A request containing a text prompt, a video, and a picture.
  public static String multimodalAllInput(String projectId, String location, String modelName)
      throws IOException {
    // Initialize client that will be used to send requests. This client only needs
    // to be created once, and can be reused for multiple requests.
    try (VertexAI vertexAI = new VertexAI(projectId, location)) {
      String videoUri = "gs://cloud-samples-data/generative-ai/video/behind_the_scenes_pixel.mp4";
      String imageUri = "gs://cloud-samples-data/generative-ai/image/a-man-and-a-dog.png";

      GenerativeModel model = new GenerativeModel(modelName, vertexAI);
      GenerateContentResponse response = model.generateContent(
          ContentMaker.fromMultiModalData(
              PartMaker.fromMimeTypeAndData("video/mp4", videoUri),
              PartMaker.fromMimeTypeAndData("image/png", imageUri),
              "Watch each frame in the video carefully and answer the questions.\n"
                  + "Only base your answers strictly on what information is available in "
                  + "the video attached. Do not make up any information that is not part "
                  + "of the video and do not be too verbose, be to the point.\n\n"
                  + "Questions:\n"
                  + "- When is the moment in the image happening in the video? "
                  + "Provide a timestamp.\n"
                  + "- What is the context of the moment and what does the narrator say about it?"
          ));

      String output = ResponseHandler.getText(response);
      System.out.println(output);

      return output;
    }
  }
}

Node.js

Before trying this sample, follow the Node.js setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Node.js API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.

const {VertexAI} = require('@google-cloud/vertexai');

/**
 * TODO(developer): Update these variables before running the sample.
 */
async function analyze_all_modalities(projectId = 'PROJECT_ID') {
  const vertexAI = new VertexAI({project: projectId, location: 'us-central1'});

  const generativeModel = vertexAI.getGenerativeModel({
    model: 'gemini-1.5-pro-preview-0409',
  });

  const videoFilePart = {
    file_data: {
      file_uri:
        'gs://cloud-samples-data/generative-ai/video/behind_the_scenes_pixel.mp4',
      mime_type: 'video/mp4',
    },
  };
  const imageFilePart = {
    file_data: {
      file_uri:
        'gs://cloud-samples-data/generative-ai/image/a-man-and-a-dog.png',
      mime_type: 'image/png',
    },
  };

  const textPart = {
    text: `
    Watch each frame in the video carefully and answer the questions.
    Only base your answers strictly on what information is available in the video attached.
    Do not make up any information that is not part of the video and do not be too
    verbose, be to the point.

    Questions:
    - When is the moment in the image happening in the video? Provide a timestamp.
    - What is the context of the moment and what does the narrator say about it?`,
  };

  const request = {
    contents: [{role: 'user', parts: [videoFilePart, imageFilePart, textPart]}],
  };

  const resp = await generativeModel.generateContent(request);
  const contentResponse = await resp.response;
  console.log(JSON.stringify(contentResponse));
}

Python

Before trying this sample, follow the Python setup instructions in the Vertex AI quickstart using client libraries. For more information, see the Vertex AI Python API reference documentation.

To authenticate to Vertex AI, set up Application Default Credentials. For more information, see Set up authentication for a local development environment.


  import vertexai
  from vertexai.generative_models import GenerativeModel, Part

  # TODO(developer): Update and un-comment below lines
  # project_id = "PROJECT_ID"

  vertexai.init(project=project_id, location="us-central1")

  model = GenerativeModel(model_name="gemini-1.5-flash-preview-0514")

  video_file_uri = (
      "gs://cloud-samples-data/generative-ai/video/behind_the_scenes_pixel.mp4"
  )
  video_file = Part.from_uri(video_file_uri, mime_type="video/mp4")

  image_file_uri = "gs://cloud-samples-data/generative-ai/image/a-man-and-a-dog.png"
  image_file = Part.from_uri(image_file_uri, mime_type="image/png")

  prompt = """
  Watch each frame in the video carefully and answer the questions.
  Only base your answers strictly on what information is available in the video attached.
  Do not make up any information that is not part of the video and do not be too
  verbose, be to the point.

  Questions:
  - When is the moment in the image happening in the video? Provide a timestamp.
  - What is the context of the moment and what does the narrator say about it?
"""

  contents = [
      video_file,
      image_file,
      prompt,
  ]

  response = model.generate_content(contents)
  print(response.text)

What's next

To search and filter code samples for other Google Cloud products, see the Google Cloud sample browser.