Understanding how to work with multiple modalities—images, audio, video, and text—is essential for building next-generation AI applications. In this chapter, we move from theory to practice, covering the technical implementation details that separate prototype demos from production-ready systems.
You'll learn how to properly encode media for different AI APIs, implement sophisticated cross-modal reasoning techniques, and leverage video AI capabilities that go far beyond simple transcription. These are the skills that enable you to build applications that truly understand multimedia content.
Part 1: Encoding Media for AI APIs
Multimodal LLMs can "see" and "hear," but they don't process raw file data directly. We must first convert media into a format the model's API can accept. Understanding the technical details of this process is crucial for building reliable, performant systems.
The Two Primary Encoding Methods
Encoding Images: Base64 Method
For quick, on-the-fly image analysis with smaller files, Base64 encoding offers simplicity and speed. Here's a production-ready implementation:
# Production-ready Base64 image encoding
import base64
from openai import OpenAI
from pathlib import Path
client = OpenAI(api_key="YOUR_OPENAI_API_KEY")
def encode_image_to_base64(image_path: str) -> str:
"""
Encode an image file to Base64 string.
Args:
image_path: Path to the image file
Returns:
Base64-encoded string representation of the image
"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def analyze_image_with_gpt4(image_path: str, prompt: str) -> str:
"""
Analyze an image using GPT-4 Vision.
Args:
image_path: Path to image file
prompt: Analysis instructions
Returns:
Model's analysis of the image
"""
# Validate file exists
if not Path(image_path).exists():
raise FileNotFoundError(f"Image not found: {image_path}")
# Encode image
base64_image = encode_image_to_base64(image_path)
# Determine MIME type from extension
suffix = Path(image_path).suffix.lower()
mime_types = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.webp': 'image/webp'
}
mime_type = mime_types.get(suffix, 'image/jpeg')
# Call API
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:{mime_type};base64,{base64_image}",
"detail": "high" # Options: low, high, auto
}
}
]
}
],
max_tokens=1000
)
return response.choices[0].message.content
# Example usage
result = analyze_image_with_gpt4(
"financial_chart.png",
"Extract all numerical data from this chart and format as JSON"
)
print(result)
💡 Base64 Best Practices
- File Size Limits: Most APIs limit Base64 images to 20MB or less. Check your provider's documentation.
- MIME Types: Always specify the correct MIME type (image/jpeg, image/png, etc.) to ensure proper parsing.
- Detail Parameter: GPT-4 Vision supports a "detail" parameter—use "high" for detailed analysis, "low" for faster, cheaper processing.
- Cost Considerations: Base64 encoding increases payload size by ~33%. For high-volume applications, URIs are more cost-effective.
Encoding Images: Cloud Storage URI Method
For production systems, especially those handling large files or batch processing, cloud storage URIs offer better performance, reliability, and cost efficiency. This is the standard approach for Gemini models:
# Cloud Storage URI method (Google Cloud + Gemini)
import google.generativeai as genai
from google.cloud import storage
genai.configure(api_key="YOUR_GEMINI_API_KEY")
def upload_to_gcs(local_path: str, bucket_name: str, blob_name: str) -> str:
"""
Upload a file to Google Cloud Storage.
Args:
local_path: Local file path
bucket_name: GCS bucket name
blob_name: Destination blob name
Returns:
gs:// URI of uploaded file
"""
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.upload_from_filename(local_path)
return f"gs://{bucket_name}/{blob_name}"
def analyze_image_from_gcs(gcs_uri: str, prompt: str) -> str:
"""
Analyze an image stored in Google Cloud Storage using Gemini.
Args:
gcs_uri: gs:// URI of the image
prompt: Analysis instructions
Returns:
Model's analysis
"""
# Create the file part
image_part = {
"mime_type": "image/jpeg", # Adjust based on your file
"file_uri": gcs_uri
}
# Generate content
model = genai.GenerativeModel('gemini-1.5-pro-latest')
response = model.generate_content([prompt, image_part])
return response.text
# Example workflow
gcs_uri = upload_to_gcs(
"large_diagram.png",
"my-ai-bucket",
"images/diagram_001.png"
)
analysis = analyze_image_from_gcs(
gcs_uri,
"Describe the architecture shown in this diagram and identify potential bottlenecks"
)
print(analysis)
Encoding Audio and Video
Due to their size, audio and video files are almost always handled via cloud storage URIs. The process involves uploading to storage, providing the URI to the model, and handling asynchronous processing.
- Audio: audio/mpeg (MP3), audio/wav (WAV), audio/ogg (OGG)
- Video: video/mp4 (MP4), video/webm (WebM), video/quicktime (MOV)
# Production video analysis workflow (Gemini)
import google.generativeai as genai
import time
from pathlib import Path
genai.configure(api_key="YOUR_GEMINI_API_KEY")
def process_video_file(video_path: str, prompt: str, timeout: int = 600) -> str:
"""
Upload and analyze a video file using Gemini.
Args:
video_path: Path to local video file
prompt: Analysis instructions
timeout: API timeout in seconds
Returns:
Model's video analysis
"""
# 1. Upload file to Gemini File API (uses GCS backend)
print(f"Uploading {Path(video_path).name}...")
video_file = genai.upload_file(
path=video_path,
display_name=Path(video_path).stem
)
print(f"Upload complete: {video_file.uri}")
# 2. Wait for file processing
while video_file.state.name == "PROCESSING":
print(".", end="", flush=True)
time.sleep(10)
video_file = genai.get_file(video_file.name)
print() # New line after dots
if video_file.state.name == "FAILED":
raise RuntimeError(f"Video processing failed: {video_file.state.name}")
print("Processing complete. Making inference request...")
# 3. Send to model for analysis
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest")
response = model.generate_content(
[video_file, prompt],
request_options={"timeout": timeout}
)
# 4. Clean up
genai.delete_file(video_file.name)
print(f"Deleted temporary file: {video_file.uri}")
return response.text
# Example: Analyze a conference talk
result = process_video_file(
"conference_talk_2024.mp4",
"""Analyze this technical conference talk and provide:
1. Main thesis and key arguments
2. Timeline of important points (with timestamps)
3. Technical concepts explained (with definitions)
4. Questions addressed by the speaker
5. Actionable takeaways for developers
Be specific and cite timestamps."""
)
print(result)
⚠️ Video Processing Best Practices
- Long Timeouts: Video analysis can take 2-10 minutes. Set timeouts to at least 600 seconds (10 minutes).
- Polling Logic: Always implement polling to check if file processing is complete before making inference requests.
- Error Handling: Check for FAILED state and implement retry logic for transient failures.
- File Cleanup: Delete uploaded files after processing to avoid storage costs.
- Rate Limits: Be aware of API rate limits for file uploads and inference requests.
Part 2: Cross-Modal Reasoning Techniques
Cross-modal reasoning is where multimodal AI becomes truly powerful. It's not enough to process different media types—the model must synthesize information across modalities to generate insights that wouldn't be possible from any single source.
Understanding Cross-Modal Reasoning
Multimodal Chain-of-Thought (CoT) Prompting
Chain-of-Thought prompting, adapted for multimodal tasks, forces the model to explicitly connect observations from different media before generating the final answer. This dramatically improves accuracy and reliability.
The Two-Stage CoT Process
- Rationale Generation: First, extract relevant information from each modality independently.
- Answer Inference: Then, synthesize across modalities to reach the final answer.
Example: Product Demo Video Analysis
# Bad approach: Direct question without CoT
bad_prompt = """
[video_file.mp4]
Create a user guide for the product shown in this video.
"""
# Result: May miss steps, misinterpret actions, or produce incomplete guide
# Good approach: Multimodal CoT
good_prompt = """
[video_file.mp4]
Your task is to create a comprehensive user guide for the product shown
in this video. To ensure accuracy, follow this process:
STEP 1 - VISUAL ANALYSIS:
Watch the video and identify every key action the user performs on screen.
For each action:
- Describe what they did visually
- Note the timestamp
- Identify which UI elements were interacted with
STEP 2 - AUDIO ANALYSIS:
Listen to the audio track and transcribe the spoken instructions that
correspond to each visual action you identified in Step 1.
STEP 3 - SYNTHESIS:
Using the information from Steps 1 and 2, create a step-by-step user guide.
Each step should:
- State the action clearly
- Include relevant verbal instructions
- Reference the timestamp
- Note any UI elements involved
Format the final guide as a numbered list with clear, actionable steps.
"""
# Implementation
import google.generativeai as genai
genai.configure(api_key="YOUR_API_KEY")
video_file = genai.upload_file(path="product_demo.mp4")
# Wait for processing...
import time
while video_file.state.name == "PROCESSING":
time.sleep(10)
video_file = genai.get_file(video_file.name)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
response = model.generate_content(
[video_file, good_prompt],
request_options={"timeout": 600}
)
print(response.text)
# Result: Accurate, detailed, reliable guide with proper audio-visual correlation
Advanced Technique: In-Context Learning (Few-Shot)
Provide examples of the desired input-output format directly in the prompt. This is extremely effective for guiding cross-modal reasoning patterns.
# Few-shot prompting for chart + text analysis
few_shot_prompt = """
Here is an example of how to analyze financial reports:
EXAMPLE 1:
[chart_image_1.png showing Q1: $1.2M, Q2: $900K]
Text: "As you can see, our Q1 revenue was strong, but we faced headwinds in Q2."
Analysis:
- Chart data: Q1 revenue at $1.2M, Q2 revenue at $900K (25% decline)
- Text confirms: "Strong Q1" aligns with $1.2M figure
- Text confirms: "Headwinds in Q2" aligns with 25% drop
- Consistency: High - visual and verbal narratives match
- Concern level: Medium - decline acknowledged but framed as external factors
---
EXAMPLE 2:
[chart_image_2.png showing Q3: $1.5M, Q4: $1.8M]
Text: "Q3 was a breakout quarter for us, and Q4 continued that strong momentum."
Analysis:
- Chart data: Q3 revenue at $1.5M, Q4 revenue at $1.8M (20% growth)
- Text confirms: "Breakout quarter" aligns with 67% increase from Q2
- Text confirms: "Strong momentum" aligns with continued 20% growth in Q4
- Consistency: High - visual and verbal narratives match
- Concern level: Low - positive trend confirmed by both data and narrative
---
Now analyze the following report using the same methodology:
[new_chart.png]
Text: "[provided text]"
Analysis:
"""
# This teaches the model the exact cross-modal reasoning pattern you want
Task Decomposition for Complex Analysis
For highly complex requests, break the problem into even smaller parts. You can chain multiple LLM calls where each call's output becomes the next call's input.
# Multi-call pipeline for video ad analysis
import google.generativeai as genai
import json
genai.configure(api_key="YOUR_API_KEY")
model = genai.GenerativeModel('gemini-1.5-pro-latest')
# Upload video once
video_file = genai.upload_file(path="marketing_ad.mp4")
# Wait for processing...
import time
while video_file.state.name == "PROCESSING":
time.sleep(10)
video_file = genai.get_file(video_file.name)
# CALL 1: Extract visual scenes
scene_prompt = """
Analyze this video ad and provide a JSON array describing all scenes.
For each scene include:
- start_time: timestamp in seconds
- end_time: timestamp in seconds
- description: what's shown visually
- key_elements: list of important visual elements
Return ONLY valid JSON, no other text.
"""
scenes_response = model.generate_content([video_file, scene_prompt])
scenes = json.loads(scenes_response.text)
# CALL 2: Extract audio/voiceover
audio_prompt = """
Transcribe the complete voiceover from this video ad.
Include timestamps for when each sentence is spoken.
Format as JSON array with timestamp and text fields.
"""
audio_response = model.generate_content([video_file, audio_prompt])
transcript = json.loads(audio_response.text)
# CALL 3: Synthesize marketing analysis
synthesis_prompt = f"""
You are a marketing analyst. Given these inputs:
SCENE DESCRIPTIONS:
{json.dumps(scenes, indent=2)}
VOICEOVER TRANSCRIPT:
{json.dumps(transcript, indent=2)}
Provide a comprehensive analysis:
1. Primary message and value proposition
2. Target audience demographics and psychographics
3. Emotional arc and persuasion techniques used
4. Alignment between visual and verbal messaging
5. Recommendations for A/B testing variations
"""
final_response = model.generate_content(synthesis_prompt)
print(final_response.text)
# Cleanup
genai.delete_file(video_file.name)
✅ Cross-Modal Reasoning Best Practices
- Be Explicit: Clearly state what information should be extracted from which source.
- Use Structured Output: Request JSON, Markdown tables, or other parseable formats.
- Provide Examples: Few-shot prompting dramatically improves cross-modal reasoning quality.
- Iterate: Your first prompt likely won't be perfect—analyze outputs and refine.
- Chain Calls: For complex tasks, multiple specialized calls often outperform one mega-prompt.
Part 3: Video AI Beyond Transcription
For years, "Video AI" meant speech-to-text transcription. Frontier models like Gemini 1.5 Pro have fundamentally changed this. They can now perceive and reason about visual narratives, on-screen text, audio cues, and spoken dialogue simultaneously—unlocking entirely new application categories.
The Paradigm Shift
Modern: Understand visual narrative + on-screen text + audio + dialogue + temporal relationships simultaneously
Core Capabilities of Modern Video AI
1. Structured Data Extraction
One of the most powerful applications: turning unstructured video into queryable structured data.
# Use Case: UI interaction logging from screen recording
import google.generativeai as genai
import json
genai.configure(api_key="YOUR_API_KEY")
# Upload screen recording
video_file = genai.upload_file(path="user_session_recording.mp4")
# Wait for processing...
import time
while video_file.state.name == "PROCESSING":
time.sleep(10)
video_file = genai.get_file(video_file.name)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
prompt = """
Analyze this screen recording of a user navigating our web application.
Your task: Produce a JSON array that logs every user interaction.
For each interaction, include:
- timestamp: Time in seconds from video start
- action_type: "click", "form_input", "scroll", "navigation", etc.
- element_name: Name or ID of the UI element
- element_type: "button", "input_field", "link", "dropdown", etc.
- value: For form inputs, what was entered
- page_url: URL visible in browser address bar
Return ONLY valid JSON, no other text.
"""
response = model.generate_content(
[video_file, prompt],
request_options={"timeout": 600}
)
interaction_log = json.loads(response.text)
# Now you can:
# - Reproduce bugs by replaying interactions
# - Analyze user behavior patterns
# - Generate automated test cases
# - Calculate time-on-task metrics
print(json.dumps(interaction_log, indent=2))
genai.delete_file(video_file.name)
2. Visual Narrative Summarization
Models can understand the sequence of events and generate high-level summaries of stories or processes.
# Use Case: University lecture summarization
import google.generativeai as genai
genai.configure(api_key="YOUR_API_KEY")
lecture_file = genai.upload_file(path="quantum_computing_lecture.mp4")
# Wait for processing...
import time
while lecture_file.state.name == "PROCESSING":
time.sleep(10)
lecture_file = genai.get_file(lecture_file.name)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
prompt = """
Analyze this university lecture on quantum computing.
Provide:
1. EXECUTIVE SUMMARY (2-3 paragraphs)
High-level overview of core concepts discussed
2. DETAILED OUTLINE (timestamped)
Create a comprehensive outline with:
- Major topics and subtopics
- Timestamp when each topic is introduced
- Key definitions or formulas presented
3. KEY VISUALS
List important diagrams, equations, or slides shown
Include timestamps and descriptions
4. STUDY GUIDE
5-10 questions a student should be able to answer after watching
Format using Markdown for easy readability.
"""
response = model.generate_content(
[lecture_file, prompt],
request_options={"timeout": 600}
)
# Result: Complete study guide generated from 90-minute lecture
# Students can navigate directly to relevant timestamps
print(response.text)
genai.delete_file(lecture_file.name)
3. Object and Event Detection
Identify and track specific objects, people, or events within video—with context understanding.
# Use Case: Sports analytics
import google.generativeai as genai
import json
genai.configure(api_key="YOUR_API_KEY")
game_file = genai.upload_file(path="basketball_game.mp4")
# Wait for processing...
import time
while game_file.state.name == "PROCESSING":
time.sleep(10)
game_file = genai.get_file(game_file.name)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
prompt = """
Analyze this basketball game footage focusing on Player #23.
Extract the following data as JSON:
{
"player_stats": {
"player_number": 23,
"three_point_attempts": [
{
"timestamp": seconds,
"made": boolean,
"location": "left_corner" | "right_corner" | "top_of_key" | "deep",
"defender_proximity": "open" | "contested" | "heavily_contested"
}
],
"two_point_attempts": [...],
"assists": [...],
"turnovers": [...]
}
}
Be precise with timestamps. Return ONLY valid JSON.
"""
response = model.generate_content(
[game_file, prompt],
request_options={"timeout": 600}
)
stats = json.loads(response.text)
# Use for:
# - Automated highlight reels
# - Player performance analysis
# - Coaching strategy development
genai.delete_file(game_file.name)
4. Cross-Modal Question Answering
Answer specific questions requiring correlation between visual and audio tracks.
# Use Case: Business presentation analysis
import google.generativeai as genai
genai.configure(api_key="YOUR_API_KEY")
presentation_file = genai.upload_file(path="q3_earnings_call.mp4")
# Wait for processing...
import time
while presentation_file.state.name == "PROCESSING":
time.sleep(10)
presentation_file = genai.get_file(presentation_file.name)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
# Ask specific, nuanced questions
questions = [
"At what timestamp does the speaker begin discussing Q3 financial results, and what is the exact revenue figure shown on the slide at that moment?",
"Identify any discrepancies between what the speaker says and what's shown in the charts. Provide timestamps and explain the discrepancy.",
"What questions from the Q&A session caused visible discomfort or hesitation in the speaker? Cite timestamps and describe the visual cues.",
"Extract all forward-looking statements made by the speaker. Include exact quotes, timestamps, and any visual data supporting or contradicting those statements."
]
for i, question in enumerate(questions, 1):
response = model.generate_content([presentation_file, question])
print(f"\n{'='*60}")
print(f"QUESTION {i}: {question}")
print(f"{'='*60}")
print(response.text)
genai.delete_file(presentation_file.name)
🚀 Production Video AI System Architecture
# Complete production-ready video analysis system
import google.generativeai as genai
from google.cloud import storage
import time
import json
from typing import Dict, List, Optional
from dataclasses import dataclass
from enum import Enum
class VideoState(Enum):
UPLOADING = "uploading"
PROCESSING = "processing"
READY = "ready"
FAILED = "failed"
@dataclass
class VideoAnalysisResult:
video_id: str
duration_seconds: float
analysis: Dict
timestamp: str
cost_estimate: float
class ProductionVideoAI:
def __init__(self, api_key: str, gcs_bucket: Optional[str] = None):
genai.configure(api_key=api_key)
self.model = genai.GenerativeModel('gemini-1.5-pro-latest')
self.gcs_bucket = gcs_bucket
def upload_and_wait(
self,
video_path: str,
max_wait: int = 600
) -> genai.types.File:
"""Upload video and wait for processing."""
print(f"Uploading {video_path}...")
video_file = genai.upload_file(path=video_path)
elapsed = 0
while video_file.state.name == "PROCESSING":
if elapsed >= max_wait:
raise TimeoutError(f"Processing exceeded {max_wait}s")
print(".", end="", flush=True)
time.sleep(10)
elapsed += 10
video_file = genai.get_file(video_file.name)
print("\nProcessing complete!")
if video_file.state.name == "FAILED":
raise RuntimeError("Video processing failed")
return video_file
def analyze(
self,
video_file: genai.types.File,
prompt: str,
timeout: int = 600
) -> str:
"""Run analysis on processed video."""
response = self.model.generate_content(
[video_file, prompt],
request_options={"timeout": timeout}
)
return response.text
def cleanup(self, video_file: genai.types.File):
"""Delete uploaded file."""
genai.delete_file(video_file.name)
print(f"Cleaned up {video_file.uri}")
# Usage
ai = ProductionVideoAI(api_key="YOUR_KEY")
video = ai.upload_and_wait("demo.mp4")
result = ai.analyze(video, "Summarize key features demonstrated")
print(result)
ai.cleanup(video)
✅ Key Takeaways
- Base64 encoding works for small files; cloud URIs are required for large media and production systems
- Always specify correct MIME types and implement proper error handling for file processing
- Multimodal Chain-of-Thought prompting dramatically improves cross-modal reasoning accuracy
- Few-shot examples teach models the exact reasoning patterns you want across modalities
- Modern video AI can extract structured data, understand narratives, and answer nuanced questions requiring audio-visual synthesis
- Production video systems require robust error handling, timeouts, polling logic, and cleanup procedures