McRogueFace/tests/vllm_demo/0_basic_vllm_demo.py

294 lines
9.1 KiB
Python

#!/usr/bin/env python3
"""
VLLM Integration Demo for McRogueFace
=====================================
Demonstrates using a local Vision-Language Model (Gemma 3) with
McRogueFace headless rendering to create an AI-driven agent.
Requirements:
- Local VLLM running at http://192.168.1.100:8100
- McRogueFace built with headless mode support
This is a research-grade demo for issue #156.
"""
import mcrfpy
from mcrfpy import automation
import sys
import requests
import base64
import os
import random
# VLLM configuration
VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions"
SCREENSHOT_PATH = "/tmp/vllm_demo_screenshot.png"
# Sprite constants from Crypt of Sokoban tileset
FLOOR_COMMON = 0 # 95% of floors
FLOOR_SPECKLE1 = 12 # 4% of floors
FLOOR_SPECKLE2 = 24 # 1% of floors
WALL_TILE = 40 # Wall sprite
PLAYER_SPRITE = 84 # Player character
RAT_SPRITE = 123 # Enemy/rat creature
def file_to_base64(file_path):
"""Convert any image file to base64 string."""
with open(file_path, 'rb') as f:
return base64.b64encode(f.read()).decode('utf-8')
def llm_chat_completion(messages: list):
"""Chat completion endpoint of local LLM"""
try:
response = requests.post(VLLM_URL, json={'messages': messages}, timeout=60)
return response.json()
except requests.exceptions.RequestException as e:
return {"error": str(e)}
def message_with_image(text, image_path):
"""Create a message with an embedded image for vision models."""
image_data = file_to_base64(image_path)
return {
"role": "user",
"content": [
{"type": "text", "text": text},
{"type": "image_url", "image_url": {"url": "data:image/png;base64," + image_data}}
]
}
def get_floor_tile():
"""Return a floor tile sprite with realistic distribution."""
roll = random.random()
if roll < 0.95:
return FLOOR_COMMON
elif roll < 0.99:
return FLOOR_SPECKLE1
else:
return FLOOR_SPECKLE2
def setup_scene():
"""Create a dungeon scene with player agent and NPC rat."""
print("Setting up scene...")
# Create and set scene
mcrfpy.createScene("vllm_demo")
mcrfpy.setScene("vllm_demo")
ui = mcrfpy.sceneUI("vllm_demo")
# Load the game texture (16x16 tiles from Crypt of Sokoban)
texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16)
# Create grid: 1014px wide at position (5,5)
# Using 20x15 grid for a reasonable dungeon size
grid = mcrfpy.Grid(
grid_size=(20, 15),
texture=texture,
pos=(5, 5),
size=(1014, 700)
)
grid.fill_color = mcrfpy.Color(20, 20, 30)
# Set zoom factor to 2.0 for better visibility
grid.zoom = 2.0
ui.append(grid)
# Set up floor tiles and walls with proper sprite distribution
for x in range(20):
for y in range(15):
point = grid.at(x, y)
# Create walls around the edges
if x == 0 or x == 19 or y == 0 or y == 14:
point.tilesprite = WALL_TILE
point.walkable = False
point.transparent = False # Walls block FOV
else:
# Floor inside with varied sprites
point.tilesprite = get_floor_tile()
point.walkable = True
point.transparent = True # Floors don't block FOV
# Add some interior walls for interest - a room divider
for y in range(5, 10):
point = grid.at(10, y)
point.tilesprite = WALL_TILE
point.walkable = False
point.transparent = False
# Door opening
door = grid.at(10, 7)
door.tilesprite = get_floor_tile()
door.walkable = True
door.transparent = True
# Create a ColorLayer for fog of war (z_index=10 to render on top)
fov_layer = grid.add_layer('color', z_index=10)
fov_layer.fill(mcrfpy.Color(0, 0, 0, 255)) # Start all black (unknown)
# Create the player entity ("The Agent")
player = mcrfpy.Entity(grid_pos=(5, 7), texture=texture, sprite_index=PLAYER_SPRITE)
grid.entities.append(player)
# Create an NPC rat entity (closer so it's visible in FOV)
rat = mcrfpy.Entity(grid_pos=(10, 7), texture=texture, sprite_index=RAT_SPRITE)
grid.entities.append(rat)
# Bind the fog layer to player's perspective
# visible = transparent, discovered = dim, unknown = black
fov_layer.apply_perspective(
entity=player,
visible=mcrfpy.Color(0, 0, 0, 0), # Transparent when visible
discovered=mcrfpy.Color(40, 40, 60, 180), # Dark overlay when discovered but not visible
unknown=mcrfpy.Color(0, 0, 0, 255) # Black when never seen
)
# Update visibility from player's position
player.update_visibility()
# Center the camera on the agent entity
px, py = int(player.pos[0]), int(player.pos[1])
grid.center = (px * 16 + 8, py * 16 + 8)
return grid, player, rat
def check_entity_visible(grid, entity):
"""Check if an entity is within the current FOV."""
ex, ey = int(entity.pos[0]), int(entity.pos[1])
return grid.is_in_fov(ex, ey)
def build_grounded_prompt(grid, player, rat):
"""Build a text prompt with visually grounded information."""
observations = []
# Check what the agent can see
if check_entity_visible(grid, rat):
observations.append("You see a rat to the east.")
# Could add more observations here:
# - walls blocking path
# - items on ground
# - doors/exits
if not observations:
observations.append("The area appears clear.")
return " ".join(observations)
def run_demo():
"""Main demo function."""
print("=" * 60)
print("VLLM Integration Demo (Research Mode)")
print("=" * 60)
print()
# Setup the scene
grid, player, rat = setup_scene()
# Advance simulation to ensure scene is ready
mcrfpy.step(0.016)
# Take screenshot
print(f"Taking screenshot: {SCREENSHOT_PATH}")
result = automation.screenshot(SCREENSHOT_PATH)
if not result:
print("ERROR: Failed to take screenshot")
return False
file_size = os.path.getsize(SCREENSHOT_PATH)
print(f"Screenshot saved: {file_size} bytes")
print()
# Build grounded observations
grounded_text = build_grounded_prompt(grid, player, rat)
print(f"Grounded observations: {grounded_text}")
print()
# Query 1: Ask VLLM to describe what it sees
print("-" * 40)
print("Query 1: Describe what you see")
print("-" * 40)
system_prompt = """You are an AI agent in a roguelike dungeon game. You can see the game world through screenshots.
The view shows a top-down grid-based dungeon with tiles, walls, and creatures.
Your character is the humanoid figure. The dark areas are outside your field of vision.
Other creatures may be enemies or NPCs. Describe what you observe concisely."""
user_prompt = f"""Look at this game screenshot. {grounded_text}
Describe what you see in the dungeon from your character's perspective.
Be specific about:
- Your position in the room
- Any creatures you can see
- The layout of walls and passages
- Areas obscured by fog of war (darkness)"""
messages = [
{"role": "system", "content": system_prompt},
message_with_image(user_prompt, SCREENSHOT_PATH)
]
resp = llm_chat_completion(messages)
if "error" in resp:
print(f"VLLM Error: {resp['error']}")
print("\nNote: The VLLM server may not be running or accessible.")
print("Screenshot is saved for manual inspection.")
description = "I can see a dungeon scene."
else:
description = resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
print(f"\nVLLM Response:\n{description}")
print()
# Query 2: Ask what action the agent would like to take
print("-" * 40)
print("Query 2: What would you like to do?")
print("-" * 40)
messages.append({"role": "assistant", "content": description})
messages.append({
"role": "user",
"content": f"""Based on what you see, what action would you like to take?
Available actions:
- GO NORTH / SOUTH / EAST / WEST - move in that direction
- WAIT - stay in place and observe
- LOOK - examine your surroundings more carefully
{grounded_text}
State your reasoning briefly, then declare your action clearly (e.g., "Action: GO EAST")."""
})
resp = llm_chat_completion(messages)
if "error" in resp:
print(f"VLLM Error: {resp['error']}")
else:
action = resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
print(f"\nVLLM Response:\n{action}")
print()
print("=" * 60)
print("Demo Complete")
print("=" * 60)
print(f"\nScreenshot preserved at: {SCREENSHOT_PATH}")
print("Grid settings: zoom=2.0, FOV radius=8, perspective rendering enabled")
return True
# Main execution
if __name__ == "__main__":
try:
success = run_demo()
if success:
print("\nPASS")
sys.exit(0)
else:
print("\nFAIL")
sys.exit(1)
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
sys.exit(1)