feat: Add VLLM integration demos for multi-agent research (#156)

- 0_basic_vllm_demo.py: Single agent with FOV, grounded text, VLLM query - 1_multi_agent_demo.py: Three agents with perspective cycling Features demonstrated: - Headless step() + screenshot() for AI-driven gameplay - ColorLayer.apply_perspective() for per-agent fog of war - Grounded text generation based on entity visibility - Sequential VLLM queries with vision model support - Proper FOV reset between perspective switches 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 09:21:25 -05:00 · 2025-12-02 09:21:25 -05:00 · 4713b62535
parent f2f8d6422f
commit 4713b62535
2 changed files with 639 additions and 0 deletions
--- a/tests/vllm_demo/0_basic_vllm_demo.py
+++ b/tests/vllm_demo/0_basic_vllm_demo.py
@ -0,0 +1,293 @@
+#!/usr/bin/env python3
+"""
+VLLM Integration Demo for McRogueFace
+=====================================
+
+Demonstrates using a local Vision-Language Model (Gemma 3) with
+McRogueFace headless rendering to create an AI-driven agent.
+
+Requirements:
+- Local VLLM running at http://192.168.1.100:8100
+- McRogueFace built with headless mode support
+
+This is a research-grade demo for issue #156.
+"""
+
+import mcrfpy
+from mcrfpy import automation
+import sys
+import requests
+import base64
+import os
+import random
+
+# VLLM configuration
+VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions"
+SCREENSHOT_PATH = "/tmp/vllm_demo_screenshot.png"
+
+# Sprite constants from Crypt of Sokoban tileset
+FLOOR_COMMON = 0      # 95% of floors
+FLOOR_SPECKLE1 = 12   # 4% of floors
+FLOOR_SPECKLE2 = 24   # 1% of floors
+WALL_TILE = 40        # Wall sprite
+PLAYER_SPRITE = 84    # Player character
+RAT_SPRITE = 123      # Enemy/rat creature
+
+def file_to_base64(file_path):
+    """Convert any image file to base64 string."""
+    with open(file_path, 'rb') as f:
+        return base64.b64encode(f.read()).decode('utf-8')
+
+def llm_chat_completion(messages: list):
+    """Chat completion endpoint of local LLM"""
+    try:
+        response = requests.post(VLLM_URL, json={'messages': messages}, timeout=60)
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        return {"error": str(e)}
+
+def message_with_image(text, image_path):
+    """Create a message with an embedded image for vision models."""
+    image_data = file_to_base64(image_path)
+    return {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": text},
+            {"type": "image_url", "image_url": {"url": "data:image/png;base64," + image_data}}
+        ]
+    }
+
+def get_floor_tile():
+    """Return a floor tile sprite with realistic distribution."""
+    roll = random.random()
+    if roll < 0.95:
+        return FLOOR_COMMON
+    elif roll < 0.99:
+        return FLOOR_SPECKLE1
+    else:
+        return FLOOR_SPECKLE2
+
+def setup_scene():
+    """Create a dungeon scene with player agent and NPC rat."""
+    print("Setting up scene...")
+
+    # Create and set scene
+    mcrfpy.createScene("vllm_demo")
+    mcrfpy.setScene("vllm_demo")
+    ui = mcrfpy.sceneUI("vllm_demo")
+
+    # Load the game texture (16x16 tiles from Crypt of Sokoban)
+    texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16)
+
+    # Create grid: 1014px wide at position (5,5)
+    # Using 20x15 grid for a reasonable dungeon size
+    grid = mcrfpy.Grid(
+        grid_size=(20, 15),
+        texture=texture,
+        pos=(5, 5),
+        size=(1014, 700)
+    )
+    grid.fill_color = mcrfpy.Color(20, 20, 30)
+
+    # Set zoom factor to 2.0 for better visibility
+    grid.zoom = 2.0
+
+    ui.append(grid)
+
+    # Set up floor tiles and walls with proper sprite distribution
+    for x in range(20):
+        for y in range(15):
+            point = grid.at(x, y)
+            # Create walls around the edges
+            if x == 0 or x == 19 or y == 0 or y == 14:
+                point.tilesprite = WALL_TILE
+                point.walkable = False
+                point.transparent = False  # Walls block FOV
+            else:
+                # Floor inside with varied sprites
+                point.tilesprite = get_floor_tile()
+                point.walkable = True
+                point.transparent = True  # Floors don't block FOV
+
+    # Add some interior walls for interest - a room divider
+    for y in range(5, 10):
+        point = grid.at(10, y)
+        point.tilesprite = WALL_TILE
+        point.walkable = False
+        point.transparent = False
+    # Door opening
+    door = grid.at(10, 7)
+    door.tilesprite = get_floor_tile()
+    door.walkable = True
+    door.transparent = True
+
+    # Create a ColorLayer for fog of war (z_index=10 to render on top)
+    fov_layer = grid.add_layer('color', z_index=10)
+    fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))  # Start all black (unknown)
+
+    # Create the player entity ("The Agent")
+    player = mcrfpy.Entity(grid_pos=(5, 7), texture=texture, sprite_index=PLAYER_SPRITE)
+    grid.entities.append(player)
+
+    # Create an NPC rat entity (closer so it's visible in FOV)
+    rat = mcrfpy.Entity(grid_pos=(10, 7), texture=texture, sprite_index=RAT_SPRITE)
+    grid.entities.append(rat)
+
+    # Bind the fog layer to player's perspective
+    # visible = transparent, discovered = dim, unknown = black
+    fov_layer.apply_perspective(
+        entity=player,
+        visible=mcrfpy.Color(0, 0, 0, 0),           # Transparent when visible
+        discovered=mcrfpy.Color(40, 40, 60, 180),   # Dark overlay when discovered but not visible
+        unknown=mcrfpy.Color(0, 0, 0, 255)          # Black when never seen
+    )
+
+    # Update visibility from player's position
+    player.update_visibility()
+
+    # Center the camera on the agent entity
+    px, py = int(player.pos[0]), int(player.pos[1])
+    grid.center = (px * 16 + 8, py * 16 + 8)
+
+    return grid, player, rat
+
+def check_entity_visible(grid, entity):
+    """Check if an entity is within the current FOV."""
+    ex, ey = int(entity.pos[0]), int(entity.pos[1])
+    return grid.is_in_fov(ex, ey)
+
+def build_grounded_prompt(grid, player, rat):
+    """Build a text prompt with visually grounded information."""
+    observations = []
+
+    # Check what the agent can see
+    if check_entity_visible(grid, rat):
+        observations.append("You see a rat to the east.")
+
+    # Could add more observations here:
+    # - walls blocking path
+    # - items on ground
+    # - doors/exits
+
+    if not observations:
+        observations.append("The area appears clear.")
+
+    return " ".join(observations)
+
+def run_demo():
+    """Main demo function."""
+    print("=" * 60)
+    print("VLLM Integration Demo (Research Mode)")
+    print("=" * 60)
+    print()
+
+    # Setup the scene
+    grid, player, rat = setup_scene()
+
+    # Advance simulation to ensure scene is ready
+    mcrfpy.step(0.016)
+
+    # Take screenshot
+    print(f"Taking screenshot: {SCREENSHOT_PATH}")
+    result = automation.screenshot(SCREENSHOT_PATH)
+    if not result:
+        print("ERROR: Failed to take screenshot")
+        return False
+
+    file_size = os.path.getsize(SCREENSHOT_PATH)
+    print(f"Screenshot saved: {file_size} bytes")
+    print()
+
+    # Build grounded observations
+    grounded_text = build_grounded_prompt(grid, player, rat)
+    print(f"Grounded observations: {grounded_text}")
+    print()
+
+    # Query 1: Ask VLLM to describe what it sees
+    print("-" * 40)
+    print("Query 1: Describe what you see")
+    print("-" * 40)
+
+    system_prompt = """You are an AI agent in a roguelike dungeon game. You can see the game world through screenshots.
+The view shows a top-down grid-based dungeon with tiles, walls, and creatures.
+Your character is the humanoid figure. The dark areas are outside your field of vision.
+Other creatures may be enemies or NPCs. Describe what you observe concisely."""
+
+    user_prompt = f"""Look at this game screenshot. {grounded_text}
+
+Describe what you see in the dungeon from your character's perspective.
+Be specific about:
+- Your position in the room
+- Any creatures you can see
+- The layout of walls and passages
+- Areas obscured by fog of war (darkness)"""
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        message_with_image(user_prompt, SCREENSHOT_PATH)
+    ]
+
+    resp = llm_chat_completion(messages)
+
+    if "error" in resp:
+        print(f"VLLM Error: {resp['error']}")
+        print("\nNote: The VLLM server may not be running or accessible.")
+        print("Screenshot is saved for manual inspection.")
+        description = "I can see a dungeon scene."
+    else:
+        description = resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
+        print(f"\nVLLM Response:\n{description}")
+    print()
+
+    # Query 2: Ask what action the agent would like to take
+    print("-" * 40)
+    print("Query 2: What would you like to do?")
+    print("-" * 40)
+
+    messages.append({"role": "assistant", "content": description})
+    messages.append({
+        "role": "user",
+        "content": f"""Based on what you see, what action would you like to take?
+
+Available actions:
+- GO NORTH / SOUTH / EAST / WEST - move in that direction
+- WAIT - stay in place and observe
+- LOOK - examine your surroundings more carefully
+
+{grounded_text}
+
+State your reasoning briefly, then declare your action clearly (e.g., "Action: GO EAST")."""
+    })
+
+    resp = llm_chat_completion(messages)
+
+    if "error" in resp:
+        print(f"VLLM Error: {resp['error']}")
+    else:
+        action = resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
+        print(f"\nVLLM Response:\n{action}")
+    print()
+
+    print("=" * 60)
+    print("Demo Complete")
+    print("=" * 60)
+    print(f"\nScreenshot preserved at: {SCREENSHOT_PATH}")
+    print("Grid settings: zoom=2.0, FOV radius=8, perspective rendering enabled")
+
+    return True
+
+# Main execution
+if __name__ == "__main__":
+    try:
+        success = run_demo()
+        if success:
+            print("\nPASS")
+            sys.exit(0)
+        else:
+            print("\nFAIL")
+            sys.exit(1)
+    except Exception as e:
+        print(f"\nError: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
--- a/tests/vllm_demo/1_multi_agent_demo.py
+++ b/tests/vllm_demo/1_multi_agent_demo.py
@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+"""
+Multi-Agent VLLM Demo for McRogueFace
+=====================================
+
+Demonstrates cycling through multiple agent perspectives,
+each with their own FOV and grounded observations.
+
+Three agents:
+- Wizard (left side) - can see the rat but not the other agents
+- Blacksmith (right side) - can see the knight, rat, and the wall
+- Knight (right side) - can see the blacksmith, rat, and the wall
+
+Each agent gets their own screenshot and VLLM query.
+"""
+
+import mcrfpy
+from mcrfpy import automation
+import sys
+import requests
+import base64
+import os
+import random
+
+# VLLM configuration
+VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions"
+SCREENSHOT_DIR = "/tmp/vllm_multi_agent"
+
+# Sprite constants
+FLOOR_COMMON = 0
+FLOOR_SPECKLE1 = 12
+FLOOR_SPECKLE2 = 24
+WALL_TILE = 40
+
+# Agent sprites
+WIZARD_SPRITE = 84
+BLACKSMITH_SPRITE = 86
+KNIGHT_SPRITE = 96
+RAT_SPRITE = 123
+
+
+def file_to_base64(file_path):
+    """Convert any image file to base64 string."""
+    with open(file_path, 'rb') as f:
+        return base64.b64encode(f.read()).decode('utf-8')
+
+
+def llm_chat_completion(messages: list):
+    """Chat completion endpoint of local LLM"""
+    try:
+        response = requests.post(VLLM_URL, json={'messages': messages}, timeout=60)
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        return {"error": str(e)}
+
+
+def message_with_image(text, image_path):
+    """Create a message with an embedded image for vision models."""
+    image_data = file_to_base64(image_path)
+    return {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": text},
+            {"type": "image_url", "image_url": {"url": "data:image/png;base64," + image_data}}
+        ]
+    }
+
+
+def get_floor_tile():
+    """Return a floor tile sprite with realistic distribution."""
+    roll = random.random()
+    if roll < 0.95:
+        return FLOOR_COMMON
+    elif roll < 0.99:
+        return FLOOR_SPECKLE1
+    else:
+        return FLOOR_SPECKLE2
+
+
+class Agent:
+    """Wrapper for an agent entity with metadata."""
+    def __init__(self, name, entity, description):
+        self.name = name
+        self.entity = entity
+        self.description = description  # e.g., "a wizard", "a blacksmith"
+
+    @property
+    def pos(self):
+        return (int(self.entity.pos[0]), int(self.entity.pos[1]))
+
+
+def setup_scene():
+    """Create a dungeon scene with multiple agents."""
+    print("Setting up multi-agent scene...")
+
+    # Create and set scene
+    mcrfpy.createScene("multi_agent_demo")
+    mcrfpy.setScene("multi_agent_demo")
+    ui = mcrfpy.sceneUI("multi_agent_demo")
+
+    # Load the game texture
+    texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16)
+
+    # Create grid
+    grid = mcrfpy.Grid(
+        grid_size=(25, 15),
+        texture=texture,
+        pos=(5, 5),
+        size=(1014, 700)
+    )
+    grid.fill_color = mcrfpy.Color(20, 20, 30)
+    grid.zoom = 2.0
+    ui.append(grid)
+
+    # Set up floor tiles and walls
+    for x in range(25):
+        for y in range(15):
+            point = grid.at(x, y)
+            if x == 0 or x == 24 or y == 0 or y == 14:
+                point.tilesprite = WALL_TILE
+                point.walkable = False
+                point.transparent = False
+            else:
+                point.tilesprite = get_floor_tile()
+                point.walkable = True
+                point.transparent = True
+
+    # Add a wall divider in the middle (blocks wizard's view of right side)
+    for y in range(3, 12):
+        point = grid.at(10, y)
+        point.tilesprite = WALL_TILE
+        point.walkable = False
+        point.transparent = False
+
+    # Door opening in the wall
+    door = grid.at(10, 7)
+    door.tilesprite = get_floor_tile()
+    door.walkable = True
+    door.transparent = True
+
+    # Create FOV layer for fog of war
+    fov_layer = grid.add_layer('color', z_index=10)
+    fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))
+
+    # Create agents
+    agents = []
+
+    # Wizard on the left side
+    wizard_entity = mcrfpy.Entity(grid_pos=(4, 7), texture=texture, sprite_index=WIZARD_SPRITE)
+    grid.entities.append(wizard_entity)
+    agents.append(Agent("Wizard", wizard_entity, "a wizard"))
+
+    # Blacksmith on the right side (upper)
+    blacksmith_entity = mcrfpy.Entity(grid_pos=(18, 5), texture=texture, sprite_index=BLACKSMITH_SPRITE)
+    grid.entities.append(blacksmith_entity)
+    agents.append(Agent("Blacksmith", blacksmith_entity, "a blacksmith"))
+
+    # Knight on the right side (lower)
+    knight_entity = mcrfpy.Entity(grid_pos=(18, 10), texture=texture, sprite_index=KNIGHT_SPRITE)
+    grid.entities.append(knight_entity)
+    agents.append(Agent("Knight", knight_entity, "a knight"))
+
+    # Rat in the middle-right area (visible to blacksmith and knight, maybe wizard through door)
+    rat_entity = mcrfpy.Entity(grid_pos=(14, 7), texture=texture, sprite_index=RAT_SPRITE)
+    grid.entities.append(rat_entity)
+
+    return grid, fov_layer, agents, rat_entity
+
+
+def switch_perspective(grid, fov_layer, agent):
+    """Switch the grid view to an agent's perspective."""
+    # Reset fog layer to all unknown (black) before switching
+    # This prevents discovered tiles from one agent carrying over to another
+    fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))
+
+    # Apply this agent's perspective
+    fov_layer.apply_perspective(
+        entity=agent.entity,
+        visible=mcrfpy.Color(0, 0, 0, 0),
+        discovered=mcrfpy.Color(40, 40, 60, 180),
+        unknown=mcrfpy.Color(0, 0, 0, 255)
+    )
+
+    # Update visibility from agent's position
+    agent.entity.update_visibility()
+
+    # Center camera on this agent
+    px, py = agent.pos
+    grid.center = (px * 16 + 8, py * 16 + 8)
+
+
+def get_visible_entities(grid, observer, all_agents, rat):
+    """Get list of entities visible to the observer."""
+    visible = []
+    ox, oy = observer.pos
+
+    # Check rat visibility
+    rx, ry = int(rat.pos[0]), int(rat.pos[1])
+    if grid.is_in_fov(rx, ry):
+        # Determine direction
+        direction = get_direction(ox, oy, rx, ry)
+        visible.append(f"a rat to the {direction}")
+
+    # Check other agents
+    for agent in all_agents:
+        if agent.name == observer.name:
+            continue
+        ax, ay = agent.pos
+        if grid.is_in_fov(ax, ay):
+            direction = get_direction(ox, oy, ax, ay)
+            visible.append(f"{agent.description} to the {direction}")
+
+    return visible
+
+
+def get_direction(from_x, from_y, to_x, to_y):
+    """Get cardinal direction from one point to another."""
+    dx = to_x - from_x
+    dy = to_y - from_y
+
+    # Primary direction
+    if abs(dx) > abs(dy):
+        return "east" if dx > 0 else "west"
+    elif abs(dy) > abs(dx):
+        return "south" if dy > 0 else "north"
+    else:
+        # Diagonal - pick one
+        ns = "south" if dy > 0 else "north"
+        ew = "east" if dx > 0 else "west"
+        return f"{ns}{ew}"
+
+
+def build_grounded_prompt(visible_entities):
+    """Build grounded text from visible entities."""
+    if not visible_entities:
+        return "The area appears clear."
+
+    if len(visible_entities) == 1:
+        return f"You see {visible_entities[0]}."
+    else:
+        items = ", ".join(visible_entities[:-1]) + f" and {visible_entities[-1]}"
+        return f"You see {items}."
+
+
+def query_agent(agent, screenshot_path, grounded_text):
+    """Query VLLM for a single agent's perspective."""
+    system_prompt = f"""You are {agent.description} in a roguelike dungeon game. You can see the game world through screenshots.
+The view shows a top-down grid-based dungeon. Your character is centered in the view.
+The dark areas are outside your field of vision. Other figures may be allies, enemies, or NPCs.
+Describe what you observe concisely and decide on an action."""
+
+    user_prompt = f"""Look at this game screenshot from your perspective as {agent.description}. {grounded_text}
+
+Describe what you see briefly, then choose an action:
+- GO NORTH / SOUTH / EAST / WEST
+- WAIT
+- LOOK
+
+State your reasoning in 1-2 sentences, then declare: "Action: <YOUR_ACTION>" """
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        message_with_image(user_prompt, screenshot_path)
+    ]
+
+    resp = llm_chat_completion(messages)
+
+    if "error" in resp:
+        return f"VLLM Error: {resp['error']}"
+    else:
+        return resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
+
+
+def run_demo():
+    """Main demo function."""
+    print("=" * 70)
+    print("Multi-Agent VLLM Demo")
+    print("=" * 70)
+    print()
+
+    # Create screenshot directory
+    os.makedirs(SCREENSHOT_DIR, exist_ok=True)
+
+    # Setup scene
+    grid, fov_layer, agents, rat = setup_scene()
+
+    # Cycle through each agent's perspective
+    for i, agent in enumerate(agents):
+        print(f"\n{'='*70}")
+        print(f"Agent {i+1}/3: {agent.name} ({agent.description})")
+        print(f"Position: {agent.pos}")
+        print("=" * 70)
+
+        # Switch to this agent's perspective
+        switch_perspective(grid, fov_layer, agent)
+
+        # Advance simulation
+        mcrfpy.step(0.016)
+
+        # Take screenshot
+        screenshot_path = os.path.join(SCREENSHOT_DIR, f"{i}_{agent.name.lower()}_view.png")
+        result = automation.screenshot(screenshot_path)
+        if not result:
+            print(f"ERROR: Failed to take screenshot for {agent.name}")
+            continue
+
+        file_size = os.path.getsize(screenshot_path)
+        print(f"Screenshot: {screenshot_path} ({file_size} bytes)")
+
+        # Get visible entities for this agent
+        visible = get_visible_entities(grid, agent, agents, rat)
+        grounded_text = build_grounded_prompt(visible)
+        print(f"Grounded observations: {grounded_text}")
+
+        # Query VLLM
+        print(f"\nQuerying VLLM for {agent.name}...")
+        print("-" * 50)
+        response = query_agent(agent, screenshot_path, grounded_text)
+        print(f"\n{agent.name}'s Response:\n{response}")
+        print()
+
+    print("\n" + "=" * 70)
+    print("Multi-Agent Demo Complete")
+    print("=" * 70)
+    print(f"\nScreenshots saved to: {SCREENSHOT_DIR}/")
+    for i, agent in enumerate(agents):
+        print(f"  - {i}_{agent.name.lower()}_view.png")
+
+    return True
+
+
+# Main execution
+if __name__ == "__main__":
+    try:
+        success = run_demo()
+        if success:
+            print("\nPASS")
+            sys.exit(0)
+        else:
+            print("\nFAIL")
+            sys.exit(1)
+    except Exception as e:
+        print(f"\nError: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)