feat: Add VLLM integration demos for multi-agent research (#156)

- 0_basic_vllm_demo.py: Single agent with FOV, grounded text, VLLM query - 1_multi_agent_demo.py: Three agents with perspective cycling Features demonstrated: - Headless step() + screenshot() for AI-driven gameplay - ColorLayer.apply_perspective() for per-agent fog of war - Grounded text generation based on entity visibility - Sequential VLLM queries with vision model support - Proper FOV reset between perspective switches 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-02 09:21:25 -05:00 · 2025-12-02 09:21:25 -05:00 · 4713b62535
parent f2f8d6422f
commit 4713b62535
2 changed files with 639 additions and 0 deletions
--- a/tests/vllm_demo/0_basic_vllm_demo.py
+++ b/tests/vllm_demo/0_basic_vllm_demo.py
@ -0,0 +1,293 @@
 #!/usr/bin/env python3
 """
 VLLM Integration Demo for McRogueFace
 =====================================
 Demonstrates using a local Vision-Language Model (Gemma 3) with
 McRogueFace headless rendering to create an AI-driven agent.
 Requirements:
 - Local VLLM running at http://192.168.1.100:8100
 - McRogueFace built with headless mode support
 This is a research-grade demo for issue #156.
 """
 import mcrfpy
 from mcrfpy import automation
 import sys
 import requests
 import base64
 import os
 import random
 # VLLM configuration
 VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions"
 SCREENSHOT_PATH = "/tmp/vllm_demo_screenshot.png"
 # Sprite constants from Crypt of Sokoban tileset
 FLOOR_COMMON = 0      # 95% of floors
 FLOOR_SPECKLE1 = 12   # 4% of floors
 FLOOR_SPECKLE2 = 24   # 1% of floors
 WALL_TILE = 40        # Wall sprite
 PLAYER_SPRITE = 84    # Player character
 RAT_SPRITE = 123      # Enemy/rat creature
 def file_to_base64(file_path):
    """Convert any image file to base64 string."""
    with open(file_path, 'rb') as f:
        return base64.b64encode(f.read()).decode('utf-8')
 def llm_chat_completion(messages: list):
    """Chat completion endpoint of local LLM"""
    try:
        response = requests.post(VLLM_URL, json={'messages': messages}, timeout=60)
        return response.json()
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}
 def message_with_image(text, image_path):
    """Create a message with an embedded image for vision models."""
    image_data = file_to_base64(image_path)
    return {
        "role": "user",
        "content": [
            {"type": "text", "text": text},
            {"type": "image_url", "image_url": {"url": "data:image/png;base64," + image_data}}
        ]
    }
 def get_floor_tile():
    """Return a floor tile sprite with realistic distribution."""
    roll = random.random()
    if roll < 0.95:
        return FLOOR_COMMON
    elif roll < 0.99:
        return FLOOR_SPECKLE1
    else:
        return FLOOR_SPECKLE2
 def setup_scene():
    """Create a dungeon scene with player agent and NPC rat."""
    print("Setting up scene...")
    # Create and set scene
    mcrfpy.createScene("vllm_demo")
    mcrfpy.setScene("vllm_demo")
    ui = mcrfpy.sceneUI("vllm_demo")
    # Load the game texture (16x16 tiles from Crypt of Sokoban)
    texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16)
    # Create grid: 1014px wide at position (5,5)
    # Using 20x15 grid for a reasonable dungeon size
    grid = mcrfpy.Grid(
        grid_size=(20, 15),
        texture=texture,
        pos=(5, 5),
        size=(1014, 700)
    )
    grid.fill_color = mcrfpy.Color(20, 20, 30)
    # Set zoom factor to 2.0 for better visibility
    grid.zoom = 2.0
    ui.append(grid)
    # Set up floor tiles and walls with proper sprite distribution
    for x in range(20):
        for y in range(15):
            point = grid.at(x, y)
            # Create walls around the edges
            if x == 0 or x == 19 or y == 0 or y == 14:
                point.tilesprite = WALL_TILE
                point.walkable = False
                point.transparent = False  # Walls block FOV
            else:
                # Floor inside with varied sprites
                point.tilesprite = get_floor_tile()
                point.walkable = True
                point.transparent = True  # Floors don't block FOV
    # Add some interior walls for interest - a room divider
    for y in range(5, 10):
        point = grid.at(10, y)
        point.tilesprite = WALL_TILE
        point.walkable = False
        point.transparent = False
    # Door opening
    door = grid.at(10, 7)
    door.tilesprite = get_floor_tile()
    door.walkable = True
    door.transparent = True
    # Create a ColorLayer for fog of war (z_index=10 to render on top)
    fov_layer = grid.add_layer('color', z_index=10)
    fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))  # Start all black (unknown)
    # Create the player entity ("The Agent")
    player = mcrfpy.Entity(grid_pos=(5, 7), texture=texture, sprite_index=PLAYER_SPRITE)
    grid.entities.append(player)
    # Create an NPC rat entity (closer so it's visible in FOV)
    rat = mcrfpy.Entity(grid_pos=(10, 7), texture=texture, sprite_index=RAT_SPRITE)
    grid.entities.append(rat)
    # Bind the fog layer to player's perspective
    # visible = transparent, discovered = dim, unknown = black
    fov_layer.apply_perspective(
        entity=player,
        visible=mcrfpy.Color(0, 0, 0, 0),           # Transparent when visible
        discovered=mcrfpy.Color(40, 40, 60, 180),   # Dark overlay when discovered but not visible
        unknown=mcrfpy.Color(0, 0, 0, 255)          # Black when never seen
    )
    # Update visibility from player's position
    player.update_visibility()
    # Center the camera on the agent entity
    px, py = int(player.pos[0]), int(player.pos[1])
    grid.center = (px * 16 + 8, py * 16 + 8)
    return grid, player, rat
 def check_entity_visible(grid, entity):
    """Check if an entity is within the current FOV."""
    ex, ey = int(entity.pos[0]), int(entity.pos[1])
    return grid.is_in_fov(ex, ey)
 def build_grounded_prompt(grid, player, rat):
    """Build a text prompt with visually grounded information."""
    observations = []
    # Check what the agent can see
    if check_entity_visible(grid, rat):
        observations.append("You see a rat to the east.")
    # Could add more observations here:
    # - walls blocking path
    # - items on ground
    # - doors/exits
    if not observations:
        observations.append("The area appears clear.")
    return " ".join(observations)
 def run_demo():
    """Main demo function."""
    print("=" * 60)
    print("VLLM Integration Demo (Research Mode)")
    print("=" * 60)
    print()
    # Setup the scene
    grid, player, rat = setup_scene()
    # Advance simulation to ensure scene is ready
    mcrfpy.step(0.016)
    # Take screenshot
    print(f"Taking screenshot: {SCREENSHOT_PATH}")
    result = automation.screenshot(SCREENSHOT_PATH)
    if not result:
        print("ERROR: Failed to take screenshot")
        return False
    file_size = os.path.getsize(SCREENSHOT_PATH)
    print(f"Screenshot saved: {file_size} bytes")
    print()
    # Build grounded observations
    grounded_text = build_grounded_prompt(grid, player, rat)
    print(f"Grounded observations: {grounded_text}")
    print()
    # Query 1: Ask VLLM to describe what it sees
    print("-" * 40)
    print("Query 1: Describe what you see")
    print("-" * 40)
    system_prompt = """You are an AI agent in a roguelike dungeon game. You can see the game world through screenshots.
 The view shows a top-down grid-based dungeon with tiles, walls, and creatures.
 Your character is the humanoid figure. The dark areas are outside your field of vision.
 Other creatures may be enemies or NPCs. Describe what you observe concisely."""
    user_prompt = f"""Look at this game screenshot. {grounded_text}
 Describe what you see in the dungeon from your character's perspective.
 Be specific about:
 - Your position in the room
 - Any creatures you can see
 - The layout of walls and passages
 - Areas obscured by fog of war (darkness)"""
    messages = [
        {"role": "system", "content": system_prompt},
        message_with_image(user_prompt, SCREENSHOT_PATH)
    ]
    resp = llm_chat_completion(messages)
    if "error" in resp:
        print(f"VLLM Error: {resp['error']}")
        print("\nNote: The VLLM server may not be running or accessible.")
        print("Screenshot is saved for manual inspection.")
        description = "I can see a dungeon scene."
    else:
        description = resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
        print(f"\nVLLM Response:\n{description}")
    print()
    # Query 2: Ask what action the agent would like to take
    print("-" * 40)
    print("Query 2: What would you like to do?")
    print("-" * 40)
    messages.append({"role": "assistant", "content": description})
    messages.append({
        "role": "user",
        "content": f"""Based on what you see, what action would you like to take?
 Available actions:
 - GO NORTH / SOUTH / EAST / WEST - move in that direction
 - WAIT - stay in place and observe
 - LOOK - examine your surroundings more carefully
 {grounded_text}
 State your reasoning briefly, then declare your action clearly (e.g., "Action: GO EAST")."""
    })
    resp = llm_chat_completion(messages)
    if "error" in resp:
        print(f"VLLM Error: {resp['error']}")
    else:
        action = resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
        print(f"\nVLLM Response:\n{action}")
    print()
    print("=" * 60)
    print("Demo Complete")
    print("=" * 60)
    print(f"\nScreenshot preserved at: {SCREENSHOT_PATH}")
    print("Grid settings: zoom=2.0, FOV radius=8, perspective rendering enabled")
    return True
 # Main execution
 if __name__ == "__main__":
    try:
        success = run_demo()
        if success:
            print("\nPASS")
            sys.exit(0)
        else:
            print("\nFAIL")
            sys.exit(1)
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
--- a/tests/vllm_demo/1_multi_agent_demo.py
+++ b/tests/vllm_demo/1_multi_agent_demo.py
@ -0,0 +1,346 @@
 #!/usr/bin/env python3
 """
 Multi-Agent VLLM Demo for McRogueFace
 =====================================
 Demonstrates cycling through multiple agent perspectives,
 each with their own FOV and grounded observations.
 Three agents:
 - Wizard (left side) - can see the rat but not the other agents
 - Blacksmith (right side) - can see the knight, rat, and the wall
 - Knight (right side) - can see the blacksmith, rat, and the wall
 Each agent gets their own screenshot and VLLM query.
 """
 import mcrfpy
 from mcrfpy import automation
 import sys
 import requests
 import base64
 import os
 import random
 # VLLM configuration
 VLLM_URL = "http://192.168.1.100:8100/v1/chat/completions"
 SCREENSHOT_DIR = "/tmp/vllm_multi_agent"
 # Sprite constants
 FLOOR_COMMON = 0
 FLOOR_SPECKLE1 = 12
 FLOOR_SPECKLE2 = 24
 WALL_TILE = 40
 # Agent sprites
 WIZARD_SPRITE = 84
 BLACKSMITH_SPRITE = 86
 KNIGHT_SPRITE = 96
 RAT_SPRITE = 123
 def file_to_base64(file_path):
    """Convert any image file to base64 string."""
    with open(file_path, 'rb') as f:
        return base64.b64encode(f.read()).decode('utf-8')
 def llm_chat_completion(messages: list):
    """Chat completion endpoint of local LLM"""
    try:
        response = requests.post(VLLM_URL, json={'messages': messages}, timeout=60)
        return response.json()
    except requests.exceptions.RequestException as e:
        return {"error": str(e)}
 def message_with_image(text, image_path):
    """Create a message with an embedded image for vision models."""
    image_data = file_to_base64(image_path)
    return {
        "role": "user",
        "content": [
            {"type": "text", "text": text},
            {"type": "image_url", "image_url": {"url": "data:image/png;base64," + image_data}}
        ]
    }
 def get_floor_tile():
    """Return a floor tile sprite with realistic distribution."""
    roll = random.random()
    if roll < 0.95:
        return FLOOR_COMMON
    elif roll < 0.99:
        return FLOOR_SPECKLE1
    else:
        return FLOOR_SPECKLE2
 class Agent:
    """Wrapper for an agent entity with metadata."""
    def __init__(self, name, entity, description):
        self.name = name
        self.entity = entity
        self.description = description  # e.g., "a wizard", "a blacksmith"
    @property
    def pos(self):
        return (int(self.entity.pos[0]), int(self.entity.pos[1]))
 def setup_scene():
    """Create a dungeon scene with multiple agents."""
    print("Setting up multi-agent scene...")
    # Create and set scene
    mcrfpy.createScene("multi_agent_demo")
    mcrfpy.setScene("multi_agent_demo")
    ui = mcrfpy.sceneUI("multi_agent_demo")
    # Load the game texture
    texture = mcrfpy.Texture("assets/kenney_TD_MR_IP.png", 16, 16)
    # Create grid
    grid = mcrfpy.Grid(
        grid_size=(25, 15),
        texture=texture,
        pos=(5, 5),
        size=(1014, 700)
    )
    grid.fill_color = mcrfpy.Color(20, 20, 30)
    grid.zoom = 2.0
    ui.append(grid)
    # Set up floor tiles and walls
    for x in range(25):
        for y in range(15):
            point = grid.at(x, y)
            if x == 0 or x == 24 or y == 0 or y == 14:
                point.tilesprite = WALL_TILE
                point.walkable = False
                point.transparent = False
            else:
                point.tilesprite = get_floor_tile()
                point.walkable = True
                point.transparent = True
    # Add a wall divider in the middle (blocks wizard's view of right side)
    for y in range(3, 12):
        point = grid.at(10, y)
        point.tilesprite = WALL_TILE
        point.walkable = False
        point.transparent = False
    # Door opening in the wall
    door = grid.at(10, 7)
    door.tilesprite = get_floor_tile()
    door.walkable = True
    door.transparent = True
    # Create FOV layer for fog of war
    fov_layer = grid.add_layer('color', z_index=10)
    fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))
    # Create agents
    agents = []
    # Wizard on the left side
    wizard_entity = mcrfpy.Entity(grid_pos=(4, 7), texture=texture, sprite_index=WIZARD_SPRITE)
    grid.entities.append(wizard_entity)
    agents.append(Agent("Wizard", wizard_entity, "a wizard"))
    # Blacksmith on the right side (upper)
    blacksmith_entity = mcrfpy.Entity(grid_pos=(18, 5), texture=texture, sprite_index=BLACKSMITH_SPRITE)
    grid.entities.append(blacksmith_entity)
    agents.append(Agent("Blacksmith", blacksmith_entity, "a blacksmith"))
    # Knight on the right side (lower)
    knight_entity = mcrfpy.Entity(grid_pos=(18, 10), texture=texture, sprite_index=KNIGHT_SPRITE)
    grid.entities.append(knight_entity)
    agents.append(Agent("Knight", knight_entity, "a knight"))
    # Rat in the middle-right area (visible to blacksmith and knight, maybe wizard through door)
    rat_entity = mcrfpy.Entity(grid_pos=(14, 7), texture=texture, sprite_index=RAT_SPRITE)
    grid.entities.append(rat_entity)
    return grid, fov_layer, agents, rat_entity
 def switch_perspective(grid, fov_layer, agent):
    """Switch the grid view to an agent's perspective."""
    # Reset fog layer to all unknown (black) before switching
    # This prevents discovered tiles from one agent carrying over to another
    fov_layer.fill(mcrfpy.Color(0, 0, 0, 255))
    # Apply this agent's perspective
    fov_layer.apply_perspective(
        entity=agent.entity,
        visible=mcrfpy.Color(0, 0, 0, 0),
        discovered=mcrfpy.Color(40, 40, 60, 180),
        unknown=mcrfpy.Color(0, 0, 0, 255)
    )
    # Update visibility from agent's position
    agent.entity.update_visibility()
    # Center camera on this agent
    px, py = agent.pos
    grid.center = (px * 16 + 8, py * 16 + 8)
 def get_visible_entities(grid, observer, all_agents, rat):
    """Get list of entities visible to the observer."""
    visible = []
    ox, oy = observer.pos
    # Check rat visibility
    rx, ry = int(rat.pos[0]), int(rat.pos[1])
    if grid.is_in_fov(rx, ry):
        # Determine direction
        direction = get_direction(ox, oy, rx, ry)
        visible.append(f"a rat to the {direction}")
    # Check other agents
    for agent in all_agents:
        if agent.name == observer.name:
            continue
        ax, ay = agent.pos
        if grid.is_in_fov(ax, ay):
            direction = get_direction(ox, oy, ax, ay)
            visible.append(f"{agent.description} to the {direction}")
    return visible
 def get_direction(from_x, from_y, to_x, to_y):
    """Get cardinal direction from one point to another."""
    dx = to_x - from_x
    dy = to_y - from_y
    # Primary direction
    if abs(dx) > abs(dy):
        return "east" if dx > 0 else "west"
    elif abs(dy) > abs(dx):
        return "south" if dy > 0 else "north"
    else:
        # Diagonal - pick one
        ns = "south" if dy > 0 else "north"
        ew = "east" if dx > 0 else "west"
        return f"{ns}{ew}"
 def build_grounded_prompt(visible_entities):
    """Build grounded text from visible entities."""
    if not visible_entities:
        return "The area appears clear."
    if len(visible_entities) == 1:
        return f"You see {visible_entities[0]}."
    else:
        items = ", ".join(visible_entities[:-1]) + f" and {visible_entities[-1]}"
        return f"You see {items}."
 def query_agent(agent, screenshot_path, grounded_text):
    """Query VLLM for a single agent's perspective."""
    system_prompt = f"""You are {agent.description} in a roguelike dungeon game. You can see the game world through screenshots.
 The view shows a top-down grid-based dungeon. Your character is centered in the view.
 The dark areas are outside your field of vision. Other figures may be allies, enemies, or NPCs.
 Describe what you observe concisely and decide on an action."""
    user_prompt = f"""Look at this game screenshot from your perspective as {agent.description}. {grounded_text}
 Describe what you see briefly, then choose an action:
 - GO NORTH / SOUTH / EAST / WEST
 - WAIT
 - LOOK
 State your reasoning in 1-2 sentences, then declare: "Action: <YOUR_ACTION>" """
    messages = [
        {"role": "system", "content": system_prompt},
        message_with_image(user_prompt, screenshot_path)
    ]
    resp = llm_chat_completion(messages)
    if "error" in resp:
        return f"VLLM Error: {resp['error']}"
    else:
        return resp.get('choices', [{}])[0].get('message', {}).get('content', 'No response')
 def run_demo():
    """Main demo function."""
    print("=" * 70)
    print("Multi-Agent VLLM Demo")
    print("=" * 70)
    print()
    # Create screenshot directory
    os.makedirs(SCREENSHOT_DIR, exist_ok=True)
    # Setup scene
    grid, fov_layer, agents, rat = setup_scene()
    # Cycle through each agent's perspective
    for i, agent in enumerate(agents):
        print(f"\n{'='*70}")
        print(f"Agent {i+1}/3: {agent.name} ({agent.description})")
        print(f"Position: {agent.pos}")
        print("=" * 70)
        # Switch to this agent's perspective
        switch_perspective(grid, fov_layer, agent)
        # Advance simulation
        mcrfpy.step(0.016)
        # Take screenshot
        screenshot_path = os.path.join(SCREENSHOT_DIR, f"{i}_{agent.name.lower()}_view.png")
        result = automation.screenshot(screenshot_path)
        if not result:
            print(f"ERROR: Failed to take screenshot for {agent.name}")
            continue
        file_size = os.path.getsize(screenshot_path)
        print(f"Screenshot: {screenshot_path} ({file_size} bytes)")
        # Get visible entities for this agent
        visible = get_visible_entities(grid, agent, agents, rat)
        grounded_text = build_grounded_prompt(visible)
        print(f"Grounded observations: {grounded_text}")
        # Query VLLM
        print(f"\nQuerying VLLM for {agent.name}...")
        print("-" * 50)
        response = query_agent(agent, screenshot_path, grounded_text)
        print(f"\n{agent.name}'s Response:\n{response}")
        print()
    print("\n" + "=" * 70)
    print("Multi-Agent Demo Complete")
    print("=" * 70)
    print(f"\nScreenshots saved to: {SCREENSHOT_DIR}/")
    for i, agent in enumerate(agents):
        print(f"  - {i}_{agent.name.lower()}_view.png")
    return True
 # Main execution
 if __name__ == "__main__":
    try:
        success = run_demo()
        if success:
            print("\nPASS")
            sys.exit(0)
        else:
            print("\nFAIL")
            sys.exit(1)
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)