Search Filtering Operations

L3
ModelContextProtocolPlaywrightShopping Admin

Configure advanced search and filtering systems in admin interface, implement category hierarchies, set up attribute filters, and optimize search algorithms for user experience.

Created by Fanqing Meng
2025-08-17
Content Submission

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
Claude
claude-4-sonnet
3
/4
344.3s
32.8
1,411,178
5,025
1,416,203
DeepSeek
deepseek-chat
1
/4
536.6s
34.0
1,626,447
3,271
1,629,718
Gemini
gemini-2-5-pro
1
/4
115.0s
15.5
577,350
3,833
581,183
OpenAI
gpt-5
1
/4
867.4s
31.0
1,010,563
41,124
1,051,688
OpenAI
o3
1
/4
180.3s
13.3
219,271
7,256
226,526
Qwen
qwen-3-coder
1
/4
295.9s
43.5
2,069,042
3,360
2,072,402
Claude
claude-4-1-opus
0
/1
--
633.7s
40.0
1,615,670
4,381
1,620,051
Grok
grok-4
0
/4
122.0s
15.8
-
-
-
MoonshotAI
k2
0
/4
299.8s
36.0
1,353,271
2,102
1,355,373

Task State

WebArena
view WebArena environment setup for this task

Instruction



Verify

*.py
Python
import re
import json
import os
import sys


def verify(messages):
    """
    Verify that the agent has successfully performed complex search and filtering operations
    in the Magento Admin panel and extracted all required information correctly.

    Args:
        messages: List of message dictionaries containing the conversation

    Returns:
        Dictionary with 'valid' boolean and 'reason' string
    """

    # Find the last assistant message with status "completed" and type "message"
    answer_content = None
    for message in reversed(messages):
        if (
            message.get("role") == "assistant"
            and message.get("status") == "completed"
            and message.get("type") == "message"
            and message.get("content")
        ):
            # Extract text from content structure
            content = message["content"]
            if isinstance(content, list):
                for item in content:
                    if isinstance(item, dict) and item.get("type") == "output_text":
                        text = item.get("text", "")
                        # Look for answer tags with case-insensitive search
                        answer_match = re.search(
                            r"<answer>(.*?)</answer>", text, re.DOTALL | re.IGNORECASE
                        )
                        if answer_match:
                            answer_content = answer_match.group(1).strip()
                            break
            elif isinstance(content, str):
                # Look for answer tags in string content
                answer_match = re.search(r"<answer>(.*?)</answer>", content, re.DOTALL | re.IGNORECASE)
                if answer_match:
                    answer_content = answer_match.group(1).strip()
                    break

            if answer_content:
                break

    if not answer_content:
        return {"valid": False, "reason": "No answer found in <answer> tags"}

    # Expected format - each line should have a key|value pair
    expected_keys = [
        "TankSearchCount",
        "ZeroResultsCount",
        "HighestUseTerm",
        "Results20to30Term",
        "Hits15PlusCount",
        "ID10to15MaxResults",
        "DefaultStoreViewCount",
        "OneResultTerm",
        "HighestResultLastSearch",
        "Position3Bestseller",
        "TopUseTerm",
        "FirstNonZeroResult",
        "TotalUniqueTerms",
    ]

    # Parse the answer
    lines = answer_content.strip().split("\n")

    # Check if we have exactly 13 lines
    if len(lines) != 13:
        return {"valid": False, "reason": f"Expected 13 data lines, found {len(lines)}"}

    # Parse each line and validate format
    extracted_data = {}
    for line in lines:
        if "|" not in line:
            return {
                "valid": False,
                "reason": f"Invalid format in line: {line}. Expected 'key|value' format",
            }

        parts = line.split("|", 1)
        if len(parts) != 2:
            return {"valid": False, "reason": f"Invalid format in line: {line}"}

        key, value = parts
        extracted_data[key] = value

    # Check all required keys are present
    missing_keys = set(expected_keys) - set(extracted_data.keys())
    if missing_keys:
        return {
            "valid": False,
            "reason": f"Missing required keys: {', '.join(missing_keys)}",
        }

    # Validate specific data formats and expected values based on the current data

    # 1. TankSearchCount should be a number (2 terms containing 'tank')
    if not extracted_data["TankSearchCount"].isdigit():
        return {
            "valid": False,
            "reason": f"TankSearchCount should be a number, got: {extracted_data['TankSearchCount']}",
        }

    # Expected: "Antonia Racer Tank" and "tanks" contain 'tank'
    if extracted_data["TankSearchCount"] != "2":
        return {
            "valid": False,
            "reason": f"TankSearchCount should be '2', got: {extracted_data['TankSearchCount']}",
        }

    # 2. ZeroResultsCount should be a number (nike has 0 results)
    if not extracted_data["ZeroResultsCount"].isdigit():
        return {
            "valid": False,
            "reason": f"ZeroResultsCount should be a number, got: {extracted_data['ZeroResultsCount']}",
        }

    if extracted_data["ZeroResultsCount"] != "1":
        return {
            "valid": False,
            "reason": f"ZeroResultsCount should be '1', got: {extracted_data['ZeroResultsCount']}",
        }

    # 3. HighestUseTerm should be in format "term:uses"
    if ":" not in extracted_data["HighestUseTerm"]:
        return {
            "valid": False,
            "reason": f"HighestUseTerm should be in format 'term:uses', got: {extracted_data['HighestUseTerm']}",
        }

    # hollister has 19 uses (highest among terms with > 10 uses)
    if extracted_data["HighestUseTerm"] != "hollister:19":
        return {
            "valid": False,
            "reason": f"HighestUseTerm should be 'hollister:19', got: {extracted_data['HighestUseTerm']}",
        }

    # 4. Results20to30Term should be in format "term:results"
    if ":" not in extracted_data["Results20to30Term"]:
        return {
            "valid": False,
            "reason": f"Results20to30Term should be in format 'term:results', got: {extracted_data['Results20to30Term']}",
        }

    # Both "tanks" and "Antonia Racer Tank" have 23 results (between 20-30)
    valid_results20to30 = ["tanks:23", "Antonia Racer Tank:23"]
    # Check if answer contains one of the valid values or both separated by |
    if not any(
        val in extracted_data["Results20to30Term"] for val in valid_results20to30
    ):
        return {
            "valid": False,
            "reason": f"Results20to30Term should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['Results20to30Term']}",
        }

    # 5. Hits15PlusCount should be a number (only hollister has 19 hits > 15)
    if not extracted_data["Hits15PlusCount"].isdigit():
        return {
            "valid": False,
            "reason": f"Hits15PlusCount should be a number, got: {extracted_data['Hits15PlusCount']}",
        }

    if extracted_data["Hits15PlusCount"] != "1":
        return {
            "valid": False,
            "reason": f"Hits15PlusCount should be '1', got: {extracted_data['Hits15PlusCount']}",
        }

    # 6. ID10to15MaxResults should be in format "term:results"
    if ":" not in extracted_data["ID10to15MaxResults"]:
        return {
            "valid": False,
            "reason": f"ID10to15MaxResults should be in format 'term:results', got: {extracted_data['ID10to15MaxResults']}",
        }

    # ID 11 is hollister (1 result), ID 13 is Antonia Racer Tank (23 results)
    if extracted_data["ID10to15MaxResults"] != "Antonia Racer Tank:23":
        return {
            "valid": False,
            "reason": f"ID10to15MaxResults should be 'Antonia Racer Tank:23', got: {extracted_data['ID10to15MaxResults']}",
        }

    # 7. DefaultStoreViewCount should be a number (all 7 terms are from Default Store View)
    if not extracted_data["DefaultStoreViewCount"].isdigit():
        return {
            "valid": False,
            "reason": f"DefaultStoreViewCount should be a number, got: {extracted_data['DefaultStoreViewCount']}",
        }

    if extracted_data["DefaultStoreViewCount"] != "7":
        return {
            "valid": False,
            "reason": f"DefaultStoreViewCount should be '7', got: {extracted_data['DefaultStoreViewCount']}",
        }

    # 8. OneResultTerm should be in format "term:uses"
    if ":" not in extracted_data["OneResultTerm"]:
        return {
            "valid": False,
            "reason": f"OneResultTerm should be in format 'term:uses', got: {extracted_data['OneResultTerm']}",
        }

    # Both hollister and WP10 have exactly 1 result
    valid_one_result = ["hollister:19", "WP10:1"]
    if not any(val in extracted_data["OneResultTerm"] for val in valid_one_result):
        return {
            "valid": False,
            "reason": f"OneResultTerm should contain 'hollister:19' or 'WP10:1', got: {extracted_data['OneResultTerm']}",
        }

    # 9. HighestResultLastSearch should be in format "term:results"
    if ":" not in extracted_data["HighestResultLastSearch"]:
        return {
            "valid": False,
            "reason": f"HighestResultLastSearch should be in format 'term:results', got: {extracted_data['HighestResultLastSearch']}",
        }

    # In Last Search Terms: tanks and Antonia Racer Tank both have 23 results (highest)
    valid_highest_last = ["tanks:23", "Antonia Racer Tank:23"]
    if not any(
        val in extracted_data["HighestResultLastSearch"] for val in valid_highest_last
    ):
        return {
            "valid": False,
            "reason": f"HighestResultLastSearch should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['HighestResultLastSearch']}",
        }

    # 10. Position3Bestseller should be in format "product:quantity"
    if ":" not in extracted_data["Position3Bestseller"]:
        return {
            "valid": False,
            "reason": f"Position3Bestseller should be in format 'product:quantity', got: {extracted_data['Position3Bestseller']}",
        }

    # Position 3 in Bestsellers is "Sprite Stasis Ball 65 cm" with quantity 6
    if extracted_data["Position3Bestseller"] != "Sprite Stasis Ball 65 cm:6":
        return {
            "valid": False,
            "reason": f"Position3Bestseller should be 'Sprite Stasis Ball 65 cm:6', got: {extracted_data['Position3Bestseller']}",
        }

    # 11. TopUseTerm should be in format "term:uses"
    if ":" not in extracted_data["TopUseTerm"]:
        return {
            "valid": False,
            "reason": f"TopUseTerm should be in format 'term:uses', got: {extracted_data['TopUseTerm']}",
        }

    # hollister has 19 uses (highest)
    if extracted_data["TopUseTerm"] != "hollister:19":
        return {
            "valid": False,
            "reason": f"TopUseTerm should be 'hollister:19', got: {extracted_data['TopUseTerm']}",
        }

    # 12. FirstNonZeroResult should be in format "term:results"
    if ":" not in extracted_data["FirstNonZeroResult"]:
        return {
            "valid": False,
            "reason": f"FirstNonZeroResult should be in format 'term:results', got: {extracted_data['FirstNonZeroResult']}",
        }

    # When sorted by results ascending, first non-zero is WP10 (has 1 result)
    if extracted_data["FirstNonZeroResult"] != "WP10:1":
        return {
            "valid": False,
            "reason": f"FirstNonZeroResult should be 'WP10:1', got: {extracted_data['FirstNonZeroResult']}",
        }

    # 13. TotalUniqueTerms should be a number
    if not extracted_data["TotalUniqueTerms"].isdigit():
        return {
            "valid": False,
            "reason": f"TotalUniqueTerms should be a number, got: {extracted_data['TotalUniqueTerms']}",
        }

    # There are 7 unique search terms in the system
    if extracted_data["TotalUniqueTerms"] != "7":
        return {
            "valid": False,
            "reason": f"TotalUniqueTerms should be '7', got: {extracted_data['TotalUniqueTerms']}",
        }

    # All validations passed
    return {
        "valid": True,
        "reason": "All complex search and filtering operations completed successfully",
    }


if __name__ == "__main__":
    # Load messages from environment variable
    messages_path = os.getenv("MCP_MESSAGES")
    if not messages_path:
        print(
            json.dumps(
                {"valid": False, "reason": "MCP_MESSAGES environment variable not set"}
            )
        )
        exit(1)

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)
    except Exception as e:
        print(
            json.dumps({"valid": False, "reason": f"Failed to load messages: {str(e)}"})
        )
        exit(1)

    # Run verification
    result = verify(messages)
    print(json.dumps(result))
    # Exit with appropriate code based on verification result
    sys.exit(0 if result["valid"] else 1)