DBA Vector Analysis

L3
ModelContextProtocolPostgresVectors

Analyze pgvector database storage, identify vector columns, assess space utilization and performance for RAG applications.

Created by Fanshi Zhang
2025-08-18
Performance OptimizationAudit And ComplianceStatistical Aggregation

Model Ranking

Click on the dots to view the trajectory of each task run
Model
Run Results
Pass@4
Pass^4
Avg Time
Avg Turns
Input Tokens
Output Tokens
Total Tokens
OpenAI
gpt-5
4
/4
753.8s
26.0
607,676
46,510
654,186
MoonshotAI
k2
4
/4
216.2s
31.3
227,560
4,688
232,248
Qwen
qwen-3-coder
4
/4
181.5s
34.5
360,679
4,086
364,765
Grok
grok-4
3
/4
239.3s
34.3
-
-
-
OpenAI
o3
3
/4
104.8s
25.0
117,575
5,932
123,507
Claude
claude-4-sonnet
2
/4
227.3s
18.8
259,394
5,080
264,474
Gemini
gemini-2-5-pro
1
/4
32.1s
2.5
4,981
2,439
7,420
Claude
claude-4-1-opus
0
/1
--
352.9s
27.0
441,117
8,585
449,702
DeepSeek
deepseek-chat
0
/4
12.9s
1.0
2,808
58
2,866

Task State

Table "documents" { "id" int4 [pk, not null, increment] "title" text [not null] "content" text [not null] "source_url" text "document_type" varchar(50) [default: 'article'] "created_at" timestamp [default: `CURRENT_TIMESTAMP`] "updated_at" timestamp [default: `CURRENT_TIMESTAMP`] "word_count" int4 "embedding" public.vector Indexes { created_at [type: btree, name: "documents_created_idx"] embedding [type: hnsw, name: "documents_embedding_idx"] title [type: btree, name: "documents_title_idx"] document_type [type: btree, name: "documents_type_idx"] } } Table "document_chunks" { "id" int4 [pk, not null, increment] "document_id" int4 "chunk_index" int4 [not null] "chunk_text" text [not null] "chunk_size" int4 "overlap_size" int4 [default: 0] "created_at" timestamp [default: `CURRENT_TIMESTAMP`] "embedding" public.vector Indexes { document_id [type: btree, name: "chunks_doc_id_idx"] embedding [type: hnsw, name: "chunks_embedding_idx"] chunk_index [type: btree, name: "chunks_index_idx"] } } Table "user_queries" { "id" int4 [pk, not null, increment] "query_text" text [not null] "user_id" varchar(100) "session_id" varchar(100) "created_at" timestamp [default: `CURRENT_TIMESTAMP`] "response_time_ms" int4 "embedding" public.vector Indexes { created_at [type: btree, name: "queries_created_idx"] embedding [type: hnsw, name: "queries_embedding_idx"] user_id [type: btree, name: "queries_user_idx"] } } Table "embedding_models" { "id" int4 [pk, not null, increment] "model_name" varchar(100) [unique, not null] "provider" varchar(50) [not null] "dimensions" int4 [not null] "max_tokens" int4 "cost_per_token" numeric(10,8) "created_at" timestamp [default: `CURRENT_TIMESTAMP`] "is_active" bool [default: true] } Table "knowledge_base" { "id" int4 [pk, not null, increment] "kb_name" varchar(100) [not null] "description" text "domain" varchar(50) "language" varchar(10) [default: 'en'] "total_documents" int4 [default: 0] "total_chunks" int4 [default: 0] "total_storage_mb" numeric(10,2) "created_at" timestamp [default: `CURRENT_TIMESTAMP`] "updated_at" timestamp [default: `CURRENT_TIMESTAMP`] } Table "search_cache" { "id" int4 [pk, not null, increment] "query_hash" varchar(64) [not null] "query_text" text [not null] "results_json" jsonb "result_count" int4 "search_time_ms" int4 "similarity_threshold" numeric(4,3) "created_at" timestamp [default: `CURRENT_TIMESTAMP`] "expires_at" timestamp Indexes { expires_at [type: btree, name: "cache_expires_idx"] query_hash [type: btree, name: "cache_hash_idx"] } } Ref "document_chunks_document_id_fkey":"documents"."id" < "document_chunks"."document_id" [delete: cascade]

Instruction



Verify

*.py
Python
"""
Verification script for Vector Database DBA Analysis task.

This script verifies that the candidate has properly analyzed the vector database
and stored their findings in appropriate result tables.
"""

import logging
import psycopg2
import os
import sys
from typing import Dict, Any

logger = logging.getLogger(__name__)


def get_connection_params():
    """Get database connection parameters from environment variables."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD"),
    }


def verify_vector_analysis_columns(conn) -> Dict[str, Any]:
    """Verify the vector_analysis_columns table exists, has correct columns, and contains actual vector columns from the database."""
    results = {'passed': False, 'issues': []}
    expected_columns = [
        'schema', 'table_name', 'column_name', 'dimensions', 'data_type', 'has_constraints', 'rows'
    ]
    try:
        with conn.cursor() as cur:
            # Check if table exists
            cur.execute("""
                SELECT EXISTS (
                    SELECT FROM information_schema.tables
                    WHERE table_name = 'vector_analysis_columns'
                );
            """)
            if not cur.fetchone()[0]:
                results['issues'].append("vector_analysis_columns table not found")
                return results

            # Check columns
            cur.execute("""
                SELECT column_name FROM information_schema.columns
                WHERE table_name = 'vector_analysis_columns'
                ORDER BY column_name;
            """)
            actual_columns = {row[0] for row in cur.fetchall()}
            missing = set(expected_columns) - actual_columns
            extra = actual_columns - set(expected_columns)
            if missing:
                results['issues'].append(f"Missing columns: {missing}")
            if extra:
                results['issues'].append(f"Unexpected columns: {extra}")

            # Check for data
            cur.execute("SELECT COUNT(*) FROM vector_analysis_columns;")
            count = cur.fetchone()[0]
            if count == 0:
                results['issues'].append("No rows found in vector_analysis_columns")
                return results

            # Get actual vector columns from the database
            cur.execute("""
                SELECT table_name, column_name
                FROM information_schema.columns
                WHERE data_type = 'USER-DEFINED'
                AND udt_name = 'vector'
                ORDER BY table_name, column_name;
            """)
            actual_vector_columns = set(cur.fetchall())

            # Get what the agent found
            cur.execute("""
                SELECT table_name, column_name
                FROM vector_analysis_columns
                ORDER BY table_name, column_name;
            """)
            found_vector_columns = set(cur.fetchall())

            # Check if agent found the actual vector columns
            missing_vectors = actual_vector_columns - found_vector_columns
            extra_vectors = found_vector_columns - actual_vector_columns

            if missing_vectors:
                results['issues'].append(f"Missing: {missing_vectors}")
            if extra_vectors:
                results['issues'].append(f"Non-existing: {extra_vectors}")

            if not missing and not extra and count > 0 and not missing_vectors and not extra_vectors:
                results['passed'] = True

    except psycopg2.Error as e:
        results['issues'].append(f"Database error: {e}")
    except Exception as e:
        results['issues'].append(f"Verification error: {e}")
    return results


def verify_vector_analysis_storage_consumption(conn) -> Dict[str, Any]:
    """Verify the vector_analysis_storage_consumption table exists, has correct columns, and analyzes actual vector tables."""
    results = {'passed': False, 'issues': []}
    expected_columns = [
        'schema', 'table_name', 'total_size_bytes', 'vector_data_bytes', 'regular_data_bytes', 'vector_storage_pct', 'row_count'
    ]
    try:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT EXISTS (
                    SELECT FROM information_schema.tables
                    WHERE table_name = 'vector_analysis_storage_consumption'
                );
            """)
            if not cur.fetchone()[0]:
                results['issues'].append("vector_analysis_storage_consumption table not found")
                return results

            cur.execute("""
                SELECT column_name FROM information_schema.columns
                WHERE table_name = 'vector_analysis_storage_consumption'
                ORDER BY column_name;
            """)
            actual_columns = {row[0] for row in cur.fetchall()}
            missing = set(expected_columns) - actual_columns
            extra = actual_columns - set(expected_columns)
            if missing:
                results['issues'].append(f"Missing columns: {missing}")
            if extra:
                results['issues'].append(f"Unexpected columns: {extra}")

            cur.execute("SELECT COUNT(*) FROM vector_analysis_storage_consumption;")
            count = cur.fetchone()[0]
            if count == 0:
                results['issues'].append("No rows found in vector_analysis_storage_consumption")
                return results

            # Get actual tables with vector columns
            cur.execute("""
                SELECT DISTINCT table_name
                FROM information_schema.columns
                WHERE data_type = 'USER-DEFINED'
                AND udt_name = 'vector'
                ORDER BY table_name;
            """)
            actual_vector_tables = {row[0] for row in cur.fetchall()}

            # Get what the agent analyzed
            cur.execute("""
                SELECT DISTINCT table_name
                FROM vector_analysis_storage_consumption
                ORDER BY table_name;
            """)
            analyzed_tables = {row[0] for row in cur.fetchall()}

            # Check if agent analyzed the actual vector tables
            missing_tables = actual_vector_tables - analyzed_tables
            if missing_tables:
                results['issues'].append(f"Agent missed analyzing vector tables: {missing_tables}")

            # Check that analyzed tables actually have vector columns
            extra_tables = analyzed_tables - actual_vector_tables
            if extra_tables:
                results['issues'].append(f"Agent analyzed non-vector tables: {extra_tables}")

            if not missing and not extra and count > 0 and not missing_tables and not extra_tables:
                results['passed'] = True

    except psycopg2.Error as e:
        results['issues'].append(f"Database error: {e}")
    except Exception as e:
        results['issues'].append(f"Verification error: {e}")
    return results


def verify_vector_analysis_indices(conn) -> Dict[str, Any]:
    """Verify the vector_analysis_indices table exists, has correct columns, and identifies actual vector indexes."""
    results = {'passed': False, 'issues': []}
    expected_columns = [
        'schema', 'table_name', 'column_name', 'index_name', 'index_type', 'index_size_bytes'
    ]
    try:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT EXISTS (
                    SELECT FROM information_schema.tables
                    WHERE table_name = 'vector_analysis_indices'
                );
            """)
            if not cur.fetchone()[0]:
                results['issues'].append("vector_analysis_indices table not found")
                return results

            cur.execute("""
                SELECT column_name FROM information_schema.columns
                WHERE table_name = 'vector_analysis_indices'
                ORDER BY column_name;
            """)
            actual_columns = {row[0] for row in cur.fetchall()}
            missing = set(expected_columns) - actual_columns
            extra = actual_columns - set(expected_columns)
            if missing:
                results['issues'].append(f"Missing columns: {missing}")
            if extra:
                results['issues'].append(f"Unexpected columns: {extra}")

            cur.execute("SELECT COUNT(*) FROM vector_analysis_indices;")
            count = cur.fetchone()[0]
            if count == 0:
                results['issues'].append("No rows found in vector_analysis_indices")
                return results

            # Get actual vector indexes from the database (exclude ground truth table indexes)
            cur.execute("""
                SELECT schemaname, tablename, indexname
                FROM pg_indexes
                WHERE (indexdef ILIKE '%hnsw%' OR indexdef ILIKE '%ivfflat%')
                AND tablename NOT LIKE '%analysis%'
                ORDER BY tablename, indexname;
            """)
            actual_vector_indexes = set(cur.fetchall())

            # Get what the agent found
            cur.execute("""
                SELECT schema, table_name, index_name
                FROM vector_analysis_indices
                ORDER BY table_name, index_name;
            """)
            found_indexes = set(cur.fetchall())

            # Check if agent found the actual vector indexes
            missing_indexes = actual_vector_indexes - found_indexes
            if missing_indexes:
                results['issues'].append(f"Agent missed vector indexes: {missing_indexes}")

            # Allow agent to find more indexes than just vector ones (they might include related indexes)
            # but at least they should find the vector-specific ones

            if not missing and not extra and count > 0 and not missing_indexes:
                results['passed'] = True

    except psycopg2.Error as e:
        results['issues'].append(f"Database error: {e}")
    except Exception as e:
        results['issues'].append(f"Verification error: {e}")
    return results


def verify_no_extra_analysis_tables(conn) -> Dict[str, Any]:
    """Check that only the required analysis tables exist (no legacy/extra analysis tables)."""
    results = {'passed': True, 'issues': []}  # Start with passed=True, more lenient
    required = {
        'vector_analysis_columns',
        'vector_analysis_storage_consumption',
        'vector_analysis_indices',
    }
    try:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT table_name FROM information_schema.tables
                WHERE table_schema = 'public'
                AND table_name LIKE 'vector_analysis_%';
            """)
            analysis_tables = {row[0] for row in cur.fetchall()}

            # Only flag as issue if there are analysis tables that don't match our required set
            # Exclude ground truth tables from this check
            analysis_tables_filtered = {t for t in analysis_tables if not t.startswith('expected_') and not t.startswith('vector_analysis_results')}
            extra = analysis_tables_filtered - required
            if extra:
                results['issues'].append(f"Found unexpected analysis tables: {extra}")
                results['passed'] = False

    except Exception as e:
        results['issues'].append(f"Verification error: {e}")
        results['passed'] = False
    return results



def main():
    """Main verification function for vector analysis deliverables."""

    conn_params = get_connection_params()
    if not conn_params["database"]:
        print("No database specified")
        sys.exit(1)
    try:
        conn = psycopg2.connect(**conn_params)
        checks = [
            ("vector_analysis_columns", verify_vector_analysis_columns),
            ("vector_analysis_storage_consumption", verify_vector_analysis_storage_consumption),
            ("vector_analysis_indices", verify_vector_analysis_indices),
            ("no_extra_analysis_tables", verify_no_extra_analysis_tables),
        ]
        passed_checks = 0
        all_issues = []
        for i, (desc, check_func) in enumerate(checks, 1):
            result = check_func(conn)
            if result['passed']:
                print(f"  PASSED")
                passed_checks += 1
            else:
                print(f"  FAILED")
                for issue in result['issues']:
                    print(f"    - {issue}")
                all_issues.extend(result['issues'])
            print()
        conn.close()
        total_checks = len(checks)
        print(f"Results: {passed_checks}/{total_checks} checks passed")
        if passed_checks == total_checks:
            sys.exit(0)
        elif passed_checks >= total_checks * 0.75:
            sys.exit(0)
        else:
            sys.exit(1)
    except psycopg2.Error as e:
        print(f"Database connection error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Verification error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()