ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (25489B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Investigating the Vulnerability of LLM-as-a-Judge Architectures to Prompt-Injection Attacks",
      6     "authors": [
      7       "Narek Maloyan",
      8       "Bislan Ashinov",
      9       "Dmitry Namiot"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2505.13348",
     14     "doi": "10.48550/arXiv.2505.13348"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims CUA achieves >30% ASR and JMA shows notable effectiveness. Table I reports 31.2-32.4% for CUA and 15.2-16.7% for JMA, directly supporting abstract claims.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Paper claims adversarial suffixes cause verdict flips and tests this with controlled experiments comparing attacked vs baseline conditions with multiple control conditions (random-suffix, token-shuffle).",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Abstract and conclusion claim vulnerabilities in 'current LLM-as-a-Judge systems' broadly, but experiments only test 2 open-source 3B models. No testing of the actual large models (GPT-4, Claude) that are used as judges in practice.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Paper mechanistically explains why attacks work (token ordering, direct decision optimization) but does not discuss alternative hypotheses for why judges are vulnerable (training, architecture, attention patterns).",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "ASR (Attack Success Rate = verdict flip percentage) is the measured outcome; claims are about 'vulnerability' and 'susceptibility' which are reasonable interpretations of this metric.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations section. Conclusion mentions one limitation: 'This work did not explore the impact of permuting the order of the attacked and genuinely superior answers,' but lacks comprehensive threats-to-validity discussion.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Limitations mentioned are vague (permutation order). Missing discussions of: generalization to large closed-source models, sample size justification, whether findings hold with defenses applied, dataset representativeness.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Paper does not explicitly state what findings do NOT show. Implicit scope limitations (small models, no defense evaluation, MT-Bench only) are never articulated as boundaries.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding statement or acknowledgments section provided in the paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "Email (maloyan.narek@gmail.com) is provided but no institutional affiliations are stated in the paper.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding source disclosed, so cannot assess independence.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) provided.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined: LLM-as-a-Judge, prompt injection, Comparative Undermining Attack, Justification Manipulation Attack, Attack Success Rate, adversarial suffix.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Introduction clearly states the work investigates vulnerabilities of LLM-as-a-Judge systems to prompt-injection attacks and develops optimization-based attack methods.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section II engages with prior work on LLM security, LLM-as-a-Judge paradigm, and prior attacks (JudgeDeceiver, GCG), positioning this work as extending GCG methods to judge architectures.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "Paper does not mention releasing code, attack generation scripts, or evaluation code.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "MT-Bench dataset is public, but the paper does not release adversarial suffixes generated, attack results, or judge model outputs.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No specification of Python version, PyTorch version, CUDA version, hardware, or other computational environment details.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "GCG algorithm described mathematically but without implementable details (exact suffix length L, number of iterations, candidate set size, evaluation function specifics).",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Table I reports single ASR percentages per method per model with no confidence intervals, standard deviations, or error bars.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests, p-values, or hypothesis tests comparing methods (e.g., CUA vs JMA).",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "ASR percentages (e.g., 31.2% CUA vs 5.1% Hard Prompt) represent effect sizes comparing attack methods.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Paper uses MT-Bench but exact number of examples tested per method is not reported. No justification for sample size or power analysis provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Single ASR value per method per model in Table I. No variance across runs, no standard deviation, no evidence of multiple evaluation passes.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Paper includes Hard Prompt Attack baseline and two control conditions (Random-Suffix Control, Token-Shuffle Control).",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Hard Prompt Attack is a reasonable simple baseline; JudgeDeceiver [41] (2024) is cited and compared as contemporary prior work.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": false,
    192           "justification": "Paper compares different attack methods (CUA, JMA, Hard Prompt) and controls but does not ablate components within a single method.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "Only Attack Success Rate (ASR) is used as an evaluation metric. No measurement of attack transferability, robustness, or other dimensions.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "Paper uses MT-Bench which contains human-judged ground truth, but does not conduct human evaluation of attack success or realism.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "Paper uses MT-Bench but does not specify train/validation/test split. Unclear if held-out test set was used or if all MT-Bench data was used for evaluation.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": false,
    216           "justification": "Results in Table I show per-model breakdown but not per-question-type, per-answer-quality, or per-difficulty-level breakdowns.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "Paper does not discuss when attacks failed, what types of examples resisted attacks, or failure case analysis.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Random-Suffix Control (1.2-1.5% ASR) and Token-Shuffle Control (2.8-3.1% ASR) demonstrate that simple perturbations are ineffective.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Model versions explicitly stated: 'Qwen2.5-3B-Instruct' and 'Falcon3-3B-Instruct' with references to technical reports [52, 53].",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Paper describes pairwise comparison task but does not provide the actual prompt template used to query the judge models.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "GCG algorithm is described at high level but key hyperparameters are not specified numerically: suffix length L is not stated as a concrete number, number of GCG iterations not specified.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Pairwise comparison scaffolding is described: judge model receives (query x, answer a, answer b) and outputs preference. Adversarial suffix δ is appended to b.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "Paper states MT-Bench examples are 'formed into triplets (x, a, b)' but does not document preprocessing steps, filtering criteria, or data cleaning.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "MT-Bench is publicly available from LMSYS, but attack-generated outputs, adversarial suffixes, and evaluation results are not released.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "MT-Bench collection is referenced [35] but not described in this paper. This paper's selection criteria for which MT-Bench examples to attack is not documented.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "N/A — MT-Bench is a standard public dataset not newly collected by these authors.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "High-level pipeline is described (MT-Bench → GCG optimization → judge evaluation) but detailed pipeline from raw data to final results is not fully documented.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Models tested are Qwen2.5 and Falcon3 but training data cutoff dates are not stated in the paper.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "MT-Bench examples could potentially be in the training data of the judge models, but paper does not discuss this potential overlap.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "MT-Bench was created in 2023 [35]; judge models (Qwen2.5, Falcon3) are from 2024-2025, making contamination unlikely, but paper does not explicitly address this.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "N/A — no human participants in the study.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "N/A — no human participants in the study.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "N/A — no human participants in the study.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "N/A — no human participants in the study.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "N/A — no human participants in the study.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "N/A — no human participants in the study.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "N/A — no human participants in the study.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference cost, latency, or computational time reported. Paper uses 3B models but no details on inference speed or cost per attack.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total computational budget, GPU hours, training time, or inference time budgets are reported.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Comparative Undermining Attack (CUA) achieves Attack Success Rate exceeding 30%",
    373       "evidence": "Table I reports CUA ASR of 31.2% on Qwen2.5-3B and 32.4% on Falcon3-3B",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Justification Manipulation Attack (JMA) shows notable effectiveness around 15-17%",
    378       "evidence": "Table I reports JMA ASR of 15.2% on Qwen2.5-3B and 16.7% on Falcon3-3B",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Token ordering in adversarial suffixes is more important than token presence",
    383       "evidence": "Token-Shuffle Control achieved 2.8-3.1% ASR vs Random-Suffix Control at 1.2-1.5%, showing structure matters",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Simple heuristic prompt injections have limited effectiveness (5% ASR) compared to optimization-based attacks",
    388       "evidence": "Hard Prompt Attack baseline achieved ~5% ASR vs CUA 31%+ and JMA 15%+",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Current LLM-as-a-Judge systems are significantly vulnerable to prompt injection attacks",
    393       "evidence": "30%+ success rates on tested models, but only demonstrated on 2 small open-source models, not on large commercial judges actually deployed",
    394       "supported": "weak"
    395     },
    396     {
    397       "claim": "Direct decision-token optimization (CUA) is more effective than reasoning manipulation (JMA)",
    398       "evidence": "CUA achieved 31-32% vs JMA 15-17% ASR; paper attributes this to more direct optimization objective",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "empirical"
    405   ],
    406   "key_findings": "LLM-as-a-Judge systems are vulnerable to prompt injection attacks via adversarial suffix optimization. Comparative Undermining Attack achieves 30%+ success rate in flipping judge verdicts on two open-source 3B models (Qwen2.5-3B-Instruct, Falcon3-3B-Instruct). Justification Manipulation Attack achieves 15-17% success, and control conditions (random text, token shuffling) show that attack effectiveness depends on specific token content and ordering, not just the presence of additional text.",
    407   "red_flags": [
    408     {
    409       "flag": "Limited model scope",
    410       "detail": "Only tested on 2 open-source 3B models; results do not demonstrate whether large/closed-source models (GPT-4, Claude) that are actually used as judges in production are similarly vulnerable"
    411     },
    412     {
    413       "flag": "No statistical significance testing",
    414       "detail": "Single ASR percentage reported per method per model with no confidence intervals, error bars, or hypothesis tests"
    415     },
    416     {
    417       "flag": "No defense evaluation",
    418       "detail": "Paper identifies vulnerabilities but does not evaluate or test any mitigation strategies, defense mechanisms, or robustness improvements"
    419     },
    420     {
    421       "flag": "Incomplete methodological reporting",
    422       "detail": "Critical hyperparameters missing: suffix length L never specified numerically, number of GCG iterations not stated, candidate set size not reported"
    423     },
    424     {
    425       "flag": "Overgeneralization in claims",
    426       "detail": "Abstract and conclusions claim vulnerabilities in 'current LLM-as-a-Judge systems' broadly, but experiments only cover 2 small models"
    427     },
    428     {
    429       "flag": "No variance or uncertainty reporting",
    430       "detail": "Table I shows only point estimates; unclear if results are from single run or averaged across multiple attacks"
    431     },
    432     {
    433       "flag": "Missing details on MT-Bench subset",
    434       "detail": "Exact number of MT-Bench examples used for attacks not reported; unclear what fraction of the full dataset was evaluated"
    435     },
    436     {
    437       "flag": "No actual prompts provided",
    438       "detail": "Pairwise comparison prompt template not included; evaluation setup cannot be fully reproduced"
    439     }
    440   ],
    441   "cited_papers": [
    442     {
    443       "title": "Quantifying and understanding adversarial prompting",
    444       "authors": "Carlini et al.",
    445       "year": 2023,
    446       "relevance": "Foundational work on adversarial attacks against LLMs; categorizes attack classes relevant to this work's methodology"
    447     },
    448     {
    449       "title": "Universal and transferable adversarial attacks on aligned language models",
    450       "authors": "Zou et al.",
    451       "year": 2023,
    452       "relevance": "Introduces GCG (Greedy Coordinate Gradient) method used in this paper's attack implementation"
    453     },
    454     {
    455       "title": "JudgeDeceiver: Prompt injection attacks to manipulate LLM-as-a-judge",
    456       "authors": "Shi et al.",
    457       "year": 2024,
    458       "relevance": "Directly prior work on attacking judge models; demonstrates universal templates achieve 22-24% ASR"
    459     },
    460     {
    461       "title": "Judging llm-as-a-judge with mt-bench and chatbot arena",
    462       "authors": "Zheng et al.",
    463       "year": 2023,
    464       "relevance": "Seminal work establishing LLM-as-a-Judge paradigm and MT-Bench dataset used in this evaluation"
    465     },
    466     {
    467       "title": "Bad-Judge: Backdoor vulnerabilities of LLM-as-a-judge",
    468       "authors": "Wang et al.",
    469       "year": 2024,
    470       "relevance": "Complementary work on backdoor attacks against judge models during training"
    471     },
    472     {
    473       "title": "SmoothLLM: Defending large language models against jailbreaking attacks",
    474       "authors": "Robey et al.",
    475       "year": 2023,
    476       "relevance": "Proposes defense mechanisms against adversarial attacks; cited as future direction for judge robustness"
    477     },
    478     {
    479       "title": "How helpful is ChatGPT as a judge?",
    480       "authors": "Gu et al.",
    481       "year": 2024,
    482       "relevance": "Empirical study of reliability and limitations of LLM judges; contextualizes why vulnerabilities matter"
    483     },
    484     {
    485       "title": "Attention tracker: Detecting prompt injection attacks in LLMs",
    486       "authors": "Zhang et al.",
    487       "year": 2024,
    488       "relevance": "Defense mechanism for detecting prompt injection; cited as approach to mitigate vulnerabilities demonstrated here"
    489     }
    490   ],
    491   "engagement_factors": {
    492     "practical_relevance": {
    493       "score": 2,
    494       "justification": "Identifies real vulnerability in deployed evaluation systems (LLM judges used in RLHF, model evaluation), but provides no defenses or mitigation strategies."
    495     },
    496     "surprise_contrarian": {
    497       "score": 1,
    498       "justification": "LLM vulnerability to adversarial attacks is well-established; applying known techniques to judge models is incremental rather than surprising."
    499     },
    500     "fear_safety": {
    501       "score": 2,
    502       "justification": "Raises concerns about evaluation system integrity that could compromise model safety pipelines (RLHF relies on judge correctness), but doesn't frame as existential risk."
    503     },
    504     "demo_ability": {
    505       "score": 3,
    506       "justification": "GCG-based attacks are straightforward to implement; paper provides sufficient detail and uses publicly available models (Qwen2.5-3B, Falcon3-3B) for easy demonstration."
    507     },
    508     "brand_recognition": {
    509       "score": 1,
    510       "justification": "Authors appear to be from Moscow State University (based on name patterns); no affiliation with well-known AI labs. Limited institutional prestige."
    511     },
    512     "drama_conflict": {
    513       "score": 2,
    514       "justification": "Has controversy potential (security vulnerability in widely-used evaluation systems) but presented as dry technical research without sensationalism."
    515     }
    516   },
    517   "hn_data": {
    518     "threads": [
    519       {
    520         "hn_id": "36038868",
    521         "title": "RWKV: Reinventing RNNs for the Transformer Era",
    522         "points": 358,
    523         "comments": 171,
    524         "url": "https://news.ycombinator.com/item?id=36038868"
    525       },
    526       {
    527         "hn_id": "45341511",
    528         "title": "Learn Your Way: Towards an AI-Augmented Textbook, Google Research",
    529         "points": 3,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=45341511"
    532       },
    533       {
    534         "hn_id": "44619169",
    535         "title": "Palatable Conceptions of Disembodied Being",
    536         "points": 3,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=44619169"
    539       },
    540       {
    541         "hn_id": "43411379",
    542         "title": "New Computer with intergrated Brain Computer interface",
    543         "points": 3,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=43411379"
    546       },
    547       {
    548         "hn_id": "43417925",
    549         "title": "Bioscience Lab in home for your Brain and Body, control laptop via mind",
    550         "points": 1,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=43417925"
    553       },
    554       {
    555         "hn_id": "42898154",
    556         "title": "Building a Verifiable Logical Clock for P2P Networks",
    557         "points": 1,
    558         "comments": 0,
    559         "url": "https://news.ycombinator.com/item?id=42898154"
    560       },
    561       {
    562         "hn_id": "27932480",
    563         "title": "Shining Light on Quantum Transport in Fractal Networks",
    564         "points": 1,
    565         "comments": 0,
    566         "url": "https://news.ycombinator.com/item?id=27932480"
    567       }
    568     ],
    569     "top_points": 358,
    570     "total_points": 370,
    571     "total_comments": 171
    572   }
    573 }

Impressum · Datenschutz