scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (29637B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Towards Engineering Multi-Agent LLMs: A Protocol-Driven Approach",
      6     "authors": [
      7       "Zhenyu Mao",
      8       "Jacky W. Keung",
      9       "Fengji Zhang",
     10       "Shuo Liu",
     11       "Yifei Wang",
     12       "Jialong Li"
     13     ],
     14     "year": 2025,
     15     "venue": "Asia-Pacific Software Engineering Conference",
     16     "arxiv_id": "2510.12120",
     17     "doi": "10.1109/APSEC66846.2025.00100"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims ('up to 69.6% reduction', '56.7%', '47.4%', '28.2%') are supported by Tables I and II.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper claims SEMAP 'reduces failures' and 'mitigates' issues (causal language), but the comparison is only against one baseline (MetaGPT) without controlling for confounds like the A2A infrastructure itself or the additional prompt engineering in SEMAP.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title claims a general 'protocol-driven approach' for 'multi-agent LLMs' but results are only from two models (DeepSeek-V3-0324 and gpt-4.1-nano) on four specific datasets. The paper does not bound its generalizations to these settings.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No discussion of alternative explanations. The improvements could be due to more detailed prompting, the A2A infrastructure, or simply more structured agent instructions rather than the three SE principles claimed.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper measures failure counts (as judged by an LLM) and frames this as 'system robustness' and 'effectiveness' without discussing the gap between LLM-judged failures and actual system quality.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No dedicated limitations section. The conclusion mentions future work items that implicitly acknowledge limitations, but there is no substantive limitations discussion.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No threats to validity discussed. The paper does not address concerns like LLM-as-a-Judge reliability, single-baseline comparison, or single-run results.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The conclusion mentions scaling to 'larger datasets, agent populations, and longer workflows' as future work, implicitly acknowledging current scope, but does not explicitly state what the results do NOT show.",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding or acknowledgments section present.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations (City University of Hong Kong, Waseda University) are clearly stated.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No funding information disclosed, so independence cannot be assessed.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests statement present.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Behavioral contracts, structured messaging, and lifecycle are formally defined in Section III. However, critical terms like 'agent', 'failure', 'protocol', and 'multi-agent system' are used informally without precise definitions upfront.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Abstract explicitly states the contribution: SEMAP, a protocol methodology implementing three SE design principles, with empirical evaluation on SE tasks showing failure reduction.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section II surveys multi-agent LLM systems for SE, protocol designs, and cites the MAST failure taxonomy. Related work includes MetaGPT, AutoGen, ChatDev. Positioning is adequate though not deeply comparative.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No repository URL or code release mentioned. The conclusion mentions 'releasing artifacts for reproducibility' as future work.",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The paper uses publicly available datasets: HumanEval, ProgramDev, Devign (devign100 subset), and CVEFixes (vudenc100 subset). However, the specific 100-sample subsets (devign100, vudenc100) are constructed by the authors and not released.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No environment specifications, dependency lists, or hardware details provided.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No reproduction instructions provided. Artifacts release is listed as future work.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Tables I and II report only point estimates (raw counts and percentage changes) with no confidence intervals or error bars.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Claims of failure reduction (e.g., '69.6% reduction') are made by comparing raw counts without any statistical significance tests.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Percentage reductions are reported with baseline context (e.g., 'from 256 to 92' = 64.1%), providing enough information to assess magnitude.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No justification for why 100-sample subsets were chosen for vulnerability detection, or why HumanEval's 164 problems are sufficient.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance, standard deviation, or spread measures reported. Results appear to be from single runs.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "MetaGPT framework is used as the baseline system for both development and vulnerability detection tasks.",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "Only one baseline (MetaGPT) is used. The conclusion acknowledges the need to 'compare against more baselines, including single-agent LLMs and domain-specific detectors' as future work.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "SEMAP has three components (contracts, messaging, lifecycle verification) but no ablation study isolates their individual contributions. The conclusion lists 'ablation studies' as future work.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Failures are categorized into three types (under-specification, inter-agent misalignment, task verification) and reported separately, plus total counts and round-by-round trends.",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": false,
    207           "justification": "Failure categorization is done entirely by LLM-as-a-Judge (gpt-4o-2024-08-06), not human evaluation. No human review of outputs.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Standard benchmark test sets (HumanEval, ProgramDev, devign100, vudenc100) are used for evaluation, not for tuning.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Tables I and II provide per-failure-category breakdowns across all tasks and models.",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "No qualitative failure analysis or specific failure examples are discussed. Results are purely quantitative.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Some weak results are reported: e.g., ProgramDev with GPT-4.1-nano shows only 12.6% total reduction, and 0.0% reduction in inter-agent misalignment; devign100 with DeepSeek shows only 8.3% reduction.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Exact model versions are specified: 'DeepSeek-V3-0324' and 'gpt-4.1-nano-2025-04-14', plus 'gpt-4o-2024-08-06' for the judge.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": false,
    245           "justification": "No actual prompt text is provided. The paper describes the roles and contracts conceptually but does not provide the actual prompts or system instructions given to agents.",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the models used.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "The SEMAP protocol is described in detail in Section III, including behavioral contracts, structured messaging format, and lifecycle FSM. The architecture (centralized 5-agent for dev, decentralized 3-agent for vuln detection) is specified.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Dataset construction is documented: devign100 is '100-sample subset...constructed by randomly selecting 50 vulnerable and 50 safe C/C++ functions' from Devign; vudenc100 similarly from CVEFixes with labeling criteria described.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "No raw data (agent outputs, failure logs, LLM judge outputs) is made available.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Dataset construction procedure is described: random sampling from Devign and CVEFixes with balanced classes (50/50), labeling criteria for vulnerability defined.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants. All evaluation is automated using benchmarks.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": false,
    289           "justification": "The pipeline from raw benchmark problems to failure counts is not fully documented. How the LLM-as-a-Judge categorizes failures, what its prompt looks like, and how ambiguous cases are handled is not described beyond citing [19].",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No training data cutoff dates stated for DeepSeek-V3-0324 or gpt-4.1-nano. HumanEval (2021) is likely in both models' training data.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No discussion of whether HumanEval or other benchmark problems appeared in the models' training data.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "HumanEval was published in 2021 and is widely known to be contaminated in modern LLMs. This is not discussed.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in this study.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference cost, API costs, or latency reported. The conclusion mentions 'measuring resource overhead' as future work.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No computational budget stated (API spend, GPU hours, wall-clock time).",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "No mention of multiple seeds or seed sensitivity. Results appear to be single-run.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "Number of runs per experiment is not stated.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "No hyperparameter search budget reported. The number of collaboration rounds (5) is stated but no tuning process described.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": false,
    393           "justification": "No discussion of how the SEMAP configuration was selected or whether alternatives were tried.",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "Multiple comparisons across 4 datasets × 2 models × 3 failure categories with no statistical tests at all, let alone corrections.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The authors implement SEMAP and compare against MetaGPT baseline without acknowledging author-evaluation bias.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "SEMAP adds protocol overhead (contracts, structured messaging, lifecycle FSM) compared to MetaGPT baseline but compute cost differences are not discussed.",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "No discussion of whether HumanEval function-level tasks and 100-sample subsets are valid measures of multi-agent system quality.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": true,
    422           "answer": false,
    423           "justification": "SEMAP uses A2A infrastructure while baseline uses MetaGPT — different scaffolds. Performance differences could be due to the scaffold rather than the SE principles. This confound is not addressed.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "HumanEval (2021) predates both models' training. ProgramDev and devign100/vudenc100 temporal relationships to model training not discussed.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of whether the evaluation setup provides information not available in realistic usage.",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of independence between training and test data.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No leakage detection or prevention methods applied.",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "SEMAP reduces failure counts by 64.1% on HumanEval with ChatGPT (256→92 failures)",
    458       "evidence": "Table I, HumanEval row, ChatGPT columns: baseline 256, SEMAP 92, delta 64.1%",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "SEMAP reduces failure counts by 69.6% on HumanEval with DeepSeek (112→34 failures)",
    463       "evidence": "Table I, HumanEval row, DeepSeek columns: baseline 112, SEMAP 34, delta 69.6%",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Under-specification is the largest failure category reduced by SEMAP (71.5-73% reduction)",
    468       "evidence": "Table I, under-specification rows show largest percentage reductions compared to other categories",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "SEMAP reduces failure counts across all tasks including vulnerability detection (28.2-47.4% reduction)",
    473       "evidence": "Table II shows reductions across devign100 and vudenc100: ChatGPT 28.2%, ChatGPT 47.4%",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Failure reduction is stable across multiple development rounds",
    478       "evidence": "Figure 2 shows downward trends in failure counts across rounds for both SEMAP and baseline, with SEMAP consistently lower",
    479       "supported": "moderate"
    480     },
    481     {
    482       "claim": "Three SE design principles (contracts, messaging, lifecycle) cause failure reduction",
    483       "evidence": "SEMAP vs. baseline comparison shows improvement, but no ablation study isolates which principles contribute",
    484       "supported": "weak"
    485     },
    486     {
    487       "claim": "Multi-agent LLM failures stem from under-specification, coordination misalignment, and verification issues",
    488       "evidence": "Adopted from Cemri et al. MAST framework (ref. 19); paper does not independently validate this taxonomy",
    489       "supported": "moderate"
    490     },
    491     {
    492       "claim": "SEMAP can be implemented atop Google's Agent-to-Agent (A2A) infrastructure",
    493       "evidence": "Section II and III describe A2A as implementation basis; no actual A2A integration shown",
    494       "supported": "weak"
    495     }
    496   ],
    497   "methodology_tags": [
    498     "benchmark-eval",
    499     "case-study",
    500     "observational"
    501   ],
    502   "key_findings": "SEMAP, a protocol-layer methodology instantiating explicit behavioral contracts, structured messaging, and lifecycle-guided execution, consistently reduces failure counts in multi-agent LLM systems for SE tasks, with the largest improvements on function-level code development (up to 69.6% failure reduction) and notable gains on vulnerability detection (up to 47.4%). Under-specification failures show the most dramatic improvement (up to 73%), while failure reductions stabilize across multiple collaboration rounds. However, the paper lacks ablation studies to isolate which of the three design principles drives improvements, and statistical significance is not assessed.",
    503   "red_flags": [
    504     {
    505       "flag": "No ablation study",
    506       "detail": "The three principles (contracts, messaging, lifecycle) are evaluated as a bundle. Which principle(s) actually drive improvement is unknown. Conclusion defers ablation to future work."
    507     },
    508     {
    509       "flag": "Small vulnerability detection samples",
    510       "detail": "devign100 and vudenc100 are only 100 samples each (50 vulnerable, 50 safe). For classification tasks, this sample size is small and no power analysis justifies it."
    511     },
    512     {
    513       "flag": "LLM-as-Judge validation unclear",
    514       "detail": "Failures are categorized using gpt-4o-2024-08-06 as judge. No ground truth validation of judge reliability or agreement with human labeling."
    515     },
    516     {
    517       "flag": "No statistical significance testing",
    518       "detail": "Results presented as raw failure counts without confidence intervals, p-values, or significance tests. Differences could be within noise."
    519     },
    520     {
    521       "flag": "No code or reproducibility artifacts",
    522       "detail": "Paper not reproducible. No source code, actual prompts, or detailed instructions released. 'Releasing artifacts' deferred to future work."
    523     },
    524     {
    525       "flag": "Missing hyperparameter details",
    526       "detail": "Temperature, top-p, token limits, timeout values not specified. LLM behavior is sensitive to these settings."
    527     },
    528     {
    529       "flag": "Limited baseline comparison",
    530       "detail": "Only MetaGPT compared. Other frameworks (AutoGen, ChatDev, single-agent baselines) not evaluated."
    531     },
    532     {
    533       "flag": "No explicit limitations section",
    534       "detail": "Paper lacks dedicated Limitations or Threats-to-Validity section. Scope and assumptions not thoroughly discussed."
    535     },
    536     {
    537       "flag": "Generalizability unclear",
    538       "detail": "Only 4 specific SE tasks tested. Unclear whether findings generalize beyond code development and vulnerability detection."
    539     },
    540     {
    541       "flag": "Title says 'Preliminary Evaluation'",
    542       "detail": "Section IV header labels results as preliminary, suggesting findings are not final and should be interpreted cautiously."
    543     }
    544   ],
    545   "cited_papers": [
    546     {
    547       "title": "LLM-based multi-agent systems for software engineering: Literature review, vision and the road ahead",
    548       "relevance": "Directly surveys multi-agent LLM systems for SE; foundational related work for this paper's domain"
    549     },
    550     {
    551       "title": "Why do multi-agent llm systems fail?",
    552       "relevance": "Introduces MAST failure taxonomy (under-specification, misalignment, verification) that this paper builds upon"
    553     },
    554     {
    555       "title": "A survey of ai agent protocols",
    556       "relevance": "Comprehensive survey of agent protocols; directly relevant to SEMAP's protocol-layer approach"
    557     },
    558     {
    559       "title": "Evaluating large language models trained on code",
    560       "relevance": "HumanEval benchmark paper; used as primary evaluation task in this study"
    561     },
    562     {
    563       "title": "MARE: Multi-agents collaboration framework for requirements engineering",
    564       "relevance": "Related multi-agent framework for SE; shows alternative approaches to agent coordination"
    565     },
    566     {
    567       "title": "A pair programming framework for code generation via multi-plan exploration and feedback-driven refinement",
    568       "relevance": "Related multi-agent code generation approach; demonstrates similar ideas in different form"
    569     },
    570     {
    571       "title": "CODES: Natural language to code repository via multi-layer sketch",
    572       "relevance": "Multi-agent code generation system; relevant baseline for comparison"
    573     },
    574     {
    575       "title": "Devign: Effective vulnerability identification by learning comprehensive program semantics via graph neural networks",
    576       "relevance": "Vulnerability detection benchmark; one of the evaluation datasets used (devign100)"
    577     }
    578   ],
    579   "engagement_factors": {
    580     "practical_relevance": {
    581       "score": 1,
    582       "justification": "Protocol methodology is conceptually practical but no implementation/framework released. Practitioners cannot yet adopt SEMAP."
    583     },
    584     "surprise_contrarian": {
    585       "score": 2,
    586       "justification": "Applying classical SE principles (contracts, formal interfaces, state machines) to multi-agent LLMs is somewhat novel given the field's informal prompt-based approaches, but SE abstractions are not new."
    587     },
    588     "fear_safety": {
    589       "score": 0,
    590       "justification": "Paper does not address AI safety, alignment, security risks, or control problems. No safety angle."
    591     },
    592     "drama_conflict": {
    593       "score": 0,
    594       "justification": "Straightforward engineering methodology paper with no controversy, competition, or conflict angle."
    595     },
    596     "demo_ability": {
    597       "score": 1,
    598       "justification": "Conceptual examples possible, but no interactive demo, released code, or hands-on artifact available. Limited demo ability."
    599     },
    600     "brand_recognition": {
    601       "score": 1,
    602       "justification": "Authors from City University of Hong Kong and Waseda University (mid-tier institutions, not top-tier labs). Uses Google A2A but no clear Google affiliation or endorsement."
    603     }
    604   },
    605   "hn_data": {
    606     "threads": [
    607       {
    608         "hn_id": "39285499",
    609         "title": "Show HN: DynamiCrafter: Animating Open-Domain Images with Video Diffusion Priors",
    610         "points": 6,
    611         "comments": 2,
    612         "url": "https://news.ycombinator.com/item?id=39285499",
    613         "created_at": "2024-02-07T07:12:57Z"
    614       },
    615       {
    616         "hn_id": "42793447",
    617         "title": "Can LLMs demonstrate behavioral self-awareness?",
    618         "points": 3,
    619         "comments": 1,
    620         "url": "https://news.ycombinator.com/item?id=42793447",
    621         "created_at": "2025-01-22T14:54:07Z"
    622       },
    623       {
    624         "hn_id": "42815497",
    625         "title": "Tell me about yourself: LLMs are aware of their learned behaviors",
    626         "points": 2,
    627         "comments": 0,
    628         "url": "https://news.ycombinator.com/item?id=42815497",
    629         "created_at": "2025-01-24T17:44:03Z"
    630       },
    631       {
    632         "hn_id": "38011661",
    633         "title": "Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture",
    634         "points": 1,
    635         "comments": 0,
    636         "url": "https://news.ycombinator.com/item?id=38011661",
    637         "created_at": "2023-10-25T11:29:10Z"
    638       },
    639       {
    640         "hn_id": "37939342",
    641         "title": "Can Large Language Models Explain Themselves? A Study",
    642         "points": 1,
    643         "comments": 0,
    644         "url": "https://news.ycombinator.com/item?id=37939342",
    645         "created_at": "2023-10-19T06:41:38Z"
    646       }
    647     ],
    648     "top_points": 6,
    649     "total_points": 13,
    650     "total_comments": 3
    651   }
    652 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs