scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (29058B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing LLM Code Generation: A Systematic Evaluation of Multi-Agent Collaboration and Runtime Debugging for Improved Accuracy, Reliability, and Latency",
      6     "authors": [
      7       "Nazmus Ashrafi",
      8       "Salah Bouktif",
      9       "Mohammed Mediani"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2505.02133",
     14     "doi": "10.48550/arXiv.2505.02133"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims about 19 LLMs, two benchmarks, and the combined approach are all confirmed in the paper body with corresponding tables (Table 2) and statistical tests.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Within-subject paired t-tests compare each of 19 models under all three conditions (ACT, Debug, ACT+Debug), which is a reasonable design for causal inference in controlled benchmark evaluation.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper evaluates only on HumanEval and HumanEval+ (Python programming tasks from 2021) but makes broad claims about 'organizations seeking robust AI-driven coding solutions' and 'real-world AI applications' without bounding to Python code completion specifically.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper discusses why debugging outperforms agentic workflows (context-rich execution feedback), why complex agentic interactions hurt (introducing fragility), and why specific models respond differently to combination approaches.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Pass@1 is clearly described as measuring functional correctness; code rigor is operationalized as the accuracy drop from HumanEval to HumanEval+ (80× more tests); latency is wall-clock time — each claim is matched to its measurement.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations or threats-to-validity section; the paper goes directly from results to conclusion. Scattered remarks in methodology (e.g., reliance on visible test cases) do not constitute a section.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No formal threats-to-validity discussion exists. Comments like 'same prompts for all models may not be ideal' and 'LDB does not fully replicate real-world debugging' are isolated remarks, not a systematic treatment.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not state what results do NOT show; conclusions are framed broadly without bounding to HumanEval-style Python tasks, specific model families, or the particular iteration limits chosen.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source is mentioned anywhere in the paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are identified as being from the Department of Computer Science and Software Engineering, United Arab Emirates University, Al Ain, UAE.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence of funder cannot be assessed.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests statement is present in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Multi-agent collaboration is defined through its components (Analyst, Coder, Tester) and workflow; runtime debugging is explained via the LDB-based block-level approach; pass@k is formally described with its formula.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper clearly states it empirically evaluates the combination of multi-agent collaboration and runtime debugging across 19 LLMs on two benchmarks, contributing insights into when and how combination strategies are beneficial.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 systematically reviews related frameworks (AgentCoder, MapCoder, LDB, CYCLE, self-collaboration, RGD, MGDebugger) and Section 3 explicitly positions the proposed approach as combining and extending these prior methods.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper links to a GitHub repository (https://github.com/nazmus-ashrafi/multiagent_vs_debugger) explicitly for agent prompts and code.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Both HumanEval and HumanEval+ are publicly available standard benchmarks requiring no separate release.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No requirements.txt, Dockerfile, or dependency specification is provided; only the API access month (December 2024) is noted, which is insufficient for reproduction.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided in the paper; the GitHub reference is specifically for prompts, not a complete reproduction guide.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Table 2 reports only point estimates (pass@1 percentages); no confidence intervals or error bars are reported for any model-approach combination.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "One-tailed paired t-tests are conducted comparing ACT+Debug vs ACT alone and ACT+Debug vs Debug alone, with t-statistics, degrees of freedom, and significance levels explicitly reported.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Percentage differences are reported throughout (e.g., 0.68% mean accuracy improvement for AC+Debug over Debug alone, 6.7% gap between Debug and ACT on HumanEval) providing practical effect size context.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 19 LLMs are chosen for diversity but no power analysis or formal justification for why 19 models is sufficient for the statistical tests performed is provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Only mean accuracy values are reported; no standard deviation or variance is provided. Single-sample evaluation (n=1 per problem) eliminates run-level variance but inter-run reproducibility is not assessed.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Five baselines are included: Basic (single prompt), AC, ACT, Debugger Only, and AC+Debugger, enabling comprehensive comparison against the proposed ACT+Debugger approach.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "LDB (2024) and self-collaboration framework (2023) are contemporary; models include GPT-4o, Claude 3.5 Sonnet, DeepSeek-V3 — all state-of-the-art at time of experiments (December 2024).",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The six approaches (Basic→AC→ACT→Debugger→AC+Debug→ACT+Debug) form a systematic ablation isolating the contribution of analyst, tester, and debugger modules individually and in combination.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Three metrics are used: functional accuracy (pass@1 on HumanEval), code rigor (accuracy drop on HumanEval+ with 80× more tests), and latency (execution time in minutes per Table 3).",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Human evaluation is not applicable for automated code generation evaluated against unit tests; functional correctness is measured programmatically.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "HumanEval's hidden test cases are reserved for final evaluation while visible test cases are used for in-pipeline execution feedback, ensuring final evaluation is on held-out data.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down per model (all 19 LLMs in Table 2, Figures 4-5 per provider family) and per approach, with per-model analysis of which configurations help or hurt specific architectures.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Failure cases are explicitly discussed: QwQ-Preview's severe degradation with agentic approaches, GPT-4o underperforming with ACT+Debug on HumanEval+, and Llama/DeepSeek models gaining nothing from ACT.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper explicitly reports that ACT+Debug does NOT significantly improve over Debug alone (H0,2 not rejected), that more complex agentic workflows reduce code rigor, and that adding ACT hurts several models.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Table 1 lists specific model names, versions, and API endpoints with the note that 'All APIs were accessed in the month of December 2024.'",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "The paper explicitly states all agent prompts are available in the GitHub repository (https://github.com/nazmus-ashrafi/multiagent_vs_debugger), covering role-specific instructions for all agents in both phases.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Iteration limits (retriesCT=3, retriesD=4 for combined, retriesD=10 for standalone) are reported, but temperature, top-p, and other LLM sampling hyperparameters are never mentioned.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The multi-agent pipeline (ACT phases, debugging phase, CFG analysis, iteration limits, agent handoff conditions) is described in detail in Section 3 with an architecture diagram in Figure 1.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The split of HumanEval into task description, visible test cases, and hidden test cases is clearly described; benchmarks are used as-is with the split rationale explained.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "Aggregated pass@1 scores are in Table 2 but raw per-problem results (which specific problems each model/approach passed or failed) are not released.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "The use of HumanEval and HumanEval+ APIs, the specific API endpoints in Table 1, and the December 2024 access period are documented.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants; standard benchmarks are used with no recruitment.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The pipeline from benchmark problem input through agent collaboration and debugging phases to final pass@1 evaluation is described in Section 3 and illustrated in Figure 1.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Training cutoffs are not stated for any of the 19 LLMs; only API access dates (December 2024) are noted, not when training data was collected.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "HumanEval (2021) predates all tested models' training data, making contamination highly likely, yet the paper never discusses potential training data overlap with the benchmark.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "HumanEval has been publicly available since 2021 and is almost certainly in the training data of all 19 LLMs tested (some achieving >90% pass@1); this is never acknowledged or addressed.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Latency per approach is reported in Table 3 and Figure 13 (ranging from 7.68 to 68.42 minutes average); Figure 4 caption qualitatively ranks models by token cost.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Total computational budget (API costs, total tokens consumed across 19 models × 2 datasets × 6 approaches × 164 problems) is not stated.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "ACT+Debug significantly outperforms ACT alone at α=0.15 significance level",
    373       "evidence": "Paired t-test on 19 LLMs: mean accuracy 64.82% (ACT+Debug) vs 57.16% (ACT) on HumanEval; H0,1 rejected at α=0.15",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "ACT+Debug does NOT significantly outperform Debug alone",
    378       "evidence": "Only 0.96% mean accuracy difference (64.82% vs 63.86%); H0,2 not rejected at α=0.15; explicitly stated as non-significant",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "AC+Debugger achieves the optimal balance of accuracy, rigor, and latency",
    383       "evidence": "AC+Debug yields 0.68% mean accuracy improvement over Debug alone at 38.42 min vs 31.11 min, while ACT+Debug takes 68.42 min with lower HumanEval+ accuracy (-1.22% vs AC+Debug)",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Debugging-based approaches generally outperform agentic workflows",
    388       "evidence": "Debug alone achieves 61.02% mean accuracy across both datasets vs 54.04% for ACT; 6.7% gap on HumanEval and 7.36% on HumanEval+",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Increased agentic complexity reduces code rigor under stringent testing",
    393       "evidence": "ACT+Debug shows the largest accuracy drop sum (137.74 across all models) on HumanEval+ vs Basic approach (90.83); AC+Debug drop is 110.41",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "The benefit of combining approaches diminishes when the Debug-ACT performance gap is large",
    398       "evidence": "Figures 6-7 show inverse correlation between the Debug-ACT gap and improvement from combining approaches across 38 data points (19 models × 2 datasets)",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "OpenAI models consistently benefit from combinatorial approaches while open-source models (Llama, DeepSeek) generally do not",
    403       "evidence": "Figure 4: GPT-4o-mini improves from 80.45% to 92.07% with ACT+Debug; Table 2 shows Llama 3.3 70B, DeepSeek-V3, and others gain nothing or regress from adding ACT to debugging",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval"
    409   ],
    410   "key_findings": "Across 19 LLMs on HumanEval and HumanEval+, runtime debugging alone outperforms multi-agent agentic workflows by ~7%, while combining a simple Analyst-Coder pipeline with debugging (AC+Debug) yields a modest 0.68% additional accuracy gain with comparable latency — a difference that is not statistically significant even at the non-standard α=0.15 threshold. The benefit of combining approaches inversely correlates with the performance gap between the individual techniques: combination helps most when both strategies perform similarly for a given model. Counter-intuitively, more complex agentic configurations (three-agent ACT) reduce code rigor under stringent testing (HumanEval+) and increase latency without improving accuracy, suggesting simpler agentic workflows paired with debugging represent the practical optimum.",
    411   "red_flags": [
    412     {
    413       "flag": "Non-standard α=0.15 significance threshold",
    414       "detail": "The paper uses α=0.15 for all statistical tests, substantially more permissive than conventional α=0.05. The justification ('even marginal improvements matter in production') is post-hoc and not pre-registered. The main positive finding (ACT+Debug > ACT alone) may not hold at standard thresholds."
    415     },
    416     {
    417       "flag": "HumanEval contamination unaddressed",
    418       "detail": "HumanEval (2021) is widely present in LLM training corpora; some tested models achieve >90% pass@1. The paper never discusses contamination despite evaluating models trained years after the benchmark was published — results may reflect memorization rather than reasoning."
    419     },
    420     {
    421       "flag": "No confidence intervals on main results",
    422       "detail": "Table 2 reports only point estimates for pass@1 scores across 19 models × 6 approaches × 2 datasets. No CIs or error bars are provided, making it impossible to assess uncertainty in individual model comparisons."
    423     },
    424     {
    425       "flag": "Single sample per problem eliminates run-level variance",
    426       "detail": "Using n=1 sample per problem means results cannot be verified for reproducibility across runs with different random seeds; LLM outputs are stochastic and single-sample estimates are unreliable for fine-grained comparisons like 0.68% differences."
    427     },
    428     {
    429       "flag": "No dedicated limitations section",
    430       "detail": "The paper lacks any formal limitations or threats-to-validity section. Generalization to non-HumanEval benchmarks, other programming languages, or real-world coding tasks is never addressed."
    431     },
    432     {
    433       "flag": "Marginal 0.68% improvement framed as optimal",
    434       "detail": "The paper's central practical recommendation (AC+Debug as 'optimal') rests on a 0.68% mean accuracy improvement that is itself not statistically significant, with no discussion of minimum practically meaningful differences."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and Optimisation",
    440       "relevance": "Core multi-agent code generation framework this paper builds upon and compares against"
    441     },
    442     {
    443       "title": "Debug like a Human: A Large Language Model Debugger via Verifying Runtime Execution Step-by-step (LDB)",
    444       "relevance": "The debugging component adopted in this paper; authors implement a variant of LDB as the debugging phase"
    445     },
    446     {
    447       "title": "Self-collaboration Code Generation via ChatGPT",
    448       "relevance": "The Analyst-Coder-Tester framework the paper's multi-agent collaboration phase is directly based on"
    449     },
    450     {
    451       "title": "MapCoder: Multi-Agent Code Generation for Competitive Problem Solving",
    452       "relevance": "Related multi-agent code generation approach reviewed in literature"
    453     },
    454     {
    455       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    456       "relevance": "Primary evaluation benchmark; defines the pass@k metric used throughout the paper"
    457     },
    458     {
    459       "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation (HumanEval+)",
    460       "relevance": "Secondary benchmark with 80× more tests used to measure code rigor throughout the study"
    461     },
    462     {
    463       "title": "RGD: Multi-LLM Based Agent Debugger via Refinement and Generation Guidance",
    464       "relevance": "Related multi-agent debugging framework combining guide, debug, and feedback agents"
    465     },
    466     {
    467       "title": "Reflexion: Language Agents with Verbal Reinforcement Learning",
    468       "relevance": "State-of-the-art approach combined with LDB achieving 98.2% on HumanEval, motivating the study of LDB integration"
    469     },
    470     {
    471       "title": "Teaching Large Language Models to Self-Debug",
    472       "relevance": "Foundational self-debugging framework using execution feedback for iterative code improvement"
    473     },
    474     {
    475       "title": "From Code to Correctness: Closing the Last Mile of Code Generation with Hierarchical Debugging (MGDebugger)",
    476       "relevance": "Related hierarchical debugging approach for code generation reviewed in literature"
    477     }
    478   ],
    479   "engagement_factors": {
    480     "practical_relevance": {
    481       "score": 3,
    482       "justification": "Provides direct, actionable guidance for organizations choosing between multi-agent and debugging strategies across 19 diverse LLMs with latency and accuracy trade-offs explicitly quantified."
    483     },
    484     "surprise_contrarian": {
    485       "score": 2,
    486       "justification": "Counterintuitive finding that simpler agentic workflows outperform complex ones and that adding a tester agent can reduce code rigor — challenges the 'more agents = better' assumption prevalent in agentic AI research."
    487     },
    488     "fear_safety": {
    489       "score": 0,
    490       "justification": "No AI safety or risk concerns are raised; the paper is purely about code generation accuracy and efficiency."
    491     },
    492     "drama_conflict": {
    493       "score": 1,
    494       "justification": "Mild tension between prevailing enthusiasm for complex multi-agent systems and the finding that they often underperform simpler debugging approaches, without framing this as a controversy."
    495     },
    496     "demo_ability": {
    497       "score": 2,
    498       "justification": "GitHub repository linked with prompts; readers could implement AC+Debugger with API access to any of the 19 models tested using the described pipeline."
    499     },
    500     "brand_recognition": {
    501       "score": 1,
    502       "justification": "Tests well-known models (GPT-4o, Claude 3.5 Sonnet, DeepSeek-V3, Llama) but authors are from UAE University, not a recognized AI research lab."
    503     }
    504   },
    505   "hn_data": {
    506     "threads": [
    507       {
    508         "hn_id": "43390400",
    509         "title": "Deep Learning Is Not So Mysterious or Different",
    510         "points": 485,
    511         "comments": 126,
    512         "url": "https://news.ycombinator.com/item?id=43390400",
    513         "created_at": "2025-03-17T16:47:02Z"
    514       },
    515       {
    516         "hn_id": "45291024",
    517         "title": "Launch HN: Cactus (YC S25) – AI inference on smartphones",
    518         "points": 123,
    519         "comments": 63,
    520         "url": "https://news.ycombinator.com/item?id=45291024",
    521         "created_at": "2025-09-18T15:40:29Z"
    522       },
    523       {
    524         "hn_id": "44430311",
    525         "title": "Small language models are the future of agentic AI",
    526         "points": 113,
    527         "comments": 45,
    528         "url": "https://news.ycombinator.com/item?id=44430311",
    529         "created_at": "2025-07-01T03:33:49Z"
    530       },
    531       {
    532         "hn_id": "44659764",
    533         "title": "Mitigating Tool Squatting and Rug Pull Attacks in Model Context Protocol (MCP)",
    534         "points": 5,
    535         "comments": 0,
    536         "url": "https://news.ycombinator.com/item?id=44659764",
    537         "created_at": "2025-07-23T14:42:26Z"
    538       },
    539       {
    540         "hn_id": "44246361",
    541         "title": "Small Language Models Are the Future of Agentic AI",
    542         "points": 5,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=44246361",
    545         "created_at": "2025-06-11T11:16:33Z"
    546       },
    547       {
    548         "hn_id": "44003454",
    549         "title": "Twist: Teleoperated Whole-Body Imitation System",
    550         "points": 2,
    551         "comments": 0,
    552         "url": "https://news.ycombinator.com/item?id=44003454",
    553         "created_at": "2025-05-16T09:44:32Z"
    554       },
    555       {
    556         "hn_id": "23087191",
    557         "title": "A Survey on Dialog Management: Recent Advances and Challenges",
    558         "points": 2,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=23087191",
    561         "created_at": "2020-05-06T01:52:26Z"
    562       },
    563       {
    564         "hn_id": "45549900",
    565         "title": "Agentic web browsing can't scale with cloud LLMs",
    566         "points": 1,
    567         "comments": 0,
    568         "url": "https://news.ycombinator.com/item?id=45549900",
    569         "created_at": "2025-10-11T15:29:17Z"
    570       },
    571       {
    572         "hn_id": "43291939",
    573         "title": "Deep Learning Is Not So Mysterious or Different",
    574         "points": 1,
    575         "comments": 0,
    576         "url": "https://news.ycombinator.com/item?id=43291939",
    577         "created_at": "2025-03-07T17:11:27Z"
    578       }
    579     ],
    580     "top_points": 485,
    581     "total_points": 737,
    582     "total_comments": 234
    583   }
    584 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs