scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28401B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "HyperAgent: Generalist Software Engineering Agents to Solve Coding Tasks at Scale",
      6     "authors": [
      7       "H. N. Phan",
      8       "Phong X. Nguyen",
      9       "Nghi D. Q. Bui"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2409.16299",
     14     "doi": "10.48550/arXiv.2409.16299"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "The abstract claims HYPERAGENT 'surpasses strong baselines' on SWE-Bench, but the Full-1 configuration achieves 26% on Lite vs Agentless 24.3% (1.7pp margin) and 33% on Verified vs SWE-Agent+Claude-3.5-Sonnet 33.6% (actually losing). The 'state-of-the-art' framing overstates thin margins with no statistical testing.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Ablation studies in Table 5 systematically remove each agent role (Navigator, Editor, Executor) and measure performance/cost impact, providing adequate evidence for causal attribution of each component's contribution.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper claims to be 'the first system designed to work off-the-shelf across diverse SE tasks in multiple programming languages' but only evaluates on three benchmarks (two Python, one Java). The 'generalist' and 'at scale' framing is not bounded to the tested settings.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Different baselines use different underlying LLMs (e.g., Agentless uses GPT-4o, SWE-Agent uses Claude-3.5-Sonnet) while HYPERAGENT uses Claude-3-Sonnet across agents; the paper does not discuss whether performance differences are attributable to architecture vs model choice.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper distinguishes plausible patches (pass all tests) from correct patches (AST match with developer fix) in program repair, and uses acc@k for fault localization rather than conflating metrics with broader claims.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations or threats-to-validity section; Section 6.4 briefly discusses error categories (hallucination, early exit, edit loops) but this is framed as analysis rather than limitations.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No threats-to-validity discussion. The paper does not address confounds such as model version differences across compared systems, benchmark contamination in LLM training data, or the reliability of single-run results with stochastic LLMs.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper does not explicitly state what results do not show; the conclusion mentions future work in new domains but does not delineate the current system's boundaries or failure conditions.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment or grant information appears anywhere in the paper; authors are affiliated with FPT Software AI Center and UT Dallas but no external funding is disclosed.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly stated on the title page: FPT Software AI Center, Vietnam and The University of Texas at Dallas, USA.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed, making this criterion not applicable.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The term 'generalist' is used throughout but never formally defined; 'at scale' in the title is not defined in contrast to prior approaches; 'agent' is used loosely without specifying what capabilities are required.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper explicitly lists three contributions as bullet points: the HYPERAGENT system, extensive benchmark evaluations, and design insights for scalable multi-agent SE systems.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The related work section covers deep learning for automated programming, SE benchmarks, and autonomous coding agents, positioning HYPERAGENT relative to SWE-Agent, AutoCodeRover, Agentless, MetaGPT, and AgileCoder with substantive comparisons.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "GitHub repository is linked on the first page: https://github.com/FSoft-AI4Code/HyperAgent.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "All three evaluation benchmarks (SWE-Bench, RepoExec, Defects4J) are standard public benchmarks used without modification.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper mentions Docker for the Executor's interactive bash shell and Zoekt for code search, but provides no requirements.txt, Dockerfile, or dependency specifications in the paper text.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Agent prompts and configurations are provided in the appendix, but no step-by-step instructions for reproducing the experimental results (e.g., how to run the benchmark evaluation pipeline) are included.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Tables 1-4 are point estimates only; no confidence intervals or error bars are reported despite LLM inference being stochastic.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are applied to any comparative claims; margins as small as 1.7pp (SWE-Bench Lite) are presented as outperforming without significance testing.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Percentage-point improvements are consistently reported (e.g., +8.7pp over AutoFL on fault localization, +2.1pp over RepairAgent on Defects4J v1.2) with baseline context.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Benchmark sizes are fixed by external sources; the ablation study uses 'SWE-bench Tiny' (100 random instances) without justification for why this subset is sufficient to generalize findings.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or multiple-run statistics are reported for any results, despite using stochastic LLMs where temperature settings are also undisclosed.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Multiple baselines are included for each task: SWE-Agent, AutoCodeRover, Agentless for issue resolution; RAG-based systems for code generation; DeepFL, AutoFL, Grace, DStar, Ochiai for fault localization; RepairAgent, ITER, SelfAPR for program repair.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines for issue resolution (SWE-Agent, Agentless, AutoCodeRover) are contemporary 2024 systems; fault localization includes both traditional (Ochiai, DStar) and recent deep learning methods (AutoFL, Grace); program repair includes recent LLM-based RepairAgent.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table 5 ablates each agent role by replacing it with the Planner; Table 6 ablates individual tool design choices (go-to-definition, open-file, code-search, auto-repair-editor) on SWE-bench Tiny.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "SWE-Bench uses resolved %, average time, and average cost; RepoExec uses pass@1 and pass@5; Defects4J uses acc@1 for localization and both plausible and correct patch counts for repair.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "No human evaluation of system outputs is performed; error analysis in Section 6.4 uses Claude-3.5-Sonnet to categorize trajectories rather than human raters.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "SWE-Bench Lite (300), SWE-Bench Verified (500), RepoExec (355 samples), and Defects4J are all established held-out benchmarks used for evaluation.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 10 in Appendix A.6 provides per-project breakdowns for Defects4J across 13 projects; Figure 3 breaks down error types for SWE-Bench.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 6.4 discusses three failure categories identified via LLM-based trajectory analysis: early exit (hallucination of task completion), edit failed loop, and exit timeout.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Open-source configurations (HYPERAGENT-Lite-2 with Llama-3, Full-3 with Llama-3-70B) achieve substantially lower performance (11-12% on SWE-Bench Lite) than baselines, and these results are reported without suppression.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Models are referred to by marketing names only ('Claude-3-Sonnet', 'Claude-3-Haiku', 'GPT-4o') without API snapshot dates or version identifiers; Claude-3-Sonnet had multiple revisions during 2024.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full system prompts for Planner, Navigator, Editor, and Executor are provided verbatim in Appendix A.7, including guidelines and available function descriptions.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "No temperature, top-p, max-tokens, or other LLM hyperparameters are reported for any of the models used in the experiments.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The multi-agent architecture, asynchronous message queue communication, agent roles, and tool suite (Zoekt search, LSP go-to-definition, interactive Docker shell, repair editor) are described in Sections 3-4 and Appendix A.3.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The paper documents how SWE-Bench Lite and Verified subsets differ (heuristic filtering vs manual annotation) and explains that RepoExec's gold contexts are excluded to test autonomous navigation.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "Agent trajectories, intermediate outputs, and raw results for all benchmark runs are not released in the paper; only aggregate summary statistics are provided.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "The benchmarks' provenance is cited and described (SWE-Bench: 2,294 instances from 12 Python repos; RepoExec: 355 samples with 96.25% test coverage; Defects4J: 353/395/440 active bugs).",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participant recruitment; standard automated benchmarks are used.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "The system architecture pipeline is described, but the experimental data pipeline (number of independent runs, compute infrastructure, how results were aggregated) is not documented.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Training data cutoffs for Claude-3-Sonnet, GPT-4o, and Llama-3 are not stated in the paper.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether SWE-Bench, RepoExec, or Defects4J instances may have appeared in the training data of the evaluated LLMs.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "SWE-Bench was published in 2023 and is likely within training windows of models used; this is not discussed. The paper cites LiveCodeBench (contamination-free benchmark) in related work but does not evaluate on it.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants involved.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants involved.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants involved.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants involved.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants involved.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants involved.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants involved.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Table 1 reports average per-instance API cost in USD for all HYPERAGENT configurations and most baselines (e.g., Lite-1: $0.45, Full-1: $1.82, SWE-Agent+GPT-4o: $2.55).",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Only per-instance API costs are reported; total compute budget across all experiments, number of runs, and wall-clock time for the full evaluation suite are not stated.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "HYPERAGENT-Full-1 achieves 26% on SWE-Bench Lite and 33% on SWE-Bench Verified, outperforming Agentless+GPT-4o and SWE-Agent+Claude-3.5-Sonnet.",
    373       "evidence": "Table 1: Full-1 gets 26%/33% vs Agentless 24.3%/33.2% and SWE-Agent 23%/33.6%. Full-1 wins on Lite (+1.7pp) but trails SWE-Agent on Verified (-0.6pp).",
    374       "supported": "weak"
    375     },
    376     {
    377       "claim": "HYPERAGENT-Lite-3 achieves 53.33% Pass@5 on RepoExec, outperforming all RAG baselines and matching models given full context.",
    378       "evidence": "Table 2: Lite-3 at 53.33% vs WizardLM2+Sparse RAG at 51.23% (next best). CodeLlama-34b with full context achieves 49.54% Pass@5.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "HYPERAGENT achieves 59.7% Acc@1 on Defects4J fault localization, surpassing AutoFL by 8.7 percentage points.",
    383       "evidence": "Table 3: HYPERAGENT-Lite-1 at 59.7%, AutoFL at 51.0%, with substantial margin over traditional methods (Ochiai 20.25%).",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "HYPERAGENT achieves 20.8% correct fixes on Defects4J v1.2, outperforming RepairAgent (18.7%), ITER (14.4%), and SelfAPR (16.2%).",
    388       "evidence": "Table 4: HYPERAGENT 82/395 (20.8%) vs RepairAgent 74 (18.7%); Table 10 provides per-project breakdown confirming HYPERAGENT leads on most projects.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "The Navigator agent is the most critical component; removing it causes the largest performance drop.",
    393       "evidence": "Table 5: Removing Navigator drops Full-1 from 27% to 19% resolved on SWE-bench Tiny; removing Editor drops to 12%; removing Executor drops to 22%.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "HYPERAGENT-Lite-1 offers competitive performance at dramatically lower cost than SWE-Agent+GPT-4o ($0.45 vs $2.55).",
    398       "evidence": "Table 1: Lite-1 achieves 25.33%/30.2% on Lite/Verified at $0.45 while SWE-Agent+GPT-4o achieves 18.33%/23.2% at $2.55.",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "HYPERAGENT is the first system designed to work off-the-shelf across diverse SE tasks in multiple programming languages without task-specific adaptations.",
    403       "evidence": "Asserted as a contribution bullet point; no systematic comparison of adaptability or off-the-shelf usability across the field is performed.",
    404       "supported": "unsupported"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval"
    409   ],
    410   "key_findings": "HYPERAGENT is a four-agent multi-agent system (Planner, Navigator, Code Editor, Executor) that achieves competitive performance across three SE benchmarks: SWE-Bench (26% on Lite, 33% on Verified), RepoExec (53.33% Pass@5), and Defects4J (59.7% fault localization Acc@1, 20.8% correct program repair). The Navigator agent is the single most impactful component; its removal causes the largest performance degradation in ablation studies. HYPERAGENT-Lite-1 offers a cost-efficient configuration ($0.45/task) that outperforms SWE-Agent+GPT-4o ($2.55/task) on the benchmarks tested. The main failure modes are early exit due to hallucination and edit-failed loops.",
    411   "red_flags": [
    412     {
    413       "flag": "Thin margins without significance tests",
    414       "detail": "The headline SWE-Bench Lite result (26% vs 24.3%) is a 1.7pp margin with no statistical testing, no variance across runs, and stochastic LLMs — the claimed outperformance is not statistically established."
    415     },
    416     {
    417       "flag": "Model versions unspecified",
    418       "detail": "Results are reported for 'Claude-3-Sonnet' and 'GPT-4o' without snapshot dates or API version IDs; Claude-3-Sonnet had multiple revisions in 2024 with significant capability differences."
    419     },
    420     {
    421       "flag": "No variance across runs",
    422       "detail": "LLM inference is stochastic but no multiple-run statistics, standard deviations, or confidence intervals are reported for any benchmark results."
    423     },
    424     {
    425       "flag": "Unfair baseline comparison confound",
    426       "detail": "HYPERAGENT configurations use different LLMs than baselines (e.g., Full-1 uses Claude-3-Sonnet, while SWE-Agent baseline uses Claude-3.5-Sonnet which is stronger); model quality differences are not controlled for or discussed."
    427     },
    428     {
    429       "flag": "Ablations on different dataset than main results",
    430       "detail": "Ablation studies are conducted on 'SWE-bench Tiny' (100 instances) while main results use 300-500 instances; the representativeness of this subset is not validated."
    431     },
    432     {
    433       "flag": "Circular error analysis",
    434       "detail": "Error categorization in Section 6.4 is performed by feeding trajectories to Claude-3.5-Sonnet, which is related to the Claude-3-Sonnet model used in the system being analyzed."
    435     },
    436     {
    437       "flag": "Benchmark contamination unaddressed",
    438       "detail": "SWE-Bench and Defects4J were available before training cutoffs of the evaluated LLMs; potential memorization of benchmark solutions is not discussed."
    439     }
    440   ],
    441   "cited_papers": [
    442     {
    443       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    444       "relevance": "Primary benchmark for GitHub issue resolution evaluation; defines the task and evaluation protocol used throughout the paper."
    445     },
    446     {
    447       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    448       "relevance": "Primary baseline system for issue resolution comparison; represents the interactive bash-based agent paradigm."
    449     },
    450     {
    451       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    452       "relevance": "Key baseline that shows simplified non-agent approaches can compete with complex agent systems on SWE-Bench."
    453     },
    454     {
    455       "title": "AutoCodeRover: Autonomous Program Improvement",
    456       "relevance": "Baseline two-stage agent pipeline for bug fixing; represents structured decomposition approach."
    457     },
    458     {
    459       "title": "RepoExec: Evaluate Code Generation with a Repository-Level Executable Benchmark",
    460       "relevance": "Second benchmark used for evaluation; tests repository-level code generation with executability emphasis."
    461     },
    462     {
    463       "title": "A Quantitative and Qualitative Evaluation of LLM-based Explainable Fault Localization (AutoFL)",
    464       "relevance": "Primary baseline for fault localization; nearest competitor on Defects4J at 51% Acc@1."
    465     },
    466     {
    467       "title": "RepairAgent: An Autonomous, LLM-based Agent for Program Repair",
    468       "relevance": "Primary baseline for program repair; another LLM-based multi-agent repair system for direct comparison."
    469     },
    470     {
    471       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    472       "relevance": "Related multi-agent SE system focused on software development from requirements."
    473     },
    474     {
    475       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    476       "relevance": "Core benchmark for fault localization and program repair evaluation; used in third experimental task."
    477     },
    478     {
    479       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    480       "relevance": "Cited as benchmark addressing contamination concerns; relevant to data integrity issues not addressed by this paper."
    481     }
    482   ],
    483   "engagement_factors": {
    484     "practical_relevance": {
    485       "score": 3,
    486       "justification": "Open-source multi-agent framework with cost-performance tradeoffs explicitly reported; directly usable by practitioners for automated SE tasks."
    487     },
    488     "surprise_contrarian": {
    489       "score": 1,
    490       "justification": "Expected result that a generalist multi-agent system can match specialized systems; the cost-efficiency finding (Lite-1 at $0.45) is the most novel practical insight."
    491     },
    492     "fear_safety": {
    493       "score": 0,
    494       "justification": "No AI safety or risk concerns raised; paper focuses on SE productivity improvements."
    495     },
    496     "drama_conflict": {
    497       "score": 1,
    498       "justification": "Competitive positioning against SWE-Agent and Agentless with thin margins creates some implicit tension; no overt controversy."
    499     },
    500     "demo_ability": {
    501       "score": 3,
    502       "justification": "GitHub repo released with full code; system runs on public SWE-Bench/Defects4J benchmarks that others can replicate."
    503     },
    504     "brand_recognition": {
    505       "score": 1,
    506       "justification": "FPT Software AI Center (Vietnam) and UT Dallas are not top-tier AI labs; limited brand pull in the competitive SE agent space."
    507     }
    508   },
    509   "hn_data": {
    510     "threads": [
    511       {
    512         "hn_id": "41783122",
    513         "title": "INT-FlashAttention: Enabling Flash Attention for INT8 Quantization",
    514         "points": 6,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=41783122",
    517         "created_at": "2024-10-09T00:04:36Z"
    518       },
    519       {
    520         "hn_id": "39868006",
    521         "title": "Applied Category Theory in the Wolfram Language Using Categorica I",
    522         "points": 4,
    523         "comments": 1,
    524         "url": "https://news.ycombinator.com/item?id=39868006",
    525         "created_at": "2024-03-29T19:26:54Z"
    526       },
    527       {
    528         "hn_id": "41760024",
    529         "title": "Irrelevant Alternatives Bias Large Language Model Hiring Decisions",
    530         "points": 3,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=41760024",
    533         "created_at": "2024-10-06T20:27:12Z"
    534       },
    535       {
    536         "hn_id": "41798265",
    537         "title": "INT8 FlashAttention",
    538         "points": 2,
    539         "comments": 0,
    540         "url": "https://news.ycombinator.com/item?id=41798265",
    541         "created_at": "2024-10-10T12:47:46Z"
    542       },
    543       {
    544         "hn_id": "33121142",
    545         "title": "Device Tracking via Linux TCP Source Port Selection Algorithm",
    546         "points": 2,
    547         "comments": 0,
    548         "url": "https://news.ycombinator.com/item?id=33121142",
    549         "created_at": "2022-10-07T13:41:38Z"
    550       },
    551       {
    552         "hn_id": "37610935",
    553         "title": "Quantum Confusions, Cleared Up (or so I hope)",
    554         "points": 1,
    555         "comments": 0,
    556         "url": "https://news.ycombinator.com/item?id=37610935",
    557         "created_at": "2023-09-22T12:09:32Z"
    558       }
    559     ],
    560     "top_points": 6,
    561     "total_points": 18,
    562     "total_comments": 1
    563   }
    564 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs