scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27061B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "EXPEREPAIR: Dual-Memory Enhanced LLM-based Repository-Level Program Repair",
      6     "authors": [
      7       "Fangwen Mu",
      8       "Junjie Wang",
      9       "Lin Shi",
     10       "Song Wang",
     11       "Shoubin Li",
     12       "Qing Wang"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2506.10484",
     17     "doi": "10.48550/arXiv.2506.10484"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims that EXPEREPAIR achieves 49.3% pass@1 (Table 1 confirms), uses dual-memory systems (Section 3.3), and outperforms open-source methods (verified in Table 1 comparison).",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Paper claims dual-memory systems improve repair; ablation study (Table 2) demonstrates this causally—removing experience module drops performance from 47.7% to 41.3%, establishing the causal link.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Claims are explicitly bounded to SWE-Bench Lite (300 issues from 12 Python projects on GitHub). No overgeneralization to all software repair tasks.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Paper compares to baselines but does not discuss alternative explanations for improvements (e.g., whether newer Claude models alone explain gains, or if prompting strategy is the primary driver). Ablation removes components but doesn't isolate model version effects.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Paper acknowledges that pass@1 is passing SWE-Bench tests, not true repair quality. Section 3.2.3 addresses false positives via validation tests. Introduces ESR and RSR metrics to capture test generation/reproduction quality separately.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6 titled 'Limitations' discusses the specific challenge of optimizing bug localization—the lack of automated oracle for verifying localization correctness.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section 6 identifies specific threat: bug localization correctness cannot be verified via execution outcomes alone. However, paper omits threats from small sample (300 issues), Python-only scope, GitHub-only source, and model dependency.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Scope is explicitly stated: 'SWE-Bench Lite benchmark...300 GitHub issues...12 diverse real-world software projects written in Python.' No claims about applicability beyond this domain.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding source disclosed anywhere in the paper. Affiliations listed (Chinese Academy of Sciences, Beihang, York University) but no funding acknowledgment.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations listed at top: State Key Laboratory of Intelligent Game/Institute of Software at CAS, Beihang University, York University. However, no disclosure of whether authors have conflicts with evaluated baseline tools.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No funding disclosed, so criterion not applicable.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement present. No disclosure of patents, equity, or consulting relationships.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms defined: 'episodic memory' (concrete repair demonstrations), 'semantic memory' (abstract insights), 'repository-level repair', 'ReAct algorithm'. However, 'pass@1' not explicitly defined in paper body.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Introduction (Section 1) clearly states two limitations of prior work and three bullet-pointed contributions: dual-memory accumulation, dynamic prompt generation, and comprehensive SWE-Bench evaluation.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 'Related Work' discusses APR history, contrasts agentic vs procedural approaches, and explicitly differentiates EXPEREPAIR (accumulates historical experience) from prior work (treats issues in isolation).",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "Section 7 states 'We release our code and data to support further research' with reference [10] pointing to https://github.com/ExpeRepair/ExpeRepair.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Uses SWE-Bench Lite, a publicly available benchmark (300 issues from GitHub). No new proprietary dataset created.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Appendix A.1 gives iteration limits and hyperparameters but no requirements.txt, Dockerfile, or Python/package version specs. Only states 'Claude-3.5-Sonnet V2' and 'DeepSeek-R1' as LLM versions.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "Section 4 and Appendix A describe the method and setup, but paper itself contains no step-by-step instructions to reproduce results. Code repository may include them, but not in the paper.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Table 1 shows single pass@1 value per method (47.7% for EXPEREPAIR) with no confidence intervals, standard error, or error bars. No mention of multiple runs.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests reported. Improvements over DARS (47.7% vs 47.0%) and PatchPilot (47.7% vs 45.3%) are small but untested for significance.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Ablation study (Table 2) reports effect sizes: removing experience module causes 6.4pp drop (47.7→41.3%). However, no effect sizes reported for baseline comparisons.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "Paper states '300 GitHub issues' but provides no justification, power analysis, or sample size calculation. Adequacy of 300 issues for subgroup comparisons not discussed.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance, standard deviation, or multiple runs reported. Single pass@1 result per method only. No confidence intervals or run-to-run variance.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Table 1 compares to 8 open-source baselines: SWE-Agent, Moatless Tools, AutoCodeRover, Agentless, OpenHands, PatchPilot, and DARS.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "All baselines are recent (2024–2025). Most use Claude 3.5 Sonnet V2 for fair comparison; some earlier rows use GPT-4o but contemporary at time of baseline publication.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Table 2 provides ablation study removing: (1) Experience Module (41.3%), (2) Demonstrations (43.7%), (3) Insights (46.0%) from full system (47.7%).",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Multiple metrics reported: pass@1 (main metric), average cost (Table 1), ESR and RSR in ablations (Table 2), pass@1 across model variants (Figure 3), intersection analysis (Figure 2).",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Paper mentions 'manually verified by human annotators' for RSR metric, but provides no inter-annotator agreement, annotation guidelines, or sample size of manual verification.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "SWE-Bench Lite is a held-out benchmark created independently (2023) before this work. Results are on this external benchmark, not a paper-created test set.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": false,
    219           "justification": "No breakdown by issue type, severity, project, or bug category. No analysis showing which types of bugs are fixed vs. failed. Intersection analysis (Figure 2) only shows which methods overlap, not why.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "Paper reports that EXPEREPAIR uniquely resolves 9 issues (Figure 2) but does not discuss or show examples of failure cases, bugs that could not be fixed, or categories where method struggles.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": false,
    231           "justification": "Only positive results presented. No discussion of when dual-memory fails, when retrieval hurts performance, or when semantic insights are misleading.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Main experiments use 'Claude-3.5-Sonnet V2' (versioned). Section 5.3 tests 'Claude 3.7 Sonnet' (marketing name, no snapshot date), 'o1-mini', 'DeepSeek-R1' (versioned). Mostly clear.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Appendix A.2 provides actual system prompts used in test generation (Figure 4), patch generation (Figure 6), patch refinement (Figure 7), validation test generation (Figure 5), and insight summarization (Figure 8).",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Appendix A.1 reports: iteration limits (3 for test/patch), patch candidates per iteration (4), validation test samples (3), augmented patches (4), top-k retrieval (5), max insights per agent (15), retrieval method (BM25). Temperature settings not reported.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "Section 3.2 describes test agent and patch agent architecture, ReAct algorithm, iterative refinement with feedback. Section 3.3 describes dual-memory module. Workflow illustrated in Figure 1.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Uses SWE-Bench Lite as-is; no custom preprocessing steps applied. Standard benchmark used without modification.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "Evaluation uses SWE-Bench Lite, a publicly available benchmark. Raw data (GitHub issues) accessible from SWE-Bench.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": false,
    276           "answer": false,
    277           "justification": "No custom data collection; uses existing SWE-Bench Lite benchmark. Not applicable to this paper.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human recruitment. Uses GitHub issues from public repositories. Not applicable.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Uses SWE-Bench Lite without custom pipeline; no data transformations documented. Standard benchmark used directly.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "Paper does not state training cutoff for Claude 3.5/3.7 models. SWE-Bench created in 2023 with issues from 2022–2023; 2025 models likely trained on internet data including post-SWE-Bench publication, creating contamination risk not discussed.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "No discussion of whether Claude proprietary models may have seen SWE-Bench issues during training. Significant gap given closed-source training data.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No discussion of potential contamination. No analysis of whether Claude models' training data overlaps with SWE-Bench or GitHub repositories used.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human subjects study; not applicable.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human subjects study; not applicable.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human subjects study; not applicable.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human subjects study; not applicable.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human subjects study; not applicable.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human subjects study; not applicable.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human subjects study; not applicable.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "Table 1 reports average cost per issue: EXPEREPAIR $2.07, vs. DARS $12.24. Cost analysis demonstrates practical efficiency.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Table 1 shows average cost per instance ($2.07). Total budget estimable (300 issues × $2.07 ≈ $620) but not explicitly stated. Inference iterations (up to 3×4 for patches) documented.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "EXPEREPAIR achieves 47.7% pass@1 on SWE-Bench Lite with Claude 3.5 Sonnet V2, outperforming all open-source baselines",
    376       "evidence": "Table 1 comparison shows EXPEREPAIR 47.7% vs next-best PatchPilot 45.3%",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Removing the experience module causes a 6.4pp drop in performance (47.7→41.3%)",
    381       "evidence": "Table 2 ablation study shows performance degradation when experience module removed",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Episodic and semantic memories both contribute to performance, with demonstrations more critical than insights",
    386       "evidence": "Table 2 ablations: w/o demonstrations 43.7%, w/o insights 46.0%. Removing demonstrations has larger effect (4pp vs 1.7pp)",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "EXPEREPAIR uniquely resolves 9 issues that no other open-source baseline can fix",
    391       "evidence": "Figure 2 intersection analysis shows 9-issue unique set",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Stronger LLM models lead to better repair performance",
    396       "evidence": "Figure 3 shows Claude 3.7 (49.3%) > Claude 3.5 (47.7%) > DeepSeek-R1 (45%) > o1-mini (41.7%)",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "EXPEREPAIR is 6x more cost-efficient than DARS while achieving similar resolution rates",
    401       "evidence": "Table 1: EXPEREPAIR $2.07 vs DARS $12.24 per issue; 47.7% vs 47.0% pass@1",
    402       "supported": "strong"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval",
    407     "observational"
    408   ],
    409   "key_findings": "EXPEREPAIR demonstrates that accumulating and reusing historical repair experiences via dual-memory systems (episodic and semantic) improves LLM-based program repair. The method achieves 47.7% pass@1 on SWE-Bench Lite, matching or exceeding all open-source baselines while maintaining favorable cost. Ablation studies confirm both memory types contribute, with concrete demonstrations having larger impact than abstract insights.",
    410   "red_flags": [
    411     {
    412       "flag": "No confidence intervals or variance",
    413       "detail": "All results are single-run; no error bars, confidence intervals, or statistical significance tests. Improvements over DARS (47.7% vs 47.0%) and PatchPilot (47.7% vs 45.3%) are marginal and untested for significance."
    414     },
    415     {
    416       "flag": "Model contamination not addressed",
    417       "detail": "Paper evaluates 2025 Claude models on 2023 SWE-Bench issues sourced from GitHub repositories. Training data cutoff for proprietary models not disclosed; high contamination risk not discussed."
    418     },
    419     {
    420       "flag": "No failure case analysis",
    421       "detail": "Paper reports unique successes but provides no examples or analysis of failure modes, bug types where method struggles, or categories of issues that resist repair."
    422     },
    423     {
    424       "flag": "Memory growth unbounded",
    425       "detail": "Episodic memory stores all successful and failed demonstrations; paper does not discuss memory size growth, storage costs, or retrieval performance at scale."
    426     },
    427     {
    428       "flag": "Limited scope without justification",
    429       "detail": "Evaluation limited to 300 Python issues from 12 GitHub repositories. No power analysis justifying sample size; generalization to non-GitHub, non-Python, or cross-project repair not assessed."
    430     },
    431     {
    432       "flag": "Hyperparameter selection not justified",
    433       "detail": "Top-5 demonstrations retrieved, max 15 insights per agent, 3 iterations per task chosen without ablation or justification. Sensitivity to these choices unexplored."
    434     },
    435     {
    436       "flag": "Per-category breakdown missing",
    437       "detail": "No analysis of performance by issue type, severity, project, or bug category. Intersection analysis (Figure 2) only shows method overlap, not diagnostic breakdown."
    438     },
    439     {
    440       "flag": "No funding or COI disclosure",
    441       "detail": "No funding source disclosed. No competing interests statement or disclosure of potential conflicts with evaluated baselines."
    442     }
    443   ],
    444   "cited_papers": [
    445     {
    446       "title": "SWE-Bench: Can Language Models Resolve Real-World GitHub Issues?",
    447       "relevance": "Defines the evaluation benchmark (SWE-Bench Lite) and motivates repository-level program repair task"
    448     },
    449     {
    450       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    451       "relevance": "Key baseline method; agentic approach to repository-level repair using tool interaction"
    452     },
    453     {
    454       "title": "Agentless: Demystifying LLM-Based Software Engineering Agents",
    455       "relevance": "Procedural alternative to agentic methods; key baseline for comparison"
    456     },
    457     {
    458       "title": "AutoCodeRover: Autonomous Program Improvement",
    459       "relevance": "Agent-based baseline for repository-level repair; recent prior art"
    460     },
    461     {
    462       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    463       "relevance": "Algorithm for iterative reasoning and action; used in EXPEREPAIR's test and patch agents"
    464     },
    465     {
    466       "title": "Dual-Process and Dual-System Theories of Reasoning",
    467       "relevance": "Theoretical foundation for dual-memory system analogy from cognitive science"
    468     },
    469     {
    470       "title": "Automatic Software Repair: A Survey",
    471       "relevance": "Comprehensive background on APR techniques and historical context"
    472     },
    473     {
    474       "title": "Dynamine: Finding Common Error Patterns by Mining Software Revision Histories",
    475       "relevance": "Prior work on recurring bug patterns in software evolution; motivates historical repair experience reuse"
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 3,
    481       "justification": "Directly applicable tool for automated code repair on real GitHub issues; practitioners can deploy for maintenance automation. Cost-efficient (avg $2.07/issue) makes deployment feasible."
    482     },
    483     "surprise_contrarian": {
    484       "score": 1,
    485       "justification": "Applies well-established dual-memory concept from cognitive science to known problem (APR); incremental improvement on existing agent-based methods rather than novel insight."
    486     },
    487     "fear_safety": {
    488       "score": 0,
    489       "justification": "No AI safety or risk concerns raised. Improves code quality through better bug fixes; no misalignment or harmful capability demonstrated."
    490     },
    491     "drama_conflict": {
    492       "score": 0,
    493       "justification": "Straightforward engineering paper; no controversy, competing claims, or methodological disputes prominent in narrative."
    494     },
    495     "demo_ability": {
    496       "score": 2,
    497       "justification": "Code released on GitHub (per Section 7); requires SWE-Bench setup and API access to Claude/DeepSeek models. Not instantly demosable in browser but reproducible with effort."
    498     },
    499     "brand_recognition": {
    500       "score": 1,
    501       "justification": "Chinese Academy of Sciences and Beihang University are respectable but not top-tier research brands in the APR/SE community. No major tech company affiliation."
    502     }
    503   },
    504   "hn_data": {
    505     "threads": [
    506       {
    507         "hn_id": "46728063",
    508         "title": "New York Times games are hard: A computational perspective",
    509         "points": 73,
    510         "comments": 33,
    511         "url": "https://news.ycombinator.com/item?id=46728063"
    512       },
    513       {
    514         "hn_id": "43695562",
    515         "title": "M1: Towards Scalable Test-Time Compute with Mamba Reasoning Models",
    516         "points": 33,
    517         "comments": 3,
    518         "url": "https://news.ycombinator.com/item?id=43695562"
    519       },
    520       {
    521         "hn_id": "44024987",
    522         "title": "Can You Trust Code Copilots? Evaluating LLMs from a Code Security Perspec",
    523         "points": 11,
    524         "comments": 2,
    525         "url": "https://news.ycombinator.com/item?id=44024987"
    526       },
    527       {
    528         "hn_id": "31833716",
    529         "title": "What does it take to solve the measurement problem?",
    530         "points": 5,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=31833716"
    533       },
    534       {
    535         "hn_id": "43116772",
    536         "title": "AI Alignment at Your Discretion",
    537         "points": 3,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=43116772"
    540       },
    541       {
    542         "hn_id": "44276478",
    543         "title": "Getting Explicit Instruction Right",
    544         "points": 2,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=44276478"
    547       },
    548       {
    549         "hn_id": "45284415",
    550         "title": "Is In-Context Learning Learning?",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=45284415"
    554       },
    555       {
    556         "hn_id": "31840313",
    557         "title": "What does it take to solve the measurement problem?",
    558         "points": 2,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=31840313"
    561       },
    562       {
    563         "hn_id": "46345690",
    564         "title": "Computational complexity of New York Times games",
    565         "points": 1,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=46345690"
    568       },
    569       {
    570         "hn_id": "45467729",
    571         "title": "AegisShield: Democratizing Cyber Threat Modeling with Generative AI",
    572         "points": 1,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=45467729"
    575       }
    576     ],
    577     "top_points": 73,
    578     "total_points": 133,
    579     "total_comments": 38
    580   }
    581 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs