scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26409B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Automated Program Repair via Faulty Token Localization and Quality-Aware Patch Refinement",
      6     "authors": [
      7       "Jiaolong Kong",
      8       "Xiaofei Xie",
      9       "Yiheng Xiong",
     10       "Yuekun Wang",
     11       "Jian Wang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2511.18001",
     16     "doi": "10.48550/arXiv.2511.18001"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The 88/139 correct-fix counts and 8.2%–34.9%/3.3%–16.1% improvement ranges are directly traceable to Table 4 and the Venn diagrams in Fig. 4; per-model baseline comparisons verify the claimed ranges.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Ablation studies (RQ3, Table 5) systematically remove each component and show performance drops of up to 20.6%, supporting the causal attribution of gains to the proposed modules.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The abstract and conclusion claim 'state-of-the-art in automated program repair' without noting the restriction to single-hunk Java bugs and 7B–8B parameter models; the evaluation scope is stated in the setup but not bounded in the main conclusions.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes performance gains solely to token-level uncertainty without seriously considering whether gains could be explained by the increased effective sampling diversity introduced by the refinement loop.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes #Plausible (passes test suites) from #Correct (manually verified as semantically equivalent to ground truth), with three independent reviewers spending 10+ hours each.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 'Threats to Validity' addresses manual verification bias and experimental reproducibility threats, constituting a dedicated section.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Threats are specific: manual verification is mitigated by three independent SE researchers each spending 10+ hours; reproducibility threat is attributed to floating-point non-determinism in LLM inference.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state in conclusions or limitations that results are bounded to Java, single-hunk bugs, or small (7B–8B) open-source models; the single-hunk restriction appears only in the setup section.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment or grant information appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors list Singapore Management University as their affiliation in the paper header.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'Plausible patch' and 'correct patch' are formally defined in Section 4.1.4; token-level uncertainty is formally defined via the probability-difference metric in Eq. 1; APR is explained through prior work context.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 explicitly lists three bulleted contributions: first incorporation of internal reflection into LLM-based repair, the TokenRepair framework itself, and the comprehensive evaluation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6 situates TokenRepair relative to conversation-based (ChatRepair, ContrastRepair, CigaR, RepairAgent) and fine-tuning-based APR methods, and explains how this work differs by exploiting internal uncertainty signals.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The paper states 'we have made our patches open-source for public evaluation' but provides no repository URL or link; patch outputs are not the same as source code.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Both Defects4J 1.2 and HumanEval-Java are standard public benchmarks used unmodified and are publicly accessible.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "HuggingFace model links are provided but no requirements file, Docker container, or dependency list is included.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided; the methodology is described algorithmically but the operational pipeline for running experiments is absent.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Table 4 and Table 5 are reported as single point estimates with no confidence intervals or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to comparative claims; differences in bug-fix counts are reported without hypothesis testing.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Percentage improvement over the best baseline is explicitly reported (e.g., 8.2%–34.9% on Defects4J) providing effect sizes in context.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The benchmarks are used as-is (154 and 163 bugs) with no sample size justification or power analysis.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Results are single-run counts; no variance or standard deviation across repeated runs is reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Three baselines are included: Base Sampling, CoT-Decoding, and ChatRepair, covering the main competing paradigms.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "ChatRepair (2024), CoT-Decoding (2024), and Base Sampling (2025) are contemporary and directly competitive with the proposed approach.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 5.3 presents a full ablation with three variants (w/o Majority, w/o Localize, w/o Quality) evaluated on both benchmarks across all five models.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Three metrics are used: #Plausible (test-passing patches), #Correct (manually verified), and #Gen (efficiency: patches per correct fix).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Three SE researchers independently manually verified plausible patches, each spending 10+ hours, with disagreements resolved by consensus (Section 7).",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Defects4J provides ground-truth tests separate from the development process; patches must pass predefined test suites not used in generation.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by model (5 models) and benchmark (2 datasets); Table 6 further breaks down by hyperparameter configuration per model.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 5.2 discusses DeepSeek's marginal underperformance on HumanEval-Java and explains it via weaker localization accuracy; Section 5.4 discusses why m=9 consistently underperforms.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that TokenRepair slightly underperforms Base Sampling for DeepSeek on HumanEval-Java (98 vs 99) and that m=9 never achieves best performance across any configuration.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Exact model names (e.g., 'Qwen2.5-Coder-7B-Instruct', 'Llama-3.1-8B-Instruct') are provided with HuggingFace repository links in references [5,6,7,11,23,25].",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper references ConstructPrompt as an algorithm step and describes inputs conceptually, but no actual prompt templates or examples are shown.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature (t=1), budget (50), TopK (3), decay factor α (0.5), n∈{2,5}, and m∈{3,6,9} are all explicitly reported in Section 4.1.1.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Algorithm 1 provides a complete pseudocode description of the full TokenRepair pipeline including the BFS loop, quality filtering, and internal/external feedback phases.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 4.1.2 specifies the benchmark construction process: 154 single-hunk bugs from Defects4J 1.2 with buggy hunk location provided from ground truth, following prior work [19,36].",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Generated patches are claimed open-source but no URL is provided; the raw LLM outputs, uncertainty scores, and intermediate results are not publicly released.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Both benchmarks are established public datasets with documented origins; the subset selection criterion (single-hunk bugs) is explicitly stated.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "Standard benchmarks are used; no participant recruitment is involved.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Algorithm 1 documents the full pipeline from bug input through patch generation, evaluation, quality filtering, and output; the flow from benchmark loading to results is traceable.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs are not stated for any of the five models (Qwen2.5-Coder, Llama-3.1, DeepSeek-Coder, CodeGemma) despite evaluating on public benchmarks.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Defects4J and HumanEval-Java are widely published benchmarks likely present in LLM training corpora; the paper does not discuss this potential contamination.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "No discussion of whether Defects4J bugs or HumanEval-Java solutions appeared in the training data of any of the five evaluated models.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants; NA.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants; NA.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants; NA.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants; NA.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants; NA.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants; NA.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants; NA.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "#Gen metric (average patches generated per correct fix) is reported in Table 4 as a computational cost proxy; lower values indicate higher efficiency.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "A per-bug patch budget cap of 50 is stated, but total GPU hours, wall-clock time, or hardware specification for the full experimental suite is not reported.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "TokenRepair achieves 88 correct fixes on Defects4J 1.2 across all five models, a 7.3% improvement over the best baseline (ChatRepair at 82).",
    375       "evidence": "Fig. 4a Venn diagram and Table 4 per-model results summed and verified against ChatRepair totals.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "TokenRepair achieves 139 correct fixes on HumanEval-Java, a 6.1% improvement over ChatRepair (131).",
    380       "evidence": "Fig. 4b Venn diagram and Table 4 HumanEval-Java results.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Per-model improvements over the best baseline range from 8.2% to 34.9% on Defects4J 1.2.",
    385       "evidence": "Table 4: Llama (53 vs 49 ChatRepair = 8.2%), CodeGemma (58 vs 43 ChatRepair = 34.9%).",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Uncertainty-guided faulty token localization achieves average Top-3 accuracy of 0.589–0.695 across models and benchmarks.",
    390       "evidence": "Table 1 reports Avg. column for α=0.5, TopK=3 across all five models on both benchmarks.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Majority voting for first-token identification is strongly correlated with actual first-token correctness (F1 scores 0.624–0.928).",
    395       "evidence": "Table 2 reports precision, recall, and F1 for all models on both benchmarks.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Uncertainty decrease during iterative repair is predictive of successful patch trajectories, with plausible paths showing 55.8%–80.5% decreasing uncertainty transitions vs. balanced distributions for incorrect paths.",
    400       "evidence": "Table 3 shows clear disparity between plausible and incorrect paths across all models and benchmarks.",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "All three components (majority voting, uncertainty localization, quality filtering) independently contribute to performance, with localization being most critical (up to 20.6% drop on removal).",
    405       "evidence": "Table 5 ablation study across both benchmarks and all five models.",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval"
    411   ],
    412   "key_findings": "TokenRepair achieves new state-of-the-art automated program repair by combining token-level uncertainty-guided fault localization (Top-3 accuracy 0.589–0.695) with quality-aware patch filtering, correctly fixing 88 bugs on Defects4J 1.2 and 139 on HumanEval-Java using five 7B–8B open-source LLMs. Per-model improvements over the best baseline (ChatRepair) range from 8.2% to 34.9% on Defects4J and 3.3% to 16.1% on HumanEval-Java. Ablation confirms uncertainty-guided token localization is the dominant component (up to 20.6% performance drop on removal), while excessive refinement budget allocation (m=9) consistently underperforms due to localization accuracy bounds and model distribution bias. All results are bounded to single-hunk Java bugs; contamination of public benchmarks in LLM training data is unaddressed.",
    413   "red_flags": [
    414     {
    415       "flag": "No statistical significance tests",
    416       "detail": "All comparative claims between TokenRepair and baselines are based on raw bug-fix counts with no hypothesis testing or confidence intervals, making it impossible to assess whether differences are statistically meaningful given the small benchmark sizes (154 and 163 bugs)."
    417     },
    418     {
    419       "flag": "Benchmark contamination unaddressed",
    420       "detail": "Defects4J and HumanEval-Java are widely published benchmarks likely present in the training corpora of all five evaluated models; training data cutoffs are not stated and overlap is not discussed."
    421     },
    422     {
    423       "flag": "Single-run results only",
    424       "detail": "With temperature=1 and non-deterministic LLM inference, results are reported as single-run counts with no variance across multiple runs, making reported improvements potentially unstable."
    425     },
    426     {
    427       "flag": "Scope overclaim in title and conclusions",
    428       "detail": "The paper claims 'state-of-the-art in automated program repair' without noting the restriction to single-hunk Java bugs with small open-source models; results may not transfer to multi-hunk, non-Java, or larger proprietary models."
    429     },
    430     {
    431       "flag": "Prompts not disclosed",
    432       "detail": "The ConstructPrompt function is referenced algorithmically but actual prompt templates are never shown, preventing verification of whether prompt design artifacts drive the improvements."
    433     },
    434     {
    435       "flag": "No code repository URL",
    436       "detail": "The claim of open-source patch release has no accompanying URL, making independent verification or reproduction infeasible."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    442       "relevance": "Primary baseline (ChatRepair); TokenRepair directly extends and compares against this conversational APR paradigm."
    443     },
    444     {
    445       "title": "Chain-of-thought reasoning without prompting",
    446       "relevance": "CoT-Decoding is a direct baseline and TokenRepair's token-guided CoT-Decoding is a core component adapted from this work."
    447     },
    448     {
    449       "title": "Demystifying Memorization in LLM-Based Program Repair via a General Hypothesis Testing Framework",
    450       "relevance": "Provides the Base Sampling baseline and benchmark construction methodology used by TokenRepair."
    451     },
    452     {
    453       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    454       "relevance": "Primary evaluation benchmark providing 154 single-hunk Java bugs."
    455     },
    456     {
    457       "title": "Impact of code language models on automated program repair",
    458       "relevance": "Introduces HumanEval-Java benchmark used as the second evaluation dataset."
    459     },
    460     {
    461       "title": "Calibration and correctness of language models for code",
    462       "relevance": "Establishes that token-level uncertainty correlates with code correctness, providing empirical foundation for TokenRepair's uncertainty-guided localization."
    463     },
    464     {
    465       "title": "Uncertainty-guided chain-of-thought for code generation with LLMs",
    466       "relevance": "Shows first token uncertainty as proxy for generation quality; motivates TokenRepair's trace quality measurement component."
    467     },
    468     {
    469       "title": "ContrastRepair: Enhancing Conversation-Based Automated Program Repair via Contrastive Test Case Pairs",
    470       "relevance": "Prior work by first and second authors; represents the conversational APR baseline class that TokenRepair extends."
    471     },
    472     {
    473       "title": "A survey of confidence estimation and calibration in large language models",
    474       "relevance": "Provides the probability-difference uncertainty metric (Eq. 1) adopted by TokenRepair for token-level uncertainty computation."
    475     },
    476     {
    477       "title": "Self-consistency improves chain of thought reasoning in language models",
    478       "relevance": "Motivates the majority voting strategy for first-token identification via self-consistency decoding principles."
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 2,
    484       "justification": "APR tools directly address developer debugging time, though the restriction to single-hunk Java bugs with small open-source LLMs limits immediate practitioner applicability."
    485     },
    486     "surprise_contrarian": {
    487       "score": 1,
    488       "justification": "Applying token-level uncertainty for fault localization in APR is a novel angle, but the finding that targeted refinement beats coarse-grained feedback is expected rather than surprising."
    489     },
    490     "fear_safety": {
    491       "score": 0,
    492       "justification": "No AI safety or risk concerns; automated bug fixing is a constructive application."
    493     },
    494     "drama_conflict": {
    495       "score": 0,
    496       "justification": "No controversy or adversarial framing; straightforward systems paper."
    497     },
    498     "demo_ability": {
    499       "score": 1,
    500       "justification": "Uses public benchmarks (Defects4J, HumanEval-Java) that practitioners could re-run, but no live demo, public code repository, or tool release is provided."
    501     },
    502     "brand_recognition": {
    503       "score": 0,
    504       "justification": "Singapore Management University is a reputable institution but not a top-tier AI lab; no famous models or products involved."
    505     }
    506   },
    507   "hn_data": {
    508     "threads": [
    509       {
    510         "hn_id": "42889052",
    511         "title": "Large language models think too fast to explore effectively",
    512         "points": 118,
    513         "comments": 41,
    514         "url": "https://news.ycombinator.com/item?id=42889052"
    515       },
    516       {
    517         "hn_id": "46664297",
    518         "title": "VaultGemma: A Differentially Private LLM",
    519         "points": 3,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=46664297"
    522       },
    523       {
    524         "hn_id": "42968402",
    525         "title": "Fault Localization via Fine-Tuning LLMs with Mutation Generated Stack Traces",
    526         "points": 3,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=42968402"
    529       },
    530       {
    531         "hn_id": "46555313",
    532         "title": "Name That Part: 3D Part Segmentation and Naming",
    533         "points": 2,
    534         "comments": 1,
    535         "url": "https://news.ycombinator.com/item?id=46555313"
    536       },
    537       {
    538         "hn_id": "46838079",
    539         "title": "VaultGemma: A Differentially Private LLM",
    540         "points": 1,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=46838079"
    543       }
    544     ],
    545     "top_points": 118,
    546     "total_points": 127,
    547     "total_comments": 42
    548   }
    549 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs