scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27145B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "On the Effectiveness of LLM-as-a-Judge for Code Generation and Summarization",
      6     "authors": [
      7       "Giuseppe Crupi",
      8       "Rosalia Tufano",
      9       "Alejandro Velasco",
     10       "Antonio Mastropaolo",
     11       "Denys Poshyvanyk",
     12       "Gabriele Bavota"
     13     ],
     14     "year": 2025,
     15     "venue": "IEEE Transactions on Software Engineering",
     16     "arxiv_id": "2507.16587",
     17     "doi": "10.1109/TSE.2025.3586082"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims that GPT-4-turbo is best judge and smaller LLMs struggle are directly supported by Cohen's Kappa in Table 2 and Krippendorff's α in Table 5; the claim that even the best LLM frequently misjudges is supported by confusion matrices showing 50% false positive rate for wrong Java implementations.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": false,
     29         "answer": false,
     30         "justification": "The paper makes comparative observational claims about LLM judging performance, not causal claims; the study is descriptive and evaluative rather than interventional.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "External validity section explicitly bounds results to two tasks (code generation and code summarization) and two languages (Java and Python), with a call for differentiated replications.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper explicitly tests and dismisses 'lack of coding context' as a major factor by rerunning analysis on self-contained functions; the false positive/negative qualitative analysis identifies specific alternative reasons for misjudgments.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The Construct Validity section explicitly acknowledges that test execution is a proxy for code correctness and documents quality checks excluding unreliable test cases; human judgment as oracle for summarization is similarly discussed with inter-rater agreement measured.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 4 'Threats to Validity' covers construct, internal, and external validity as distinct subsections with specific discussion under each.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats include: CoderEval test suite quality (67 problems excluded with documented criteria), subjectivity in manual analysis mitigated by multi-author labeling with conflict resolution, and explicit restriction to Java/Python and two SE tasks.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "External validity explicitly states results 'are capped by (i) the two code-related tasks subject of the study and (ii) the focus on the Java and Python programming languages.'",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding source is mentioned anywhere in the paper.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All author affiliations are clearly disclosed in the paper header (SEART @ Università della Svizzera italiana, William & Mary).",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No funding is disclosed, making this criterion not applicable.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or financial interest declaration appears in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "LLM-as-a-judge is defined in the introduction; code generation and code summarization are precisely defined with their evaluation challenges and existing metric limitations described.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper explicitly states its goal is to 'assess the effectiveness of LLMs-as-a-judge for software-related tasks' with a single focused research question stated in Section 2.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 5 explicitly positions this work relative to ICE-Score, CodeJudge, Weyssow et al., and Koutcheme et al., explaining methodological differences and how this study extends or improves on each.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "A replication package is referenced at GitHub [1] (https://github.com/crupig/LLMs-as-a-judge-for-SE-tse RP) containing prompts, extraction scripts, and data.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The paper explicitly states they 'build (and make publicly available [1]) our own dataset' of 1,163 summaries with human judgments; CoderEval is also publicly available.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No requirements.txt, Dockerfile, or dependency specification is mentioned; HuggingFace inference endpoints are referenced but no environment specification is provided.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper references a replication package repeatedly but provides no step-by-step reproduction instructions within the paper itself.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Main results (Kappa scores, Krippendorff's α, bias coefficients) are reported as point estimates without confidence intervals or error bars.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Mann-Whitney tests with Benjamini-Hochberg correction for multiple testing are used for self-bias analysis; Krippendorff's α and Cohen's Kappa are used as agreement metrics with explicit interpretation thresholds.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Cliff's δ effect sizes are reported for all Mann-Whitney tests in Tables 3 and 6 with explicit interpretation thresholds (negligible/small/medium/large).",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No power analysis or formal sample size justification is provided; sample sizes are determined by benchmark availability and resource constraints rather than statistical considerations.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Agreement scores and bias coefficients are reported as point estimates without variance or standard deviation across repeated runs.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Eight LLMs of different sizes are compared against each other and against oracle ground truths (test execution for code generation, human judgments for summarization).",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "GPT-4-turbo, GPT-3.5-turbo, CodeLlama, and DeepSeek Coder were contemporary state-of-the-art models at time of study.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Four different prompting strategies (zero-shot, zero-shot W/O rationale, automated CoT, slow-thinking) are systematically compared for both tasks across all LLMs.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Code generation uses Cohen's Kappa, confusion matrices, bias coefficients, accuracy, and mutation testing; code summarization uses Krippendorff's α across three quality dimensions.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Nine human judges independently evaluated 1,163 code summaries across three quality dimensions, with each summary rated by three judges and inter-rater agreement measured.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "This is a benchmarking evaluation study, not a machine learning training/prediction task requiring train/test splits.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by language (Java vs Python), by LLM, by quality criterion (content adequacy, conciseness, fluency), and by prompt type.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "A dedicated qualitative analysis identifies reasons for false positives (uncaught wrong behavior 37%, coding context 32%, ambiguous requirements 27%) and false negatives (hallucination 33%, misunderstanding 19%).",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The primary finding is negative: most LLMs cannot reliably judge code correctness; GPT-4-turbo achieves only 'fair' Kappa (0.21 Java, 0.10 Python) and smaller models completely fail.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": false,
    239           "justification": "Open-source models have specific size variants (DeepSeek 1.3B/6.7B/33B, CodeLlama 7B/13B/34B), but GPT-3.5-turbo and GPT-4-turbo lack snapshot dates, which is critical given OpenAI's silent model updates.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Actual prompts are reproduced verbatim in the paper for zero-shot and automated CoT strategies for both code generation and code summarization tasks.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "Temperature, top-p, and other generation hyperparameters are not reported for any of the eight models.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used; this is direct prompt-based evaluation without orchestration frameworks.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Detailed quality assurance for CoderEval is documented (67 problems excluded with specific criteria); dataset construction for code summarization including function selection, LLM generation, and human annotation is described step-by-step.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "The replication package [1] is stated to contain all collected judgments; the code summarization dataset with human ratings is explicitly made publicly available.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 2.3 describes in detail how 80,556 code generation judgments and 22,304 summarization judgments were collected, extracted (via scripts and manual verification), and cleaned.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": false,
    283           "justification": "Nine judges are described by qualifications (Master's/PhD, years of Java/Python experience) but the actual recruitment method (lab members, external, volunteer) is not stated.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline from benchmark selection through quality assurance, code generation, judgment collection, manual cleaning, and statistical analysis is documented across Sections 2.1–2.4.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "Training data cutoffs are not stated for any of the eight models; only vague descriptions like 'trained on a corpus of 2 trillion tokens' are provided without dates.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The paper does not discuss whether CoderEval problems (from ICSE'24) or the code summarization functions may have appeared in the training data of the evaluated LLMs.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No discussion of whether GPT-4-turbo or other models saw CoderEval problems during training, which could distort judging performance for familiar code patterns.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned for the human evaluation study involving nine judges assessing 1,163 summaries.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No IRB or ethics approval is mentioned despite the study involving nine human participants as paid/volunteer judges.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Judges' education (Master's or PhD in Informatics/CS), specialization (four with PhD in SE), and programming experience (avg 5.8 years Java, 6.9 years Python, with min/max) are reported.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": true,
    335           "justification": "Judges required to have 'code summarization background' and Master's or PhD degree in Informatics or Computer Science.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": false,
    341           "justification": "The paper states summaries were split among judges ensuring each assessed by three, but the randomization procedure for assignment is not described.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "No blinding is described; judges could potentially identify human-written vs LLM-generated summaries, which could introduce evaluation bias.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "All nine judges appear to have completed their assignments with no mention of dropouts; attrition not applicable.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "Despite citing cost as a key motivation for the LLM-as-a-judge paradigm, no actual API costs or inference costs are reported for running 80,556+ judgments.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Total computational budget for running all experiments across eight models, four prompts, and two tasks is not reported.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "GPT-4-turbo is the best LLM judge for both code generation and code summarization among all eight evaluated models.",
    376       "evidence": "Cohen's Kappa of 0.21 (Java) and 0.10 (Python) for code generation is highest among all models; Krippendorff's α of 0.58–0.63 for content adequacy in summarization outperforms all others.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Smaller LLMs (DeepSeek Coder 1.3B/6.7B, CodeLlama 7B) are essentially unable to perform code correctness judgment, showing near-zero or negative Kappa scores.",
    381       "evidence": "Table 2 shows DeepSeek Coder 1.3B and 6.7B achieve Kappa values near 0 or negative across all prompts and both languages; CodeLlama 7B shows similar results.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Even GPT-4-turbo frequently misjudges code correctness, classifying 50% of wrong Java implementations as correct.",
    386       "evidence": "Confusion matrices in Fig. 1 show GPT-4 has a 50% false positive rate for failing Java implementations despite being the best-performing model.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "All LLMs systematically underestimate the correctness of human-written code relative to LLM-generated code.",
    391       "evidence": "Table 3 shows negative bias coefficients for human-written code for all judge models, statistically significant with large Cliff's δ effect sizes across all comparisons.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "GPT-4-turbo achieves moderate-to-substantial agreement with human judges for code summary content adequacy.",
    396       "evidence": "Krippendorff's α of 0.58 (Java) and 0.63 (Python) for content adequacy with zero-shot prompt, compared to human inter-rater agreement of α=0.81 and 0.69.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Prompt choice has limited impact on overall findings; model size is the dominant factor in judging capability.",
    401       "evidence": "Table 2 shows Kappa scores for each LLM are relatively stable across four prompt variants; GPT-4 remains best-in-class regardless of prompt used.",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "Lack of visible coding context (external dependencies) is not a major cause of LLM judging failures.",
    406       "evidence": "Analysis restricted to 80 Java and 58 Python self-contained functions (no external deps) showed no change in judging effectiveness or model rankings.",
    407       "supported": "moderate"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval",
    412     "observational",
    413     "qualitative"
    414   ],
    415   "key_findings": "GPT-4-turbo is the best available LLM judge for code-related tasks but remains unreliable for code correctness assessment, misjudging 50% of incorrect Java implementations as correct (Cohen's Kappa = 0.21 Java, 0.10 Python). For code summarization, GPT-4-turbo achieves moderate-to-substantial agreement with human judges on content adequacy (Krippendorff's α ≈ 0.58–0.63), suggesting LLM-as-a-judge is more viable for natural language quality evaluation than for code correctness verification. A systematic anti-human bias was identified: all LLMs significantly underestimate the correctness of human-written code relative to LLM-generated code, with large effect sizes. Smaller LLMs (tens of billions of parameters in the CodeLlama and DeepSeek Coder families) largely fail at both judging tasks entirely.",
    416   "red_flags": [
    417     {
    418       "flag": "Contamination unaddressed",
    419       "detail": "The paper does not discuss whether GPT-4-turbo or other closed-source models may have seen CoderEval problems (published ICSE'24) during training, which could inflate or distort judging performance for familiar code patterns."
    420     },
    421     {
    422       "flag": "Hyperparameters not reported",
    423       "detail": "Temperature, top-p, and other generation hyperparameters are not disclosed for any of the eight models, limiting reproducibility of results."
    424     },
    425     {
    426       "flag": "No confidence intervals on agreement metrics",
    427       "detail": "Main results (Cohen's Kappa, Krippendorff's α, bias coefficients) are reported as point estimates without any uncertainty quantification."
    428     },
    429     {
    430       "flag": "OpenAI model snapshots undefined",
    431       "detail": "GPT-3.5-turbo and GPT-4-turbo lack snapshot dates; these models are silently updated over time, undermining exact reproducibility of the key results."
    432     },
    433     {
    434       "flag": "Human study not pre-registered or ethics-reviewed",
    435       "detail": "The study involving nine human judges evaluating 1,163 summaries was not pre-registered and no IRB/ethics approval is mentioned."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    441       "relevance": "Primary benchmark used for both code generation evaluation and as source of functions for the summarization dataset"
    442     },
    443     {
    444       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    445       "relevance": "Foundational work proposing LLM-as-a-judge concept and identifying positional bias, verbosity, and self-enhancement bias"
    446     },
    447     {
    448       "title": "CodeJudge: Evaluating Code Generation with Large Language Models",
    449       "relevance": "Most closely related prior work applying GPT-3.5 as judge for code correctness with slow-thinking prompts; this paper adopts and extends CodeJudge's prompt"
    450     },
    451     {
    452       "title": "ICE-Score: Instructing Large Language Models to Evaluate Code",
    453       "relevance": "Prior work using GPT-3.5-turbo as judge for code implementations on HumanEval-X; this paper uses a harder benchmark and more LLMs"
    454     },
    455     {
    456       "title": "CodeUltraFeedback: An LLM-as-a-Judge Dataset for Aligning Large Language Models to Coding Preferences",
    457       "relevance": "Related work exploiting LLM-as-a-judge for SE evaluation, source of the zero-shot prompt design"
    458     },
    459     {
    460       "title": "Large Language Models are Zero-Shot Reasoners",
    461       "relevance": "Source of the automated chain-of-thought prompting strategy tested in this study"
    462     },
    463     {
    464       "title": "Reassessing Automatic Evaluation Metrics for Code Summarization Tasks",
    465       "relevance": "Demonstrates shortcomings of BLEU/ROUGE/METEOR for code summarization, motivating LLM-as-a-judge as an alternative"
    466     },
    467     {
    468       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    469       "relevance": "One of the two open-source LLM families evaluated as judges in this study"
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 3,
    475       "justification": "Directly answers whether practitioners can replace human evaluation with LLMs for automated code review and summarization assessment, a question with immediate industry relevance."
    476     },
    477     "surprise_contrarian": {
    478       "score": 2,
    479       "justification": "Finding that LLMs systematically underestimate human-written code quality and that even GPT-4 fails 50% of code correctness judgments challenges widespread enthusiasm for LLM-as-a-judge in SE research."
    480     },
    481     "fear_safety": {
    482       "score": 0,
    483       "justification": "No safety or AI risk concerns are raised; the paper is a methodological evaluation of automated evaluation quality."
    484     },
    485     "drama_conflict": {
    486       "score": 1,
    487       "justification": "Challenges the growing trend of using LLM-as-a-judge as a cheap substitute for human evaluation in SE, but framed constructively rather than confrontationally."
    488     },
    489     "demo_ability": {
    490       "score": 2,
    491       "justification": "Prompts are provided verbatim, replication package is publicly available, and experiments use accessible APIs, enabling replication with moderate effort."
    492     },
    493     "brand_recognition": {
    494       "score": 1,
    495       "justification": "Published in IEEE Transactions on Software Engineering (top venue) but authors are from USI and William & Mary rather than major AI labs."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [
    500       {
    501         "hn_id": "45028439",
    502         "title": "No evidence ageing/declining populations compromise socio-economic performance",
    503         "points": 82,
    504         "comments": 101,
    505         "url": "https://news.ycombinator.com/item?id=45028439",
    506         "created_at": "2025-08-26T16:05:54Z"
    507       },
    508       {
    509         "hn_id": "47213997",
    510         "title": "Von Neumann on Consciousness in Quantum Mechanics",
    511         "points": 3,
    512         "comments": 0,
    513         "url": "https://news.ycombinator.com/item?id=47213997",
    514         "created_at": "2026-03-02T04:46:53Z"
    515       },
    516       {
    517         "hn_id": "43557330",
    518         "title": "Ultra-high resolution multimodal MRI dense labelled holistic brain atlas",
    519         "points": 2,
    520         "comments": 0,
    521         "url": "https://news.ycombinator.com/item?id=43557330",
    522         "created_at": "2025-04-02T14:48:56Z"
    523       }
    524     ],
    525     "top_points": 82,
    526     "total_points": 87,
    527     "total_comments": 101
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs