scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28410B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating Large Language Models Trained on Code",
      6     "authors": [
      7       "Chen, M.",
      8       "Tworek, J.",
      9       "Jun, H.",
     10       "Yuan, Q.",
     11       "Pinto, H. P. d. O.",
     12       "et al."
     13     ],
     14     "year": 2021,
     15     "venue": "arXiv",
     16     "arxiv_id": "2107.03374",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Core claims (28.8% pass@1, GPT-3 ~0%, GPT-J 11.4%) match Table 1 exactly; the 70.2%/100-sample claim is slightly inconsistent with Table 1's 72.31% for Codex-12B but is within plausible temperature-configuration variation.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Causal claims (fine-tuning on code improves performance, supervised fine-tuning further helps) are supported by controlled ablations comparing GPT vs. Codex vs. Codex-S across multiple model sizes.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Claims are consistently bounded to Python code generation from docstrings on HumanEval; the broader impacts section explicitly flags economic and societal speculation as preliminary.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No alternative explanations are discussed for why Codex outperforms GPT-3 (data volume vs. architecture vs. fine-tuning distribution); the alignment appendix discusses robustness vs. misalignment but the main results section lacks this.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper explicitly argues pass@k (functional correctness via unit tests) is superior to BLEU and demonstrates empirically that BLEU scores do not reliably distinguish correct from incorrect solutions (Figure 8).",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6 is a dedicated Limitations section covering docstring length degradation, variable binding failures, and sample efficiency relative to human programmers.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific, quantified threats are provided: exponential pass-rate degradation per additional chained operation (Figure 11, factor of 2-3 per step), concrete variable-binding failure examples, and acknowledgment that unit test coverage averages only 7.7 tests per problem.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly scopes to standalone Python function synthesis from docstrings and acknowledges this is not representative of full software engineering (design, collaboration, debugging, upgrading stacks).",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No formal funding disclosure; acknowledgments mention GitHub partnership and Microsoft Azure infrastructure but do not constitute a funding statement.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations with OpenAI, Anthropic (work performed while at OpenAI), and Zipline are explicitly listed in the author block.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "OpenAI employees evaluate their own model (Codex) which directly powers a commercial product (GitHub Copilot); the organization has clear financial interest in positive results.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or financial disclosure is included; the commercial relationship between Codex and GitHub Copilot is mentioned in passing but not formally declared as a conflict.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Pass@k is formally defined with an unbiased estimator (Section 2.1), functional correctness is defined and contrasted with match-based metrics, and 'alignment' is operationalized in Appendix E with sufficient and necessary conditions.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper clearly states it introduces Codex, the HumanEval benchmark, and an improved pass@k estimator; the relationship to GitHub Copilot is also stated explicitly.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 8 provides comprehensive related work covering program induction, synthesis (SPoC, TransCoder, RobustFill), neural code models (CodeBERT, PyMT5), and prior benchmarks (APPS, CodeSearchNet, CodeXGLUE), situating Codex's contributions clearly.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "HumanEval benchmark and evaluation framework are released at github.com/openai/human-eval; alignment evaluation data released at github.com/openai/code-align-evals-data.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "HumanEval (164 problems with unit tests) is publicly released; training data (GitHub Python) is not packaged but the evaluation benchmark is fully available.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No requirements.txt or Dockerfile provided; gVisor sandbox is described at a conceptual level but reproduction requires proprietary model access not available to the public.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "Evaluation code is released but the Codex model is proprietary (API-only), making full end-to-end reproduction impossible; benchmark results cannot be independently verified.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No confidence intervals are reported around any pass@k estimates; the unbiased estimator is described but uncertainty bounds on the point estimates are absent throughout.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are conducted for any comparative claims between Codex and baselines (GPT-J, GPT-Neo, Tabnine).",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Absolute pass@k percentages with baseline context are reported for all model comparisons (e.g., Codex-12B 28.81% vs. GPT-J-6B 11.62%), providing interpretable effect sizes.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "HumanEval's 164 problems are not justified by power analysis; sample size is described as driven by hand-authoring constraints rather than statistical reasoning.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No standard deviations or variance across multiple model runs are reported; only point estimates for pass@k are given in all tables and figures.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "GPT-3, GPT-J-6B, GPT-Neo (125M, 1.3B, 2.7B), and Tabnine (commercial) are all included as baselines in Table 1.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "GPT-J and GPT-Neo were state-of-the-art open-source models at time of publication (2021); Tabnine is a leading commercial autocomplete tool, providing a practical comparator.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Ablations include Codex vs. Codex-S (supervised fine-tuning effect), 8 model sizes (12M to 12B), fine-tuning from GPT vs. random init, and temperature effects on pass@k.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Pass@1, pass@10, pass@100, BLEU score, mean log-probability ranking, and back-translation score are all reported as evaluation metrics.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Human evaluation is conducted for docstring generation (Codex-D): 10 samples graded per problem across all 164 HumanEval problems, assessing whether docstrings uniquely and accurately specify the code body.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "HumanEval is explicitly hand-written after the May 2020 training data cutoff and kept entirely separate from training; APPS test split is also used as held-out evaluation.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "APPS results are broken down by introductory/interview/competition difficulty (Table 2); synthetic tasks are broken down by number of chained operations (Figure 11).",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 6 provides code-level failure examples (do_work variable binding failure), Figure 11 quantifies degradation per chained operation, and Appendix E shows alignment failures with specific prompt examples.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Negative results reported: fine-tuning from GPT showed no accuracy improvement over random init (only convergence speed), back-translation ranking underperforms mean log-probability (Figure 7), and Codex underperforms SAST tools at vulnerability detection.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "As the paper introducing the models, full specifications are provided: parameter counts (12M to 12B), training data source (159GB Python from GitHub, May 2020), and complete optimizer settings.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Figure 2 shows actual example prompts with function signatures and docstrings; stop sequences are explicitly listed ('\nclass', '\ndef', '\n#', '\nif', '\nprint').",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Learning rate, 175-step linear warmup, cosine decay, Adam parameters (β1=0.9, β2=0.95, ε=10⁻⁸, weight decay=0.1), nucleus sampling top_p=0.95, temperature settings, and 100B token training budget are all reported.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used; this evaluates a base code generation model without orchestration layers.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 3.1 documents all filtering criteria (auto-generated file removal, average line length >100, max line >1000, low alphanumeric percentage) and tokenizer modification for whitespace encoding.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "HumanEval benchmark with all 164 problems and unit tests is publicly released at github.com/openai/human-eval for independent verification of benchmark claims.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 3.1 describes data collection in detail: 54M public GitHub repos, May 2020 snapshot, 179GB Python files under 1MB, filtering pipeline, resulting in 159GB final dataset.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": false,
    283           "justification": "Human graders for docstring evaluation are used (1,640 gradings) but their recruitment, qualifications, grading criteria details, and inter-rater reliability are not described.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline from GitHub data collection through filtering, tokenization, training, and evaluation (including sandbox execution via gVisor) is documented across Sections 2-4.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": true,
    297           "justification": "Training data was collected in May 2020, explicitly stated in Section 3.1.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": true,
    303           "justification": "The paper explicitly motivates hand-written HumanEval problems because 'our models are trained on a large fraction of GitHub, which already contains solutions to problems from a variety of sources' including Codeforces.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": true,
    309           "justification": "HumanEval is hand-written specifically after the May 2020 training cutoff to prevent overlap; APPS contamination is noted as a concern and motivates the 1-shot evaluation approach.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No controlled human participants study requiring pre-registration; docstring grading is internal evaluation.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human subjects research requiring IRB approval.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in the main study.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in the main study.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in the main study.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in the main study.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in the main study.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference latency, API cost, or per-query compute figures are reported for running Codex evaluations.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "The paper states GPT-3-12B pre-training consumed 'hundreds of petaflop/s-days' and Codex-12B fine-tuning 'consumed a similar amount'; Azure platform is identified.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Codex-12B achieves 28.81% pass@1 on HumanEval, far exceeding GPT-J-6B (11.62%) and GPT-3 (~0%)",
    376       "evidence": "Table 1 provides exact pass@k numbers for all models across k=1, 10, 100 with consistent results",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Repeated sampling is highly effective: Codex-12B achieves 72.31% pass@100 with oracle unit-test selection",
    381       "evidence": "Table 1 shows the dramatic improvement from pass@1 to pass@100 is consistent across all model sizes",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "BLEU score is not a reliable indicator of functional correctness for code generation",
    386       "evidence": "Figure 8 shows significant overlap in BLEU distributions between correct and incorrect Codex-12B solutions across 4 random HumanEval tasks",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Supervised fine-tuning on curated standalone functions (Codex-S) improves pass@1 by 6.5pp and pass@100 by 15.1pp on average",
    391       "evidence": "Section 4.5 reports these averages; Figure 10 shows the improvement is consistent across model sizes with one or two orders of magnitude parameter efficiency improvement",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Codex frequently generates clearly insecure cryptographic code (RSA keys <2048 bits, ECB AES mode) at significant rates regardless of model size",
    396       "evidence": "Figure 15 shows insecure configuration rates across model sizes for RSA and AES based on ~30k generated samples",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Codex performance degrades exponentially with the number of chained operations in a docstring, dropping by a factor of 2-3 per added operation",
    401       "evidence": "Figure 11 quantifies this degradation using 13 synthetic building blocks composed into chains of increasing length",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "Mean log-probability sample ranking outperforms random selection but underperforms oracle unit-test selection",
    406       "evidence": "Figure 7 directly compares oracle, mean log-probability, back-translation, and random selection curves for Codex-12B",
    407       "supported": "strong"
    408     },
    409     {
    410       "claim": "Code model test loss follows the same power-law scaling with model size observed in language models",
    411       "evidence": "Figure 4 shows power-law fit with functional form (N/5.92×10^7)^-0.13 on held-out validation set",
    412       "supported": "strong"
    413     }
    414   ],
    415   "methodology_tags": [
    416     "benchmark-eval",
    417     "case-study"
    418   ],
    419   "key_findings": "Codex, a GPT model fine-tuned on 159GB of public GitHub Python code, achieves 28.81% pass@1 on the new HumanEval benchmark—dramatically exceeding GPT-J (11.62%) and GPT-3 (~0%)—and 72.31% pass@100 with oracle unit-test selection, with Codex-S reaching 77.5% after supervised fine-tuning on curated functions. The paper demonstrates that BLEU score is a poor proxy for functional correctness, introduces an unbiased pass@k estimator that remains standard years later, and shows performance scales as a power law with model size. Critical safety findings include that Codex generates insecure cryptographic code at significant rates regardless of model size, exhibits misalignment (producing buggy code when prompted with buggy code despite having capability to produce correct code), and encodes societal biases from training data.",
    420   "red_flags": [
    421     {
    422       "flag": "Self-evaluation conflict",
    423       "detail": "OpenAI employees evaluate their own proprietary model (Codex) powering a commercial product (GitHub Copilot); no independent external validation of results is included."
    424     },
    425     {
    426       "flag": "No uncertainty quantification",
    427       "detail": "No confidence intervals, standard errors, or significance tests for any of the main pass@k comparisons between Codex and baselines across Tables 1 and 2."
    428     },
    429     {
    430       "flag": "Abstract-body inconsistency",
    431       "detail": "Abstract claims 70.2% with 100 samples, but Table 1 shows Codex-12B at 72.31% pass@100; the source configuration for the 70.2% figure is not clearly identified in the paper."
    432     },
    433     {
    434       "flag": "Small benchmark (164 problems)",
    435       "detail": "HumanEval contains only 164 problems with no statistical justification for this size; comparative differences of a few percentage points lack power to reach significance."
    436     },
    437     {
    438       "flag": "Proprietary model barrier",
    439       "detail": "Full reproduction requires access to the proprietary Codex model; while evaluation code and benchmark are released, independent verification of model performance is not possible."
    440     },
    441     {
    442       "flag": "Human grading underdescribed",
    443       "detail": "Docstring evaluation uses human graders for 1,640 assessments but provides no information on grader recruitment, qualifications, or inter-rater reliability."
    444     }
    445   ],
    446   "cited_papers": [
    447     {
    448       "title": "Measuring Coding Challenge Competence with APPS",
    449       "relevance": "Key benchmark paper for evaluating code generation on competitive programming; used as secondary evaluation dataset and direct baseline comparator"
    450     },
    451     {
    452       "title": "Language Models are Few-Shot Learners (GPT-3)",
    453       "relevance": "Foundation model that Codex is fine-tuned from; establishes the baseline that Codex dramatically improves for code generation"
    454     },
    455     {
    456       "title": "SPoC: Search-based Pseudocode to Code",
    457       "relevance": "Introduces the pass@k metric concept and functional correctness evaluation for code synthesis; Codex adopts and extends this framework"
    458     },
    459     {
    460       "title": "Unsupervised Translation of Programming Languages (TransCoder)",
    461       "relevance": "Establishes that functional correctness better captures code quality than BLEU for translation tasks, supporting Codex's methodological choice"
    462     },
    463     {
    464       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    465       "relevance": "Prior code-NL model trained on docstring-function pairs; represents the state of the art for code understanding at time of publication"
    466     },
    467     {
    468       "title": "CodeSearchNet Challenge: Evaluating the State of Semantic Code Search",
    469       "relevance": "Large-scale GitHub corpus that established the multimodal code-NL dataset paradigm; predecessor to Codex's training approach"
    470     },
    471     {
    472       "title": "GPT-J-6B: A 6 Billion Parameter Autoregressive Language Model",
    473       "relevance": "Key open-source baseline trained on The Pile (8% GitHub code); primary competitive comparator demonstrating Codex's advantage from code-focused training"
    474     },
    475     {
    476       "title": "Extracting Training Data from Large Language Models",
    477       "relevance": "Shows LLMs can memorize and reproduce training data; directly cited in Codex's legal/privacy analysis regarding code reproduction from training"
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 3,
    483       "justification": "Directly introduces GitHub Copilot's underlying model and HumanEval benchmark that remained the standard code evaluation for years."
    484     },
    485     "surprise_contrarian": {
    486       "score": 2,
    487       "justification": "The finding that repeated sampling is 'surprisingly effective' (72% with 100 samples vs 29% with 1) and that BLEU is unreliable for code were non-obvious and highly cited."
    488     },
    489     "fear_safety": {
    490       "score": 2,
    491       "justification": "Explicit security analysis showing Codex generates insecure cryptographic code, misalignment analysis, and polymorphic malware concerns raise concrete AI safety issues."
    492     },
    493     "drama_conflict": {
    494       "score": 1,
    495       "justification": "Commercial relationship between OpenAI and GitHub Copilot creates implicit tension, but the paper itself is measured and academic in tone with no direct controversy."
    496     },
    497     "demo_ability": {
    498       "score": 3,
    499       "justification": "GitHub Copilot based on Codex was publicly available at launch; HumanEval is released for immediate community use and reproduction."
    500     },
    501     "brand_recognition": {
    502       "score": 3,
    503       "justification": "OpenAI, GitHub Copilot, and the Codex name have extremely high brand recognition; several authors (Sutskever, Amodei, Brockman) are prominent AI industry figures."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [
    508       {
    509         "hn_id": "27786283",
    510         "title": "Evaluating Large Language Models Trained on Code",
    511         "points": 12,
    512         "comments": 1,
    513         "url": "https://news.ycombinator.com/item?id=27786283",
    514         "created_at": "2021-07-09T17:39:30Z"
    515       },
    516       {
    517         "hn_id": "27767328",
    518         "title": "Evaluating Large Language Models Trained on Code",
    519         "points": 11,
    520         "comments": 1,
    521         "url": "https://news.ycombinator.com/item?id=27767328",
    522         "created_at": "2021-07-08T00:36:26Z"
    523       },
    524       {
    525         "hn_id": "27777657",
    526         "title": "Evaluating Large Language Models Trained on Code (paper about GH copilot model)",
    527         "points": 4,
    528         "comments": 1,
    529         "url": "https://news.ycombinator.com/item?id=27777657",
    530         "created_at": "2021-07-08T21:10:26Z"
    531       },
    532       {
    533         "hn_id": "27770978",
    534         "title": "Evaluating Large Language Models Trained on Code(GitHub Copilot)",
    535         "points": 3,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=27770978",
    538         "created_at": "2021-07-08T12:20:59Z"
    539       },
    540       {
    541         "hn_id": "34552130",
    542         "title": "Evaluating Large Language Models Trained on Code",
    543         "points": 2,
    544         "comments": 0,
    545         "url": "https://news.ycombinator.com/item?id=34552130",
    546         "created_at": "2023-01-27T21:27:58Z"
    547       },
    548       {
    549         "hn_id": "29172572",
    550         "title": "Measuring mathematical problem solving with the MATH dataset",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=29172572",
    554         "created_at": "2021-11-10T09:00:47Z"
    555       },
    556       {
    557         "hn_id": "26070039",
    558         "title": "On the Reproducibility of Neural Network Predictions",
    559         "points": 2,
    560         "comments": 0,
    561         "url": "https://news.ycombinator.com/item?id=26070039",
    562         "created_at": "2021-02-08T21:00:30Z"
    563       }
    564     ],
    565     "top_points": 12,
    566     "total_points": 36,
    567     "total_comments": 3
    568   }
    569 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs