scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19078B)
      1 {
      2   "paper": {
      3     "title": "CRQBench: A Benchmark of Code Reasoning Questions",
      4     "authors": ["Elizabeth Dinella", "Satish Chandra", "Petros Maniatis"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2408.08453"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive link is provided in the paper. The paper mentions reproducing corporate results for open source release using GitHub PR comments, but no link to released code is given."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper describes the benchmark of 100 C++ code reasoning questions derived from the CodeReviewer dataset, but no download link or repository for CRQBench itself is provided in the paper."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, dependency lists, or setup instructions are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are included. The curation methodology is described at a high level but not with reproducible commands or scripts."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (e.g., 65/100, precision/recall values) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares LLM classification vs keyword matching and GPT-4 vs Falcon performance without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Only raw numbers and percentages are reported (e.g., 65%, 1.8x reduction). No effect sizes with baseline context beyond simple percentages."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The benchmark size of 100 questions and classifier evaluation on 100 comments are not justified. No power analysis or rationale for these sample sizes."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance or standard deviation is reported. Results appear to be from single runs with no spread measures."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares LLM-based classification against keyword matching (Table 2) and evaluates both GPT-4 and Falcon on the benchmark."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "GPT-4 (2024) is a contemporary model. Falcon-7B is used as a secondary comparison. The keyword matching baseline is appropriate for the classification task."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is performed. The curation pipeline has multiple components (classifier, query type classifier, rephraser, validator) but their individual contributions are not isolated."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Precision, recall, and F1 score are reported for the classification task (Table 2). Accuracy with breakdown by query type (VALUE vs EQUIV) is reported for GPT-4 evaluation."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Human evaluation is central: manual labeling validates the classifier (Section 3.1), human inspection validates rephrased questions (Section 3.2), and GPT-4 outputs are manually evaluated for correctness and contextual relevance (Section 4)."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a benchmark construction paper, not a model training paper. There is no train/test split to evaluate."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "GPT-4 performance is broken down by query type (VALUE: 61%, EQUIV: 70%) in Table 3. Error analysis categorizes 35 incorrect responses into missing context (25), C++ knowledge gaps (5), and logic errors (5)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4 provides error analysis with three categories of failures. Appendix Figures 8-10 show specific examples of incorrect GPT-4 responses."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that Falcon achieved only ~25% accuracy, that the keyword matching approach had significantly more false positives, and that the LLM classifier had low recall (52% on Google)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims GPT-4 produces correct responses for 65 of 100 questions, which matches Table 3. The claim about reducing manual effort is supported by Section 3.3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper's causal claims are modest: the LLM-based approach reduces manual effort (supported by comparing pipeline stages). The error categorization attributes failures to specific causes (missing context, C++ knowledge, logic errors) based on manual inspection, which is adequate."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper tests only C++ code from the CodeReviewer dataset and evaluates only GPT-4 and Falcon, but the title and introduction frame this as a general 'code reasoning' benchmark without adequately bounding to these specific settings."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed for the results. For instance, GPT-4's 65% accuracy could be influenced by prompt design, context window limitations, or dataset characteristics, but these are not explored."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper says 'GPT-4' and 'Falcon' without specifying exact model versions, snapshot dates, or API versions. The Google internal LLM used for curation is not named at all."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Full prompt text is provided in the appendix for the Code Reasoning Classifier (Figure 12), Query Type Classifier (Figure 13), Edit Generator (Figure 14), Expression Extractor (Figure 15), Equiv Rewriter (Figure 16), Value Rewriter (Figure 17), Validator (Figure 18), and GPT-4 evaluation prompt (Figure 11)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the LLM calls."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The multi-step curation pipeline is described in detail in Section 3 and Figure 4, including the chain-of-thought reasoning, query type classification, rephrasing, and validation steps."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper describes the filtering pipeline: starting from code review comments, classifying with the Code Reasoning Classifier, rephrasing, validating, and human inspection. Table 1 quantifies comment categories. Section 3.3 provides sample counts at each stage."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5 'Limitations' discusses two specific limitations: the manual answer extraction approach (5.1) and the size of the target environment requiring 10x more comments (5.2)."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations are specific to this study: answers are derived by human best-effort inspection rather than symbolic verification (Section 5.1), and the cooperative approach requires >10x more comments due to false negatives (Section 5.2)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to C++ only, to the specific dataset used, or discuss what types of code reasoning are not covered."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The benchmark data (100 questions, answers, code contexts) is not made available for download. The underlying CodeReviewer dataset is public, but the curated CRQBench subset is not released with a link."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 3 describes the data collection procedure in detail: sourcing from CodeReviewer dataset GitHub PR comments, classification, rephrasing, and validation steps."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited for a study. The human inspectors appear to be the authors themselves. Data comes from a public dataset."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Figure 4 illustrates the full pipeline. Section 3.3 provides counts: 285 GitHub comments inspected to produce 160 candidates, yielding 100 CRQs. Each transformation step is described."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: Bryn Mawr College and Google/Google DeepMind."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Two of three authors are affiliated with Google/Google DeepMind. The curation technique uses a 'Google code aware LLM.' No discussion of whether Google has a stake in the benchmark results. Funding source is undisclosed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "GPT-4 is evaluated on the benchmark but no training cutoff date is stated. The CodeReviewer dataset is from 2022 and likely in GPT-4's training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether GPT-4 may have seen the CodeReviewer dataset or similar GitHub PR comments during training."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The CodeReviewer dataset was published in 2022, before GPT-4's training. The benchmark derives from public GitHub PRs that could be in GPT-4's training data. This contamination risk is not discussed."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human subjects study was conducted. Human inspection was part of the curation methodology, not a study of human behavior."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human subjects study requiring IRB approval."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in a study context."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in a study context."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects experiment."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects experiment."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects experiment."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference costs, API costs, or latency figures are reported for any of the LLM calls (curation pipeline or GPT-4 evaluation)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No computational budget or resource usage is reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "GPT-4 produces correct responses grounded in the given context for 65 of 100 CRQBench questions.",
    286       "evidence": "Table 3 shows 65% overall accuracy, with VALUE queries at 61% and EQUIV queries at 70% (Section 4).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "The cooperative LLM + human approach reduces manual inspection by 1.8x on GitHub and 3.3x on Google comments.",
    291       "evidence": "Section 3.3 compares manual (285 GitHub comments) vs cooperative (160 comments) approaches, shown in Figure 5.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "The majority of GPT-4 errors (25/35) are due to the model lacking necessary context.",
    296       "evidence": "Section 4 error analysis categorizes 35 errors: 25 missing context, 5 C++ knowledge gaps, 5 logic errors.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Most code review comments (65% GitHub, 80% Google) are not related to code reasoning.",
    301       "evidence": "Table 1 shows CRQ-related comments at 35% (GitHub) and 20% (Google), based on manual analysis.",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "CRQBench is a benchmark of 100 C++ code reasoning questions derived from real code review comments using an LLM-assisted curation pipeline. GPT-4 achieves 65% accuracy on these questions, with most errors (71%) due to missing context rather than reasoning failures. The LLM-assisted curation approach reduces manual inspection effort by 1.8-3.3x compared to fully manual curation, though it requires 10x more source comments due to classifier false negatives.",
    307   "red_flags": [
    308     {
    309       "flag": "Contamination risk unaddressed",
    310       "detail": "The benchmark is derived from public GitHub PRs in the CodeReviewer dataset (2022), which is likely in GPT-4's training data. No contamination analysis is performed."
    311     },
    312     {
    313       "flag": "Google affiliation with undisclosed internal tools",
    314       "detail": "Two authors are from Google/Google DeepMind. The curation pipeline uses an unnamed 'Google code aware LLM' and Google internal code reviews. No conflict of interest statement is provided."
    315     },
    316     {
    317       "flag": "Very small benchmark size",
    318       "detail": "100 questions is small for a benchmark. No justification for this sample size is provided, and no statistical uncertainty is reported for the 65% accuracy figure."
    319     },
    320     {
    321       "flag": "No variance or uncertainty quantification",
    322       "detail": "All results are single-run point estimates with no confidence intervals, error bars, or repeated measurements."
    323     }
    324   ],
    325   "cited_papers": [
    326     {
    327       "title": "Evaluating Large Language Models Trained on Code",
    328       "authors": ["Mark Chen", "Jerry Tworek"],
    329       "year": 2021,
    330       "arxiv_id": "2107.03374",
    331       "relevance": "Introduces HumanEval, a foundational benchmark for LLM code generation evaluation."
    332     },
    333     {
    334       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    335       "authors": ["Carlos E. Jimenez", "John Yang"],
    336       "year": 2023,
    337       "arxiv_id": "2310.06770",
    338       "relevance": "A realistic benchmark for evaluating LLM code reasoning through software engineering tasks."
    339     },
    340     {
    341       "title": "Program Synthesis with Large Language Models",
    342       "authors": ["Jacob Austin"],
    343       "year": 2021,
    344       "relevance": "Introduces MBPP benchmark for LLM code generation evaluation."
    345     },
    346     {
    347       "title": "GPT-4 Technical Report",
    348       "authors": ["OpenAI"],
    349       "year": 2024,
    350       "arxiv_id": "2303.08774",
    351       "relevance": "Technical report for GPT-4, the primary model evaluated in this benchmark study."
    352     },
    353     {
    354       "title": "Automating Code Review Activities by Large-Scale Pre-training",
    355       "authors": ["Zhiyu Li", "Shuai Lu"],
    356       "year": 2022,
    357       "arxiv_id": "2203.09095",
    358       "relevance": "Source of the CodeReviewer dataset used to derive the CRQBench benchmark."
    359     },
    360     {
    361       "title": "Can ChatGPT replace StackOverflow? A Study on Robustness and Reliability of Large Language Model Code Generation",
    362       "authors": ["Li Zhong", "Zilong Wang"],
    363       "year": 2023,
    364       "arxiv_id": "2308.10335",
    365       "relevance": "Evaluates LLM code generation reliability, relevant to understanding LLM code reasoning limitations."
    366     },
    367     {
    368       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    369       "authors": ["Jason Wei", "Xuezhi Wang"],
    370       "year": 2022,
    371       "relevance": "Chain-of-thought prompting technique used in the CRQBench curation pipeline."
    372     },
    373     {
    374       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    375       "authors": ["Xuezhi Wang", "Jason Wei"],
    376       "year": 2023,
    377       "arxiv_id": "2203.11171",
    378       "relevance": "Self-consistency technique used for validation in the CRQBench curation pipeline."
    379     }
    380   ]
    381 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs