scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24203B)
      1 {
      2   "paper": {
      3     "title": "CodeInsight: A Curated Dataset of Practical Coding Solutions from Stack Overflow",
      4     "authors": ["Nathanaël Beau", "Benoît Crabbé"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2409.16819"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'The benchmark can be accessed at https://github.com/NathanaelBeau/CodeInsight', providing a GitHub URL for the dataset and code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The dataset is publicly available via the GitHub repository linked in the abstract (https://github.com/NathanaelBeau/CodeInsight). The 3,409 examples are released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions training on 'an a100 GPU with 40GB memory' and 'half-precision computation (FP16)' but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. While the dataset is released, there is no README or 'Reproducing Results' section with commands to replicate the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Table 3 reports mean and standard deviation across five seeds for CodeLLaMa fine-tuning experiments (e.g., '48.9 ± 0.6%' for pass@1). Table 1 also reports standard deviations for dataset statistics."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., CodeLLaMa outperforms Mistral, fine-tuning improves over prompting) but provides no statistical significance tests (no p-values, t-tests, or similar)."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports absolute performance numbers with baselines for context. For example, Table 2 shows pass@1 improving from 4.7% to 10.1% for Mistral with the second prompt, and Table 3 shows performance across splits with absolute values. This provides enough context to assess magnitude of differences."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the number of seeds (5) used in the fine-tuning experiments, nor for why 3 models were selected. The dataset size of 3,409 examples is described but not justified via power analysis or similar reasoning."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Table 3 reports standard deviations across five seeds for all metrics (e.g., '52.6 ± 0.8%' for pass@1 on the 40-60 split). However, Table 5 (final results) reports only single numbers without variance, which is inconsistent. Giving credit because Table 3 does report multi-run variance for the key fine-tuning analysis."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper evaluates three models (Mistral 7B, CodeLLaMa 13B, Starcoder 15B) and compares prompting vs. fine-tuning configurations, providing multiple baselines in Tables 2, 3, and 5."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The models evaluated (Mistral 7B, CodeLLaMa 13B, Starcoder 15B) were contemporary at submission but the paper does not compare against stronger models like GPT-4 on the full benchmark (GPT-4 is only used on a subset of 812 examples for the contamination analysis). No comparison with other leading code models like DeepSeek Coder or WizardCoder."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper ablates prompting strategies (no prompt, first prompt, second prompt in Table 2) and training/test split sizes (20-80, 40-60, 60-40, 80-20 in Table 3), effectively serving as ablation studies showing which configurations matter."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses pass@1, BLEU score, and codeBLEU score (Section 4.1, Table 3), providing three different evaluation metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of model outputs is included. All evaluation is automated via unit tests and BLEU/codeBLEU metrics. Given that the paper claims the dataset mirrors real-world coding tasks, human evaluation of output quality would be relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.3 describes explicit train/test splits (20-80, 40-60, 60-40, 80-20). The final evaluation in Table 5 uses the 40-60 split with a distinct test set of 1,860 examples."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 5 provides per-category breakdowns across Labels (MULTILINE, ASSIGN, COMPLEXTASK, etc.) and Packages (Pandas, Numpy, Regex, etc.) for all three models."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Appendix E provides a detailed error analysis of CodeLLaMa outputs with three specific failure examples (Table 12): argument specification error, regex syntax error, and annotation discrepancy."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that the first prompt actually decreased CodeLLaMa's performance from 44.7% to 40.3% (Table 2), and that Mistral's non-code-specific pre-training caused it to struggle. The annotation discrepancy in Appendix E is also a candid negative finding."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims a dataset of 3,409 examples with unit tests, support for fine-tuning and evaluation, reduced contamination confirmed by model performance, and public availability. All are supported in the paper: Section 2 describes construction, Section 4 presents evaluations, Section 4.5 addresses contamination, and a GitHub link is provided."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper's causal claims are modest and supported by controlled experiments. The ablation of prompt strategies (Table 2) isolates the effect of prompts, and the fine-tuning experiments (Table 3) use multiple seeds with controlled train/test splits. The contamination analysis in Section 4.5 uses before/after comparisons on the same examples."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper title claims 'Practical Coding Solutions' broadly but the dataset is exclusively Python. While the Limitations section mentions the Python restriction, the title and abstract do not bound the scope to Python-only tasks. The claims about 'aiding developers in common tasks' are broader than what Python-only evidence supports."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 4.5 discusses whether GPT-4's performance is due to genuine coding ability versus data memorization/contamination, presenting evidence for both interpretations. The error analysis in Appendix E discusses annotation discrepancies as an alternative explanation for model failures."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper refers to models by name and size only: 'Mistral 7B', 'CodeLLAMA 13B', 'Starcoder 15B', and 'GPT-4'. No specific version identifiers, snapshot dates, or API versions are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The two prompts used are provided verbatim in Section 4.2: 'You are a powerful code generation model. Your job is to convert a given natural language prompt into Python function code and return the result.' and 'Return the Result.' appended to the intent."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.3 reports fine-tuning hyperparameters in detail: LoRA r=16, alpha=16, dropout=0.05, batch size=128, warmup=100 steps, total=400 steps, learning rate=3e-5, AdamW optimizer, FP16 precision. However, no temperature/sampling parameters for the zero-shot/prompting evaluations are reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The models are evaluated in a standard prompt-to-completion setting without tool use, feedback loops, or multi-step workflows."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 2 documents the full pipeline: starting with 7,300 raw examples, filtering to 2,707, then annotating to 3,409. Filtering criteria are described in Section 2.2 (authenticity, extractability, alignment, executability), and the annotation process is described in Section 2.3 with specific tasks."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing the dataset's specialized nature, potential annotation biases, scope limitations, and Python-only restriction."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The Limitations section contains only generic disclaimers: 'specialized nature... may not fully represent the broader spectrum', 'could introduce biases', 'current scope may limit its adaptability'. None are specific threats tied to particular findings or design choices in this study."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The Limitations section mentions Python restriction but does not explicitly state what the dataset does NOT show or claim. There is no equivalent of 'our results do not generalize to X' or 'we did not test Y'. The boundaries are implied but not explicitly stated."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The full dataset is released on GitHub (https://github.com/NathanaelBeau/CodeInsight) including all 3,409 examples with intents, code, and unit tests, allowing independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2.1 describes data sources in detail: CoNaLa dataset (2,379 hand-written + 3,121 top-ranked unrefined examples), plus 600 additional Stack Overflow samples focusing on Pandas, Numpy, and Regex. Selection criteria include community engagement metrics (votes, views) and temporal weighting."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 2.3 describes the annotation team: 'A team of five data science professionals, each with a minimum of five years of experience, contributed to the labeling.' While recruitment channels are not described, the annotator qualifications and time constraints (20 minutes per example, ~12 minutes average, ~540 total hours) are documented."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline is documented across Sections 2.1-2.3: 7,300 raw examples → filtering (with specific counts per source: 1,993/5,500 from CoNaLa, 294/600 Pandas, 242/600 Numpy, 178/600 Regex) → 2,707 filtered examples → annotation (3 tasks) → 3,409 final examples (from 2,702 distinct problems, some yielding multiple solutions)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: Nathanaël Beau is affiliated with both Université de Paris (LLF, CNRS) and onepoint (a company). Benoît Crabbé is affiliated with Université de Paris."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Since no funding is disclosed, it is impossible to assess funder independence. The first author has a corporate affiliation (onepoint) which could represent a potential conflict, but no funding relationship is stated."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper. The first author's dual affiliation with onepoint (a consulting company) is noted but no explicit financial interest declaration is made."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates pre-trained models (Mistral 7B, CodeLLaMa 13B, Starcoder 15B, GPT-4) on the benchmark but does not state training data cutoff dates for any of them."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Section 4.5 directly addresses data contamination by comparing GPT-4's performance on original CoNaLa examples versus the rewritten CodeInsight versions. The BLEU score drop from 58.8 to 47.6 after annotation is used as evidence of the rewriting's effectiveness against memorization."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Section 2.3 describes deliberate annotation design to reduce contamination: rephrasing intents, normalizing variable names, converting snippets to function format. Section 4.5 empirically tests contamination by comparing GPT-4 on original vs. rewritten examples. Appendix F provides concrete examples of potential memorization patterns."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in an experimental study. The annotators created the dataset but were not subjects of research."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants as research subjects. Annotators were professional contributors to dataset construction, not study participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants as research subjects. Annotator qualifications (5+ years data science experience) are described but this is about dataset construction, not a human subjects study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants as research subjects."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants as research subjects; not an experimental study involving human randomization."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants as research subjects; not an experimental study requiring blinding."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants as research subjects."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or tokens consumed are reported for any of the model evaluations. The paper does not mention API costs or wall-clock time for generating predictions."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The paper mentions using 'an a100 GPU with 40GB memory' for fine-tuning but does not state total GPU hours, training time, or overall computational budget. The annotation effort of 540 hours is mentioned but computational cost is not quantified."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CodeInsight comprises 3,409 expert-curated Python examples derived from Stack Overflow, with an average of three unit tests per example.",
    286       "evidence": "Section 2.3 states the annotation yielded 3,409 examples from 2,702 distinct problems. Table 10 confirms average 3.0 ± 0.4 unit tests across the full dataset.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "The dataset's annotation process (rephrasing intents, normalizing variables, converting to functions) reduces data contamination, confirmed by GPT-4's BLEU score dropping from 58.8 to 47.6 on rewritten examples.",
    291       "evidence": "Section 4.5 reports GPT-4 achieved BLEU 58.8 on original CoNaLa examples and 47.6 on the rewritten CodeInsight versions of the same 812 examples. GPT-4 still passed 64% of unit tests.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Fine-tuning with as little as 40% of the dataset matches the performance of training on 60-80% for CodeLLaMa.",
    296       "evidence": "Table 3 shows pass@1 of 52.6% (40-60), 53.4% (60-40), and 53.1% (80-20) across five seeds, with overlapping standard deviations.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Fine-tuning improves model performance over zero-shot prompting.",
    301       "evidence": "Table 2 shows CodeLLaMa at 48.1% with best prompt; Table 5 shows 53.1% after fine-tuning on 40-60 split. Mistral improves from 10.1% to 38.4%.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Appending 'Return the Result.' to the intent leads to overall improvement in performance across all models.",
    306       "evidence": "Table 2 shows improvements for all three models with the second prompt: Mistral 4.7%→10.1%, CodeLLaMa 44.7%→48.1%, Starcoder 45.1%→46.8%. No significance tests accompany these comparisons.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "CodeInsight introduces a curated dataset of 3,409 Python code generation examples from Stack Overflow, each with expert-refined intents and an average of three unit tests. The annotation process (rephrasing intents, normalizing variable names) demonstrably reduces data contamination, as shown by GPT-4's BLEU score dropping from 58.8 to 47.6 on rewritten examples while still passing 64% of unit tests. Fine-tuning CodeLLaMa on just 40% of the dataset achieves comparable performance to using 60-80%, reaching ~53% pass@1. Per-category breakdowns reveal that complex tasks (COMPLEXTASK, Regex) remain challenging for all models.",
    312   "red_flags": [
    313     {
    314       "flag": "Incomplete variance reporting",
    315       "detail": "Table 3 reports standard deviations across 5 seeds for training split experiments, but Table 5 (the final results table) reports only single-run numbers without any variance measure, making it impossible to assess result stability for the main findings."
    316     },
    317     {
    318       "flag": "No significance tests for comparisons",
    319       "detail": "The paper compares three models and multiple prompting strategies, claiming some outperform others, but no statistical significance tests are performed. The improvements could be within noise."
    320     },
    321     {
    322       "flag": "GPT-4 only partially evaluated",
    323       "detail": "GPT-4 is evaluated only on 812 CoNaLa-derived examples for contamination analysis, not on the full benchmark. This selective evaluation leaves the strongest baseline's full performance unknown, potentially making the other models look more competitive."
    324     },
    325     {
    326       "flag": "Corporate affiliation undisclosed as conflict",
    327       "detail": "First author is affiliated with onepoint (a consulting company) but no funding disclosure or competing interests statement is provided. The relationship between the company and the research is unclear."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Evaluating Large Language Models Trained on Code",
    333       "authors": ["Mark Chen", "Jerry Tworek"],
    334       "year": 2021,
    335       "arxiv_id": "2107.03374",
    336       "relevance": "Introduced HumanEval benchmark and Codex, foundational for code generation evaluation."
    337     },
    338     {
    339       "title": "Program Synthesis with Large Language Models",
    340       "authors": ["Jacob Austin", "Augustus Odena"],
    341       "year": 2021,
    342       "arxiv_id": "2108.07732",
    343       "relevance": "Introduced MBPP benchmark for code generation evaluation."
    344     },
    345     {
    346       "title": "Measuring Coding Challenge Competence with APPS",
    347       "authors": ["Dan Hendrycks", "Steven Basart"],
    348       "year": 2021,
    349       "relevance": "Introduced APPS dataset for evaluating code generation on competitive programming problems."
    350     },
    351     {
    352       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    353       "authors": ["Yuhang Lai", "Chengxi Li"],
    354       "year": 2023,
    355       "relevance": "Data science code generation benchmark from Stack Overflow, directly compared with CodeInsight."
    356     },
    357     {
    358       "title": "Code LLaMA: Open Foundation Models for Code",
    359       "authors": ["Baptiste Rozière", "Jonas Gehring"],
    360       "year": 2023,
    361       "arxiv_id": "2308.12950",
    362       "relevance": "One of the primary baseline models evaluated on CodeInsight."
    363     },
    364     {
    365       "title": "StarCoder: May the Source Be With You!",
    366       "authors": ["Raymond Li", "Loubna Ben Allal"],
    367       "year": 2023,
    368       "arxiv_id": "2305.06161",
    369       "relevance": "One of the primary baseline models evaluated on CodeInsight."
    370     },
    371     {
    372       "title": "Mistral 7B",
    373       "authors": ["Albert Q. Jiang", "Alexandre Sablayrolles"],
    374       "year": 2023,
    375       "arxiv_id": "2310.06825",
    376       "relevance": "One of the primary baseline models evaluated on CodeInsight."
    377     },
    378     {
    379       "title": "Learning to Mine Aligned Code and Natural Language Pairs from Stack Overflow",
    380       "authors": ["Pengcheng Yin", "Bowen Deng"],
    381       "year": 2018,
    382       "relevance": "Introduced CoNaLa dataset, the primary data source for CodeInsight's examples."
    383     },
    384     {
    385       "title": "Execution-Based Evaluation for Open-Domain Code Generation",
    386       "authors": ["Zhiruo Wang", "Shuyan Zhou"],
    387       "year": 2022,
    388       "arxiv_id": "2212.10481",
    389       "relevance": "Introduced ODEX benchmark for execution-based code generation evaluation, compared in Table 4."
    390     },
    391     {
    392       "title": "CodeBLEU: A Method for Automatic Evaluation of Code Synthesis",
    393       "authors": ["Shuo Ren", "Daya Guo"],
    394       "year": 2020,
    395       "arxiv_id": "2009.10297",
    396       "relevance": "Proposed the codeBLEU metric used as one of the evaluation metrics in CodeInsight."
    397     }
    398   ]
    399 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs