scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (15925B)
      1 {
      2   "paper": {
      3     "title": "A Comprehensive Survey of AI-Driven Advancements and Techniques in Automated Program Repair and Code Generation",
      4     "authors": ["Avinash Anand", "Nishchay Yadav", "Akshit Gupta", "Shaurya Bajaj"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2411.07586"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No code or analysis scripts are released. No repository URL is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or structured extraction of the surveyed papers is released."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a survey paper with no computational experiments requiring an environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No instructions provided for reproducing the survey methodology or paper selection process."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "Survey paper with no original experiments or statistical analysis."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "Survey paper; no statistical comparisons are performed by the authors."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Survey paper; no original effect sizes computed."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Survey paper; no experimental sample size to justify."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Survey paper; no original experimental runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The survey does not compare itself against prior surveys or reviews in the same area."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No prior surveys are compared against."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "Survey paper; no system components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Survey paper; no original evaluation metrics applied."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Survey paper; human evaluation is not relevant to the claims."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Survey paper; no test set involved."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The survey splits papers into APR and code generation categories (Fig. 1) and provides per-category discussion and comparison tables (Tables 1-4)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The survey discusses limitations and challenges of the reviewed tools, including areas where models struggle (e.g., complex logic, load balancing issues with Mixtral, bias inheritance in Magicoder)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports weaknesses and limitations of reviewed tools, such as WizardCoder lacking compared to ChatGPT in some areas, and Smaug struggling with low edit distance preference datasets."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims to review 27 papers split into APR and code generation groups, which matches the paper's content structure."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper is a survey that summarizes existing work without making original causal claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title says 'Comprehensive Survey' but covers only 27 papers with no clear justification for why this constitutes comprehensive coverage of the field."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "Survey/taxonomy paper presenting no original empirical results requiring alternative explanations."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "Survey paper; no models are run by the authors."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "Survey paper; no prompting is performed."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper; no experiments are run."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Survey paper; no agentic scaffolding used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 2.2 describes methods employed (systematic literature review, taxonomy development, etc.) but does not provide specific filtering criteria, search queries, databases searched, or counts at each filtering stage. The inclusion/exclusion criteria are mentioned abstractly but never specified concretely."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 1 states the paper covers only code generation and bug fixing, explicitly excluding other code-related tasks like summarization and repository understanding."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data (e.g., list of all candidate papers, extraction spreadsheets) is made available."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Section 2.2 mentions 'systematic literature review' and 'inclusion and exclusion criteria' but never specifies what those criteria actually are, which databases were searched, or what search terms were used."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data source is published papers from standard databases."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline from initial search to 27 included papers is not documented with specific counts or filtering steps."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding information or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All four authors are listed as affiliated with Indraprastha Institute of Information Technology, India."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial disclosure statement is present."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "Survey paper; no pre-trained model is evaluated on any benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Survey paper; no model evaluation performed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper; no benchmark evaluation performed by the authors."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this survey."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "Survey paper; no method with inference costs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "Survey paper; no computation performed."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "LLMs have significantly improved the quality and speed of automating programming and discovering bugs in code.",
    286       "evidence": "Stated in Section 1 as motivation; no original evidence provided beyond citing reviewed papers.",
    287       "supported": "weak"
    288     },
    289     {
    290       "claim": "27 recent papers were reviewed covering APR and code generation with LLMs.",
    291       "evidence": "The reference list contains 27 entries, and the paper discusses each. Sections 3-6 cover these papers.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Trends include use of LLMs, feedback loops for iterative code improvement, and open-source models.",
    296       "evidence": "Discussed throughout Sections 3-6 with examples from reviewed papers, but no systematic analysis quantifying these trends.",
    297       "supported": "moderate"
    298     }
    299   ],
    300   "methodology_tags": ["meta-analysis"],
    301   "key_findings": "This survey reviews 27 papers on automated program repair and code generation using LLMs, categorizing them into APR tools and code generation models. It provides comparative tables of tools, benchmarks, and programming language support. The survey identifies trends such as increasing LLM integration, iterative feedback loops, and growing use of open-source models, while noting challenges in functional correctness and security.",
    302   "red_flags": [
    303     {
    304       "flag": "No quality assessment of reviewed papers",
    305       "detail": "The survey summarizes 27 papers without any structured quality assessment or critical evaluation of their methodology. This risks laundering weak results by presenting them alongside strong ones without distinction."
    306     },
    307     {
    308       "flag": "Undocumented paper selection process",
    309       "detail": "Section 2.2 mentions systematic literature review with inclusion/exclusion criteria but never specifies what those criteria are, which databases were searched, what search terms were used, or how many candidates were found at each stage. The selection of 27 papers appears arbitrary."
    310     },
    311     {
    312       "flag": "No limitations section",
    313       "detail": "The paper has no limitations, threats to validity, or discussion of what the survey does not cover, which is a significant methodological omission for a survey paper."
    314     },
    315     {
    316       "flag": "Overclaimed comprehensiveness",
    317       "detail": "The title claims 'Comprehensive Survey' but covers only 27 papers in a field with hundreds of relevant publications, without justifying why 27 constitutes comprehensive coverage."
    318     }
    319   ],
    320   "cited_papers": [
    321     {
    322       "title": "Evaluating Large Language Models Trained on Code",
    323       "authors": ["OpenAI"],
    324       "year": 2021,
    325       "relevance": "Foundational work on LLM code generation evaluation (Codex/HumanEval)."
    326     },
    327     {
    328       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    329       "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven C.H. Hoi"],
    330       "year": 2021,
    331       "arxiv_id": "2109.00859",
    332       "relevance": "Pre-trained code model for code understanding and generation tasks."
    333     },
    334     {
    335       "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow",
    336       "authors": ["Daya Guo"],
    337       "year": 2021,
    338       "arxiv_id": "2009.08366",
    339       "relevance": "Pre-training approach incorporating data flow for code representation learning."
    340     },
    341     {
    342       "title": "Deepseek-coder",
    343       "authors": ["Daya Guo"],
    344       "year": 2024,
    345       "arxiv_id": "2401.14196",
    346       "relevance": "Open-source code LLM with large context window and fill-in-the-middle training."
    347     },
    348     {
    349       "title": "Magicoder: Empowering Code Generation with OSS-Instruct",
    350       "authors": ["Yuxiang Wei"],
    351       "year": 2024,
    352       "arxiv_id": "2312.02120",
    353       "relevance": "Code generation model using open-source code snippets for training data generation."
    354     },
    355     {
    356       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    357       "authors": ["Anton Lozhko"],
    358       "year": 2024,
    359       "arxiv_id": "2402.19173",
    360       "relevance": "Next-generation open-source code LLM trained on large-scale programming data."
    361     },
    362     {
    363       "title": "OpenCodeInterpreter: Integrating Code Generation with Execution and Refinement",
    364       "authors": ["Tianyu Zheng"],
    365       "year": 2024,
    366       "arxiv_id": "2402.14658",
    367       "relevance": "Code generation model integrating execution feedback for iterative refinement."
    368     },
    369     {
    370       "title": "A Unified Debugging Approach via LLM-Based Multi-Agent Synergy",
    371       "authors": ["Cheryl Lee"],
    372       "year": 2024,
    373       "relevance": "Multi-agent LLM approach to automated debugging."
    374     },
    375     {
    376       "title": "Automatic Programming: Large Language Models and Beyond",
    377       "authors": ["Michael R. Lyu"],
    378       "year": 2024,
    379       "relevance": "Survey of LLMs for automatic programming tasks."
    380     },
    381     {
    382       "title": "WizardLM: Empowering Large Language Models to Follow Complex Instructions",
    383       "authors": ["Can Xu"],
    384       "year": 2024,
    385       "arxiv_id": "2304.12244",
    386       "relevance": "Evol-Instruct method for complex instruction following in code generation."
    387     }
    388   ]
    389 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs