scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31155B)
      1 {
      2   "paper": {
      3     "title": "Revisiting Evolutionary Program Repair via Code Language Model",
      4     "authors": ["Yunan Wang", "Tingyu Guo", "Zilong Huang", "Yuan Yuan"],
      5     "year": 2024,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2408.10486",
      8     "doi": "10.48550/arXiv.2408.10486"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "ARJA-CLM combines Code Language Models (CodeLlama, InCoder) with multiobjective evolutionary algorithms for multi-point automated program repair in Java, correctly fixing 56 of 224 Defects4J bugs — 64% more than ARJA-e and 43% more than DEAR. Context-aware prompts that include callable fields and methods improve repair by 21% (56 vs 46 correct patches). On the APR-2024 competition benchmark, ARJA-CLM produced 19 plausible patches (7 correct), outperforming all participating tools. Combining search spaces from different sources before searching does not improve results — searching separately and merging patches is better.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. No promise of future release is made either."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses Defects4J v1.0.1, a widely-used public benchmark, and the APR-2024 competition dataset, both publicly available."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Section 4.2 mentions hardware (DELL C4140, Intel Xeon Gold 6148, NVIDIA Tesla V100 32GB), OS (Ubuntu 22.04.3 LTS), and Java version (1.7.0_80), but no Python version, ML framework versions, requirements.txt, or dependency specifications are provided. Not enough to recreate the software environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The approach is described algorithmically but not in a form that enables direct replication."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., 56 correct patches, 94 plausible patches). No confidence intervals or error bars appear in any table or figure."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Claims like 'ARJA-CLM surpasses all participating repair tools' and '64% more defects' are made by comparing raw counts without any statistical significance test (no p-values, t-tests, or non-parametric tests)."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Percentage improvements with baseline context are consistently reported: 64% more than ARJA-e (56 vs 34), 43% more than DEAR (56 vs 39), 21% from context-aware prompts (56 vs 46), 280% more plausible patches on APR-2024 (19 vs 5). Raw counts and percentages are both provided."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses 224 bugs from Defects4J and 100 from APR-2024 without justifying why these sizes are adequate. The choice of 224 is inherited from ARJA-e ('following ARJA-e') without independent justification."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The evolutionary algorithm is stochastic, yet the paper states 'all experiments only perform one round of multiobjective evolution' (Section 4.2). No variance, standard deviation, or spread measures are reported across runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 10 compares ARJA-CLM against 8 baselines: DEAR, HERCULES, Recoder, DLFix, AlphaRepair, GAMMA, TBar, and SimFix."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent tools: DEAR (2022), GAMMA (2023), AlphaRepair (2022), Recoder (2021). The mix of traditional and learning-based tools covers the current landscape well."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "RQ1 ablates model type and size (CodeLlama-7B vs InCoder-6.7B vs InCoder-1.3B). RQ2 ablates search space sources (CodeLlama vs ARJA-e vs combined). RQ3 ablates context-aware prompts (with vs without extra field/method information)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports plausible patches, correct patches, plausible percentage, correct percentage, and evaluation cost (number of fitness evaluations to find first plausible patch, Table 6)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.3: 'we manually verify the correctness of the plausible patches by our repair approach.' They inspect the 10 smallest patches for each bug to determine correctness."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Defects4J provides standard test suites not used for tuning. APR-2024 has separate public test suites (visible to repair tools) and private test suites (for verification only), as stated in Section 4.3."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "All result tables (Tables 3-11) break down results by project (Chart, Lang, Math, Time). Table 4 lists individual bug IDs fixed."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 5 shows bugs ARJA-e fixes but ARJA-CLM cannot. Section 5.6 discusses limitations with algorithmic bugs: 'the direct context of the single buggy statement is more likely to contain errors besides the erroneous statement itself, misleading the CLM to generate incorrect candidates.'"
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 5 shows combining search spaces before searching does not improve results (56 correct for CodeLlama alone vs 45 for CodeLlama+ARJA-e). Section 5.3 explains why: 'combining space also exponentially increases the difficulty for the evolutionary algorithm to find the correct patch.'"
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims '64% more defects compared to ARJA-e' matches 56 vs 34 (Table 5). '43% more than DEAR' matches 56 vs 39 (Table 10). 'Context-aware prompts improved repair effectiveness by 21%' matches 56 vs 46 (Table 7). '280% more plausible patches' on APR-2024 matches 19 vs 5 (Table 11). All verified."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The claim that context-aware prompts 'improve' repair is supported by a controlled ablation (RQ3) comparing prompts with and without extra info under identical conditions. The claim that model size improves performance is supported by comparing InCoder-1.3B vs InCoder-6.7B under identical conditions. These are adequate single-variable manipulations."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 7 explicitly bounds scope: 'our repair approach is aimed at passing the JUnit test suite in Java programs, for which it cannot generalize well to other programming languages or the Java program without test suites.' They also note Defects4J 'cannot fully cover all the defects in the real world.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 7 discusses data leakage as an alternative explanation for performance, notes that different experimental settings may affect comparisons, discusses prompt trimming as a confound, and addresses GPU variation. They argue context-aware prompts (different from training data) mitigating the leakage explanation."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper clearly distinguishes between plausible patches (pass test suite) and correct patches (manually verified), acknowledging that plausible patches may be overfitting. Section 4.3 states they 'skip the overfitting patches (e.g., those deleting necessary statements).'"
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific models are named: CodeLlama-7B, InCoder-1.3B, InCoder-6.7B. These are specific enough identifiers with known architectures and parameter counts. Half-precision inference is also stated."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Figure 2 provides a concrete example of the full context-aware replacement prompt including the comment structure, field/method information format, FILL_ME marker, and actual code context. Figure 6 shows additional prompt examples. The prompt structure is fully described in Section 3.1."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 1 reports sampling parameters (top_p=0.9, top_k=50, temperature=1.0, num_return_sequences=10, max_new_tokens=100). Table 2 reports evolutionary algorithm parameters (N=40, G=50, γ_min=0.1, n_max=60, w=0.5, μ=0.06)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. ARJA-CLM is a traditional pipeline: fault localization → prompt construction → CLM inference → candidate extraction → evolutionary search. No agent loops, tool use, or feedback mechanisms."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.1 describes prompt construction (context extraction, field/method extraction, trimming to max tokens). Section 3.3 describes code sequence filtering (invalid outputs removed, multi-statement sequences handled, duplicates removed). Section 4.3 describes bug selection (Mockito and Closure excluded with reasons)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Threats to validity' provides substantive discussion divided into 'Internal' and 'External' subsections covering multiple specific concerns."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 7 discusses specific threats: manual patch verification bias, CodeLlama/InCoder data leakage from GitHub, different experimental settings across compared tools ('this may affect the fairness'), prompt trimming losing information due to memory limits, and GPU-dependent output variation."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 7 explicitly states: 'our repair approach is aimed at passing the JUnit test suite in Java programs, for which it cannot generalize well to other programming languages or the Java program without test suites' and 'the 224 bugs... cannot fully cover all the defects in the real world.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (generated patches, candidate statements, intermediate results) is made available for independent verification. Only aggregate results are reported in tables."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.3 describes data sources: Defects4J v1.0.1 with 224 real-world bugs in four projects (Chart, Lang, Math, Time), with reasons for excluding Mockito and Closure. APR-2024 is described as 100 bugs generated by GPT-3.5 and GPT-4 for Leetcode problems."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The data sources are standard public benchmarks (Defects4J, APR-2024 competition)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 3 documents the full pipeline: fault localization with Ochiai → LBS extraction → prompt construction with context and field/method info → CLM inference → code sequence filtering and transformation (Figure 3) → candidate set construction → evolutionary search. Figure 4 reports valid statement counts per project."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is mentioned anywhere in the paper. No acknowledgments section listing grants or sponsors."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All four authors are affiliated with the School of Computer Science and Engineering, Beihang University, Beijing, China. They evaluate third-party open-source models (CodeLlama, InCoder), so no product-affiliation conflict exists."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence of funders cannot be assessed. The absence of any funding disclosure means this criterion is not satisfiable."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement appears in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The training data cutoff dates for CodeLlama and InCoder are not stated in the paper, despite both being trained on GitHub data that may overlap with Defects4J."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 5.6 and Section 7 explicitly discuss this: 'We notice the possibility of source code from Defect4J leaking into CodeLlama which we uses.' They argue context-aware prompts make prompts 'different from training data' and test on APR-2024 to mitigate."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "They address contamination by conducting additional experiments on the newly released APR-2024 competition dataset (Section 5.6) and arguing that 'the buggy programs in the newly released APR-2024 AI Track benchmark are written by GPT, with less data leakage issues.' Results on APR-2024 confirm performance."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. This is a benchmark evaluation of an automated program repair system."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. This is a benchmark evaluation of an automated program repair system."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. This is a benchmark evaluation of an automated program repair system."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. This is a benchmark evaluation of an automated program repair system."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. This is a benchmark evaluation of an automated program repair system."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants. This is a benchmark evaluation of an automated program repair system."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants. This is a benchmark evaluation of an automated program repair system."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Section 4.2 states 'the time of one round of search is within 1 hour' but this is a vague upper bound. No per-bug inference time, token counts, or monetary costs are reported. Table 6 reports evaluation cost as number of fitness evaluations, not wall-clock time or compute cost."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is mentioned (V100 32GB, Intel Xeon Gold 6148) but total GPU hours, total experiment time, or overall computational budget are not quantified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The evolutionary algorithm is inherently stochastic, yet only 'one round of multiobjective evolution' is performed per experiment (Section 4.2). No analysis of sensitivity to random seeds or initial populations."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.2 explicitly states: 'All experiments only perform one round of multiobjective evolution on the candidate statements.' The number of runs (1) is clearly stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Evolutionary algorithm parameters are inherited from ARJA-e (Table 2) and sampling parameters are set without justification (Table 1). No hyperparameter search or budget is reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper transparently compares all three models (RQ1) and all search space configurations (RQ2), showing all results in Tables 3 and 5. CodeLlama-7B is selected as best based on comprehensive comparison, not cherry-picking."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "Section 7 acknowledges: 'our experimental results for comparison come directly from the papers whose experimental settings are different, this may affect the fairness of the repair effect comparison.' They also note that AlphaRepair and GAMMA use perfect fault localization while ARJA-CLM does not, disadvantaging their own system."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No analysis of performance as a function of compute budget. The search cost (Table 6) reports fitness evaluations but not compute time, and there is no comparison of compute requirements across methods."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Defects4J is used as a standard benchmark without any discussion of whether its 224 bugs represent real-world defect distributions or whether passing test suites constitutes genuine correctness."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No agentic scaffolding is involved. The pipeline (fault localization + CLM + evolutionary search) is the system under test, not a scaffold around a model."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "The paper explicitly discusses that Defects4J source code may have leaked into CodeLlama's training data (Section 7) and addresses this by testing on the newly released APR-2024 dataset with AI-generated bugs less likely to be in training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup (e.g., fault localization information, test suite feedback during search) provides information beyond what would be available in real deployment scenarios."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether Defects4J bugs from the same project (e.g., multiple Math bugs) are independent, or whether project-level correlations could inflate results."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The mitigation strategy of using APR-2024 is a workaround, not a detection method."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "ARJA-CLM correctly fixes 56 of 224 Defects4J bugs, 64% more than ARJA-e (34) and 43% more than DEAR (39).",
    365       "evidence": "Tables 4, 5, and 10 report correct patch counts. Table 10 provides per-project breakdown showing ARJA-CLM surpasses all compared tools.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Context-aware prompts that include callable fields and methods improve repair effectiveness by 21%.",
    370       "evidence": "Table 7 shows 56 correct patches with extra info vs 46 without. Table 8 identifies 12 correct patches using extra info, 7 of which are unfixable without it. Figure 6 shows concrete examples.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "ARJA-CLM generates 280% more plausible patches than ARJA-e on the APR-2024 benchmark (19 vs 5), surpassing all participating tools.",
    375       "evidence": "Table 11 compares ARJA-CLM against 7 other tools on 100 APR-2024 bugs. ARJA-CLM produces 19 plausible and 7 correct patches.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Better overall model performance (HumanEval scores) correlates with better candidate statement generation for APR.",
    380       "evidence": "Section 5.2 compares CodeLlama-7B (HumanEval 33.5%) vs InCoder-6.7B (15.2%). Figure 4 shows CodeLlama generates more valid statements across all projects. Table 3 shows CodeLlama produces more plausible patches (94 vs 71).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Combining search spaces before evolutionary search does not improve repair effectiveness compared to using CodeLlama alone.",
    385       "evidence": "Table 5 shows CodeLlama alone yields 56 correct patches vs 45 for CodeLlama+ARJA-e combined. Section 5.3 explains that combining exponentially increases search difficulty.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "ARJA-CLM fixes 22 unique bugs that no other compared tool can fix.",
    390       "evidence": "Figure 7 Venn diagram shows 22 bugs unique to ARJA-CLM when compared against DEAR, GAMMA, Recoder, and HERCULES.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "ARJA-CLM repairs more multilocation bugs than ARJA-e (17 correct vs 10 correct).",
    395       "evidence": "Table 9 breaks down multilocation bug fixes by project, showing ARJA-CLM generates plausible patches for 38 and correct patches for 17 multilocation bugs vs 32/10 for ARJA-e.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Single stochastic run without variance",
    402       "detail": "The evolutionary algorithm is inherently stochastic, yet all experiments use only one round of evolution (Section 4.2). Without multiple runs, it is impossible to know whether the reported counts are typical or lucky outcomes. This is especially concerning for the APR-2024 results where the margins are small (7 vs 5 correct patches)."
    403     },
    404     {
    405       "flag": "Unfair baseline comparison conditions",
    406       "detail": "Section 7 acknowledges that baseline results come from other papers with different experimental settings. AlphaRepair and GAMMA use perfect fault localization while ARJA-CLM uses imperfect (Ochiai), making the comparison non-uniform. While this actually disadvantages ARJA-CLM, it prevents apples-to-apples conclusions."
    407     },
    408     {
    409       "flag": "No code released",
    410       "detail": "Despite proposing a novel tool (ARJA-CLM), no source code is released or linked. This prevents independent verification of the reported results and limits practical impact."
    411     },
    412     {
    413       "flag": "Manual patch verification subjectivity",
    414       "detail": "Correctness of plausible patches is determined by manual inspection. Section 7 acknowledges: 'Due to the limitations of knowledge and the complexity of defects, we may misjudge the correctness of the patch.' No inter-rater agreement or independent verification is reported."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    420       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    421       "year": 2022,
    422       "relevance": "AlphaRepair is a key CLM-based APR baseline that uses CodeBERT for zero-shot program repair, directly compared against ARJA-CLM."
    423     },
    424     {
    425       "title": "Code llama: Open foundation models for code",
    426       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    427       "year": 2023,
    428       "arxiv_id": "2308.12950",
    429       "relevance": "CodeLlama is the primary code language model used in ARJA-CLM for generating candidate repair statements."
    430     },
    431     {
    432       "title": "Incoder: A generative model for code infilling and synthesis",
    433       "authors": ["Daniel Fried", "Armen Aghajanyan", "Jessy Lin"],
    434       "year": 2022,
    435       "arxiv_id": "2204.05999",
    436       "relevance": "InCoder is one of the code language models evaluated in ARJA-CLM for code infilling in program repair."
    437     },
    438     {
    439       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    440       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    441       "year": 2020,
    442       "arxiv_id": "2002.08155",
    443       "relevance": "Pre-trained code model used in related APR work (AlphaRepair), relevant to understanding CLM capabilities for code tasks."
    444     },
    445     {
    446       "title": "GAMMA: Revisiting template-based automated program repair via mask prediction",
    447       "authors": ["Quanjun Zhang", "Chunrong Fang", "Tongke Zhang"],
    448       "year": 2023,
    449       "relevance": "State-of-the-art CLM-based APR using mask prediction, compared against ARJA-CLM. Uses perfect fault localization."
    450     },
    451     {
    452       "title": "Evaluating large language models trained on code",
    453       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    454       "year": 2021,
    455       "arxiv_id": "2107.03374",
    456       "relevance": "Introduces HumanEval benchmark used to compare overall model performance between CodeLlama and InCoder."
    457     },
    458     {
    459       "title": "Impact of code language models on automated program repair",
    460       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    461       "year": 2023,
    462       "arxiv_id": "2302.05020",
    463       "relevance": "Directly studies the impact of CLMs on APR effectiveness, closely related to this paper's research questions."
    464     },
    465     {
    466       "title": "Practical program repair in the era of large pre-trained language models",
    467       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    468       "year": 2022,
    469       "arxiv_id": "2210.14179",
    470       "relevance": "Studies practical aspects of using pre-trained LLMs for program repair, addressing similar challenges as ARJA-CLM."
    471     },
    472     {
    473       "title": "Automated repair of programs from large language models",
    474       "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev", "Abhik Roychoudhury", "Shin Hwei Tan"],
    475       "year": 2023,
    476       "relevance": "Explores automated repair of LLM-generated code, relevant to understanding CLM reliability and repair needs."
    477     },
    478     {
    479       "title": "DeepCode AI Fix: Fixing security vulnerabilities with large language models",
    480       "authors": ["Berkay Berabi", "Alexey Gronskiy", "Veselin Raychev"],
    481       "year": 2024,
    482       "arxiv_id": "2402.13291",
    483       "relevance": "Uses LLMs for fixing security vulnerabilities, extending APR to the safety/security domain."
    484     },
    485     {
    486       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    487       "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven C.H. Hoi"],
    488       "year": 2021,
    489       "arxiv_id": "2109.00859",
    490       "relevance": "Pre-trained code model relevant to the broader landscape of code language models used in software engineering tasks."
    491     }
    492   ],
    493   "engagement_factors": {
    494     "practical_relevance": {
    495       "score": 2,
    496       "justification": "Proposes a concrete tool for multi-point automated program repair in Java, but no code is released limiting immediate practical use."
    497     },
    498     "surprise_contrarian": {
    499       "score": 1,
    500       "justification": "Combining evolutionary algorithms with CLMs is a novel integration but the finding that larger/better models help is expected; the negative result about merging search spaces is mildly surprising."
    501     },
    502     "fear_safety": {
    503       "score": 0,
    504       "justification": "No AI safety or security implications; this is a software engineering tool for fixing bugs."
    505     },
    506     "drama_conflict": {
    507       "score": 0,
    508       "justification": "No controversy or conflict; straightforward empirical comparison of repair tools."
    509     },
    510     "demo_ability": {
    511       "score": 0,
    512       "justification": "No code repository, demo, or installable tool is provided."
    513     },
    514     "brand_recognition": {
    515       "score": 0,
    516       "justification": "From Beihang University; uses open-source models (CodeLlama, InCoder) rather than products from famous AI labs."
    517     }
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs