scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30204B)
      1 {
      2   "paper": {
      3     "title": "Exploring and Lifting the Robustness of LLM-powered Automated Program Repair with Metamorphic Testing",
      4     "authors": [
      5       "Pengyu Xue",
      6       "Linhao Wu",
      7       "Zhen Yang",
      8       "Zhongxing Yu",
      9       "Zhi Jin",
     10       "Ge Li",
     11       "Yan Xiao",
     12       "Shuo Liu",
     13       "Xinyi Li",
     14       "Hongyi Lin",
     15       "Jingwen Wu"
     16     ],
     17     "year": 2024,
     18     "venue": "arXiv.org",
     19     "arxiv_id": "2410.07516",
     20     "doi": "10.48550/arXiv.2410.07516"
     21   },
     22   "scan_version": 3,
     23   "active_modules": ["experimental_rigor", "data_leakage"],
     24   "methodology_tags": ["benchmark-eval", "qualitative"],
     25   "key_findings": "MT-LAPR, a metamorphic testing framework with nine code perturbation rules across token, statement, and block levels, reveals that 34.4%–48.5% of test cases expose instability in four LLMs' automated program repair capabilities. A negative correlation between perturbation distance and repair performance suggests that code readability positively affects LAPR robustness. A CodeT5-based readability improvement model, embedded as a preprocessing step, enhances LAPR robustness by up to 49.32% without modifying the LLMs' parameters.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No repository URL, code archive, or link to the MT-LAPR implementation is provided anywhere in the paper."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The base datasets (Defects4J and QuixBugs) are standard public benchmarks. The perturbations are deterministic AST transformations fully described in the paper, making reconstruction feasible from public data."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No environment specifications, dependency files, or library versions are provided. The paper mentions JavaParser and Python difflib but gives no version or setup details."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The perturbation algorithms are described at a conceptual level but full implementation details are absent."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All results (R-scores in Tables II–VI) are reported as point estimates without confidence intervals or error bars."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper uses Spearman correlation with p-value for the edit-distance analysis (Section V-D) and kappa coefficients for inter-rater agreement, but the main comparative claims about model performance differences (e.g., Mistral Large vs LLaMA3-70B) lack significance tests."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Improvements are reported with baseline context: e.g., LLaMA3-8B R-score from 0.440 to 0.657 (49.32% improvement), and percentage declines per model are quantified in Tables II and V."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper uses 60 base samples per dataset (15 per LLM) with no justification for why 15 is sufficient, no power analysis, and no discussion of whether this sample size supports generalizable conclusions."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No variance, standard deviation, or spread measures are reported. Temperature=0 makes outputs deterministic, but this eliminates only inference randomness — variance across different base sample selections is not assessed."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No baseline robustness testing framework or alternative perturbation approach is compared against MT-LAPR. The paper evaluates MT-LAPR in isolation."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No baselines are included, so contemporaneity cannot be assessed."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "RQ4 (Table III) evaluates each of the 9 MRs individually, and RQ3 (Figure 2) studies the effect of combining different numbers of MRs (perturbation distance), effectively ablating components of the framework."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The paper uses R-score as the primary metric, supplemented by edit distance analysis (Table III), readability scores (Figure 2), and kappa coefficients for agreement assessment."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Developer surveys are included: RQ1 (Section V-A) has 10 developers rate perturbation frequency on a Likert scale, and RQ3 includes readability assessments by developer pairs."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "For the CodeT5 fine-tuning (Section VI-A), the paper states: 'we still test on the dataset used in previous experiments (RQ2~5), while preparing the training dataset with the rest of the samples,' establishing a proper train/test split."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Results are broken down by perturbation rule (Table III), repair pattern (Table IV), perturbation distance (Figure 2), and per-model (Table II)."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "RQ5 (Table IV) identifies repair patterns where LLMs are most vulnerable (Missing Null-Check at 0.298 R-score), and Section VI-C discusses approaches that failed (LLM refactoring causing 20.5% R-score reduction)."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section VI-C 'Trial and error' reports that direct LLM code refactoring led to a 20.5% reduction in R-score, and that manual reverse rule design has scalability limitations."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Abstract claims of '34.4%~48.5% instability' match Table II averages (1-0.656=34.4%, 1-0.515=48.5%). The '49.32% improvement' claim matches Table V (LLaMA3-8B with CodeT5-large)."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The perturbation study uses controlled single-variable manipulation (same code, only perturbation differs, temperature=0 eliminates inference randomness), which is adequate for the causal claim that perturbations cause repair failures. The readability improvement experiment similarly isolates the preprocessing step."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title claims broadly about 'LLM-powered Automated Program Repair' but experiments are restricted to Java only, 4 specific LLMs, and 2 datasets. The threats section acknowledges the Java limitation but the title and abstract do not bound the claims."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section VII discusses data leakage as an alternative explanation and runs a dedicated experiment (Table VI). The paper also considers whether edit distance rather than perturbation content drives results (Section V-D, finding no significant correlation)."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "R-score directly measures what is claimed: the proportion of perturbed test cases on which the LLM still produces a correct repair. The proxy aligns with the measured outcome (robustness to perturbation)."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Models are identified as 'Mistral Large', 'LLaMA3-70B', 'LLaMA3-8B', and 'CodeGemma-7B' without snapshot dates, API versions, or exact model identifiers. 'Mistral Large' is especially vague as Mistral has released multiple versions."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "The paper states 'the parameter settings and prompt templates for all LLMs reviewed are fixed the same' (Section IV-A) but does not provide the actual prompt text used for repair."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Temperature=0 is stated for all LLMs (Section IV-A). For CodeT5 fine-tuning: 3 epochs, learning rate 5×10⁻⁵, batch size 1, weight decay 0.01 (Section VI-A)."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. LLMs are called directly for code repair without agent loops, tools, or iterative workflows."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section IV-B documents the base sample construction: filter for successfully repaired samples, follow Sobreira et al.'s taxonomy to sample across repair patterns, obtain 60 samples per dataset. Section IV-D details test case construction with combinatorial sampling."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section VII 'Threats to Validity' provides substantive discussion of internal, external, and construct validity threats."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The threats section identifies specific issues: the nine MRs may not cover all coding discrepancies, Java-only limits generality, Defects4J data leakage risk (with dedicated experiment), and test suite evaluation may miss literal inconsistencies."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section VII explicitly states that focusing on Java limits generality, the MRs may not encapsulate all real-world coding styles, and the selected LLMs may affect conclusions. Future work to extend to other PLs is noted."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No raw experimental data (perturbed test cases, LLM outputs, repair results) is released for independent verification."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The pilot study describes collecting 500 Codeforces samples (10 problems, 50 Java codes each, average difficulty, highest passing rate). The base sample selection from Defects4J and QuixBugs is documented in Section IV-B."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The paper states '10 full-time Java developers (at least 3-5 years of coding experience) from the industry' but does not describe how they were recruited, from which companies, or whether recruitment could introduce bias."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline is documented: original datasets → filter successfully repaired samples → select across repair patterns (60 base samples) → apply perturbation rules → construct single/multi-distance test cases with combinatorial sampling → evaluate via test suites with reverse processing for variable/method names."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No funding sources, grants, or acknowledgments section is present in the paper."
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "All author affiliations are clearly listed: Shandong University, Peking University, Nanyang Technological University, Sun Yat-sen University, and City University of Hong Kong."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No funding source is disclosed, making independence impossible to assess."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interest declarations appear in the paper."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No training data cutoff dates are stated for any of the four LLMs (Mistral Large, LLaMA3-70B/8B, CodeGemma-7B)."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Section VII explicitly discusses data leakage: 'since the datasets we used have been widely studied, data leakage may pose an internal threat.' A dedicated experiment (Table VI) uses perturbed samples as leakage-free test sets."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": true,
    251         "answer": true,
    252         "justification": "Section VII constructs Defects4J_free-leakage and QuixBugs_free-leakage datasets from perturbed samples that LLMs are unlikely to have seen during pre-training, and re-runs experiments to validate conclusions hold (Table VI)."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No pre-registration is mentioned for the developer surveys."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "No IRB or ethics board approval is mentioned for the developer surveys."
    265       },
    266       "demographics_reported": {
    267         "applies": true,
    268         "answer": true,
    269         "justification": "Survey respondents are characterized as 'ten full-time Java developers (at least 3-5 years of coding experience for each) from the industry' (Section IV-D1)."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Inclusion criterion is stated: 'at least 3-5 years of practical Java development experience' and 'full-time Java developers from the industry' (Sections III-A, IV-D1)."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "The human component is a cross-sectional survey/rating task, not an experimental study with conditions requiring randomization."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "The human component is a Likert-scale rating task on perturbation frequency and readability, not an experimental comparison where blinding would apply."
    285       },
    286       "attrition_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No information about whether all 10 developers completed all survey/rating tasks or whether any dropped out."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No inference cost, API cost, or latency is reported despite running 4 LLMs across thousands of test cases."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No GPU hours, hardware specifications, total API spend, or training time is reported for either the LLM evaluations or the CodeT5 fine-tuning."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Results are not reported across multiple random seeds. Temperature=0 is used to make outputs deterministic, but the paper does not report sensitivity to other sources of variance (e.g., base sample selection)."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The number of experimental runs is not explicitly stated. Temperature=0 implies single-run deterministic outputs, but this is not stated as '1 run per test case.'"
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No hyperparameter search is reported for the CodeT5 fine-tuning (learning rate, batch size, epochs appear fixed without search). LLM temperature is fixed at 0 without justification."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Temperature=0 is stated without justification for why this is the best setting. CodeT5 hyperparameters are reported but their selection is not justified."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Many comparisons are made across 4 models, 9 perturbation rules, 9 perturbation distances, and 9 repair patterns, but no correction for multiple comparisons is applied."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors evaluate their own MT-LAPR framework without acknowledging the bias of self-evaluation or using independent evaluators for the framework's effectiveness."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No performance-vs-compute analysis is provided. Model sizes range from 7B to unknown (Mistral Large) but cost/compute differences are not discussed."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The paper notes that QuixBugs contains simpler lab bugs vs Defects4J's real-world bugs, but does not substantively discuss whether these benchmarks adequately measure LAPR robustness or their construct validity limitations."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No agentic scaffolding is used; LLMs are called directly for repair."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "Section VII discusses that Defects4J and QuixBugs have been widely studied and may appear in training data. A dedicated experiment (Table VI) creates leakage-free datasets from perturbed samples to assess temporal leakage impact."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the repair prompt format or surrounding context could leak information about the expected fix."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether train and test examples share structural similarities (e.g., same projects in Defects4J appearing across samples)."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": true,
    370         "justification": "The paper constructs 'free-leakage' datasets (Defects4J_free-leakage, QuixBugs_free-leakage) using perturbed samples that LLMs are unlikely to have encountered during pre-training, then re-evaluates performance (Table VI)."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "34.4%–48.5% of MT-LAPR test cases expose the instability of LAPR techniques on average across four LLMs.",
    377       "evidence": "Table II shows average R-scores of 0.515 (Defects4Jtest) and 0.656 (QuixBugstest), meaning 48.5% and 34.4% of test cases caused repair failure after perturbation. Per-model R-scores range from 0.248 (CodeGemma on QuixBugs) to 0.951 (Mistral Large on QuixBugs).",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "There is a significant negative correlation between perturbation distance and LLMs' program repair performance.",
    382       "evidence": "Figure 2 shows R-score declining as perturbation distance increases from 1 to 9 on both datasets. The trend is consistent across all models.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Code readability positively correlates with LAPR robustness.",
    387       "evidence": "Figure 2 shows readability scores (from developer surveys) decreasing in parallel with R-score as perturbation distance increases. Cohen's kappa for inter-rater agreement is 0.65 and 0.67 (Section V-C).",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "CodeT5-based readability improvement enhances LAPR robustness by up to 49.32%.",
    392       "evidence": "Table V: LLaMA3-8B R-score improved from 0.440 to 0.657 with CodeT5-large (49.32% improvement). LLaMA3-70B improved from 0.536 to 0.617 (15.11%). Both CodeT5 models reduced edit distances of ~75% of perturbed codes.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "All nine proposed MRs are prevalent and practically significant among developers.",
    397       "evidence": "Figure 1 shows average frequency scores >3 (Generally Frequent) for all nine perturbations from 10 industry developers. Randolph's Kappa = 0.76 indicating 'almost perfect agreement' (Section V-A).",
    398       "supported": "weak"
    399     },
    400     {
    401       "claim": "Perturbation content is more significant than the number of perturbed tokens in affecting LLM repair performance.",
    402       "evidence": "Section V-D: Spearman correlation between edit distances and R-scores is 0.042 with p=0.914, showing no significant relationship between the amount of perturbation and LLM performance.",
    403       "supported": "moderate"
    404     },
    405     {
    406       "claim": "Larger LLMs exhibit better perturbation resistance in APR (scaling effect).",
    407       "evidence": "Table II: LLaMA3-70B (R-score 0.536/0.757) consistently outperforms LLaMA3-8B (0.440/0.668) on both datasets. Mistral Large performs best overall.",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "red_flags": [
    412     {
    413       "flag": "Very small base sample sizes",
    414       "detail": "Only 15 base samples per LLM (60 total per dataset) are used, selected from samples each LLM can already repair. This is a very small foundation for claims about LAPR robustness in general. The combinatorial expansion to construct test cases does not compensate for the narrow base."
    415     },
    416     {
    417       "flag": "No code or framework released",
    418       "detail": "Neither the MT-LAPR implementation, the generated test cases, the CodeT5 models, nor the experimental outputs are released, preventing independent verification or replication."
    419     },
    420     {
    421       "flag": "Survey with only 10 developers",
    422       "detail": "Claims about perturbation prevalence and code readability rely on surveys with only 10 industry developers. This is too small to generalize about developer practices broadly, and recruitment methods are undisclosed."
    423     },
    424     {
    425       "flag": "No baseline testing framework comparison",
    426       "detail": "MT-LAPR is evaluated in isolation without comparison to any existing robustness testing approach (e.g., random code perturbation, existing mutation testing tools). It is unclear how much of the instability detected is attributable to the specific MR design vs. any perturbation."
    427     },
    428     {
    429       "flag": "Temperature=0 does not reflect practical deployment",
    430       "detail": "All experiments use temperature=0 for deterministic outputs, but in practice developers use higher temperatures. The robustness results may differ substantially under realistic sampling settings."
    431     },
    432     {
    433       "flag": "Selection bias in base samples",
    434       "detail": "Only samples that LLMs can successfully repair are included as base samples. This excludes harder bugs and may overestimate the practical impact of perturbations by focusing on the easiest cases."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Automated program repair in the era of large pre-trained language models",
    440       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    441       "year": 2023,
    442       "relevance": "Extensively explored 9 state-of-the-art LLMs for automated program repair, demonstrating LLMs significantly outperform existing APR techniques."
    443     },
    444     {
    445       "title": "On the robustness of code generation techniques: An empirical study on github copilot",
    446       "authors": ["A. Mastropaolo", "L. Pascarella", "E. Guglielmi"],
    447       "year": 2023,
    448       "relevance": "Studied robustness of GitHub Copilot in code generation, finding ~46% of cases produce different recommendations when prompts are modified."
    449     },
    450     {
    451       "title": "NLPerturbator: Studying the robustness of code LLMs to natural language variations",
    452       "authors": ["J. Chen", "Z. Li", "X. Hu", "X. Xia"],
    453       "year": 2024,
    454       "arxiv_id": "2406.19783",
    455       "relevance": "Found that perturbed prompts can decrease code generation performance by up to 21.2% across six code LLMs."
    456     },
    457     {
    458       "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair",
    459       "authors": ["Y. Wei", "C. S. Xia", "L. Zhang"],
    460       "year": 2023,
    461       "relevance": "Proposes hybrid LLM-based program repair combining LLMs with code completion engines."
    462     },
    463     {
    464       "title": "RepairAgent: An autonomous, LLM-based agent for program repair",
    465       "authors": ["I. Bouzenia", "P. Devanbu", "M. Pradel"],
    466       "year": 2024,
    467       "arxiv_id": "2403.17134",
    468       "relevance": "Proposes an autonomous LLM-based agent for program repair, directly relevant to agentic AI workflows for code."
    469     },
    470     {
    471       "title": "Syntactic robustness for LLM-based code generation",
    472       "authors": ["L. Sarker", "M. Downing", "A. Desai", "T. Bultan"],
    473       "year": 2024,
    474       "arxiv_id": "2404.01535",
    475       "relevance": "Studies syntactic robustness of LLM-based code generation, directly related to code perturbation effects on LLMs."
    476     },
    477     {
    478       "title": "The plastic surgery hypothesis in the era of large language models",
    479       "authors": ["C. S. Xia", "Y. Ding", "L. Zhang"],
    480       "year": 2023,
    481       "relevance": "Investigates LLMs' APR capabilities in the context of the plastic surgery hypothesis about code reuse for patches."
    482     },
    483     {
    484       "title": "Hybrid automated program repair by combining large language models and program analysis",
    485       "authors": ["F. Li", "J. Jiang", "J. Sun", "H. Zhang"],
    486       "year": 2024,
    487       "arxiv_id": "2406.00992",
    488       "relevance": "Proposes combining LLMs with program analysis for APR, relevant to understanding LAPR technique design."
    489     },
    490     {
    491       "title": "VulRepair: a T5-based automated software vulnerability repair",
    492       "authors": ["M. Fu", "C. Tantithamthavorn", "T. Le", "V. Nguyen", "D. Phung"],
    493       "year": 2022,
    494       "relevance": "Uses T5 models for automated vulnerability repair, relevant to LLM-based code repair and the CodeT5 model used in this paper."
    495     },
    496     {
    497       "title": "PromptBench: Towards evaluating the robustness of large language models on adversarial prompts",
    498       "authors": ["K. Zhu", "J. Wang", "J. Zhou"],
    499       "year": 2023,
    500       "arxiv_id": "2306.04528",
    501       "relevance": "Benchmark for evaluating LLM robustness to adversarial prompts, foundational work on prompt sensitivity that motivates this paper."
    502     },
    503     {
    504       "title": "Exploring and unleashing the power of large language models in automated code translation",
    505       "authors": ["Z. Yang", "F. Liu", "Z. Yu"],
    506       "year": 2024,
    507       "relevance": "Studies LLMs for automated code translation, relevant to understanding LLM capabilities and limitations in code tasks."
    508     },
    509     {
    510       "title": "How effective are neural networks for fixing security vulnerabilities",
    511       "authors": ["Y. Wu", "N. Jiang", "H. V. Pham"],
    512       "year": 2023,
    513       "relevance": "Evaluates neural networks for security vulnerability repair, relevant to LLM-based code repair effectiveness."
    514     }
    515   ],
    516   "engagement_factors": {
    517     "practical_relevance": {
    518       "score": 2,
    519       "justification": "The framework and readability-improvement preprocessing concept are actionable for teams deploying LAPR tools, but no code is released for immediate use."
    520     },
    521     "surprise_contrarian": {
    522       "score": 1,
    523       "justification": "LLM sensitivity to input formatting is already well-known; the readability-robustness correlation is a useful quantification but not deeply surprising."
    524     },
    525     "fear_safety": {
    526       "score": 1,
    527       "justification": "Highlights that LLM-based repair tools are unreliable under common code variations, relevant to deployment safety but not a novel attack vector."
    528     },
    529     "drama_conflict": {
    530       "score": 0,
    531       "justification": "No controversy, no challenge to specific companies or benchmarks."
    532     },
    533     "demo_ability": {
    534       "score": 0,
    535       "justification": "No code repository, no demo, no installable tool provided."
    536     },
    537     "brand_recognition": {
    538       "score": 1,
    539       "justification": "Uses LLaMA3 and Mistral (moderately well-known) but not tier-1 brands like GPT-4 or Copilot."
    540     }
    541   }
    542 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs