scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31902B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Exploring and Lifting the Robustness of LLM-powered Automated Program Repair with Metamorphic Testing",
      6     "authors": [
      7       "Pengyu Xue",
      8       "Linhao Wu",
      9       "Zhen Yang",
     10       "Zhongxing Yu",
     11       "Zhi Jin"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2410.07516",
     16     "doi": "10.48550/arXiv.2410.07516"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims of '34.4%~48.5% instability' match Table II averages (1-0.656=34.4%, 1-0.515=48.5%). The '49.32% improvement' claim matches Table V (LLaMA3-8B with CodeT5-large).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The perturbation study uses controlled single-variable manipulation (same code, only perturbation differs, temperature=0 eliminates inference randomness), which is adequate for the causal claim that perturbations cause repair failures. The readability improvement experiment similarly isolates the preprocessing step.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims broadly about 'LLM-powered Automated Program Repair' but experiments are restricted to Java only, 4 specific LLMs, and 2 datasets. The threats section acknowledges the Java limitation but the title and abstract do not bound the claims.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section VII discusses data leakage as an alternative explanation and runs a dedicated experiment (Table VI). The paper also considers whether edit distance rather than perturbation content drives results (Section V-D, finding no significant correlation).",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "R-score directly measures what is claimed: the proportion of perturbed test cases on which the LLM still produces a correct repair. The proxy aligns with the measured outcome (robustness to perturbation).",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section VII 'Threats to Validity' provides substantive discussion of internal, external, and construct validity threats.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The threats section identifies specific issues: the nine MRs may not cover all coding discrepancies, Java-only limits generality, Defects4J data leakage risk (with dedicated experiment), and test suite evaluation may miss literal inconsistencies.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section VII explicitly states that focusing on Java limits generality, the MRs may not encapsulate all real-world coding styles, and the selected LLMs may affect conclusions. Future work to extend to other PLs is noted.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding sources, grants, or acknowledgments section is present in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are clearly listed: Shandong University, Peking University, Nanyang Technological University, Sun Yat-sen University, and City University of Hong Kong.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding source is disclosed, making independence impossible to assess.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interest declarations appear in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "LAPR, Metamorphic Relations (MRs), perturbation distance (with formal definition in Eq. 10-11), R-score (Eq. 12), and code readability (cited definition from Sedano 2016) are all explicitly defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are stated in the introduction: the MT-LAPR framework with nine MRs, empirical experiments on robustness deficiencies across four LLMs and two datasets, and the CodeT5-based readability improvement approach.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II reviews prior LAPR techniques (Xia et al., Zhang et al., InferFix) and metamorphic testing literature (Chen et al., Wang et al., Tao et al.), positioning MT-LAPR as the first application of metamorphic testing specifically to LAPR.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No repository URL, code archive, or link to the MT-LAPR implementation is provided anywhere in the paper.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The base datasets (Defects4J and QuixBugs) are standard public benchmarks. The perturbations are deterministic AST transformations fully described in the paper, making reconstruction feasible from public data.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No environment specifications, dependency files, or library versions are provided. The paper mentions JavaParser and Python difflib but gives no version or setup details.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The perturbation algorithms are described at a conceptual level but full implementation details are absent.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results (R-scores in Tables II–VI) are reported as point estimates without confidence intervals or error bars.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper uses Spearman correlation with p-value for the edit-distance analysis (Section V-D) and kappa coefficients for inter-rater agreement, but the main comparative claims about model performance differences (e.g., Mistral Large vs LLaMA3-70B) lack significance tests.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Improvements are reported with baseline context: e.g., LLaMA3-8B R-score from 0.440 to 0.657 (49.32% improvement), and percentage declines per model are quantified in Tables II and V.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper uses 60 base samples per dataset (15 per LLM) with no justification for why 15 is sufficient, no power analysis, and no discussion of whether this sample size supports generalizable conclusions.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread measures are reported. Temperature=0 makes outputs deterministic, but this eliminates only inference randomness — variance across different base sample selections is not assessed.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No baseline robustness testing framework or alternative perturbation approach is compared against MT-LAPR. The paper evaluates MT-LAPR in isolation.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "No baselines are included, so contemporaneity cannot be assessed.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "RQ4 (Table III) evaluates each of the 9 MRs individually, and RQ3 (Figure 2) studies the effect of combining different numbers of MRs (perturbation distance), effectively ablating components of the framework.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper uses R-score as the primary metric, supplemented by edit distance analysis (Table III), readability scores (Figure 2), and kappa coefficients for agreement assessment.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Developer surveys are included: RQ1 (Section V-A) has 10 developers rate perturbation frequency on a Likert scale, and RQ3 includes readability assessments by developer pairs.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "For the CodeT5 fine-tuning (Section VI-A), the paper states: 'we still test on the dataset used in previous experiments (RQ2~5), while preparing the training dataset with the rest of the samples,' establishing a proper train/test split.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by perturbation rule (Table III), repair pattern (Table IV), perturbation distance (Figure 2), and per-model (Table II).",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "RQ5 (Table IV) identifies repair patterns where LLMs are most vulnerable (Missing Null-Check at 0.298 R-score), and Section VI-C discusses approaches that failed (LLM refactoring causing 20.5% R-score reduction).",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Section VI-C 'Trial and error' reports that direct LLM code refactoring led to a 20.5% reduction in R-score, and that manual reverse rule design has scalability limitations.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Models are identified as 'Mistral Large', 'LLaMA3-70B', 'LLaMA3-8B', and 'CodeGemma-7B' without snapshot dates, API versions, or exact model identifiers. 'Mistral Large' is especially vague as Mistral has released multiple versions.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper states 'the parameter settings and prompt templates for all LLMs reviewed are fixed the same' (Section IV-A) but does not provide the actual prompt text used for repair.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature=0 is stated for all LLMs (Section IV-A). For CodeT5 fine-tuning: 3 epochs, learning rate 5×10⁻⁵, batch size 1, weight decay 0.01 (Section VI-A).",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. LLMs are called directly for code repair without agent loops, tools, or iterative workflows.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section IV-B documents the base sample construction: filter for successfully repaired samples, follow Sobreira et al.'s taxonomy to sample across repair patterns, obtain 60 samples per dataset. Section IV-D details test case construction with combinatorial sampling.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw experimental data (perturbed test cases, LLM outputs, repair results) is released for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The pilot study describes collecting 500 Codeforces samples (10 problems, 50 Java codes each, average difficulty, highest passing rate). The base sample selection from Defects4J and QuixBugs is documented in Section IV-B.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "The paper states '10 full-time Java developers (at least 3-5 years of coding experience) from the industry' but does not describe how they were recruited, from which companies, or whether recruitment could introduce bias.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline is documented: original datasets → filter successfully repaired samples → select across repair patterns (60 base samples) → apply perturbation rules → construct single/multi-distance test cases with combinatorial sampling → evaluate via test suites with reverse processing for variable/method names.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the four LLMs (Mistral Large, LLaMA3-70B/8B, CodeGemma-7B).",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Section VII explicitly discusses data leakage: 'since the datasets we used have been widely studied, data leakage may pose an internal threat.' A dedicated experiment (Table VI) uses perturbed samples as leakage-free test sets.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Section VII constructs Defects4J_free-leakage and QuixBugs_free-leakage datasets from perturbed samples that LLMs are unlikely to have seen during pre-training, and re-runs experiments to validate conclusions hold (Table VI).",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration is mentioned for the developer surveys.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No IRB or ethics board approval is mentioned for the developer surveys.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": true,
    328           "justification": "Survey respondents are characterized as 'ten full-time Java developers (at least 3-5 years of coding experience for each) from the industry' (Section IV-D1).",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "Inclusion criterion is stated: 'at least 3-5 years of practical Java development experience' and 'full-time Java developers from the industry' (Sections III-A, IV-D1).",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "The human component is a cross-sectional survey/rating task, not an experimental study with conditions requiring randomization.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "The human component is a Likert-scale rating task on perturbation frequency and readability, not an experimental comparison where blinding would apply.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "No information about whether all 10 developers completed all survey/rating tasks or whether any dropped out.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, API cost, or latency is reported despite running 4 LLMs across thousands of test cases.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No GPU hours, hardware specifications, total API spend, or training time is reported for either the LLM evaluations or the CodeT5 fine-tuning.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "Results are not reported across multiple random seeds. Temperature=0 is used to make outputs deterministic, but the paper does not report sensitivity to other sources of variance (e.g., base sample selection).",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is not explicitly stated. Temperature=0 implies single-run deterministic outputs, but this is not stated as '1 run per test case.'",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search is reported for the CodeT5 fine-tuning (learning rate, batch size, epochs appear fixed without search). LLM temperature is fixed at 0 without justification.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "Temperature=0 is stated without justification for why this is the best setting. CodeT5 hyperparameters are reported but their selection is not justified.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "Many comparisons are made across 4 models, 9 perturbation rules, 9 perturbation distances, and 9 repair patterns, but no correction for multiple comparisons is applied.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors evaluate their own MT-LAPR framework without acknowledging the bias of self-evaluation or using independent evaluators for the framework's effectiveness.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "No performance-vs-compute analysis is provided. Model sizes range from 7B to unknown (Mistral Large) but cost/compute differences are not discussed.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper notes that QuixBugs contains simpler lab bugs vs Defects4J's real-world bugs, but does not substantively discuss whether these benchmarks adequately measure LAPR robustness or their construct validity limitations.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No agentic scaffolding is used; LLMs are called directly for repair.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": true,
    430           "justification": "Section VII discusses that Defects4J and QuixBugs have been widely studied and may appear in training data. A dedicated experiment (Table VI) creates leakage-free datasets from perturbed samples to assess temporal leakage impact.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the repair prompt format or surrounding context could leak information about the expected fix.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether train and test examples share structural similarities (e.g., same projects in Defects4J appearing across samples).",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": true,
    448           "justification": "The paper constructs 'free-leakage' datasets (Defects4J_free-leakage, QuixBugs_free-leakage) using perturbed samples that LLMs are unlikely to have encountered during pre-training, then re-evaluates performance (Table VI).",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "MT-LAPR detects instability in 34.4%–48.5% of test cases on average across 4 LLMs and 2 datasets",
    457       "evidence": "Table II reports overall R-scores of 0.515 (Defects4J) and 0.656 (QuixBugs), meaning 34.4–48.5% of cases expose instability",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "Code readability positively correlates with LAPR robustness: more perturbations reduce both readability and repair success",
    462       "evidence": "Figure 2 shows parallel decline in R-score and developer-rated readability scores as perturbation distance increases from 1–9; inter-rater Cohen's kappa 0.65–0.67",
    463       "supported": "moderate"
    464     },
    465     {
    466       "claim": "CodeT5-based readability improvement model enhances LAPR robustness by 7.46%–49.32%",
    467       "evidence": "Table V shows LLaMA3-8B R-score improving from 0.440 to 0.657 (+49.32%) and LLaMA3-70B from 0.536 to 0.617 (+15.11%) with CodeT5-large",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "Larger LLMs exhibit greater robustness than smaller ones in APR under perturbation (scaling effect)",
    472       "evidence": "Table II shows LLaMA3-70B consistently outperforms LLaMA3-8B (0.536 vs 0.440 on Defects4J; 0.757 vs 0.668 on QuixBugs)",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Perturbation content type has greater impact on LLM performance than number of edited tokens",
    477       "evidence": "Spearman correlation between edit distance and R-score is r=0.042, p=0.914 (not significant), while individual MR types show varying R-scores ranging from 0.500 to 0.750",
    478       "supported": "moderate"
    479     },
    480     {
    481       "claim": "Missing Null-Check and Wraps-with/Unwraps-from repair patterns are most sensitive to perturbations",
    482       "evidence": "Table IV shows Missing Null-Check R-scores of 0.298 (Defects4J) and 0.357 (QuixBugs), the lowest among all repair patterns with sufficient samples",
    483       "supported": "moderate"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval",
    488     "empirical"
    489   ],
    490   "key_findings": "MT-LAPR, a metamorphic testing framework with 9 code perturbation rules across token, statement, and block levels, reveals that 34.4%–48.5% of semantically-equivalent code variants cause instability in LLM-powered program repair across 4 LLMs and 2 datasets. Perturbation distance negatively correlates with both developer-rated code readability and repair success rate. A CodeT5 model trained specifically to undo the MR perturbations improves robustness by up to 49.32% when used as a preprocessing step, while direct LLM-based code refactoring reduces performance by 20.5%. Repair patterns involving complex conditional logic (Missing Null-Check) are most sensitive to perturbations.",
    491   "red_flags": [
    492     {
    493       "flag": "Circular intervention design",
    494       "detail": "The CodeT5 'readability improvement' model is trained on inverted MR pairs from the same 9 perturbation rules used in testing. It is learning to de-perturb, not to improve general readability, making the 49.32% improvement largely circular and unlikely to generalize to real-world readability issues."
    495     },
    496     {
    497       "flag": "Prompts not disclosed",
    498       "detail": "The paper states prompt templates are fixed across all LLMs but never includes or describes the actual prompts used for bug repair, making the core LLM evaluation non-reproducible."
    499     },
    500     {
    501       "flag": "No statistical significance on main results",
    502       "detail": "Main comparative claims (LLM A vs LLM B robustness, before vs after CodeT5) have no confidence intervals, effect size tests, or significance tests; only a secondary edit-distance analysis uses Spearman correlation."
    503     },
    504     {
    505       "flag": "Model versions unspecified",
    506       "detail": "LLMs are cited by marketing name and announcement URL rather than specific checkpoint versions, API snapshot dates, or model hashes, preventing exact replication."
    507     },
    508     {
    509       "flag": "Human study lacks basic reporting standards",
    510       "detail": "The 10-developer surveys have no IRB approval, no pre-registration, no demographics beyond years of experience, and no blinding or randomization procedure described."
    511     },
    512     {
    513       "flag": "Java-only generalization",
    514       "detail": "All experiments use Java (Defects4J, QuixBugs, Codeforces Java samples), but the paper makes broad claims about 'LAPR techniques' in general without restricting conclusions to Java."
    515     }
    516   ],
    517   "cited_papers": [
    518     {
    519       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models (Xia et al., ICSE 2023)",
    520       "relevance": "Core prior work establishing LLM superiority in APR; MT-LAPR directly tests robustness of such systems"
    521     },
    522     {
    523       "title": "On the Robustness of Code Generation Techniques: An Empirical Study on GitHub Copilot (Mastropaolo et al., ICSE 2023)",
    524       "relevance": "Direct prior work on LLM robustness for code tasks using natural language perturbations; this paper extends to structural code perturbations"
    525     },
    526     {
    527       "title": "NLPerturbator: Studying the Robustness of Code LLMs to Natural Language Variations (Chen et al., 2024)",
    528       "relevance": "Related work testing code LLM robustness via NL prompt perturbations across 6 models; MT-LAPR addresses the complementary code-level perturbation gap"
    529     },
    530     {
    531       "title": "A Survey on Metamorphic Testing (Segura et al., IEEE TSE 2016)",
    532       "relevance": "Foundational methodology survey for the metamorphic testing technique adopted in MT-LAPR"
    533     },
    534     {
    535       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs (Just et al., ISSTA 2014)",
    536       "relevance": "Primary evaluation dataset used in all main experiments"
    537     },
    538     {
    539       "title": "QuixBugs: A Multi-Lingual Program Repair Benchmark Set Based on the Quixey Challenge (Lin et al., 2017)",
    540       "relevance": "Second evaluation dataset used in all main experiments"
    541     },
    542     {
    543       "title": "CodeT5: Identifier-Aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation (Wang et al., 2021)",
    544       "relevance": "Base model fine-tuned for the code readability improvement approach"
    545     },
    546     {
    547       "title": "Syntactic Robustness for LLM-Based Code Generation (Sarker et al., 2024)",
    548       "relevance": "Concurrent work on syntactic perturbations affecting LLM code generation, used for comparison in discussion"
    549     }
    550   ],
    551   "engagement_factors": {
    552     "practical_relevance": {
    553       "score": 2,
    554       "justification": "The framework and readability-improvement preprocessing concept are actionable for teams deploying LAPR tools, but no code is released for immediate use."
    555     },
    556     "surprise_contrarian": {
    557       "score": 1,
    558       "justification": "LLM sensitivity to input formatting is already well-known; the readability-robustness correlation is a useful quantification but not deeply surprising."
    559     },
    560     "fear_safety": {
    561       "score": 1,
    562       "justification": "Highlights that LLM-based repair tools are unreliable under common code variations, relevant to deployment safety but not a novel attack vector."
    563     },
    564     "drama_conflict": {
    565       "score": 0,
    566       "justification": "No controversy, no challenge to specific companies or benchmarks."
    567     },
    568     "demo_ability": {
    569       "score": 0,
    570       "justification": "No code repository, no demo, no installable tool provided."
    571     },
    572     "brand_recognition": {
    573       "score": 1,
    574       "justification": "Uses LLaMA3 and Mistral (moderately well-known) but not tier-1 brands like GPT-4 or Copilot."
    575     }
    576   },
    577   "hn_data": {
    578     "threads": [
    579       {
    580         "hn_id": "24800245",
    581         "title": "World Age in Julia: Optimizing Method Dispatch in the Presence of Eval",
    582         "points": 8,
    583         "comments": 1,
    584         "url": "https://news.ycombinator.com/item?id=24800245"
    585       },
    586       {
    587         "hn_id": "37860517",
    588         "title": "Llark: An LLM which understands music",
    589         "points": 2,
    590         "comments": 1,
    591         "url": "https://news.ycombinator.com/item?id=37860517"
    592       },
    593       {
    594         "hn_id": "42048023",
    595         "title": "Text Embedding Benchmark (2022)",
    596         "points": 2,
    597         "comments": 0,
    598         "url": "https://news.ycombinator.com/item?id=42048023"
    599       },
    600       {
    601         "hn_id": "36512785",
    602         "title": "Can Language Representation Models Think in Bets?",
    603         "points": 1,
    604         "comments": 0,
    605         "url": "https://news.ycombinator.com/item?id=36512785"
    606       }
    607     ],
    608     "top_points": 8,
    609     "total_points": 13,
    610     "total_comments": 2
    611   }
    612 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs