scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29218B)
      1 {
      2   "paper": {
      3     "title": "RePaCA: Leveraging Reasoning Large Language Models for Static Automated Patch Correctness Assessment",
      4     "authors": [
      5       "Marcos Fuster-Peña",
      6       "David de-Fitero-Dominguez",
      7       "Antonio Garcia-Cabot",
      8       "Eva Garcia-Lopez"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2507.22580",
     13     "doi": "10.48550/arXiv.2507.22580"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "RePaCA fine-tunes Qwen2.5-Coder 3B with GRPO reinforcement learning for static automated patch correctness assessment, achieving 83.1% accuracy and 84.8% F1 on the standard Defects4J-derived benchmark, surpassing prior SOTA (APPT) by 3.4pp accuracy. The model shows stronger cross-dataset generalization (72.7% vs 60.5% accuracy when trained on small and tested on large dataset). Chain-of-thought reasoning provides explainable justifications, though the reasoning contains occasional hallucinations and contradictions.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No source code repository is provided. The paper references the APPT GitHub repo (https://github.com/iSEngLab/APPT) for datasets but provides no link to their own implementation, model weights, or training scripts."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The datasets used are publicly available from the APPT GitHub repository (Section 3.5, footnote 1: https://github.com/iSEngLab/APPT), derived from CACHE. Both the small (1,183 patches from Defects4J) and large (49,694 patches) datasets are accessible."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Section 3.6 mentions NVIDIA H100 NVL GPU, TRL library, unsloth, and PyTorch framework, but provides no library versions, requirements.txt, Dockerfile, or detailed environment specification sufficient to recreate the setup."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided. While hyperparameters are listed in Section 3.6, there are no scripts, README, or commands to reproduce the experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables 4 and 5 report only point estimates (e.g., 83.1% accuracy, 84.8% F1). Despite using 5-fold cross-validation, no confidence intervals, error bars, or standard deviations across folds are reported."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims superiority over APPT and CACHE based solely on comparing point estimates (e.g., 'surpassing APPT by 3.4 percentage points'). No statistical significance tests (t-test, Wilcoxon, bootstrap) are applied."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 4.1 reports percentage point differences with baseline context throughout (e.g., 'surpassing APPT by 3.4 percentage points', 'CACHE by 7.8 points', 'our Precision of 84.0%, which is 3.2 points higher than APPT (80.8%)')."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The small dataset contains 1,183 patches and the large dataset 49,694 patches, inherited from prior work. No power analysis or justification for these sizes is provided. Section 3.5 simply describes the datasets as given."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Despite using 5-fold cross-validation, no variance, standard deviation, or spread measure across folds is reported. Tables 4 and 5 show only single aggregate numbers per metric."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 4 compares against four baselines: ODS, Quatrain, CACHE, and APPT. Table 5 compares against APPT for cross-dataset generalization."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "APPT (2024) is the leading prior method. CACHE (2022) and ODS (2022) are also relatively recent. The baselines represent the state of the art in static APCA."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The system has multiple components (prompt design with CoT guidance, GRPO training, one-shot example, asymmetric reward, model selection) but no ablation study isolates the contribution of any individual component. It is impossible to attribute the improvement to GRPO vs prompting vs model choice."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Five evaluation metrics are reported: Accuracy, Precision, Recall, F1-score, and AUC (Section 3.4, Tables 4 and 5)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No systematic human evaluation is performed. Section 4.3 shows two hand-picked examples of model reasoning, but there is no structured human assessment of reasoning quality across the dataset."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "5-fold cross-validation separates train and test splits (Section 3.5). In the cross-dataset experiment (Section 4.2), the model is trained on the small dataset and tested on the entirely separate large dataset."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Only aggregate metrics are reported. No breakdown by bug type, project, patch complexity, or any other category is provided, despite Defects4J containing patches from multiple projects."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 5.2 discusses specific failure modes: hallucination, failure to identify actual code changes, overemphasis on superficial changes, contradictory conclusions (reasoning says correct but label says overfitting), and ~1% format failures. Section 4.3.1 shows an example where the model makes an error in its reasoning."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 3.3 reports that distilled DeepSeek models 'showed poor performance on our APCA task' despite being pre-trained for CoT. Smaller models (0.5B, 1.5B) 'struggled with complex reasoning tasks.' Larger models (7B) had memory issues without significant gains."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims 83.1% accuracy and 84.8% F1, which match Table 4. Claims of 'state-of-the-art performance' and 'superior generalization' are supported by Tables 4 and 5 respectively."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes causal claims throughout — e.g., 'GRPO finetuning process appears to be effective and critical' (Section 5.1), 'This improvement comes from the LLM's understanding' (Section 1). No ablation study isolates the causal contribution of any single component. The improvement could be due to model size, prompting strategy, or other factors."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title and abstract claim general 'Static Automated Patch Correctness Assessment' but results are limited to Java patches from Defects4J and ManySStuBs4J. The paper does not explicitly bound conclusions to Java or to these specific benchmarks."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "Section 5.2 discusses threats to validity but does not consider alternative explanations for the results. For example, the improvement could be due to Qwen2.5-Coder's pre-training on Java code, the model's larger parameter count vs BERT-based APPT, or data contamination. None of these are discussed."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures binary classification accuracy on labeled correct/overfitting patches, which is exactly what it claims to measure. No proxy gap exists — the measurements match the granularity of the claims."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section 3.3 specifies 'Qwen2.5-Coder 3B' as the base model, which is a specific model family, version (2.5), variant (Coder), and parameter count (3B). Other evaluated models are also named with versions and sizes."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 3.1 describes the prompt design in detail (structure, one-shot example, output format with <think>/<answer> tags) but does not provide the actual prompt text used. The reader cannot reconstruct the exact prompt."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section 3.6 reports: 20 epochs, learning rate 5e-5, batch size 8, 2 gradient accumulation steps, 8 GRPO candidate generations, temperature 0.6, max completion length 2048 tokens."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. RePaCA is a fine-tuned LLM with a single prompt-response pattern, not an agent with tools or multi-step workflows."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "Section 3.5 states datasets were 'extracted from files provided on the GitHub, following the methodology outlined in the original paper.' The preprocessing is deferred to prior work. Filtering and deduplication are mentioned but specific criteria and counts at each step are not provided."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5.2 'Threats to Validity' is a dedicated subsection discussing practical and technical limitations of the approach."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 5.2 discusses threats specific to this study: hallucination in reasoning, failure to identify actual code changes, ~1% format failures, contradictory conclusions between reasoning and labels, and computational overhead of GRPO training."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what the results do NOT show. No bounds are given for language (Java only), benchmark (Defects4J/ManySStuBs4J only), or patch source. Section 6 discusses future work but does not bound current claims."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The datasets are publicly available from the APPT GitHub repository (https://github.com/iSEngLab/APPT), which provides the raw patch data."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.5 describes data sources: the small dataset combines patches from Tian et al. (1,000 patches) and Wang et al. (902 patches) from Defects4J v1.2. The large dataset uses correct patches from ManySStuBs4J and overfitting patches from RepairThemAll."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data sources are standard public benchmarks (Defects4J, ManySStuBs4J, RepairThemAll)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The pipeline from raw patches to final datasets involves filtering and deduplication, but specific counts at each step are incomplete. For the small dataset: 1,000 + 902 → 1,183 after filtering, but criteria are not detailed. For the large dataset: 'duplicated and biased patches were removed' with no counts."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The Acknowledgements section discloses two funding sources: project SBPLY/23/180225/000063 and project TIFON (PLEC2023-010251), both from Spanish research funding calls."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All four authors are affiliated with Universidad de Alcalá, Departamento de Ciencias de la Computación. No commercial product is being evaluated, so no product-related conflict exists."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funding comes from Spanish government research grants (Ayudas para la realización de proyectos de investigación científica). These funders have no commercial stake in APCA outcomes."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No mention of Qwen2.5-Coder's training data cutoff date. The base model could have seen Defects4J patches or related code during pretraining."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether the Qwen2.5-Coder base model may have encountered Defects4J patches, Java bug-fix datasets, or related code during pretraining."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "Defects4J (2014) and its patches have been widely published and are available on GitHub. Qwen2.5 was trained on web data that likely includes these repositories. This contamination risk is not discussed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Section 5.2 states inference is 'relatively fast and efficient' with a ~3B model but provides no actual inference time, cost per patch, or latency measurements."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Section 3.6 mentions an NVIDIA H100 NVL GPU but does not report total training time, GPU hours, wall-clock duration, or any quantified compute budget."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "5-fold cross-validation is used but no results across different random seeds are reported. No sensitivity analysis to initialization randomness in the GRPO training."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "Section 3.5 states '5-fold cross-validation method is used to test the performance,' explicitly stating the number of experimental configurations."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Section 3.6 mentions '20 epochs was determined to be generally optimal based on observing the convergence' and 'preliminary experimentation' for reward values, but no systematic search budget (number of configurations tried, search method) is reported."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Hyperparameters appear partially justified (learning rate same as APPT, others for VRAM reduction) but no validation-based selection process is described. The reward asymmetry (1.0 vs 2.0) was chosen after 'preliminary experimentation' with no details on alternatives tried."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "No statistical tests are performed at all, let alone correction for multiple comparisons across the 4 baselines and 5 metrics."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "In Section 4.2, the authors re-implement and train APPT themselves for the cross-dataset comparison. No acknowledgment that their re-implementation of the baseline may underperform the original authors' version."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "RePaCA uses a 3B parameter model with GRPO training (8 candidate generations per iteration) while APPT uses BERT with LSTM. These vastly different compute requirements are not discussed as a potential confound."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper uses Defects4J-derived benchmarks without questioning whether correct/overfitting labels are reliable, whether the benchmark captures real-world APCA challenges, or whether the small dataset (1,183 patches) adequately represents the problem space."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. RePaCA is a single-prompt fine-tuned model, not an agentic system."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "Defects4J (2014) and its patches predate Qwen2.5-Coder's training. The base model may have seen patch solutions during pretraining. This temporal leakage risk is not discussed."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the model's pretraining on code repositories provides information leakage about the specific bugs and patches in the test set."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether train and test folds contain patches from the same Defects4J projects, which could introduce structural dependencies. The 5-fold split strategy is not described."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "RePaCA achieves state-of-the-art performance on the standard small dataset benchmark with 83.1% accuracy and 84.8% F1-score.",
    370       "evidence": "Table 4 (Section 4.1) shows accuracy 83.1%, precision 84.0%, recall 85.7%, F1 84.8%, AUC 82.8%, surpassing APPT (79.7% accuracy, 81.8% F1) and all other static APCA baselines under 5-fold cross-validation.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "RePaCA demonstrates superior generalization when trained on the small dataset and tested on the large dataset, outperforming APPT.",
    375       "evidence": "Table 5 (Section 4.2) shows 72.7% accuracy and 75.4% F1 vs APPT's 60.5% accuracy and 71.4% F1 when trained on small and tested on large dataset.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "GRPO fine-tuning is effective and critical for enhancing reasoning capabilities for the APCA task.",
    380       "evidence": "Section 5.1 claims GRPO is 'effective and critical' but no ablation compares GRPO-trained vs non-GRPO-trained versions. The only evidence is the final system's performance.",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "The Chain-of-Thought reasoning provides explainable, transparent justifications for patch assessments.",
    385       "evidence": "Section 4.3 shows two hand-picked examples. Section 5.2 acknowledges the model sometimes hallucinates, makes contradictory conclusions, and has ~1% format failures.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Qwen2.5-Coder 3B provides the best balance between reasoning capability and computational efficiency for APCA.",
    390       "evidence": "Section 3.3 describes evaluating 0.5B, 1.5B, 3B, 7B variants and DeepSeek distilled models, concluding 3B is optimal. No detailed comparison table or metrics are provided for model selection.",
    391       "supported": "weak"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No ablation study",
    397       "detail": "The system combines multiple components (prompt design, CoT guidance, GRPO training, one-shot example, asymmetric reward, model choice) but no ablation isolates any single contribution. The improvement cannot be attributed to any specific innovation."
    398     },
    399     {
    400       "flag": "No statistical significance testing",
    401       "detail": "All comparative claims (e.g., 'surpassing APPT by 3.4 percentage points') are based on comparing point estimates without any statistical tests. With only 1,183 patches and 5-fold CV, the variance across folds could easily account for observed differences."
    402     },
    403     {
    404       "flag": "No variance reported despite cross-validation",
    405       "detail": "5-fold cross-validation is used but no standard deviation, confidence interval, or spread measure across folds is reported. The reader cannot assess whether the improvements are within noise."
    406     },
    407     {
    408       "flag": "Contamination risk unaddressed",
    409       "detail": "Defects4J patches (published 2014) and Qwen2.5-Coder's training data likely overlap. The base model may have memorized correct patches during pretraining, inflating classification accuracy. This is never discussed."
    410     },
    411     {
    412       "flag": "Explainability claims based on two examples",
    413       "detail": "The paper claims 'enhanced explainability' and 'transparent justification' based on two hand-picked examples (Section 4.3), one of which contains a reasoning error (inventing a new variable). No systematic evaluation of reasoning quality is performed."
    414     },
    415     {
    416       "flag": "Self-comparison bias in cross-dataset experiment",
    417       "detail": "In Section 4.2, the authors re-implement and retrain APPT themselves for the cross-dataset comparison. Their re-implementation may underperform the original, but this bias is not acknowledged."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "A Survey of Learning-based Automated Program Repair",
    423       "authors": ["Q. Zhang", "C. Fang", "Y. Ma", "W. Sun", "Z. Chen"],
    424       "year": 2023,
    425       "arxiv_id": "2301.03270",
    426       "doi": "10.48550/arXiv.2301.03270",
    427       "relevance": "Survey of deep learning approaches to automated program repair, directly relevant to understanding the LLM-based APR landscape."
    428     },
    429     {
    430       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    431       "authors": ["DeepSeek-AI"],
    432       "year": 2025,
    433       "arxiv_id": "2501.12948",
    434       "doi": "10.48550/arXiv.2501.12948",
    435       "relevance": "Foundational work on GRPO-based reasoning in LLMs, the core training methodology adapted by RePaCA."
    436     },
    437     {
    438       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    439       "authors": ["J. Wei"],
    440       "year": 2023,
    441       "arxiv_id": "2201.11903",
    442       "doi": "10.48550/arXiv.2201.11903",
    443       "relevance": "Foundational CoT prompting paper that underlies the reasoning approach used in RePaCA."
    444     },
    445     {
    446       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    447       "authors": ["Z. Feng"],
    448       "year": 2020,
    449       "arxiv_id": "2002.08155",
    450       "doi": "10.48550/arXiv.2002.08155",
    451       "relevance": "Foundational pre-trained code model used as a component in INVALIDATOR, relevant to understanding LLM-based code analysis."
    452     },
    453     {
    454       "title": "APPT: Boosting Automated Patch Correctness Prediction via Fine-Tuning Pre-Trained Models",
    455       "authors": ["Q. Zhang"],
    456       "year": 2024,
    457       "doi": "10.1109/TSE.2024.3354969",
    458       "relevance": "Previous SOTA static APCA technique using fine-tuned BERT+LSTM; the primary baseline that RePaCA aims to surpass."
    459     },
    460     {
    461       "title": "Leveraging Large Language Model for Automatic Patch Correctness Assessment",
    462       "authors": ["X. Zhou"],
    463       "year": 2024,
    464       "doi": "10.1109/TSE.2024.3452252",
    465       "relevance": "LLM4PatchCorrect — directly relevant prior work applying LLMs to patch correctness assessment."
    466     },
    467     {
    468       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    469       "authors": ["Z. Shao"],
    470       "year": 2024,
    471       "arxiv_id": "2402.03300",
    472       "doi": "10.48550/arXiv.2402.03300",
    473       "relevance": "Source of the GRPO algorithm used for fine-tuning in RePaCA."
    474     },
    475     {
    476       "title": "Qwen2.5 Technical Report",
    477       "authors": ["Qwen"],
    478       "year": 2025,
    479       "arxiv_id": "2412.15115",
    480       "doi": "10.48550/arXiv.2412.15115",
    481       "relevance": "Technical report for the Qwen2.5 model family from which RePaCA's base model (Qwen2.5-Coder 3B) is drawn."
    482     },
    483     {
    484       "title": "Proximal Policy Optimization Algorithms",
    485       "authors": ["J. Schulman", "F. Wolski", "P. Dhariwal", "A. Radford", "O. Klimov"],
    486       "year": 2017,
    487       "arxiv_id": "1707.06347",
    488       "doi": "10.48550/arXiv.1707.06347",
    489       "relevance": "Foundational RL algorithm (PPO) that GRPO builds upon; relevant to understanding the training methodology."
    490     },
    491     {
    492       "title": "Defects4J: a database of existing faults to enable controlled testing studies for Java programs",
    493       "authors": ["R. Just", "D. Jalali", "M. D. Ernst"],
    494       "year": 2014,
    495       "doi": "10.1145/2610384.2628055",
    496       "relevance": "Standard Java bug benchmark from which the primary evaluation dataset is derived."
    497     }
    498   ],
    499   "engagement_factors": {
    500     "practical_relevance": {
    501       "score": 1,
    502       "justification": "APCA is a niche sub-field of APR; most developers would not directly use a patch correctness classifier in their daily workflow."
    503     },
    504     "surprise_contrarian": {
    505       "score": 0,
    506       "justification": "Confirms the expected trend that larger LLMs with reasoning fine-tuning outperform older NLP methods on code analysis tasks."
    507     },
    508     "fear_safety": {
    509       "score": 0,
    510       "justification": "No AI safety, security, or risk concerns raised by the work."
    511     },
    512     "drama_conflict": {
    513       "score": 0,
    514       "justification": "No controversy or conflicting claims with other groups."
    515     },
    516     "demo_ability": {
    517       "score": 0,
    518       "justification": "No code, demo, or model weights are released."
    519     },
    520     "brand_recognition": {
    521       "score": 0,
    522       "justification": "From Universidad de Alcalá, a relatively unknown institution in the AI/SE community; no well-known product or lab involved."
    523     }
    524   }
    525 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs