scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27809B)
      1 {
      2   "paper": {
      3     "title": "Repair-R1: Better Test Before Repair",
      4     "authors": [
      5       "Haichuan Hu",
      6       "Xiaochen Xie",
      7       "Quanjun Zhang"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2507.22853",
     12     "doi": "10.48550/arXiv.2507.22853"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "Repair-R1 jointly optimizes test generation and bug repair via reinforcement learning (GRPO), requiring the model to generate discriminative tests before producing patches. Across four benchmarks (HumanEval, MBPP, CodeForces, CodeContests) and three Qwen models, RL-Both improves repair success rate by 2.68%–48.29% over vanilla models while avoiding the catastrophic forgetting observed with SFT on underrepresented benchmarks. Ablation shows joint optimization (RL-Both) outperforms single-objective RL in 11 of 12 settings.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract states 'We publish the code and weights at Github and HuggingFace' but no repository URL or archive link is provided anywhere in the paper text. Without a verifiable URL, this does not meet the criterion."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The base benchmarks (HumanEval, MBPP, CodeForces, CodeContests) are publicly available, but the core experimental dataset—5,421 training and 1,358 test defective variants generated by GPT-4o—is not released. The paper mentions publishing code and weights but not the custom defect dataset."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, conda environment, or library versions are specified. The paper mentions model names but provides no environment setup details."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README commands, or scripts for replicating experiments are described in the paper."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Table I reports only point estimates (e.g., '81.25%', '66.15%') with no confidence intervals, error bars, or ± notation."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Claims of superiority (e.g., 'Repair-R1 improves repair success rate') are based solely on comparing raw percentages. No p-values, t-tests, or other significance tests are reported."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table I provides absolute performance for all methods across all benchmarks, and the paper reports improvement ranges (e.g., '2.68% to 48.29%') with baseline context, allowing readers to assess effect magnitude."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Benchmark sizes are stated (HumanEval 112, MBPP 257, CodeForces 585, CodeContests 404) but there is no justification for why these sizes are adequate or any power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviation, variance across seeds, or spread measures are reported. Results appear to be from single runs."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table I compares five configurations: Vanilla (no adaptation), SFT (supervised fine-tuning), RL-Repair (RL on repair only), RL-Test (RL on test only), and RL-Both (Repair-R1)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "All baselines are internal variants of their own approach (Vanilla, SFT, single-objective RL). No comparison with contemporary external APR methods such as RepairAgent, CREF, or other LLM-based APR systems that are cited in the related work."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section V-C presents an explicit ablation study comparing RL-Repair (repair reward only), RL-Test (test reward only), and RL-Both (joint) to isolate the contribution of each objective."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Three evaluation metrics are used: Bugfix (repair success rate), Test (discriminative test generation success rate), and Tcov (test coverage of bugs)."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of patch quality is performed. All evaluation is automated via oracle test execution. Human evaluation would be relevant to assess whether patches are semantically correct rather than merely test-passing."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section IV-A states 'the filtered defect dataset was partitioned into training and test sets at a 4:1 ratio, with care taken to ensure that no original sample appeared in both sets, thereby preventing data contamination.'"
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table I provides results broken down by all four benchmarks (HumanEval, MBPP, CodeForces, CodeContests), three models, and five method configurations."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No systematic failure analysis is presented. The paper does not discuss specific cases where Repair-R1 fails or analyze failure modes."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section V-B reports that SFT causes performance degradation on HumanEval and MBPP. Section V-C notes that RL-Test can decrease repair effectiveness (e.g., Qwen-4B RL-Test shows decreased repair on HumanEval, CodeForces, and CodeContests)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims of improvement ranges (repair 2.68%–48.29%, test generation 16.38%–53.28%, test coverage 0.78%–53.96%) can be verified from Table I data across the three models and four benchmarks."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims ('test generation helps repair better'). The ablation study in Section V-C provides controlled single-variable manipulation: RL-Test, RL-Repair, and RL-Both differ only in the reward signal, providing adequate evidence for the causal contribution of each component."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper claims to show 'a new direction for LLM-based APR' and the contribution of a 'novel method' without bounding claims to the tested setting: Python-only, artificially mutated bugs, three Qwen models. Real-world bugs, other languages, and other model families are not tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations are considered. The paper does not discuss whether improvements could stem from increased training signal, compute differences between methods, or the fact that RL-Both simply gets more gradient updates on repair-relevant objectives."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures oracle test pass rate but frames it as the model 'understanding the underlying cause of the defect' and 'performing more effective repairs.' The well-known plausible-patch problem in APR (patches that pass tests but are semantically incorrect) is not acknowledged."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The three evaluated models are specified by exact HuggingFace identifiers: Qwen2.5-Coder-1.5B-Instruct, Qwen2.5-Coder-3B-Instruct, and Qwen3-4B. GPT-4o is used for data generation without a snapshot version, but the primary evaluated models are precisely identified."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Figure 3 provides the complete prompt including system prompt, task prompt, one-shot example with full text, and task input template. The fill values are the buggy functions from the dataset."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "The GRPO objective references hyperparameters ε and β but their values are not stated. Learning rate, batch size, training epochs, sampling temperature, and other critical hyperparameters are not reported."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The approach is a standard prompted LLM with RL training, not an agentic system."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section IV-A documents the four-step pipeline: (1) normal sample collection from four benchmarks with runtime filtering, (2) GPT-4o mutation to generate ≥10 defective variants per sample, (3) oracle test validation and semantic deduplication, (4) 4:1 train-test split ensuring no sample overlap."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The paper has no limitations section or threats-to-validity discussion. It proceeds directly from experimental results to conclusion."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No threats to validity are discussed anywhere in the paper."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit scope boundaries are stated. The paper does not acknowledge that results are limited to Python, artificially mutated bugs, or Qwen model families."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw data is available for verification. No download links, supplementary data files, or data repository is provided."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section IV-A describes data collection: samples from HumanEval, MBPP, CodeForces, and CodeContests were collected, filtered by runtime (<3s for competitive programming), mutated via GPT-4o, validated against oracle tests, and deduplicated."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard public benchmarks."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The pipeline stages are described (collection → mutation → validation → filtering → split), but intermediate counts are missing. The paper does not report how many samples were collected per benchmark, how many mutants were generated, or how many were filtered out at each stage. Only final counts are given (5,421 train, 1,358 test)."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source is disclosed anywhere in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: Haichuan Hu (Alibaba Cloud), Xiaochen Xie (Zhejiang University), Quanjun Zhang (Nanjing University of Science and Technology)."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funder is disclosed. One author is affiliated with Alibaba Cloud, and the paper exclusively evaluates Alibaba's Qwen models, creating a potential conflict of interest whose independence cannot be assessed."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement or financial disclosures are present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for any of the three Qwen models used."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "Section IV-A addresses overlap in their own RL training split but does not discuss whether the pre-trained Qwen models may have seen HumanEval, MBPP, or CodeForces solutions during pre-training."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "HumanEval (2021) and MBPP (2021) were published well before Qwen's training data was collected. The paper does not discuss contamination risk from these benchmarks being in the pre-training data."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference cost, latency, or per-example cost is reported despite the method requiring both test generation and patch generation per bug."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No GPU hours, training time, or total computational budget is stated for the RL training of three models."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No results across multiple random seeds are reported. All results appear to be single-run."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs is not stated anywhere in the paper."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search budget is reported. The GRPO hyperparameters ε and β are referenced but their values and selection process are not described."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No explanation of how the final hyperparameter configuration was selected. No validation set performance comparison or selection criterion is described."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper compares 5 methods across 4 benchmarks and 3 models (60 comparisons) with no statistical testing at all, let alone multiple comparison correction."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "All baselines (Vanilla, SFT, RL-Repair, RL-Test) are the authors' own implementations. No external baselines are included and no acknowledgment of self-comparison bias is made."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "RL training presumably requires substantially more compute than SFT or Vanilla, but compute costs are not reported for any method. Performance is not compared at matched compute budgets."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The bugs are artificially generated via GPT-4o mutation, which may not represent real-world defects. The paper does not discuss whether these synthetic benchmarks measure real repair capability or merely the ability to fix GPT-4o-style mutations."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. All methods use the same prompting approach and differ only in training strategy."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "HumanEval and MBPP were published in 2021, well before Qwen model training. The paper does not discuss whether solutions to these benchmarks appeared in Qwen's pre-training data."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information beyond what would be available in practice."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section IV-A explicitly states 'care taken to ensure that no original sample appeared in both sets, thereby preventing data contamination' for the train-test split of their RL training data."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The structural train-test split addresses their own RL data but not pre-training contamination."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Repair-R1 improves repair success rate by 2.68% to 48.29% compared to vanilla models across three models and four benchmarks.",
    369       "evidence": "Table I shows RL-Both results versus Vanilla for all 12 model×benchmark combinations. Improvements range from 2.68% (Qwen3-4B on HumanEval) to 48.29% (Qwen2.5-Coder-1.5B on HumanEval).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Test generation success rate improves by 16.38% to 53.28% and test coverage by 0.78% to 53.96% compared to vanilla models.",
    374       "evidence": "Table I shows Test and Tcov metrics for RL-Both versus Vanilla across all settings.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Joint RL optimization (RL-Both) outperforms single-objective optimization (RL-Repair or RL-Test) on repair.",
    379       "evidence": "Section V-C and Table I: RL-Both achieves higher Bugfix than RL-Repair in 11 of 12 settings, with improvements of 0.24%–6.25%.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "SFT suffers from catastrophic forgetting on underrepresented benchmarks while RL avoids this.",
    384       "evidence": "Section V-B: SFT degrades Bugfix on HumanEval and MBPP (e.g., Qwen2.5-Coder-3B drops 32.26% on HumanEval) while improving on CodeForces/CodeContests. RL-Both improves across all benchmarks.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Test generation capability and repair capability are correlated—better test generation leads to better repair.",
    389       "evidence": "Section V-A and Figure 4: After RL training, successful repairs are more frequently accompanied by correct test cases. The correlation is shown visually but not tested statistically.",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "Qwen3-4B (general-purpose reasoning model) shows better test-time scaling than code-specific models at higher sampling sizes.",
    394       "evidence": "Section V-D and Figure 5: Qwen3-4B surpasses Qwen2.5-Coder-3B at sampling sizes >4 on most benchmarks.",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No external baselines",
    401       "detail": "Despite citing RepairAgent, CREF, and other LLM-based APR methods in the related work, the paper compares only against internal variants (Vanilla, SFT, single-objective RL). No contemporary APR methods are used as baselines, making it impossible to assess whether Repair-R1 advances the state of the art."
    402     },
    403     {
    404       "flag": "No uncertainty quantification",
    405       "detail": "All 60 experimental settings report single point estimates with no error bars, confidence intervals, significance tests, or multi-seed results. The smallest benchmark (HumanEval, 112 bugs) is particularly susceptible to variance."
    406     },
    407     {
    408       "flag": "Artificial bugs only",
    409       "detail": "All defects are generated by GPT-4o mutation, not collected from real-world bug repositories. The paper does not discuss whether GPT-4o-generated mutations are representative of real bugs, raising questions about ecological validity."
    410     },
    411     {
    412       "flag": "Potential conflict of interest",
    413       "detail": "First author is affiliated with Alibaba Cloud and the paper exclusively evaluates Alibaba's Qwen model family. No independent model families (e.g., Llama, CodeLlama, StarCoder) are tested. No funding disclosure or competing interests statement is provided."
    414     },
    415     {
    416       "flag": "No limitations discussion",
    417       "detail": "The paper has no limitations section, no threats to validity, and no discussion of scope boundaries. Claims about 'a new direction for LLM-based APR' are unbounded."
    418     },
    419     {
    420       "flag": "Missing critical hyperparameters",
    421       "detail": "RL hyperparameters (ε, β, learning rate, batch size, epochs, temperature) are not reported, making the approach non-reproducible even if code were available."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Repairagent: An autonomous, llm-based agent for program repair",
    427       "authors": ["I. Bouzenia", "P. Devanbu", "M. Pradel"],
    428       "year": 2024,
    429       "arxiv_id": "2403.17134",
    430       "relevance": "Directly relevant LLM-based agentic approach to automated program repair."
    431     },
    432     {
    433       "title": "Stepcoder: Improve code generation with reinforcement learning from compiler feedback",
    434       "authors": ["S. Dou", "Y. Liu", "H. Jia"],
    435       "year": 2024,
    436       "arxiv_id": "2402.01391",
    437       "relevance": "RL-based code generation using compiler feedback, closely related methodology for code tasks."
    438     },
    439     {
    440       "title": "Acecoder: An effective prompting technique specialized in code generation",
    441       "authors": ["J. Li", "Y. Zhao", "Y. Li", "G. Li", "Z. Jin"],
    442       "year": 2024,
    443       "relevance": "Prompting technique for LLM-based code generation, relevant to code-related LLM capabilities."
    444     },
    445     {
    446       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    447       "authors": ["D. Guo", "D. Yang", "H. Zhang"],
    448       "year": 2025,
    449       "arxiv_id": "2501.12948",
    450       "relevance": "GRPO-based RL training for LLM reasoning, foundational method used by Repair-R1."
    451     },
    452     {
    453       "title": "The use of large language models for program repair",
    454       "authors": ["F. Zubair", "M. Al-Hitmi", "C. Catal"],
    455       "year": 2025,
    456       "relevance": "Survey of LLM-based program repair methods, provides landscape context for APR research."
    457     },
    458     {
    459       "title": "Cref: An llm-based conversational software repair framework for programming tutors",
    460       "authors": ["B. Yang", "H. Tian", "W. Pian"],
    461       "year": 2024,
    462       "relevance": "LLM-based conversational approach to software repair with human interaction."
    463     },
    464     {
    465       "title": "Selfapr: Self-supervised program repair with test execution diagnostics",
    466       "authors": ["H. Ye", "M. Martinez", "X. Luo", "T. Zhang", "M. Monperrus"],
    467       "year": 2022,
    468       "relevance": "Self-supervised APR using test execution, relevant prior work on integrating test information with repair."
    469     },
    470     {
    471       "title": "Neural program repair with execution-based backpropagation",
    472       "authors": ["H. Ye", "M. Martinez", "M. Monperrus"],
    473       "year": 2022,
    474       "relevance": "Execution-based learning signals for neural program repair, similar motivation of leveraging test execution."
    475     },
    476     {
    477       "title": "Direct preference optimization: Your language model is secretly a reward model",
    478       "authors": ["R. Rafailov", "A. Sharma", "E. Mitchell"],
    479       "year": 2023,
    480       "relevance": "DPO as an alternative RL alignment method for LLMs, relevant to the training methodology landscape."
    481     },
    482     {
    483       "title": "Deepseekmath: Pushing the limits of mathematical reasoning in open language models",
    484       "authors": ["Z. Shao", "P. Wang", "Q. Zhu"],
    485       "year": 2024,
    486       "arxiv_id": "2402.03300",
    487       "relevance": "Introduced GRPO algorithm used as the core RL method in Repair-R1."
    488     },
    489     {
    490       "title": "Proximal policy optimization algorithms",
    491       "authors": ["J. Schulman", "F. Wolski", "P. Dhariwal", "A. Radford", "O. Klimov"],
    492       "year": 2017,
    493       "arxiv_id": "1707.06347",
    494       "relevance": "Foundational RL algorithm (PPO) that GRPO builds upon for LLM optimization."
    495     }
    496   ],
    497   "engagement_factors": {
    498     "practical_relevance": {
    499       "score": 2,
    500       "justification": "The test-before-repair paradigm is potentially usable by practitioners building APR tools, and model weights are claimed to be released."
    501     },
    502     "surprise_contrarian": {
    503       "score": 1,
    504       "justification": "Inverting the conventional test-after-repair paradigm is mildly novel but not deeply counterintuitive."
    505     },
    506     "fear_safety": {
    507       "score": 0,
    508       "justification": "No AI safety or security implications."
    509     },
    510     "drama_conflict": {
    511       "score": 0,
    512       "justification": "No controversy or conflict angle."
    513     },
    514     "demo_ability": {
    515       "score": 1,
    516       "justification": "Code and weights claimed available on GitHub/HuggingFace but no URLs provided in the paper."
    517     },
    518     "brand_recognition": {
    519       "score": 1,
    520       "justification": "Alibaba Cloud affiliation and use of Qwen models provide some recognition but not top-tier visibility."
    521     }
    522   }
    523 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs