ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (28897B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Input Reduction Enhanced LLM-based Program Repair",
      6     "authors": [
      7       "Boyang Yang",
      8       "Luyao Ren",
      9       "Xin Yin",
     10       "Jiadong Ren",
     11       "Haoye Tian"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2507.15251",
     16     "doi": "10.48550/arXiv.2507.15251"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are verified: 89.1% average input reduction (Table 3), 53.8% relative pass@10 improvement over Origin Test (GLM overall 6.5%→10.0%), 17.6% over Baseline (GLM 8.5%→10.0%), ChatRepair +21.3% and CREF +2.6% (Tables 10–11).",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims that input reduction improves repair accuracy; these are backed by controlled ablations (RQ-3 isolating length vs. information), statistical MWW tests (p<0.05), and plug-in integration experiments that hold all other variables fixed.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are explicitly bounded to LLM-based APR with long failure-inducing test inputs; the threats-to-validity section acknowledges the competitive-programming domain and validates on OSS-Fuzz for broader applicability, without overclaiming universal generalization.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "RQ-3 specifically investigates whether gains come from shorter prompts alone vs. preserved failure information; five prompt variants (Diff Lines, Reduced+Origin, etc.) systematically rule out length-only explanations.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "pass@k is defined as patch correctness against the full official test suite; the paper does not conflate this metric with broader software quality or developer productivity, staying within the measured granularity.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 'Threats to Validity' contains three dedicated subsections (Internal, Construct, External validity), each with specific threats and mitigations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are identified: compression ratio reaching 100% hiding variation (addressed by reporting both mean and median), dataset restriction to AtCoder (mitigated with OSS-Fuzz), stochasticity addressed by pass@k sampling — these are concrete, not boilerplate.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly scopes evaluation to long failure-inducing inputs and notes 'ReduceFix might benefit only the pipeline evaluated in this study and fail to transfer,' testing transferability via ChatRepair and CREF plug-in experiments.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments section lists three explicit funding sources: National Natural Science Foundation of China (62273292), Central Leading Local Science and Technology Development Project of Hebei Province (246Z0804G), and Hebei Innovation Capability Improvement Plan Project (22567626H).",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are listed on the first page: Yanshan University, Peking University, Zhejiang University, Aalto University, and Yanshan University.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "All funders are Chinese government science foundations with no commercial interest in the ReduceFix tool or the LLMs evaluated.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or conflicts-of-interest statement is included anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: APR is introduced in the opening sentence, 'lost-in-the-middle' is explained with citation, compression rate is formally defined in Eq. 2, and pass@k is defined with its formula in Section 4.3.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 enumerates four explicit contributions: (1) ReduceFix framework, (2) LFTBench benchmark, (3) comprehensive evaluation results, (4) plug-in integration into ChatRepair and CREF.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 7 provides a structured related work covering LLM-based APR and test input reduction, explicitly positioning ReduceFix as the first approach combining both and showing how it differs from ddmin-family methods, HDD, Perses, and LPR.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Section 4.4 explicitly states 'the full artifact is published at https://github.com/GLEAM-Lab/ReduceFix,' and the README is referenced for reducer visualizations.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "LFTBench and LFTBench-Py are stated as released in the contributions section; the GitHub artifact URL implies data availability alongside code.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or dependency specification is provided in the paper; only hyperparameters (Table 2) and hardware category ('single 24 GB consumer GPU') are mentioned.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions appear in the paper itself; the GitHub README is referenced but not its content, and the paper provides only algorithm pseudocode.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All pass@k results are reported as single point estimates in tables; no confidence intervals or error bars appear anywhere in the paper.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Section 5.3 applies two-sided Mann-Whitney-Wilcoxon tests to compare ReduceFix against Origin Test, reporting p < 0.05 for key comparisons including the ddmin-only gap.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Both absolute and relative improvements are consistently reported (e.g., 'pass@10 rises from 30.5% to 37.0%, an absolute gain of 6.5 percentage points and a relative gain of 21.3%').",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 200-bug, 20-task sample size is motivated by the availability of AtCoder data after LLM cutoff dates, but no power analysis or statistical justification for sufficiency is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Pass@k figures are reported as single values with no standard deviation, confidence intervals, or run-to-run variance; the reducer uses temperature=0 (deterministic), but repair sampling variance is not reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple baselines are tested: Baseline (no test), Origin Test (full test), ddmin-only, and pure-LLM reduction, plus ChatRepair and CREF with and without ReduceFix.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "ChatRepair (ISSTA 2024) and CREF (ISSTA 2024) are recent and competitive APR systems; ddmin is classical but appropriate as a known algorithm baseline for the reduction task.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "RQ-3 (Section 5.4) is an explicit ablation with five prompt variants to isolate the contributions of length reduction vs. information selection.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "The paper reports pass@1, pass@5, pass@10, reduction success rate, mean and median compression rate, and token/cost comparisons.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Automated test suite evaluation is the appropriate measure for APR; no human evaluation is needed or applicable.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The full official AtCoder hidden test archive is used for final validation; the failure-inducing input used in the prompt is distinct from the full test suite used to judge patch correctness.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by task difficulty (C, D, E&F in Table 6) and by input format (6 categories in Table 4), as well as by project for OSS-Fuzz (Tables 12–13).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 5.2 includes a dedicated 'Failed Case Study' for ABC372E where vertex renumbering masked the defect, with a proposed fix described in detail.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper explicitly reports that including the full failing test (Origin Test) hurts performance relative to no test for GLM-4-9B-chat (8.5%→6.5%) and DeepSeek-V3 (66.5%→63.0%).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Model names are given (Qwen2.5-Coder-7B-instruct, GLM-4-9B-chat, DeepSeek-V3, Qwen2.5-Plus) but no API snapshot dates or commit hashes are provided; Qwen2.5-Plus is a cloud service with non-disclosed parameter count and no versioning timestamp.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Listing 1 provides the full one-shot reducer prompt template with all placeholder variables labeled; the repair prompt structure is also described in Section 3.4.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 2 lists all hyperparameters: temperature 0.0 for reduction, 0.8 for repair, 60s wall-clock limit, 10s compilation timeout, 5s execution timeout per test case, k∈{1,5,10}.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Algorithm 1 provides the complete three-stage pipeline control logic; Sections 3.2–3.4 describe each stage in detail including reducer generation, iterative reduction, and patch validation loops.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 4.2 describes the full benchmark construction: AtCoder ABC 361–377 selection, test size filter (≥4 KB), difficulty filter (C–F), manual collection of wrong-answer submissions before July 1, 2025.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The GitHub artifact at GLEAM-Lab/ReduceFix is stated to contain the full artifact, including LFTBench and LFTBench-Py; OSS-Fuzz data reuses publicly available ARVO scripts.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.2 describes the collection procedure in detail: AtCoder tasks selected by contest number range, difficulty level, test file size threshold, and manual identification of failing submissions before a cutoff date.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; LFTBench is constructed from publicly available AtCoder contest data with no subject recruitment.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from AtCoder test archives → filtering → manual submission collection → benchmark packaging is described; ARVO's Docker-based data and scripts are reused for OSS-Fuzz.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Section 4.2 explicitly states LFTBench covers ABC 361–377, 'a span entirely after the knowledge cut-offs of the 4 LLMs we evaluate,' directly addressing training cutoffs.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Section 4.2 explicitly motivates the post-cutoff design by noting that existing benchmarks 'were released years ago' and 'large language models have almost certainly seen,' providing quantitative context for the leakage concern.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "For OSS-Fuzz, the paper acknowledges potential overlap but argues relative comparisons remain valid since the same LLM and instances are held fixed across strategies; LFTBench is explicitly designed to be post-cutoff.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Table 5 reports exact API costs: ReduceFix costs $0.017 vs. pure-LLM $0.632 on 20 problems; Section 4.1 lists per-token pricing for Qwen2.5-Plus and DeepSeek-V3.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Only API costs for one reduction comparison are reported; total compute for all experiments across 200 bugs × 4 LLMs × 3 conditions is not stated, and local GPU experiments have no wall-clock totals.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "ReduceFix successfully reduces 95% of 200 LFTBench bugs with 89.1% average compression rate",
    375       "evidence": "Table 3 reports 95.0% overall success rate and mean/median compression of 89.1%/100.0%; all 200 syntactically valid reducers were generated.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Providing reduced test inputs improves pass@10 by up to 53.8% relative to full test inputs across LLMs",
    380       "evidence": "Table 6: GLM-4-9B-chat improves from 6.5% (Origin Test) to 10.0% (Reduced Test) overall pass@10, a 53.8% relative gain; gains are consistent across all 4 LLMs.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Including the full unmodified failing test often hurts repair accuracy below the no-test baseline",
    385       "evidence": "Table 6: GLM-4-9B-chat drops from 8.5% (Baseline) to 6.5% (Origin Test) pass@10; DeepSeek-V3 drops from 66.5% to 63.0%.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Both compact length and complete failure evidence are required; neither alone suffices",
    390       "evidence": "Table 9 (RQ-3): Diff Lines (sparse info, short prompt) achieves 20.0% pass@10; Reduced+Origin (complete info, long prompt) achieves 19.0%; Reduced Test (both) achieves 25.5%.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "ReduceFix integrates as a drop-in plug-in and improves ChatRepair by 21.3% and CREF by 2.6% relative pass@10",
    395       "evidence": "Tables 10–11: ChatRepair 30.5%→37.0% (+21.3% relative); CREF 39.0%→40.0% (+2.6% relative) on LFTBench.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "ReduceFix outperforms both ddmin-only (35.5% success) and pure-LLM (40.0% success) reduction baselines",
    400       "evidence": "Table 3 reports ddmin-only 35.5%, pure-LLM 40.0%, ReduceFix 95.0% overall reduction success rate.",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "ReduceFix generalizes to repository-level OSS-Fuzz crashes, improving pass@10 from 16.7% (Origin Test) to 41.7%",
    405       "evidence": "Table 13: micro-average pass@10 rises from 16.7% (Origin Test) and 25.0% (Baseline) to 41.7% (Reduced Test) on 12 OSS-Fuzz instances with Qwen2.5-Plus.",
    406       "supported": "moderate"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "empirical"
    412   ],
    413   "key_findings": "ReduceFix demonstrates that automatically reducing long failure-inducing test inputs before LLM-based repair substantially improves patch accuracy: inputs shrink by 89.1% on average with 95% success, and pass@10 improves by up to 53.8% relative over using the full test. Crucially, including unmodified long test inputs often hurts performance below the no-test baseline, confirming the 'lost-in-the-middle' effect. Ablation studies show that both prompt length reduction and preserved failure information are independently necessary—neither alone achieves the full gain. The approach integrates as a drop-in component for existing APR systems (ChatRepair +21.3%, CREF +2.6%) and generalizes to OSS-Fuzz repository-level crashes.",
    414   "red_flags": [
    415     {
    416       "flag": "Tiny OSS-Fuzz dataset",
    417       "detail": "Repository-level generalization claims are based on only 12 OSS-Fuzz instances across 5 projects; this is too small to support confident conclusions about real-world applicability."
    418     },
    419     {
    420       "flag": "No confidence intervals on pass@k",
    421       "detail": "All pass@k results are point estimates with no variance, confidence intervals, or error bars; statistical significance is tested for one comparison (RQ-2) but not reported for most tables."
    422     },
    423     {
    424       "flag": "Cloud model versions unpinned",
    425       "detail": "Qwen2.5-Plus is a cloud API service with no snapshot date or version identifier; results may not be reproducible if the underlying model is updated."
    426     },
    427     {
    428       "flag": "Competitive programming domain gap",
    429       "detail": "LFTBench is entirely AtCoder competitive programming problems, which have reference solutions and exact output oracles not present in most real-world software bugs; the benchmark may not reflect typical APR scenarios."
    430     },
    431     {
    432       "flag": "Marginal gains for strong models",
    433       "detail": "For DeepSeek-V3, improvement over Baseline is 45.2%→45.9% pass@1 and 66.5%→67.0% pass@10 — within statistical noise range despite the MWW test; the approach appears most valuable for weaker models."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT",
    439       "relevance": "Primary baseline system (ChatRepair) integrated with ReduceFix; demonstrates conversational APR with test feedback."
    440     },
    441     {
    442       "title": "CREF: An LLM-based Conversational Software Repair Framework for Programming Tutors",
    443       "relevance": "Second baseline system integrated with ReduceFix; representative of conversational repair with tutor guidance."
    444     },
    445     {
    446       "title": "Lost in the Middle: How Language Models Use Long Contexts",
    447       "relevance": "Foundational motivation for ReduceFix: documents the attention degradation in long prompts that ReduceFix targets."
    448     },
    449     {
    450       "title": "Simplifying and Isolating Failure-Inducing Input (ddmin)",
    451       "relevance": "Classical delta debugging algorithm that ReduceFix builds upon; the paper's LLM generates task-specific adaptations of ddmin."
    452     },
    453     {
    454       "title": "LPR: Large Language Models-Aided Program Reduction",
    455       "relevance": "Closest prior work to ReduceFix's approach; focuses on source code reduction rather than arbitrary test input formats."
    456     },
    457     {
    458       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    459       "relevance": "Standard APR benchmark the paper explicitly contrasts with LFTBench, noting its short test inputs and leakage risk."
    460     },
    461     {
    462       "title": "Perses: Syntax-Guided Program Reduction",
    463       "relevance": "Grammar-based reduction baseline that motivates ReduceFix's LLM-based approach to handle diverse input formats."
    464     },
    465     {
    466       "title": "ARVO: Atlas of Reproducible Vulnerabilities for Open Source Software",
    467       "relevance": "Provides the OSS-Fuzz data, scripts, and Docker images used for the repository-level validation experiments."
    468     },
    469     {
    470       "title": "Automated Repair of Programs from Large Language Models",
    471       "relevance": "Representative LLM-based APR work; evaluates LLMs on programs they generated, establishing state of the art."
    472     },
    473     {
    474       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    475       "relevance": "Repository-level repair framework cited for its SEARCH/REPLACE patch format used in the OSS-Fuzz experiments."
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 3,
    481       "justification": "ReduceFix is a drop-in component for existing APR pipelines with a public artifact, directly usable by practitioners building or extending LLM-based repair systems."
    482     },
    483     "surprise_contrarian": {
    484       "score": 2,
    485       "justification": "The finding that adding more test information (full failing test) consistently hurts repair accuracy below the no-test baseline is counterintuitive and challenges common APR prompt design assumptions."
    486     },
    487     "fear_safety": {
    488       "score": 0,
    489       "justification": "The paper addresses software reliability tooling with no safety, alignment, or misuse implications."
    490     },
    491     "drama_conflict": {
    492       "score": 1,
    493       "justification": "Mild tension with prior APR work that includes full test inputs by default; no major controversy."
    494     },
    495     "demo_ability": {
    496       "score": 2,
    497       "justification": "GitHub artifact is released with LFTBench benchmark; practitioners can run the pipeline on the benchmark, though API keys and AtCoder problem access are required."
    498     },
    499     "brand_recognition": {
    500       "score": 1,
    501       "justification": "Authors from Yanshan University, Peking University, Zhejiang University, and Aalto University — credible institutions but not famous AI labs; no well-known product affiliation."
    502     }
    503   },
    504   "hn_data": {
    505     "threads": [
    506       {
    507         "hn_id": "44309345",
    508         "title": "Reasoning by Superposition: A Perspective on Chain of Continuous Thought",
    509         "points": 60,
    510         "comments": 1,
    511         "url": "https://news.ycombinator.com/item?id=44309345"
    512       },
    513       {
    514         "hn_id": "44996548",
    515         "title": "The JWST Rocky Worlds DDT Program reveals GJ 3929B to likely be a bare rock",
    516         "points": 18,
    517         "comments": 0,
    518         "url": "https://news.ycombinator.com/item?id=44996548"
    519       },
    520       {
    521         "hn_id": "44047804",
    522         "title": "Code Improvement Practices at Meta",
    523         "points": 4,
    524         "comments": 0,
    525         "url": "https://news.ycombinator.com/item?id=44047804"
    526       },
    527       {
    528         "hn_id": "45300655",
    529         "title": "Generalizable Geometric Image Caption Synthesis",
    530         "points": 3,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=45300655"
    533       },
    534       {
    535         "hn_id": "36942453",
    536         "title": "Open Problems and Fundamental Limitations of RLHF",
    537         "points": 3,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=36942453"
    540       },
    541       {
    542         "hn_id": "44324675",
    543         "title": "ProtoReasoning: Prototypes as the Foundation for Generalizable Reasoning in LLMs",
    544         "points": 2,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=44324675"
    547       },
    548       {
    549         "hn_id": "43781749",
    550         "title": "A Comprehensive Benchmark for C-to-Safe-Rust Transpilation",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=43781749"
    554       },
    555       {
    556         "hn_id": "43776339",
    557         "title": "The Bitter Lesson Learned from 2k Multilingual Benchmarks",
    558         "points": 2,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=43776339"
    561       },
    562       {
    563         "hn_id": "45537808",
    564         "title": "The role of non–metricity on neutrino behavior in bumblebee gravity",
    565         "points": 1,
    566         "comments": 0,
    567         "url": "https://news.ycombinator.com/item?id=45537808"
    568       },
    569       {
    570         "hn_id": "44971896",
    571         "title": "OS-R1: Agentic Operating System Kernel Tuning with Reinforcement Learning",
    572         "points": 1,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=44971896"
    575       }
    576     ],
    577     "top_points": 60,
    578     "total_points": 96,
    579     "total_comments": 1
    580   }
    581 }

Impressum · Datenschutz