scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22592B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "GitBugs: Bug Reports for Duplicate Detection, Retrieval Augmented Generation, Triage, and More",
      6     "authors": [
      7       "Avinash Patil"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2504.09651",
     12     "doi": "10.48550/arXiv.2504.09651"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All abstract claims are supported: 150K+ reports confirmed by Table I (196,388 total), multi-tracker aggregation confirmed in Section III.A, standardized fields and metadata listed, train/test splits for duplicate detection mentioned, EDA notebooks and statistics provided in Section III.C.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Paper avoids strong causal claims. High-priority bugs being resolved as Fixed/Won't Fix is presented as an observation, not causal inference. No ablation studies or RCTs. No problematic causal claims without justification.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Claims are bounded to 9 specific open-source projects; case study explicitly restricted to Cassandra; paper acknowledges variation across projects (duplicate rates 2-28%, resolution times vary). Title matches scope (bug reports dataset, not universal software quality measure).",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Case study acknowledges challenges (class imbalance, linguistic subtlety in duplicates, poor time-to-fix prediction) but does not discuss alternative explanations for why the dataset has these characteristics or whether findings could be interpreted differently.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Paper is clear about measurement: counts duplicate rates, resolution times, severity labels. Does not conflate proxies (e.g., text similarity) with underlying construct without acknowledgment (duplicate detection task explicitly stated).",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No dedicated Limitations or Threats-to-Validity section. Section V (Conclusion) does not discuss limitations. Challenges mentioned only in case study (class imbalance, linguistic variation, poor prediction) but not systematized.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No specific threats to dataset validity discussed. Heavy-tailed resolution times and class imbalance are observed but not analyzed as threats. No discussion of measurement error, annotation bias, or generalizability limits.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Paper does not explicitly state what is NOT in scope (e.g., applicability to closed-source projects, other domains, proprietary trackers). No boundaries on generalization beyond the 9 projects studied.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding acknowledgment section present. No statement of funding sources or financial support.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "Author listed as 'avinashpatil@ieee.org' with ORCID but no institutional affiliation (university, company) disclosed.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Not applicable—no funding disclosed.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement. No declaration of patents, equity, or consulting relationships related to GitBugs or evaluated projects.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Terms like 'bug report', 'duplicate', 'triaging', and 'benchmark' are used but not formally defined. Readers must infer meaning from context (e.g., 'duplicate' assumes prior knowledge of what constitutes semantic equivalence).",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Clearly stated: 'we present GitBugs—a comprehensive and up-to-date dataset' (abstract). Key contributions listed in Section I (dataset, metadata, analytics, reproducible artifacts).",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section II.A comprehensively reviews existing datasets (Lamkanfi, BugRepo, Defects4J, etc.). Table II directly compares GitBugs to 16 prior datasets on scope, size, and features. Shows how GitBugs addresses limitations (scale, multi-tracker, temporal currency).",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "benchmark-creation": {
    116       "construct_design": {
    117         "construct_validity_argued": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "Paper states the dataset 'supports various software engineering research tasks' but does not argue WHY this particular collection of fields, projects, and metadata measures these capabilities. No validation that bug summaries + descriptions are sufficient for duplicate detection, for example.",
    121           "source": "haiku"
    122         },
    123         "difficulty_distribution_characterized": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "For duplicate detection, only states 'most top-10 similarity scores fell below 0.5' suggesting difficulty but not systematically characterized. For severity classification, class imbalance noted (macro F1=0.35) but no explicit difficulty tiers or distribution analysis.",
    127           "source": "haiku"
    128         },
    129         "ceiling_floor_effects_checked": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Severity classification shows 82% accuracy (ceiling effect on majority class) but macro F1=0.35 indicates floor effects on minorities. Duplicate detection Recall@10=0.61 is low. Effects are implicitly shown but not formally analyzed for benchmark validity.",
    133           "source": "haiku"
    134         },
    135         "human_baseline_included": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No human performance reported on any task (duplicate detection, severity classification, time-to-fix). No inter-annotator agreement for duplicate labels. Models evaluated against no human reference point.",
    139           "source": "haiku"
    140         },
    141         "scoring_rubric_justified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Standard metrics used (F1, MAE, RMSE, R², Recall@10, cosine similarity) but not justified for this benchmark. No discussion of why these metrics are appropriate or how edge cases in scoring are handled (e.g., how partial duplicate matches are scored).",
    145           "source": "haiku"
    146         }
    147       },
    148       "robustness": {
    149         "contamination_resistance_designed": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No anti-gaming or contamination-resistance measures mentioned. No temporal train/test splits discussed. No canary strings, dynamic generation, or version control strategy to prevent benchmark gaming or contamination.",
    153           "source": "haiku"
    154         },
    155         "temporal_robustness_discussed": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Data spans 2020–2024 but no discussion of temporal robustness, obsolescence risk, or update plan. No mention of whether benchmark will be gamed over time or if maintenance is planned.",
    159           "source": "haiku"
    160         },
    161         "failure_modes_discussed": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "Case study identifies challenges (class imbalance, high variance, textual signal limitations) but these are framed as modeling problems, not benchmark failure modes. No discussion of what the benchmark cannot or will not measure.",
    165           "source": "haiku"
    166         },
    167         "baseline_implementations_provided": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Paper mentions 'EDA notebooks' and 'model training and validation scripts' in abstract but does not provide explicit baseline implementations (e.g., reference BERT model for duplicate detection, decision tree for severity). Case study code examples are illustrative, not reusable baselines.",
    171           "source": "haiku"
    172         }
    173       },
    174       "documentation": {
    175         "dataset_documentation_complete": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "Section III describes sources, collection (API + scraping), and summary statistics. Missing: data card format, detailed preprocessing steps (e.g., how 'non-bug entries' filtered), null/missing value handling, and field-level documentation.",
    179           "source": "haiku"
    180         },
    181         "licensing_and_access_clear": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "States 'openly licensed dataset' and GitHub URL provided (https://github.com/av9ash/gitbugs/) but does not specify license type (MIT, Apache, GPL). Terms of use and data re-sharing rights unclear.",
    185           "source": "haiku"
    186         },
    187         "intended_use_specified": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "Section IV.A/B lists research opportunities and industry use cases but does not explicitly specify what should/should not be concluded from results. No guidance on dataset limitations or misuse scenarios (e.g., should not be used to generalize to closed-source bug patterns).",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "GitBugs contains over 150,000 bug reports from 9 open-source projects",
    199       "evidence": "Table I sums to 196,388 total reports across Cassandra, Firefox, Hadoop, HBase, Mozilla Core, VS Code, SeaMonkey, Spark, and Thunderbird.",
    200       "supported": "strong"
    201     },
    202     {
    203       "claim": "VS Code and Thunderbird have duplicate rates exceeding 25%",
    204       "evidence": "Table I reports VS Code 28.2% and Thunderbird 27.6% duplicate rates.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "Semantic embedding-based duplicate detection achieves Recall@10 = 0.61",
    209       "evidence": "Section IV.C duplicate detection experiment using Sentence-BERT embeddings and cosine similarity on 300 sampled queries.",
    210       "supported": "moderate"
    211     },
    212     {
    213       "claim": "Spark shows fastest bug resolution times (under 10 days median)",
    214       "evidence": "Figure 2 kernel density estimates show Spark peaking at <10 days; Figure 3 box plots confirm compressed distribution.",
    215       "supported": "strong"
    216     },
    217     {
    218       "claim": "Time-to-fix prediction achieves poor performance (MAE=86.06, RMSE=158.44, R²=-0.09)",
    219       "evidence": "Section IV.C regression model on Cassandra data; R² < 0 indicates worse than naive mean predictor.",
    220       "supported": "strong"
    221     },
    222     {
    223       "claim": "Bug severity classification suffers from class imbalance (macro F1=0.35 despite 82% accuracy)",
    224       "evidence": "Section IV.C reports 82% accuracy on majority Normal class but macro F1=0.35, indicating poor performance on minority classes (High, Low, Urgent).",
    225       "supported": "strong"
    226     },
    227     {
    228       "claim": "GitBugs supports duplicate detection, triaging, resolution prediction, and RAG tasks",
    229       "evidence": "Case study (Section IV.C) demonstrates duplicate detection, severity classification, time-to-fix prediction, topic modeling, and RAG pipeline. Mixed results (recall@10=0.61, R²=-0.09) show feasibility but not excellence.",
    230       "supported": "moderate"
    231     }
    232   ],
    233   "methodology_tags": [
    234     "benchmark-creation",
    235     "empirical"
    236   ],
    237   "key_findings": "GitBugs aggregates 150K+ bug reports from 9 open-source projects (Firefox, VS Code, Cassandra, etc.) across GitHub, Jira, and Bugzilla, providing standardized metadata for benchmark research. Case study on Cassandra demonstrates mixed utility: semantic similarity-based duplicate detection achieves Recall@10=0.61 (linguistically subtle duplicates remain hard to detect), severity classification suffers class imbalance (macro F1=0.35), and time-to-fix prediction fails (R²=−0.09, worse than baseline). The dataset is practical for industry use (tool benchmarking, LLM fine-tuning) but benchmarking rigor is limited by absent human baselines, no construct validity arguments, and no contamination-resistance design.",
    238   "red_flags": [
    239     {
    240       "flag": "No human baseline",
    241       "detail": "No human performance reported on duplicate detection, severity classification, or other tasks. No inter-annotator agreement for duplicate labels; unclear if human agreement is high enough to justify task as benchmark."
    242     },
    243     {
    244       "flag": "Missing construct validity",
    245       "detail": "Paper does not argue WHY this dataset's fields and projects measure claimed capabilities (duplicate detection, triaging, etc.). No validation that summary+description are sufficient constructs."
    246     },
    247     {
    248       "flag": "Class imbalance unaddressed",
    249       "detail": "Severity classification macro F1=0.35 despite 82% accuracy; case study notes this as a 'challenge' but proposes no rebalancing, cost-sensitive learning, or evaluation strategy."
    250     },
    251     {
    252       "flag": "Poor prediction performance",
    253       "detail": "Time-to-fix regression R²=−0.09 (worse than always-predicting-mean). Indicates either task is intractable with available features or benchmark is poorly designed; no analysis of root cause."
    254     },
    255     {
    256       "flag": "No limitations section",
    257       "detail": "Paper lacks dedicated discussion of dataset limitations, scope boundaries, or generalizability constraints. Challenges mentioned only in case study conclusions, not systematized."
    258     },
    259     {
    260       "flag": "No temporal robustness",
    261       "detail": "No discussion of contamination resistance, train/test splits, or update strategy. Unclear if benchmark will remain useful or be gamed over time."
    262     },
    263     {
    264       "flag": "Incomplete documentation",
    265       "detail": "No data card or detailed preprocessing steps. License type unspecified ('openly licensed' but not which license). Missing field-level documentation and null/missing value handling."
    266     },
    267     {
    268       "flag": "Single-project validation",
    269       "detail": "Case study restricted to Cassandra only. Generalization to other 8 projects and beyond not validated."
    270     },
    271     {
    272       "flag": "No inter-rater agreement",
    273       "detail": "Duplicate mappings are labeled data but no inter-annotator agreement or validation of label quality reported."
    274     }
    275   ],
    276   "cited_papers": [
    277     {
    278       "title": "The eclipse and mozilla defect tracking dataset: a genuine dataset for mining bug information",
    279       "authors": "Lamkanfi, A., Pérez, J., Demeyer, S.",
    280       "year": 2013,
    281       "relevance": "Foundational large-scale bug dataset paper; established methodological precedent for multi-project bug analysis."
    282     },
    283     {
    284       "title": "An automatically created novel bug dataset and its validation in bug prediction",
    285       "authors": "Ferenc, R., Gyimesi, P., Gyimesi, G., Tóth, Z., Gyimóthy, T.",
    286       "year": 2020,
    287       "relevance": "Demonstrates dataset-to-benchmark pipeline with validation on bug prediction task."
    288     },
    289     {
    290       "title": "Defects4J: A database of existing faults to enable controlled testing studies for java programs",
    291       "authors": "Just, R., Jalali, D., Ernst, M. D.",
    292       "year": 2014,
    293       "relevance": "Widely-used reproducible bug benchmark; establishes standard for controlled bug datasets with metadata."
    294     },
    295     {
    296       "title": "RegMiner: mining replicable regression dataset from code repositories",
    297       "authors": "Song, X., Lin, Y., Wu, Y., Zhang, Y., Ng, S. H., Peng, X., Dong, J. S., Mei, H.",
    298       "year": 2022,
    299       "relevance": "Recent large-scale regression bug dataset; demonstrates multi-project mining and temporal analysis."
    300     },
    301     {
    302       "title": "From reports to bug-fix commits: A 10 years dataset of bug-fixing activity from 55 apache's open source projects",
    303       "authors": "Vieira, R., da Silva, A., Rocha, L., Gomes, J.",
    304       "year": 2019,
    305       "relevance": "Longitudinal bug dataset linking reports to commits; establishes precedent for bug lifecycle analysis."
    306     },
    307     {
    308       "title": "CUPID: Leveraging chatgpt for more accurate duplicate bug report detection",
    309       "authors": "Zhang, T., Irsan, I. C., Thung, F., Lo, D.",
    310       "year": 2023,
    311       "relevance": "Recent LLM application to duplicate detection; demonstrates modern evaluation methodology for task."
    312     },
    313     {
    314       "title": "AndroR2: A dataset of manually-reproduced bug reports for android apps",
    315       "authors": "Wendland, T., Sun, J., Mahmud, J., Mansur, S. H., Huang, S., Moran, K., Rubin, J., Fazzini, M.",
    316       "year": 2021,
    317       "relevance": "Domain-specific (Android) bug dataset with reproduction steps; example of specialized benchmark design."
    318     },
    319     {
    320       "title": "BuGL–a cross-language dataset for bug localization",
    321       "authors": "Muvva, S., Rao, A. E., Chimalakonda, S.",
    322       "year": 2020,
    323       "relevance": "Multilingual bug dataset demonstrating cross-language benchmark design and evaluation methodology."
    324     }
    325   ],
    326   "engagement_factors": {
    327     "practical_relevance": {
    328       "score": 2,
    329       "justification": "Dataset is directly usable by practitioners for bug triage, duplicate detection, and LLM fine-tuning; case study demonstrates real-world applications, but poor results on time-to-fix prediction limit immediate utility across all stated use cases."
    330     },
    331     "surprise_contrarian": {
    332       "score": 0,
    333       "justification": "No surprising findings. Results confirm expected challenges (class imbalance in severity, semantic variation in duplicates, difficulty predicting fix times). No contrarian insights about bug patterns or software engineering practices."
    334     },
    335     "fear_safety": {
    336       "score": 0,
    337       "justification": "Dataset is about open-source bug management and software quality. No AI safety, alignment, or adversarial concerns raised or relevant."
    338     },
    339     "drama_conflict": {
    340       "score": 0,
    341       "justification": "Straightforward dataset contribution; no controversy, competing approaches, or contentious claims. Low drama angle."
    342     },
    343     "demo_ability": {
    344       "score": 3,
    345       "justification": "GitHub repository (https://github.com/av9ash/gitbugs/) publicly available with data and code; users can immediately download, explore, and train models on real bug data. RAG example in paper shows concrete usage."
    346     },
    347     "brand_recognition": {
    348       "score": 1,
    349       "justification": "Aggregates data from well-known projects (Firefox, VS Code, Mozilla) but single-author paper from non-major lab (no institutional affiliation visible). Lower visibility than papers from top research groups."
    350     }
    351   },
    352   "hn_data": {
    353     "threads": [
    354       {
    355         "hn_id": "43563265",
    356         "title": "Search-R1: Training LLMs to Reason and Leverage Search Engines with RL",
    357         "points": 101,
    358         "comments": 12,
    359         "url": "https://news.ycombinator.com/item?id=43563265"
    360       },
    361       {
    362         "hn_id": "45240856",
    363         "title": "Pipes: A Meta-Dataset of Machine Learning Pipelines",
    364         "points": 3,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=45240856"
    367       },
    368       {
    369         "hn_id": "35657519",
    370         "title": "Quantum Decoherence in Microtubules",
    371         "points": 3,
    372         "comments": 0,
    373         "url": "https://news.ycombinator.com/item?id=35657519"
    374       },
    375       {
    376         "hn_id": "44376843",
    377         "title": "Migrating Code at Scale with LLMs at Google",
    378         "points": 2,
    379         "comments": 0,
    380         "url": "https://news.ycombinator.com/item?id=44376843"
    381       },
    382       {
    383         "hn_id": "45226714",
    384         "title": "Are ArXiv submissions on Wednesday better cited?",
    385         "points": 2,
    386         "comments": 0,
    387         "url": "https://news.ycombinator.com/item?id=45226714"
    388       },
    389       {
    390         "hn_id": "39989953",
    391         "title": "InternLM-XComposer2-4KHD: A Pioneering LVLM Handling Resolutions from 336 to 4K",
    392         "points": 2,
    393         "comments": 0,
    394         "url": "https://news.ycombinator.com/item?id=39989953"
    395       },
    396       {
    397         "hn_id": "44000446",
    398         "title": "Language Agents Mirror Human Causal Reasoning Biases. How Can We Help Them Think",
    399         "points": 1,
    400         "comments": 0,
    401         "url": "https://news.ycombinator.com/item?id=44000446"
    402       },
    403       {
    404         "hn_id": "43430336",
    405         "title": "Empowering LLMs for Time Series Forecasting with Temporal Patterns and Semantics",
    406         "points": 1,
    407         "comments": 0,
    408         "url": "https://news.ycombinator.com/item?id=43430336"
    409       },
    410       {
    411         "hn_id": "35650604",
    412         "title": "How Secure Is Code Generated by ChatGPT?",
    413         "points": 1,
    414         "comments": 0,
    415         "url": "https://news.ycombinator.com/item?id=35650604"
    416       },
    417       {
    418         "hn_id": "35635743",
    419         "title": "UniMax: More Effective Language Sampling for LargeScale Multilingual Pretraining",
    420         "points": 1,
    421         "comments": 0,
    422         "url": "https://news.ycombinator.com/item?id=35635743"
    423       }
    424     ],
    425     "top_points": 101,
    426     "total_points": 117,
    427     "total_comments": 12
    428   }
    429 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs