scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26966B)
      1 {
      2   "paper": {
      3     "title": "GitBugs: Bug Reports for Duplicate Detection, Retrieval Augmented Generation, Triage, and More",
      4     "authors": ["Avinash Patil"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2504.09651",
      8     "doi": "10.48550/arXiv.2504.09651"
      9   },
     10   "scan_version": 3,
     11   "active_modules": [],
     12   "methodology_tags": ["observational", "case-study"],
     13   "key_findings": "GitBugs aggregates over 150,000 bug reports from 9 open-source projects across GitHub, Jira, and Bugzilla. A case study on Cassandra demonstrates dataset utility: ARIMA outperformed Prophet for bug volume forecasting (MAE 10.92 vs 20.10), priority classification achieved 82% accuracy but only 0.35 macro-F1 due to class imbalance, and time-to-fix regression yielded R\u00b2 = -0.09 (worse than a mean predictor). Sentence-BERT duplicate detection achieved Recall@10 of 0.61 across 300 queries.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract provides a GitHub repository URL: https://github.com/av9ash/gitbugs/. The paper states it includes 'exploratory data analysis (EDA) notebooks, model training and, validation scripts.'"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The dataset is described as 'openly licensed' and accessible at the GitHub repository. The abstract states: 'Access the data and code at https://github.com/av9ash/gitbugs/.'"
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, conda environment, or library version specifications are mentioned in the paper. The paper does not describe the computational environment needed to run the provided scripts."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions reproducible artifacts (EDA notebooks, training scripts) at the GitHub repository but provides no step-by-step reproduction instructions in the paper itself. A reader would need to navigate the repository without guidance."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All case study results are point estimates: MAE of 10.92 vs 20.10, accuracy 82%, macro-F1 0.35, R\u00b2 = -0.09, Recall@10 = 0.61. No confidence intervals or error bars are reported for any metric."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims 'ARIMA outperformed Prophet' based solely on comparing two MAE numbers (10.92 vs 20.10) without any statistical test. No p-values or significance tests are used anywhere in the case study."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Results are reported as raw metrics (MAE, accuracy, F1, R\u00b2) without formal effect size measures or contextualized comparisons against baselines. The R\u00b2 = -0.09 comparison to a mean predictor is implicit but no formal effect sizes are computed."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for why 300 randomly selected query bugs were used for duplicate detection, why the last 6 months were reserved for testing, or why only the Cassandra project was selected for the case study."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "All results appear to be single-run experiments. No standard deviations, interquartile ranges, or variance across multiple runs are reported for any of the prediction tasks."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "For bug volume forecasting, ARIMA and Prophet are compared against each other and against actual data. The R\u00b2 = -0.09 implicitly compares against a mean-predictor baseline. However, most other tasks (classification, duplicate detection, RAG) use only a single method."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper uses ARIMA, Prophet, and Sentence-BERT (2019), which are established but not contemporary methods. Table III surveys LLM-based approaches for these same tasks but the case study does not benchmark against any of them."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The paper presents a dataset, not a multi-component system. There are no components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used across tasks: MAE for forecasting, accuracy and macro-F1 for classification, MAE/RMSE/R\u00b2 for regression, and Recall@10 for duplicate detection."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is conducted. The RAG-generated explanations are shown as a single illustrative example without human quality assessment. Dataset quality is not evaluated by human annotators."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section IV.C states 'reserving the final six months of data for testing' for the Cassandra case study, implementing a temporal train/test split."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table I provides per-project statistics. Figure 5 shows per-class confusion matrix for priority classification. Figures 2-3 show per-project resolution time distributions. Table II compares against existing datasets."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper explicitly discusses failures: the time-to-fix model's R\u00b2 = -0.09 ('the model underperforms even a naive predictor'), the severity classifier's poor macro-F1 of 0.35 due to class imbalance, and limitations of textual signals."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The time-to-fix prediction R\u00b2 = -0.09 is explicitly reported as a negative result. The severity classifier's 0.35 macro-F1 is honestly presented as poor. The paper states these 'highlight the need for more informative features or alternative modeling approaches.'"
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims a 'comprehensive and up-to-date dataset comprising over 150,000 bug reports from nine actively maintained open-source projects.' Table I confirms 196,387 total reports across 9 projects. Claims about standardized fields, train/test splits, and EDA notebooks are described in the paper."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no causal claims. Comparisons are descriptive ('ARIMA outperformed Prophet' by metric comparison). Correlations are stated as correlations ('Bug priority correlates with outcomes'). The trend analysis uses 'suggesting' rather than causal language."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper describes the dataset as covering 9 specific projects from 3 specific trackers. Claims are framed as the dataset being a 'resource' and 'benchmark' for specific listed tasks. The scope is described by the included projects and domains (browsers, IDEs, distributed systems, cloud infrastructure)."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for its results. The poor R\u00b2 could stem from many factors (feature selection, model choice, outliers, data quality) but only 'more informative features or alternative modeling approaches' is briefly mentioned without substantive discussion."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's claims match the granularity of its measurements. It reports MAE, accuracy, F1, R\u00b2, and Recall@10 as direct metrics without framing them as proxies for broader constructs. No proxy gap exists between what is measured and what is claimed."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper mentions 'Sentence-BERT embeddings' without specifying which Sentence-BERT model variant. ARIMA and Prophet library versions are not specified. The language model used in the RAG demonstration is not identified at all."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The RAG demonstration (Section IV.C) shows an input bug report and retrieved examples but does not provide the actual prompt sent to the language model. The LLM used for generation is not even identified."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters are reported for any model: no ARIMA (p,d,q) parameters, no Prophet configuration, no classifier hyperparameters, no Sentence-BERT parameters, no LLM temperature or sampling settings for the RAG example."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The ML models are standard classifiers and time-series models. The RAG pipeline is described at a high level but is not an agentic system."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Section III.B provides only high-level preprocessing: 'filtered reports to exclude non-bug entries such as feature requests or tasks.' How fields were standardized across GitHub, Jira, and Bugzilla trackers is not described. No filtering counts or intermediate pipeline stages are documented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper has no dedicated Limitations or Threats to Validity section. The Conclusion and Future Work section (Section V) does not discuss limitations. The Insights paragraph briefly mentions 'challenges' but this is not a limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. The paper does not address potential biases in the dataset (e.g., survivorship bias in open-source projects), the representativeness of 9 projects for software engineering generally, or limitations of the standardization process."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge limitations such as: only English-language reports, only open-source projects, only 3 tracker types, or that the Cassandra case study may not generalize to other projects."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The dataset is publicly available at the GitHub repository (https://github.com/av9ash/gitbugs/). The paper describes it as 'openly licensed' with the full bug report data accessible for download."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section III.B describes collection: 'RESTful API calls (for Jira-based systems like Apache projects) and HTML scraping or CSV archive parsing (for Bugzilla-based systems like Firefox and Thunderbird).' Filtering for bug-only entries is described."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data is sourced from public bug tracking systems (GitHub, Jira, Bugzilla) for 9 named open-source projects."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The collection method is described at a high level (APIs, scraping, filtering) but the full pipeline is not documented. No counts are given for how many raw entries were collected vs. how many survived filtering. How duplicate mappings were established across trackers is not described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "This appears to be unfunded solo work by an independent researcher (single author with IEEE email and ORCID, no institutional affiliation listed)."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "The author lists only an IEEE email address and ORCID. No institutional affiliation is disclosed anywhere in the paper, making it impossible to assess potential conflicts with the projects included in the dataset."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "This appears to be unfunded solo work with no disclosed institutional backing."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement appears anywhere in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. The ML models (ARIMA, Prophet, classifiers) are trained from scratch on the dataset. The RAG demonstration is illustrative, not a benchmark evaluation."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No pre-trained model is evaluated on a benchmark. The paper trains its own models on the dataset with a temporal split."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No pre-trained model is evaluated on a benchmark. Contamination concerns do not apply to this dataset paper."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. The paper mines public bug repositories."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. Data comes from public issue trackers."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants involved in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. Project selection criteria (9 well-established open-source projects) are stated but not participant criteria."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants or experimental conditions involving participant assignment."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants or conditions requiring blinding."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a dataset paper. The case study models are illustrative demonstrations of the dataset's utility, not proposed methods requiring cost analysis."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a dataset paper. The computational cost of the illustrative case study is not the paper's contribution."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "GitBugs comprises over 150,000 bug reports from 9 open-source projects spanning GitHub, Jira, and Bugzilla.",
    296       "evidence": "Table I shows 196,387 total reports across 9 projects: Cassandra (4,612), Firefox (28,824), Hadoop (2,503), HBase (5,403), Mozilla Core (85,673), VS Code (32,829), Seamonkey (1,076), Spark (20,275), Thunderbird (15,192).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "ARIMA outperforms Prophet for bug volume forecasting on Cassandra data with MAE of 10.92 vs 20.10.",
    301       "evidence": "Section IV.C and Figure 4 show the comparison on the last 6 months of Cassandra data. Both MAE values are reported.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Bug priority classification achieves 82% accuracy but only 0.35 macro-F1, indicating severe class imbalance effects.",
    306       "evidence": "Section IV.C and Figure 5 show the confusion matrix. The Normal class achieves F1=0.90 while minority classes perform poorly.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Time-to-fix regression yields R\u00b2 = -0.09, performing worse than a naive mean predictor.",
    311       "evidence": "Section IV.C and Figure 6 report MAE of 86.06, RMSE of 158.44, and R\u00b2 of -0.09. The paper explicitly states the negative R\u00b2 indicates underperformance vs. mean prediction.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Sentence-BERT-based duplicate detection achieves Recall@10 of 0.61 across 300 randomly selected query bugs.",
    316       "evidence": "Section IV.C and Figure 9 describe the experiment: cosine similarity over Sentence-BERT embeddings, top-10 retrieval. Most similarity scores fall below 0.5.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "GitBugs supports diverse research tasks including duplicate detection, triaging, resolution prediction, RAG, and temporal analysis.",
    321       "evidence": "Section IV demonstrates each task type with at least one experiment or example. However, most demonstrations use a single method on a single project subset.",
    322       "supported": "weak"
    323     }
    324   ],
    325   "red_flags": [
    326     {
    327       "flag": "No limitations section",
    328       "detail": "A dataset paper presenting itself as a benchmark resource has no dedicated limitations section. Key limitations such as English-language bias, open-source-only scope, representativeness of 9 projects, and potential selection effects in the included projects are never discussed."
    329     },
    330     {
    331       "flag": "No statistical rigor in case study",
    332       "detail": "All case study results are single-run point estimates with no confidence intervals, error bars, significance tests, or variance across runs. Claims like 'ARIMA outperformed Prophet' rest on comparing two numbers without any uncertainty quantification."
    333     },
    334     {
    335       "flag": "Anecdotal RAG demonstration",
    336       "detail": "The Retrieval-Augmented Generation example (Section IV.C) shows a single hand-picked input and output. No systematic evaluation of the RAG pipeline is conducted \u2014 no metrics, no multiple examples, and the LLM used is not even identified."
    337     },
    338     {
    339       "flag": "Incomplete data standardization documentation",
    340       "detail": "The paper does not describe how fields were harmonized across three fundamentally different issue trackers (GitHub Issues, Jira, Bugzilla). How categorical fields like Status, Priority, and Resolution were mapped across systems with different taxonomies is not documented."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Cupid: Leveraging ChatGPT for More Accurate Duplicate Bug Report Detection",
    346       "authors": ["T. Zhang", "I. C. Irsan", "F. Thung", "D. Lo"],
    347       "year": 2023,
    348       "arxiv_id": "2308.10022",
    349       "relevance": "Evaluates ChatGPT for duplicate bug detection, directly relevant to LLM capability assessment on software engineering tasks."
    350     },
    351     {
    352       "title": "Can we enhance bug report quality using LLMs?: An empirical study of LLM-based bug report generation",
    353       "authors": ["J. Acharya", "G. Ginde"],
    354       "year": 2025,
    355       "relevance": "Empirical evaluation of LLM capabilities for improving bug report quality, relevant to LLM productivity in software engineering."
    356     },
    357     {
    358       "title": "ChatBR: Automated Assessment and Improvement of Bug Report Quality Using ChatGPT",
    359       "authors": ["L. Bo", "W. Ji", "X. Sun", "T. Zhang", "X. Wu", "Y. Wei"],
    360       "year": 2024,
    361       "relevance": "Evaluates ChatGPT for automated bug report quality assessment, relevant to LLM-based software engineering automation."
    362     },
    363     {
    364       "title": "An empirical study on the capability of LLMs in decomposing bug reports",
    365       "authors": ["Z. Chen", "V. Nava-Camal", "A. Suleiman", "Y. Tang", "D. Hou", "W. Shang"],
    366       "year": 2025,
    367       "arxiv_id": "2504.20911",
    368       "relevance": "Directly evaluates LLM capabilities for bug report decomposition, relevant to understanding LLM limitations in software engineering."
    369     },
    370     {
    371       "title": "Can LLMs demystify bug reports?",
    372       "authors": ["L. Plein", "T. F. Bissyand\u00e9"],
    373       "year": 2023,
    374       "arxiv_id": "2310.06310",
    375       "relevance": "Evaluates LLM ability to understand and explain bug reports, relevant to LLM comprehension capabilities."
    376     },
    377     {
    378       "title": "Evaluating diverse large language models for automatic and general bug reproduction",
    379       "authors": ["S. Kang", "J. Yoon", "N. Askarbekkyzy", "S. Yoo"],
    380       "year": 2024,
    381       "relevance": "Benchmark evaluation of multiple LLMs for automated bug reproduction, relevant to LLM capability and agentic workflows."
    382     },
    383     {
    384       "title": "Large language models are few-shot testers: Exploring LLM-based general bug reproduction",
    385       "authors": ["S. Kang", "J. Yoon", "S. Yoo"],
    386       "year": 2023,
    387       "relevance": "Evaluates LLMs as few-shot bug reproduction agents, directly relevant to LLM capability evaluation and code generation."
    388     },
    389     {
    390       "title": "Agentic bug reproduction for effective automated program repair at Google",
    391       "authors": ["R. Cheng", "M. Tufano", "J. Cito", "J. Cambronero", "P. Rondon", "R. Wei", "A. Sun", "S. Chandra"],
    392       "year": 2025,
    393       "arxiv_id": "2502.01821",
    394       "relevance": "Describes an agentic AI system for bug reproduction at Google, directly relevant to agentic workflows and automated program repair."
    395     },
    396     {
    397       "title": "An ensemble method for bug triaging using large language models",
    398       "authors": ["A. Kumar Dipongkor"],
    399       "year": 2024,
    400       "relevance": "Applies LLMs to automated bug triaging, relevant to LLM capability in software engineering automation."
    401     },
    402     {
    403       "title": "Graph Neural Network vs. Large Language Model: A comparative analysis for bug report priority and severity prediction",
    404       "authors": ["J. Acharya", "G. Ginde"],
    405       "year": 2024,
    406       "relevance": "Directly compares GNN vs LLM approaches for bug severity prediction, relevant to LLM benchmark evaluation."
    407     },
    408     {
    409       "title": "Method-level bug severity prediction using source code metrics and LLMs",
    410       "authors": ["E. Mashhadi", "H. Ahmadvand", "H. Hemmati"],
    411       "year": 2023,
    412       "relevance": "Evaluates LLMs for code-level bug severity prediction, relevant to LLM capability assessment."
    413     },
    414     {
    415       "title": "Aegis: An agent-based framework for bug reproduction from issue descriptions",
    416       "authors": ["X. Wang", "P. Gao", "X. Meng", "C. Peng", "R. Hu", "Y. Lin", "C. Gao"],
    417       "year": 2025,
    418       "relevance": "Agent-based framework for automated bug reproduction, directly relevant to agentic AI workflows in software engineering."
    419     },
    420     {
    421       "title": "Let's fix this together: Conversational debugging with GitHub Copilot",
    422       "authors": ["Y. Bajpai", "B. Chopra", "P. Biyani", "C. Aslan", "D. Coleman", "S. Gulwani", "C. Parnin", "A. Radhakrishna", "G. Soares"],
    423       "year": 2024,
    424       "relevance": "Evaluates conversational debugging with GitHub Copilot, relevant to LLM-powered developer productivity and agentic code assistance."
    425     },
    426     {
    427       "title": "Burt: A chatbot for interactive bug reporting",
    428       "authors": ["Y. Song", "J. Mahmud", "N. De Silva", "Y. Zhou", "O. Chaparro", "K. Moran", "A. Marcus", "D. Poshyvanyk"],
    429       "year": 2023,
    430       "relevance": "Chatbot system for interactive bug reporting, relevant to LLM-powered software engineering automation."
    431     },
    432     {
    433       "title": "BugRepro: Enhancing Android Bug Reproduction with Domain-Specific Knowledge Integration",
    434       "authors": ["H. Yin", "J. Huang", "Y. Li", "Y. Dong", "T. Zhang"],
    435       "year": 2025,
    436       "arxiv_id": "2505.14528",
    437       "relevance": "Uses domain-specific knowledge for automated bug reproduction, relevant to AI-powered software engineering and agentic workflows."
    438     }
    439   ],
    440   "engagement_factors": {
    441     "practical_relevance": {
    442       "score": 2,
    443       "justification": "The dataset is immediately downloadable and usable for bug report analysis, duplicate detection, and ML training on realistic software engineering data."
    444     },
    445     "surprise_contrarian": {
    446       "score": 0,
    447       "justification": "Straightforward dataset paper that confirms known challenges (class imbalance, prediction difficulty) without challenging any conventional wisdom."
    448     },
    449     "fear_safety": {
    450       "score": 0,
    451       "justification": "No AI risk or security concerns raised."
    452     },
    453     "drama_conflict": {
    454       "score": 0,
    455       "justification": "No controversy or conflict angle."
    456     },
    457     "demo_ability": {
    458       "score": 2,
    459       "justification": "GitHub repository with data, EDA notebooks, and training scripts available for immediate use."
    460     },
    461     "brand_recognition": {
    462       "score": 1,
    463       "justification": "Covers well-known projects (Firefox, VS Code, Cassandra) but the author is an independent researcher, not from a famous lab."
    464     }
    465   }
    466 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs