scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24912B)
      1 {
      2   "paper": {
      3     "title": "Measuring Technical Debt in AI-Based Competition Platforms",
      4     "authors": ["Dionysios Sklavenitis", "Dimitris Kalles"],
      5     "year": 2024,
      6     "venue": "13th Hellenic Conference on Artificial Intelligence (SETN 2024)",
      7     "arxiv_id": "2405.11825",
      8     "doi": "10.1145/3688671.3688783"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["survey_methodology"],
     12   "methodology_tags": ["meta-analysis", "qualitative"],
     13   "key_findings": "Through a scoping review of 72 studies, the authors identified and categorized 18 types of technical debt in AI-based systems, including algorithm, architectural, code, configuration, data, model, ethics, infrastructure, and test debt. They propose a new type — Accessibility Debt — specific to AI competition platforms, addressing barriers participants face from inadequate platform usability. A 68-question questionnaire with a 1-5 scoring system is developed for organizers and participants to assess technical debt, but it has not been empirically validated.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code, analysis scripts, or repository links are provided in the paper. The questionnaire and classification data are presented only within the paper text."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset or supplementary data files are released. The full list of 100 reviewed studies is presented in tables within the paper but not as a downloadable dataset."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment or tool specifications are provided. The scoping review methodology does not describe any software tools used for analysis."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While the search strategy is documented (Appendix A), there are no step-by-step reproduction instructions for replicating the full review process including screening decisions and data extraction."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a qualitative scoping review with no statistical experiments or quantitative results requiring confidence intervals."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No comparative quantitative claims are made that would require significance testing. The paper is a qualitative classification study."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "No quantitative effects are measured. The paper classifies types of technical debt and proposes a questionnaire without empirical testing."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "As a scoping review, the sample size (number of papers) is determined by the search strategy rather than requiring statistical justification."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "No experimental runs or quantitative measurements are conducted that would require variance reporting."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The paper mentions that 'to our knowledge, there are no comparable initiatives' and references [30] as partially related but does not conduct a structured comparison against prior surveys or frameworks for measuring technical debt."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No structured comparison with prior surveys or technical debt measurement tools is conducted. The only reference to a comparable effort ([30] Breck et al. 2017) is noted as focusing solely on ML testing/monitoring."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "There is no system with components to ablate. The paper proposes a classification taxonomy and questionnaire."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "No experiments are conducted and no metrics are evaluated. The questionnaire is proposed but not tested."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No system outputs are evaluated by humans. The questionnaire is proposed for future evaluation by organizers and participants."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "No test set or experimental evaluation exists. This is a literature review with a proposed framework."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides a breakdown of 18 types of technical debt with document counts and publication spans. Table 2 shows the number of questionnaire questions per debt type. Table 6 in the appendix lists individual studies by debt type."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper does not discuss cases where its classification scheme might fail, borderline categorization decisions, or studies that were difficult to classify. All examples are hypothetical success scenarios."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results are reported. The paper does not discuss any classification approaches that were tried and abandoned, or types of debt that were considered but rejected."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims are supported: they identify and categorize types of technical debt (Table 1, Section 3.2), develop a questionnaire (Section 4, Appendix C), and introduce Accessibility Debt (Section 3.2.9). Claims are appropriately scoped as a framework proposal."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper makes no empirical causal claims. Statements like 'may lead to' and 'can compromise' are qualitative assessments from the literature, not causal claims from the authors' own data."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper broadly claims the framework applies to AI-based competition platforms generally, but the questionnaire has not been tested on any actual platform. The title and abstract make broad claims ('measuring technical debt') for a framework that is entirely unvalidated. Future work acknowledges the need to test across 'various competition platforms' and 'diverse stakeholders.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative classification schemes, whether different categorizations of technical debt would be equally valid, or whether the 18 types might overlap or be reducible to fewer dimensions."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The questionnaire scores are presented as a measure of technical debt, but the paper does not discuss whether questionnaire responses actually correlate with real technical debt levels. The gap between self-reported questionnaire answers and actual technical debt presence is not acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No AI models are used in this study. It is a literature review with a proposed framework."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting of AI models is involved in this study."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No models or algorithms requiring hyperparameters are used."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used in this study."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix A documents the PRISMA flow: 870 records identified → duplicates removed (n=65) → 169 screened → 85 excluded → 84 full-text assessed → 72 included. Search strings, databases, date ranges, and inclusion/exclusion criteria are specified in Sections A.3-A.8."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 'Threats to Validity' provides a dedicated discussion of internal and external validity threats. Additional limitations are discussed in Appendix A.12."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The threats section discusses generic concerns: researcher subjectivity ('other researchers who wish to use our strategy may come up with slightly different results') and that 'the area of technical debt in AI-based systems is constantly evolving.' These are standard disclaimers rather than threats specific to this study's particular findings or classification decisions."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the framework does NOT cover or cannot measure. Future work implies the scope is limited to academic platforms (mentioning plans to test commercial environments like Azure and Kaggle), but no explicit scope boundaries are stated in the paper body."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw review data (screening decisions, extraction sheets, coding forms) is available for independent verification. Only aggregated results appear in tables."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Appendix A describes the data collection in detail: five databases searched (Google Scholar, ACM, IEEE Xplore, Scopus, Springer), exact search strings, date range (2012-Feb 2024), and inclusion/exclusion criteria (I1-I3, E1-E5)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved in this study. Paper selection methods are covered under data_collection_described."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The PRISMA flow diagram in Appendix A.13 documents each pipeline stage with counts: 870 records identified → duplicates removed → 169 screened → 85 excluded → 84 assessed for eligibility → 10 excluded → 74 full-text assessed → 2 excluded → 72 included. Exclusion reasons are provided."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Both authors are identified as affiliated with Hellenic Open University, Patras, Greece, with email addresses provided."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is disclosed, so independence of funding cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate any pre-trained model on a benchmark. It is a scoping literature review."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No model evaluation is conducted. This is a literature review and framework proposal."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is involved in this study."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved. The proposed questionnaire is not yet administered to participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects are involved in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are studied."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are recruited. Inclusion/exclusion criteria for papers are described separately."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No experimental study with human participants is conducted."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No experimental study requiring blinding is conducted."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a survey paper with no system to measure inference cost for."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a survey paper with no computational experiments."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Appendix A.13 explicitly references PRISMA guidelines and includes a PRISMA flow diagram. The methodology includes structured search strings across five databases, defined inclusion/exclusion criteria, and a two-phase screening process."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The survey does not assess the methodological quality of its 72 included studies. All papers are treated equally regardless of rigor. No quality scoring rubric or risk-of-bias assessment is applied to source papers."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No discussion of publication bias is present. The paper does not consider whether its sources skew toward positive results, nor does it use funnel plots or other publication bias tests."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "18 distinct types of technical debt exist in AI-based systems, categorized from 100 reviewed studies.",
    313       "evidence": "Table 1 lists 18 types (Algorithm, Architectural, Build, Code, Configuration, Data, Defect, Design, Documentation, Ethics, Infrastructure, Model, People, Process, Requirements, SATD, Test, Versioning) with document counts and publication spans. Table 6 in the appendix maps individual studies to debt types. Section A.9 describes the PRISMA selection yielding 72 studies.",
    314       "supported": "moderate"
    315     },
    316     {
    317       "claim": "Accessibility Debt is a new type of technical debt specific to AI-based competition platforms.",
    318       "evidence": "Section 3.2.9 defines Accessibility Debt as 'barriers that participants encounter due to the lack of immediate usability of platform technologies.' A hypothetical example is provided but no empirical evidence demonstrates this is a distinct or measurable category.",
    319       "supported": "weak"
    320     },
    321     {
    322       "claim": "A 68-question questionnaire can quantify technical debt in AI-based competition platforms.",
    323       "evidence": "Section 4 describes the scoring methodology (1-5 scale, YES/NO/NA/Don't Know responses) and Tables 3-4 provide worked examples. The full 68 questions are listed in Appendix C. However, the questionnaire has not been tested on any platform or with any participants — validation is explicitly listed as future work (Section 7).",
    324       "supported": "weak"
    325     },
    326     {
    327       "claim": "The nine most significant types of technical debt for AI competition platforms are Algorithm, Architectural, Configuration, Data, Model, Ethics, Infrastructure, Test, and Accessibility.",
    328       "evidence": "Section 3.2 states these were selected 'based on their prevalence in the literature [8, 14]' and the unique relevance of Accessibility Debt to competition platforms. The selection criteria for 'most significant' are not rigorously defined.",
    329       "supported": "weak"
    330     }
    331   ],
    332   "red_flags": [
    333     {
    334       "flag": "Unvalidated measurement instrument",
    335       "detail": "The paper's primary contribution — a 68-question questionnaire for measuring technical debt — has not been tested on any actual platform, with any real organizers or participants. The authors explicitly state validation is future work (Section 7). Yet the title claims 'measuring' technical debt, implying an operational capability that does not yet exist."
    336     },
    337     {
    338       "flag": "All examples are hypothetical",
    339       "detail": "Every example illustrating the 18 types of technical debt in competition platforms is a hypothetical scenario, not a real case study. No actual AI competition platform was analyzed to demonstrate the framework's applicability."
    340     },
    341     {
    342       "flag": "No quality assessment of source papers",
    343       "detail": "The scoping review treats all 72-100 included papers equally without assessing their methodological quality. This risks laundering weak or preliminary findings alongside rigorous studies."
    344     },
    345     {
    346       "flag": "Single-author initial screening",
    347       "detail": "The first selection phase was conducted by a single author who read all articles and made comments. The second author reviewed only the comments, not the full papers. This introduces potential screening bias despite the described mitigation."
    348     },
    349     {
    350       "flag": "Questionnaire scoring lacks psychometric justification",
    351       "detail": "The 1-5 scoring weights assigned to questionnaire items appear arbitrary with no psychometric basis. For example, why does 'usability testing' score 5 while 'feedback mechanisms' scores 3? No weighting methodology, expert panel, or calibration is described."
    352     }
    353   ],
    354   "cited_papers": [
    355     {
    356       "title": "Hidden technical debt in machine learning systems",
    357       "authors": ["D. Sculley", "G. Holt", "D. Golovin", "E. Davydov", "T. Phillips", "D. Ebner"],
    358       "year": 2015,
    359       "relevance": "Foundational work introducing the concept of hidden technical debt in ML systems, including boundary erosion, entanglement, and feedback loops."
    360     },
    361     {
    362       "title": "Is using deep learning frameworks free? Characterizing technical debt in deep learning frameworks",
    363       "authors": ["J. Liu", "Q. Huang", "X. Xia", "E. Shihab", "D. Lo", "S. Li"],
    364       "year": 2020,
    365       "doi": "10.1145/3377815.3381377",
    366       "relevance": "Identifies extensive technical debt in deep learning frameworks across algorithm, build, defect, and SATD categories."
    367     },
    368     {
    369       "title": "Characterizing Technical Debt and Antipatterns in AI-Based Systems: A Systematic Mapping Study",
    370       "authors": ["J. Bogner", "R. Verdecchia", "I. Gerostathopoulos"],
    371       "year": 2021,
    372       "doi": "10.1109/TechDebt52882.2021.00016",
    373       "relevance": "Systematic mapping of 72 antipatterns and 46 solutions for technical debt in AI systems, major source for the paper's classification."
    374     },
    375     {
    376       "title": "Software Engineering for AI-Based Systems: A Survey",
    377       "authors": ["S. Martínez-Fernández", "J. Bogner", "X. Franch", "M. Oriol", "J. Siebert", "A. Trendowicz", "S. Wagner"],
    378       "year": 2022,
    379       "doi": "10.1145/3487043",
    380       "relevance": "Comprehensive survey synthesizing SE practices for AI development, examining integration of software engineering in AI."
    381     },
    382     {
    383       "title": "Software Engineering for Machine Learning: A Case Study",
    384       "authors": ["S. Amershi", "A. Begel", "C. Bird", "R. DeLine", "H. Gall", "E. Kamar", "N. Nagappan"],
    385       "year": 2019,
    386       "doi": "10.1109/ICSE-SEIP.2019.00042",
    387       "relevance": "Industry case study examining SE challenges in ML development at Microsoft, providing practical perspective on AI technical debt."
    388     },
    389     {
    390       "title": "The ML test score: A rubric for ML production readiness and technical debt reduction",
    391       "authors": ["E. Breck", "S. Cai", "E. Nielsen", "M. Salib", "D. Sculley"],
    392       "year": 2017,
    393       "doi": "10.1109/BigData.2017.8258038",
    394       "relevance": "Proposes a rubric for ML production readiness and debt reduction — the closest prior work to this paper's questionnaire approach."
    395     },
    396     {
    397       "title": "Code and Architectural Debt in Artificial Intelligence-Enabled Systems: On the Prevalence, Severity, Impact, and Management Strategies",
    398       "authors": ["G. Recupito", "F. Pecorelli", "G. Catolino", "V. Lenarduzzi", "D. Taibi"],
    399       "year": 2024,
    400       "relevance": "Studies prevalence and management of code and architectural debt specifically in AI systems."
    401     },
    402     {
    403       "title": "Code Smells in Machine Learning Systems",
    404       "authors": ["J. Gesi", "S. Liu", "J. Li", "I. Ahmed", "N. Nagappan", "D. Lo", "L. Bao"],
    405       "year": 2022,
    406       "relevance": "Identifies five new code smells affecting maintainability of deep learning systems, contributing to understanding of code debt in AI."
    407     },
    408     {
    409       "title": "CodaLab Competitions: An open source platform to organize scientific challenges",
    410       "authors": ["A. Pavao"],
    411       "year": 2022,
    412       "relevance": "Proposes using CodaLab to manage technical debt in competitive AI settings, directly relevant to AI competition platform quality."
    413     },
    414     {
    415       "title": "Software Quality for AI: Where We Are Now?",
    416       "authors": ["V. Lenarduzzi", "F. Lomio", "S. Moreschini", "D. Taibi", "D. A. Tamburri"],
    417       "year": 2021,
    418       "doi": "10.1007/978-3-030-65854-0_4",
    419       "relevance": "Examines the evolving nature of software quality metrics for AI systems, highlighting that AI systems are 'just like traditional software until they're not.'"
    420     }
    421   ],
    422   "engagement_factors": {
    423     "practical_relevance": {
    424       "score": 1,
    425       "justification": "The questionnaire framework could be useful to AI competition organizers, but it is entirely unvalidated and requires significant adaptation before practical use."
    426     },
    427     "surprise_contrarian": {
    428       "score": 0,
    429       "justification": "The findings confirm expected challenges with technical debt in AI systems; no conventional wisdom is challenged."
    430     },
    431     "fear_safety": {
    432       "score": 0,
    433       "justification": "No AI safety or security concerns are raised; the paper addresses software engineering quality, not risks."
    434     },
    435     "drama_conflict": {
    436       "score": 0,
    437       "justification": "No controversy or conflict is present in the paper's findings or framing."
    438     },
    439     "demo_ability": {
    440       "score": 0,
    441       "justification": "No code, tool, or demo is available; the questionnaire exists only as text in the appendix."
    442     },
    443     "brand_recognition": {
    444       "score": 0,
    445       "justification": "Authors are from Hellenic Open University; the venue (SETN) is a regional conference with limited international visibility."
    446     }
    447   }
    448 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs