scan.json (20122B)
1 { 2 "paper": { 3 "title": "Source Code Comprehension: A Contemporary Definition and Conceptual Model for Empirical Investigation", 4 "authors": ["Marvin Wyrich"], 5 "year": 2023, 6 "venue": "arXiv.org (submission under review)", 7 "arxiv_id": "2310.11301", 8 "doi": "10.48550/arXiv.2310.11301" 9 }, 10 "scan_version": 3, 11 "active_modules": [], 12 "methodology_tags": ["theoretical"], 13 "key_findings": "The paper proposes the first formal definition of source code comprehension: 'a person's intentional act and degree of accomplishment in inferring the meaning of source code.' It provides a conceptual model for code comprehension experiments with three lanes (mental model, mental state, experimental variables) and distinguishes between measuring comprehension outcomes (link c) and observing the cognitive comprehension process (link d). The narrative review of ~50 years of program comprehension theory shows that the field has never agreed on a definition, leading to incomparable operationalizations across studies.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No code, supplementary materials, or digital artifacts are released. The paper is purely theoretical but could have provided structured data on the reviewed models/theories." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No dataset is released. The paper reviews literature narratively but does not provide a structured corpus or extracted data from the reviewed works." 25 }, 26 "environment_specified": { 27 "applies": false, 28 "answer": false, 29 "justification": "Purely theoretical paper with no computational experiments requiring an environment specification." 30 }, 31 "reproduction_instructions": { 32 "applies": false, 33 "answer": false, 34 "justification": "No empirical experiments to reproduce. The paper proposes a definition and conceptual model." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": false, 40 "answer": false, 41 "justification": "Purely theoretical paper with no quantitative results." 42 }, 43 "significance_tests": { 44 "applies": false, 45 "answer": false, 46 "justification": "No statistical comparisons are made. The paper proposes a definition and conceptual model." 47 }, 48 "effect_sizes_reported": { 49 "applies": false, 50 "answer": false, 51 "justification": "No empirical results requiring effect size reporting." 52 }, 53 "sample_size_justified": { 54 "applies": false, 55 "answer": false, 56 "justification": "No data collection or sampling is performed." 57 }, 58 "variance_reported": { 59 "applies": false, 60 "answer": false, 61 "justification": "No experimental runs or quantitative data." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": false, 67 "answer": false, 68 "justification": "No system or method is empirically evaluated. The paper proposes a theoretical framework." 69 }, 70 "baselines_contemporary": { 71 "applies": false, 72 "answer": false, 73 "justification": "No empirical evaluation requiring baselines." 74 }, 75 "ablation_study": { 76 "applies": false, 77 "answer": false, 78 "justification": "No system with components to ablate." 79 }, 80 "multiple_metrics": { 81 "applies": false, 82 "answer": false, 83 "justification": "No empirical evaluation requiring metrics." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "No system outputs to evaluate." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "No dataset or test set involved." 94 }, 95 "per_category_breakdown": { 96 "applies": false, 97 "answer": false, 98 "justification": "No quantitative results to break down." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 4.2 (Limitations) discusses where the conceptual model falls short: it cannot adequately anchor ethnographic/qualitative studies since 'the model is mostly reduced to the cognitive comprehension aspect,' and the definition is limited to human comprehension, not machine understanding." 104 }, 105 "negative_results_reported": { 106 "applies": false, 107 "answer": false, 108 "justification": "No experiments that could yield negative results." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims the paper (1) defines source code comprehension and (2) presents a conceptual framework for empirical research. Both are delivered: Definition 3.1 in Section 3.1 and the conceptual model in Section 3.2 (Figures 2-5), with case examples in Section 4." 116 }, 117 "causal_claims_justified": { 118 "applies": false, 119 "answer": false, 120 "justification": "The paper makes no causal claims. It proposes a definition and conceptual model without asserting causal relationships." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 4.2 explicitly bounds the scope: the conceptual model is limited to experiments ('we focused on experiments, although definition 3.1 applies regardless'), cannot fully capture qualitative/ethnographic studies, and the definition covers only human comprehension, not machine comprehension of code." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": false, 129 "answer": false, 130 "justification": "The paper presents no empirical results. It is a theoretical framework and definition paper (pure taxonomy/conceptual work)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": false, 134 "answer": false, 135 "justification": "Theoretical paper with no measurements. The paper does discuss proxy-outcome gaps in other studies (Section 3.2, link c) as part of its conceptual contribution, but makes no measurements itself." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": false, 141 "answer": false, 142 "justification": "No AI/ML models are used in this paper." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "No prompting is used in this paper." 148 }, 149 "hyperparameters_reported": { 150 "applies": false, 151 "answer": false, 152 "justification": "No models or experiments requiring hyperparameters." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper reviews literature narratively across ~50 years but does not document how the reviewed works were identified, selected, or filtered. No search strategy, inclusion/exclusion criteria, or systematic selection process is described." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 4.2 is titled 'Limitations' and provides substantive discussion of two specific limitations of the proposed conceptual model." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 4.2 identifies specific limitations: (1) the model is restricted to experiments and 'the outward conduct of a person is not depicted in this process,' limiting applicability to ethnographic studies; (2) the definition focuses on humans and does not address machine/LLM comprehension. These are specific to this work, not generic." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 4.2 explicitly states: 'we focused on experiments' (not qualitative methods), the model 'is mostly reduced to the cognitive comprehension aspect,' and 'a discussion of when a machine understands source code is left for future work.' Section 3.2 also restricts scope to 'empirical studies with an experimental character.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": false, 185 "answer": false, 186 "justification": "No data is collected or analyzed. This is a purely theoretical/definitional paper." 187 }, 188 "data_collection_described": { 189 "applies": false, 190 "answer": false, 191 "justification": "No data collection occurs. The paper reviews existing theory narratively." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No participants or subjects. This is a theoretical paper." 197 }, 198 "data_pipeline_documented": { 199 "applies": false, 200 "answer": false, 201 "justification": "No data pipeline exists. The paper is a theoretical contribution." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "The Acknowledgments section states: 'Wyrich's work is supported by the European Union as part of the ERC Advanced Grant 101052182.'" 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliation is clearly stated: Marvin Wyrich, Saarland University, Saarbrücken, Germany." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "The ERC (European Research Council) is an independent funding body with no commercial stake in the definition of source code comprehension." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "No pre-trained model is evaluated. This is a theoretical paper." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "No model evaluation or benchmark testing occurs." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "No benchmarks are used. Purely theoretical work." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants. This is a theoretical paper proposing a definition and conceptual model." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants or experimental conditions." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants or experimental conditions." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "Purely theoretical paper with no computational experiments or inference." 285 }, 286 "compute_budget_stated": { 287 "applies": false, 288 "answer": false, 289 "justification": "No computation is performed." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "The research community has not agreed on a definition of code comprehension, and most primary studies do not define the construct they measure.", 296 "evidence": "Section 1 cites Sjøberg and Bergersen [54] noting 'most of the concepts are often not theoretically defined' in SE research, and Wyrich et al. [66] confirming 'hardly any study defines code comprehension' in the code comprehension experiment literature.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Different common ways of measuring code comprehension do not correlate with each other.", 301 "evidence": "Section 1 cites references [2, 7, 16, 23, 69] which 'overwhelmingly conclude that different common ways of measuring code comprehension do not correlate with each other.'", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Developers spend 58-70% of their time on program comprehension activities.", 306 "evidence": "Section 2 cites Minelli et al. [33] finding ~70% from IDE interaction data of 740 sessions from 18 developers, and Xia et al. [68] finding ~58% from a field study with 78 professionals across applications.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "The proposed conceptual model covers the main research branches of code comprehension research: behavioral research on influences (links b, c) and neuroscientific investigation (links a, d).", 311 "evidence": "Section 4 demonstrates this by mapping two representative studies to the model: Wagner and Wyrich [64] for behavioral research (Figure 4) and Siegmund et al. [53] for neuroscientific investigation (Figure 5).", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "An implicit definition-by-task prevails in code comprehension research, where 'code comprehension is what the experimental tasks measure,' creating construct validity threats.", 316 "evidence": "Section 1 argues this leads to two problems: inability to justify operationalization decisions (face/content validity) and inability to compare across studies using different measures. Supported by [42, 54, 66].", 317 "supported": "moderate" 318 } 319 ], 320 "red_flags": [ 321 { 322 "flag": "Narrative rather than systematic literature review", 323 "detail": "The historical survey in Section 2 is a narrative review without documented search methodology, inclusion/exclusion criteria, or completeness assessment. While acceptable for a theoretical paper, the selection of which historical works are 'milestones' relies entirely on the author's judgment, potentially omitting alternative theoretical traditions." 324 } 325 ], 326 "cited_papers": [ 327 { 328 "title": "40 Years of Designing Code Comprehension Experiments: A Systematic Mapping Study", 329 "authors": ["Marvin Wyrich", "Justus Bogner", "Stefan Wagner"], 330 "year": 2023, 331 "doi": "10.1145/3626522", 332 "relevance": "Systematic mapping of code comprehension experiment methodology — directly relevant to understanding how AI coding tool evaluations measure comprehension." 333 }, 334 { 335 "title": "Construct Validity in Software Engineering", 336 "authors": ["Dag I.K. Sjøberg", "Gunnar R. Bergersen"], 337 "year": 2022, 338 "doi": "10.1109/TSE.2022.3176725", 339 "relevance": "Establishes construct validity framework for SE research, directly applicable to evaluating whether AI tool studies measure what they claim." 340 }, 341 { 342 "title": "Construct Validity in Software Engineering Research and Software Metrics", 343 "authors": ["Paul Ralph", "Ewan Tempero"], 344 "year": 2018, 345 "doi": "10.1145/3210459.3210461", 346 "relevance": "Foundational work on construct validity for SE metrics including productivity and maintainability — relevant for evaluating AI productivity claims." 347 }, 348 { 349 "title": "Code Comprehension Confounders: A Study of Intelligence and Personality", 350 "authors": ["Stefan Wagner", "Marvin Wyrich"], 351 "year": 2021, 352 "doi": "10.1109/TSE.2021", 353 "relevance": "Studies how developer characteristics (intelligence, personality, experience) confound code comprehension measurements — relevant for AI pair-programming study design." 354 }, 355 { 356 "title": "Evidence Profiles for Validity Threats in Program Comprehension Experiments", 357 "authors": ["Marvin Muñoz Barón", "Marvin Wyrich", "Daniel Graziotin", "Stefan Wagner"], 358 "year": 2023, 359 "relevance": "Catalogs validity threats in code comprehension experiments — directly applicable to evaluating methodological quality of AI-assisted coding studies." 360 }, 361 { 362 "title": "Toward an Objective Measure of Developers' Cognitive Activities", 363 "authors": ["Zohreh Sharafi", "Yu Huang", "Kevin Leach", "Westley Weimer"], 364 "year": 2021, 365 "doi": "10.1145/3434643", 366 "relevance": "Develops objective cognitive activity measures for developers — relevant for evaluating cognitive load claims in AI-assisted programming studies." 367 }, 368 { 369 "title": "Understanding Understanding Source Code with Functional Magnetic Resonance Imaging", 370 "authors": ["Janet Siegmund", "Christian Kästner", "Sven Apel", "Chris Parnin"], 371 "year": 2014, 372 "relevance": "First fMRI study of code comprehension, identifying activated brain regions — foundational for neurocognitive studies of AI-assisted coding." 373 }, 374 { 375 "title": "Measuring Program Comprehension: A Large-Scale Field Study with Professionals", 376 "authors": ["Xin Xia", "Lingfeng Bao", "David Lo", "Zhenchang Xing"], 377 "year": 2018, 378 "doi": "10.1109/TSE.2017.2734091", 379 "relevance": "Large-scale field study finding developers spend ~58% of time on comprehension — relevant baseline for claims about AI tools reducing comprehension effort." 380 }, 381 { 382 "title": "The ABC of Software Engineering Research", 383 "authors": ["Klaas-Jan Stol", "Brian Fitzgerald"], 384 "year": 2018, 385 "doi": "10.1145/3241743", 386 "relevance": "Classification framework for SE research methodologies — relevant for evaluating study design quality in AI/LLM research." 387 }, 388 { 389 "title": "Use and Misuse of the Term 'Experiment' in Mining Software Repositories Research", 390 "authors": ["Claudia Ayala", "Burak Turhan", "Xavier Franch", "Natalia Juristo"], 391 "year": 2022, 392 "doi": "10.1109/TSE.2021.3113558", 393 "relevance": "Documents misuse of 'experiment' terminology in SE research — relevant for evaluating methodological claims in AI/LLM papers." 394 } 395 ], 396 "engagement_factors": { 397 "practical_relevance": { 398 "score": 1, 399 "justification": "Useful for researchers designing code comprehension studies but not directly applicable by software practitioners." 400 }, 401 "surprise_contrarian": { 402 "score": 1, 403 "justification": "The observation that 50 years of code comprehension research has never produced a formal definition is mildly surprising but not paradigm-challenging." 404 }, 405 "fear_safety": { 406 "score": 0, 407 "justification": "No AI risk or security concerns are raised." 408 }, 409 "drama_conflict": { 410 "score": 0, 411 "justification": "No controversy or conflict — the paper is a constructive theoretical contribution." 412 }, 413 "demo_ability": { 414 "score": 0, 415 "justification": "No code, tool, or demo — purely a conceptual framework on paper." 416 }, 417 "brand_recognition": { 418 "score": 0, 419 "justification": "Solo author from Saarland University; no famous lab or product association." 420 } 421 } 422 }