scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20495B)
      1 {
      2   "paper": {
      3     "title": "Lumen: Developer Agency Through Transparent Context Control in AI-Assisted Programming",
      4     "authors": ["Nakul Goel", "Glaucia Melo"],
      5     "year": 2025,
      6     "venue": "CASCON",
      7     "doi": "10.1109/CASCON66301.2025.00024"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["case-study", "qualitative"],
     12   "key_findings": "Lumen is an open-source tool that uses a double-copy clipboard interaction paradigm to provide transparent context assembly for AI-assisted programming. A cognitive walkthrough analysis using the Cognitive Dimensions of Notations framework found that Lumen reduces context assembly viscosity from 15-20 manual operations to 2-5 guided selections while maintaining full visibility of file dependencies. The paper presents illustrative scenarios but no empirical user study with real participants.",
     13   "claims": [
     14     {
     15       "claim": "Lumen's double-copy interaction design reduces the viscosity of context assembly from 15-20 manual operations to 2-5 guided selections.",
     16       "evidence": "Cognitive walkthrough analysis in Section V, Table I comparing Traditional (15-20 operations), Automatic (single command), and Lumen (2-5 operations) approaches.",
     17       "supported": "weak"
     18     },
     19     {
     20       "claim": "Lumen reduces debugging time from 15 minutes to 3 minutes compared to traditional copy-paste approaches.",
     21       "evidence": "Scenario walkthrough in Section V-B describing a login failure debugging scenario. No actual measurement — times are hypothetical walkthrough estimates.",
     22       "supported": "unsupported"
     23     },
     24     {
     25       "claim": "Lumen eliminates premature commitment by allowing developers to preview and adjust context before AI processing.",
     26       "evidence": "Cognitive dimension analysis in Table I and Section V-A. This is an analytical claim based on the tool's design, not measured empirically.",
     27       "supported": "weak"
     28     },
     29     {
     30       "claim": "Lumen's approach makes hidden dependencies explicit through visual dependency graph display.",
     31       "evidence": "Table I cognitive dimension analysis and implementation details in Section IV-C. Supported by design analysis but not validated with users.",
     32       "supported": "weak"
     33     }
     34   ],
     35   "checklist": {
     36     "artifacts": {
     37       "code_released": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "GitHub repository provided: https://github.com/lumenEngines/Core (Section IV-A)."
     41       },
     42       "data_released": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No evaluation data released. The cognitive walkthrough scenarios are described textually but no structured data or analysis artifacts are provided."
     46       },
     47       "environment_specified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper mentions PyQt5, NetworkX, ThreadPoolExecutor, and API connections (Anthropic, Groq) but provides no requirements.txt, version specifications, or environment setup instructions."
     51       },
     52       "reproduction_instructions": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no instructions for running the tool or reproducing the evaluation."
     56       }
     57     },
     58     "statistical_methodology": {
     59       "confidence_intervals_or_error_bars": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No quantitative experiments are conducted. The evaluation is a qualitative cognitive walkthrough analysis."
     63       },
     64       "significance_tests": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "No quantitative comparisons are made. The evaluation is analytical, not statistical."
     68       },
     69       "effect_sizes_reported": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "No quantitative experiments producing effect sizes. The paper reports estimated operation counts from walkthrough analysis."
     73       },
     74       "sample_size_justified": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "No empirical sample. The evaluation uses cognitive walkthrough analysis, not participant studies."
     78       },
     79       "variance_reported": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No repeated experimental runs. The evaluation is a design-oriented analysis."
     83       }
     84     },
     85     "evaluation_design": {
     86       "baselines_included": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Table I compares three approaches: Traditional (manual copy-paste), Automatic (Cursor/Claude Code), and Lumen across four cognitive dimensions."
     90       },
     91       "baselines_contemporary": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Baselines include contemporary tools: ChatGPT, Claude, Cursor, and Claude Code, which are current AI coding assistants."
     95       },
     96       "ablation_study": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "Lumen is presented as a unified tool; no component ablation is feasible in a cognitive walkthrough evaluation."
    100       },
    101       "multiple_metrics": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Four cognitive dimensions are analyzed: viscosity, visibility, premature commitment, and hidden dependencies (Table I)."
    105       },
    106       "human_evaluation": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "No human evaluation was conducted. The paper acknowledges this: 'a key limitation is the absence of an empirical user study' (Section VII-E)."
    110       },
    111       "held_out_test_set": {
    112         "applies": false,
    113         "answer": false,
    114         "justification": "No dataset or test set is used. The evaluation is a cognitive walkthrough, not a benchmark evaluation."
    115       },
    116       "per_category_breakdown": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Results are broken down by cognitive dimension (viscosity, visibility, premature commitment, hidden dependencies) and by scenario (bug fix, feature addition) in Table I and Section V-B."
    120       },
    121       "failure_cases_discussed": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section IV-G discusses limitations: no detection of runtime imports, conditional imports, memory issues with large projects, 500KB file size limit, no streaming support."
    125       },
    126       "negative_results_reported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section IV-G explicitly lists trade-offs and limitations including static analysis gaps, memory constraints, and API rate limiting challenges."
    130       }
    131     },
    132     "claims_and_evidence": {
    133       "abstract_claims_supported": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The abstract claims Lumen 'reduces context assembly overhead' which is only supported by analytical walkthrough estimates, not empirical measurement. The abstract also says 'We demonstrate through Cognitive Walkthrough Analysis' which is accurate about the method used."
    137       },
    138       "causal_claims_justified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper makes causal claims like 'Lumen reduces context assembly overhead' and scenario estimates ('Time: 3 minutes' vs '15 minutes') without empirical measurement. The cognitive walkthrough is an analytical method, not a causal identification strategy."
    142       },
    143       "generalization_bounded": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The title claims 'AI-Assisted Programming' broadly. The evaluation uses only three illustrative scenarios (login debugging, rate limiting, refactoring) with no real users. Claims about 'production code' are not bounded to the tested scenarios."
    147       },
    148       "alternative_explanations_discussed": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper does not discuss alternative explanations for the claimed benefits. For example, whether the double-copy paradigm might be slower than keyboard shortcuts, or whether the dependency graph might cause information overload."
    152       },
    153       "proxy_outcome_distinction": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "The paper measures cognitive dimensions analytically and uses these as proxies for developer productivity and trust, but does not acknowledge this proxy gap. Scenario time estimates (3 min vs 15 min) are presented without distinguishing them from actual measured productivity."
    157       }
    158     },
    159     "setup_transparency": {
    160       "model_versions_specified": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "The paper mentions using Anthropic and Groq APIs for file summarization but does not specify model versions. Section IV-D code shows 'call_anthropic_api' and 'call_groq_api' without version details."
    164       },
    165       "prompts_provided": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "Section IV-D mentions 'structured prompts requesting specific information' for file summarization but does not provide the actual prompt text. Only '_create_summary_prompt' is referenced without showing content."
    169       },
    170       "hyperparameters_reported": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No API hyperparameters (temperature, top-p, max tokens) are reported for the AI summary generation or query processing."
    174       },
    175       "scaffolding_described": {
    176         "applies": false,
    177         "answer": false,
    178         "justification": "Lumen is a developer tool, not an agentic scaffolding system. It assembles context for AI queries but does not use agentic scaffolding itself."
    179       },
    180       "data_preprocessing_documented": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No evaluation data pipeline is documented. The cognitive walkthrough methodology is described at a high level but without detailed preprocessing or analysis steps."
    184       }
    185     },
    186     "limitations_and_scope": {
    187       "limitations_section_present": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section VII-E 'Limitations' provides substantive discussion of six specific limitation areas including dynamic behavior, semantic understanding, cross-repository dependencies, collaborative context, performance at scale, and need for empirical validation."
    191       },
    192       "threats_to_validity_specific": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section VII-E includes specific threats: 'a key limitation is the absence of an empirical user study', static analysis cannot capture runtime dependencies, 500KB file size limit for detection, and performance challenges with 'millions of files'."
    196       },
    197       "scope_boundaries_stated": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The paper explicitly states what it does NOT show: 'To fully assess Lumen's impact on usability, productivity, and trust in AI assistance, we plan to conduct structured studies with professional developers' (Section VII-E). It also notes static analysis limitations."
    201       }
    202     },
    203     "data_integrity": {
    204       "raw_data_available": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No raw evaluation data is provided. The cognitive walkthrough results are presented as narrative analysis without underlying data artifacts."
    208       },
    209       "data_collection_described": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "The cognitive walkthrough methodology is described briefly in Section V-A but the actual analysis process (who performed it, how judgments were made) is not documented."
    213       },
    214       "recruitment_methods_described": {
    215         "applies": false,
    216         "answer": false,
    217         "justification": "No human participants were recruited. The evaluation is an analytical cognitive walkthrough performed by the authors."
    218       },
    219       "data_pipeline_documented": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No data pipeline is documented. The cognitive walkthrough analysis proceeds directly from scenario descriptions to conclusions without documenting intermediate analytical steps."
    223       }
    224     },
    225     "conflicts_of_interest": {
    226       "funding_disclosed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No funding or acknowledgments section is present in the paper."
    230       },
    231       "affiliations_disclosed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Both authors are affiliated with Toronto Metropolitan University Computer Science Department, clearly listed on the first page."
    235       },
    236       "funder_independent_of_outcome": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No funding information is disclosed, so independence cannot be assessed."
    240       },
    241       "financial_interests_declared": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No competing interests or financial interests statement is present in the paper."
    245       }
    246     },
    247     "contamination": {
    248       "training_cutoff_stated": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It presents a developer tool and evaluates it via cognitive walkthrough."
    252       },
    253       "train_test_overlap_discussed": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No benchmark evaluation of model capabilities is performed."
    257       },
    258       "benchmark_contamination_addressed": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No benchmark evaluation of model capabilities is performed."
    262       }
    263     },
    264     "human_studies": {
    265       "pre_registered": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants. The evaluation is an analytical cognitive walkthrough performed by the authors."
    269       },
    270       "irb_or_ethics_approval": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants involved."
    274       },
    275       "demographics_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants involved."
    279       },
    280       "inclusion_exclusion_criteria": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants involved."
    284       },
    285       "randomization_described": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants involved."
    289       },
    290       "blinding_described": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No human participants involved."
    294       },
    295       "attrition_reported": {
    296         "applies": false,
    297         "answer": false,
    298         "justification": "No human participants involved."
    299       }
    300     },
    301     "cost_and_practicality": {
    302       "inference_cost_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Lumen makes API calls for file summarization and query processing but no costs, token consumption, or latency measurements are reported."
    306       },
    307       "compute_budget_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No computational budget is stated despite the tool requiring API calls and local processing."
    311       }
    312     }
    313   },
    314   "red_flags": [
    315     {
    316       "flag": "No empirical user study",
    317       "detail": "The paper's evaluation is entirely analytical (cognitive walkthrough). All quantitative claims (e.g., '3 minutes vs 15 minutes', '2-5 operations vs 15-20') are hypothetical estimates from walkthrough scenarios, not measured with real developers. The authors acknowledge this limitation."
    318     },
    319     {
    320       "flag": "Claims significantly outrun evidence",
    321       "detail": "The paper makes strong claims about reducing cognitive load, preserving flow state, and building trust, but these are supported only by analytical scenarios, not empirical data. The scenario time estimates appear fabricated for illustration purposes."
    322     },
    323     {
    324       "flag": "Authors evaluate their own tool without independent validation",
    325       "detail": "The cognitive walkthrough was performed by the tool's creators. No independent evaluators or external developers assessed the tool's usability or benefits."
    326     },
    327     {
    328       "flag": "Selective scenario construction",
    329       "detail": "The real-world usage examples in Section VI are constructed to favor Lumen's design. Each scenario contrasts an idealized Lumen workflow against a worst-case traditional workflow, with no consideration of cases where Lumen's approach might be slower or unnecessary."
    330     }
    331   ],
    332   "cited_papers": [
    333     {
    334       "title": "Measuring the impact of early-2025 AI on experienced open-source developer productivity",
    335       "authors": ["METR"],
    336       "year": 2025,
    337       "relevance": "RCT finding that AI assistance reduced senior developer productivity by 19%, directly relevant to AI coding tool effectiveness."
    338     },
    339     {
    340       "title": "Evaluating large language models trained on code",
    341       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    342       "year": 2021,
    343       "arxiv_id": "2107.03374",
    344       "relevance": "Foundational Codex/HumanEval paper for AI code generation evaluation."
    345     },
    346     {
    347       "title": "Grounded Copilot: How programmers interact with code-generating models",
    348       "authors": ["Shraddha Barke", "Michael B. James", "Nadia Polikarpova"],
    349       "year": 2023,
    350       "relevance": "Empirical study of developer interaction modes with AI coding assistants, revealing dual control/loss-of-control patterns."
    351     },
    352     {
    353       "title": "A large-scale survey on the usability of AI programming assistants: Successes and challenges",
    354       "authors": ["Jenny T. Liang", "Chenyang Yang", "Brad A. Myers"],
    355       "year": 2024,
    356       "relevance": "Large-scale survey (410 developers) identifying contextual understanding as a barrier to AI assistant adoption."
    357     },
    358     {
    359       "title": "Using AI-based coding assistants in practice: State of affairs, perceptions, and ways forward",
    360       "authors": ["Alexandr Sergeyuk", "Yaroslav Golubev", "Timofey Bryksin", "Iftekhar Ahmed"],
    361       "year": 2024,
    362       "arxiv_id": "2406.07765",
    363       "relevance": "Survey of 481 programmers confirming lack of project-scale context as key AI assistant limitation."
    364     },
    365     {
    366       "title": "Do users write more insecure code with AI assistants?",
    367       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    368       "year": 2023,
    369       "relevance": "Study on security implications of AI-assisted coding, relevant to trust and safety concerns."
    370     },
    371     {
    372       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    373       "authors": ["Priyan Vaithilingam", "Tianshi Zhang", "Elena L. Glassman"],
    374       "year": 2022,
    375       "relevance": "Usability evaluation of LLM code generation tools, relevant to workflow disruption and adoption barriers."
    376     },
    377     {
    378       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming",
    379       "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"],
    380       "year": 2024,
    381       "relevance": "Models of human-AI interaction costs in programming, relevant to understanding developer workflow with AI tools."
    382     },
    383     {
    384       "title": "Investigating and designing for trust in AI-powered code generation tools",
    385       "authors": ["Ruijia Wang", "Ruoxi Cheng", "Denae Ford", "Thomas Zimmermann"],
    386       "year": 2024,
    387       "relevance": "Study on trust factors in AI code generation, directly relevant to Lumen's transparency motivation."
    388     },
    389     {
    390       "title": "Examining the use and impact of an AI code assistant on developer productivity and experience in the enterprise",
    391       "authors": ["Justin D. Weisz", "Saurabh Kumar", "Michael Muller"],
    392       "year": 2024,
    393       "arxiv_id": "2412.06603",
    394       "relevance": "Enterprise-scale study of AI coding assistant impact on developer productivity."
    395     }
    396   ]
    397 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs