scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22244B)
      1 {
      2   "paper": {
      3     "title": "Agentic AI Modernization: Transforming Institutional Infrastructure Through Orchestrated Multi-Agent LLM Framework",
      4     "authors": ["Mahesh Kumar Damarched"],
      5     "year": 2026,
      6     "venue": "Journal of Computer Science and Technology Studies",
      7     "doi": "10.32996/jcsts.2026.8.4.1"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, code archive, or any mention of code release found in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The legacy code corpus was collected under institutional data governance agreements. No dataset is released or made available."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Section 4.5.3 lists minimum hardware specs (GPU, RAM, CPU) but no software environment specification (requirements.txt, library versions, Dockerfile) is provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results are reported as point estimates (e.g., 87% behavioral equivalence, 65% timeline reduction) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims improvements over traditional approaches (e.g., 65% timeline reduction) but provides no statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports improvements with baseline context, e.g., 'manual intervention required 35% vs. 65-75% traditional' (Table 9), 'documentation accuracy 78% vs. 40-50% manual approaches' (Table 9), and cost reduction from $1,105,000 to $468,000 (Table 12)."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The study uses 3 institutional partners with no justification for why 3 is sufficient or any power analysis. The paper acknowledges in future work that '20-30 diverse institutions would enable statistical generalization.'"
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported across any results. Per-institution results are shown (Table 9) but no aggregate variance."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against 'traditional approaches' with specific numbers, e.g., Table 11 compares agentic vs. traditional timelines activity-by-activity, and Table 12 compares costs."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The 'traditional approach' baseline is described generically (manual SME-driven modernization) with no reference to specific contemporary AI-assisted modernization tools or recent competing approaches."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The system has 7 specialized agents but no ablation study examines the contribution of individual agents or components."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are reported: behavioral equivalence (87%), manual intervention (35%), timeline reduction (65%), documentation accuracy (78%), cost reduction (58%), and per-agent success rates (Table 13)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The paper mentions stakeholder approval (38 percentage points higher with governance alignment) but provides no details on how human evaluation was conducted, who evaluated, or the methodology."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.6.1 states 'Evaluated fine-tuned models on held-out test sets (20% of training data) to ensure improvement and prevent catastrophic forgetting.'"
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 9 provides per-institution breakdown across all metrics. Table 13 provides per-agent-task breakdown of success rates. Table 10 provides per-regulation compliance breakdown."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 4.8.1 reports '11% required further refinement, and 2% exposed ambiguities in the original legacy code that necessitated stakeholder clarification.' Section 5.1.2 identifies specific compliance gaps."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Every metric shows improvement. The infrastructure cost is higher ($60,000 upfront) but framed positively. No experiments that failed or approaches that were tried and abandoned are reported."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 87% modernization rate, 65% decrease in manual intervention, and 78% documentation accuracy improvement, all of which appear in Table 9. Note: the abstract says '65% decrease in manual intervention' but Table 9 shows manual intervention at 35% vs. 65-75% traditional, which is a different framing."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims ('agentic approach reduces timelines by 65%') but the comparison is against estimated traditional approach hours (e.g., '12 SMEs x 40 hours'), not actual observed traditional modernization. There is no controlled comparison."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims 'Transforming Institutional Infrastructure' broadly, and the conclusion states the framework is a 'roadmap for transformation,' but results come from only 3 anonymized institutions. The paper does acknowledge needing 20-30 institutions for generalization in future work, but the title and conclusion do not bound claims to the tested setting."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No alternative explanations are discussed for the observed improvements. For example, the timeline reduction could be partly due to the specific characteristics of the codebases chosen, or the traditional approach estimates could be inflated."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper specifies 'Llama 3 70B' as the primary model (Section 4.4.2) but no specific version, snapshot, or checkpoint is given. Other models evaluated (GPT-4 Turbo, Claude 3 Opus, Mixtral 8x22B, Deepseek-Coder-33B) also lack version specifics."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No actual prompts used for any of the 7 agents are provided. The paper describes agent roles in natural language but never shows the actual prompt text."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 6 reports temperature (0.3), top-p (0.9), frequency penalty (0.5), max tokens (8192), repetition penalty (1.2), and top-k (50). Table 7 provides agent-specific parameters."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The multi-agent architecture is described in detail in Section 3.1, with 7 specialized agents across 4 tiers, their roles, interactions, and workflow patterns (sequential, parallel). The governance alignment pattern is described in Section 3.2."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.2 describes code preprocessing steps: comment removal, normalization, dead code removal, and language detection, with outcome (2.3M to 1.8M lines, 18.1% reduction). Section 4.2.2 describes segmentation."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section. Section 6.4 is 'Future Research Prospects' which mentions needing more institutions but does not constitute a limitations discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed. The future work section mentions expanding to 20-30 institutions but does not frame this as a limitation of the current study."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statements about what the results do NOT show or what populations/settings are excluded. The paper presents results from 3 institutions without bounding the applicability of the conclusions."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data is available. The legacy code is under institutional governance agreements. No supplementary data files or processed datasets are released."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 4.1.1 describes the data collection from 3 institutional partners and a commercial repository, with line counts, languages, date ranges, and a disclaimer about FERPA/GDPR compliance and PII stripping."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The institutional partners are described by type but not how they were recruited or selected."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Sections 4.2 and 4.7 document the full pipeline from collection through preprocessing, parsing, semantic analysis, agent processing, and output generation, including processing times (Table 8) and volume at each stage."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The author is identified as 'Enterprise Programmer Analyst, University of Louisville, USA.'"
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper uses Llama 3 70B fine-tuned on institutional code but does not state the base model's training data cutoff date."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The held-out test set (20% of training data) is mentioned but no discussion of whether the base Llama 3 model may have seen similar code patterns or institutional data in pre-training."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate on any public benchmark. The evaluation is on proprietary institutional code, so benchmark contamination is not applicable."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 12 reports total 12-month costs for the agentic approach ($468,000) broken down by labor, infrastructure, tools, and training. Section 4.6.1 mentions fine-tuning cost ('a few hundred USD'). Table 8 reports processing times."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 4.6.1 states fine-tuning took '40 GPU-hours.' Section 4.5.3 specifies hardware requirements (NVIDIA A100/H100). Table 8 gives total pipeline time of 420 hours (17.5 days)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The framework achieves 87% behavioral equivalence in modernized code across three institutional partners.",
    286       "evidence": "Table 9 shows per-institution behavioral equivalence: 91% (Institution 1), 84% (Institution 2), 86% (Institution 3), aggregate 87%.",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "The agentic approach reduces modernization timelines by 65% compared to traditional approaches.",
    291       "evidence": "Table 11 compares activity-by-activity timelines, showing total reduction from 3,600 hours to 552 hours (85% reduction). Table 9 shows 65% aggregate timeline reduction.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "Documentation accuracy improves by 78% compared to 40-50% from manual approaches.",
    296       "evidence": "Table 9 shows documentation accuracy of 82%, 76%, and 77% across three institutions (aggregate 78%). The 40-50% baseline for manual approaches is stated without citation or evidence.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The framework reduces total 12-month modernization cost by 58% ($1,105,000 to $468,000).",
    301       "evidence": "Table 12 provides itemized cost comparison. However, the traditional approach costs appear to be estimates rather than actual observed costs.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "Governance-aligned deployment achieves 38 percentage points higher stakeholder approval.",
    306       "evidence": "Mentioned only in the conclusion with no supporting data, methodology, or section reference for how stakeholder approval was measured.",
    307       "supported": "unsupported"
    308     },
    309     {
    310       "claim": "Fine-tuned Llama 3 model demonstrates 78-82% improvement on institution-specific tasks relative to base model.",
    311       "evidence": "Stated in Section 4.6.1 outcome but no details on what tasks, metrics, or evaluation methodology produced this figure.",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["case-study"],
    316   "key_findings": "The paper proposes a seven-agent, four-tier agentic LLM architecture for modernizing legacy systems (COBOL, MUMPS, PeopleSoft) in higher education, with on-premises deployment for data sovereignty. Pilot implementations across 3 anonymized institutional partners report 87% behavioral equivalence in modernized code, 65% timeline reduction, and 78% documentation accuracy improvement compared to traditional approaches. However, the 'traditional approach' baselines appear to be estimated rather than observed, no statistical tests are applied, and N=3 institutions limits generalizability.",
    317   "red_flags": [
    318     {
    319       "flag": "Estimated baselines rather than observed comparisons",
    320       "detail": "The traditional approach timelines (Table 11) are calculated estimates (e.g., '12 SMEs x 40 hours = 480 hours') rather than actual observed modernization efforts. The cost comparison (Table 12) also appears estimated. This makes the claimed improvements unverifiable."
    321     },
    322     {
    323       "flag": "Unsupported claim in conclusion",
    324       "detail": "The conclusion states '38 percentage points higher stakeholder approval when governance-alignment strategies are employed' but this figure appears nowhere else in the paper and has no supporting methodology or data."
    325     },
    326     {
    327       "flag": "No limitations section",
    328       "detail": "A paper reporting pilot results from 3 institutions with no limitations section, no threats to validity, and no discussion of alternative explanations is a significant methodological gap."
    329     },
    330     {
    331       "flag": "Single author self-citation pattern",
    332       "detail": "Reference [93] is a self-citation (Damarched, 2026b) that is cited 3 times in the discussion section for claims about economic impact, governance implications, and compliance enhancement."
    333     },
    334     {
    335       "flag": "Suspiciously clean results",
    336       "detail": "Every metric shows substantial improvement (65-90% reductions). No negative results, no failed experiments, no approaches that were tried and abandoned. All compliance frameworks show >93% pass rates."
    337     },
    338     {
    339       "flag": "Tiny sample size for claims made",
    340       "detail": "N=3 institutions is insufficient for the broad claims made. The paper acknowledges in future work that 20-30 institutions would be needed for 'statistical generalization' but does not frame this as a limitation of current claims."
    341     },
    342     {
    343       "flag": "Anonymous institutions prevent verification",
    344       "detail": "All three institutional partners are anonymized ('Large Research University', 'Regional Comprehensive University', 'Community College Consortium'), making independent verification of claims impossible."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    350       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang", "Yiran Wu", "Beibin Li"],
    351       "year": 2023,
    352       "arxiv_id": "2308.08155",
    353       "relevance": "Foundational multi-agent LLM framework used as the primary framework selection in this paper."
    354     },
    355     {
    356       "title": "QualityFlow: an agentic workflow for program synthesis controlled by LLM quality checks",
    357       "authors": ["Ye Hu", "Qiaochu Zhou", "Qinshi Chen"],
    358       "year": 2025,
    359       "arxiv_id": "2501.17167",
    360       "relevance": "Agentic workflow for code synthesis with quality validation, relevant to LLM-based code generation evaluation."
    361     },
    362     {
    363       "title": "AgentMesh: a cooperative Multi-Agent generative AI framework for software development automation",
    364       "authors": ["Saman Khanzadeh"],
    365       "year": 2025,
    366       "arxiv_id": "2507.19902",
    367       "relevance": "Multi-agent framework for software development automation, directly comparable architecture."
    368     },
    369     {
    370       "title": "VAPU: System for Autonomous Legacy Code Modernization",
    371       "authors": ["Ville Ala-Salmi", "Zeeshan Rasheed"],
    372       "year": 2025,
    373       "arxiv_id": "2510.18509",
    374       "relevance": "Autonomous legacy code modernization system using LLMs, directly related research."
    375     },
    376     {
    377       "title": "Code Reborn AI-Driven Legacy Systems Modernization from COBOL to Java",
    378       "authors": ["Gopala Bandarupalli"],
    379       "year": 2025,
    380       "arxiv_id": "2504.11335",
    381       "relevance": "AI-driven COBOL to Java modernization, directly relevant to legacy code transformation evaluation."
    382     },
    383     {
    384       "title": "Evaluating large language models trained on code",
    385       "authors": ["Mark Chen", "Jerry Tworek"],
    386       "year": 2021,
    387       "arxiv_id": "2107.03374",
    388       "relevance": "Foundational work on LLM code generation evaluation (Codex/HumanEval)."
    389     },
    390     {
    391       "title": "A Survey on Code Generation with LLM-based Agents",
    392       "authors": ["Yetao Dong", "Xiangyu Jiang"],
    393       "year": 2025,
    394       "arxiv_id": "2508.00083",
    395       "relevance": "Survey of LLM-based agent code generation approaches."
    396     },
    397     {
    398       "title": "Exploration of LLM Multi-Agent Application implementation based on LangGraph+CrewAI",
    399       "authors": ["Zhihua Duan", "Jialin Wang"],
    400       "year": 2024,
    401       "arxiv_id": "2411.18241",
    402       "relevance": "Multi-agent framework implementation combining LangGraph and CrewAI, relevant to framework comparison."
    403     },
    404     {
    405       "title": "A middle path for On-Premises LLM deployment: preserving privacy without sacrificing model confidentiality",
    406       "authors": ["Hao Huang", "Yifan Li"],
    407       "year": 2024,
    408       "arxiv_id": "2410.11182",
    409       "relevance": "On-premises LLM deployment with privacy preservation, relevant to data sovereignty research."
    410     }
    411   ]
    412 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs