scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26704B)
      1 {
      2   "paper": {
      3     "title": "A Multi-agent Onboarding Assistant based on Large Language Models, Retrieval Augmented Generation, and Chain-of-Thought",
      4     "authors": [
      5       "Andrei-Cristian Ionescu",
      6       "Sergey Titov",
      7       "Maliheh Izadi"
      8     ],
      9     "year": 2025,
     10     "venue": "FSE'25 (SIGSOFT FSE Companion)",
     11     "arxiv_id": "2503.23421",
     12     "doi": "10.1145/3696630.3728611"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'Our tool, source code [7], and demonstration video [6] are publicly available.' Reference [7] points to https://onboarding.software/register?colab=true."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The study was conducted on a closed-source project (Section 5: 'We chose a closed-source project to avoid data contamination'). Neither the study telemetry data, questionnaire responses, nor the evaluated codebase are released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, conda environment, or environment setup details are provided in the paper. Only the embedding model (OpenAI text-embedding-3-large) and FAISS are named without version numbers."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided for either the system setup or the user study. The paper describes the architecture but not how to replicate the evaluation."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Only means and standard deviations are reported (M=3.26, SD=0.86 for helpfulness; M=3.0, SD=0.96 for ease). No confidence intervals or error bars are provided, which would be especially important given n=8."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are performed. The paper reports only descriptive statistics (mean and SD) without any inferential tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. Only raw means and standard deviations are provided with no baseline for comparison."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The sample size of n=8 is not justified. Section 5 acknowledges 'Our pilot study involved eight participants on one codebase due to the limitations of the demonstration project' but provides no power analysis or formal justification."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviations are reported for both metrics: helpfulness (SD=0.86) and ease of onboarding (SD=0.96). These represent variance across the eight participants."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No formal baseline comparison is included. The paper mentions GitHub Copilot and ChatGPT informally but does not compare against them. Section 6 notes participants 'who used mainstream AI coding tools in their daily workflow noted... Onboarding Buddy gave more context-specific advice' but this is anecdotal, not a formal comparison."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No baselines are included at all. The paper acknowledges the need for 'rigorous A/B testing' in future work (Section 6) but does not perform any baseline comparison."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The system has multiple components (Contextualization Agent, Onboarding Agent, Step Processor, Message Enhancer, RAG, CoT, planning scratchpad) but no ablation study is performed to measure individual component contributions."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Three evaluation dimensions are reported: perceived helpfulness (M=3.26), ease of onboarding (M=3.0), and task completion rate (7/8 participants completed all tasks)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The evaluation is a human user study with 8 participants performing onboarding tasks and providing ratings on helpfulness and ease of onboarding (Section 5-6)."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is a user study evaluating a tool, not a machine learning evaluation with train/test splits. The concept of a held-out test set is structurally inapplicable."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Three distinct tasks are used (Setup Task, New Payment Option Task, Questionnaire Duplication Task) but results are only reported as aggregate means across all tasks and participants. No per-task breakdown of ratings is provided."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "Section 6 briefly mentions 'a number of users experienced minor hiccups' on setup and questionnaire duplication tasks, and one participant had coding errors. But there is no systematic error analysis, no qualitative examples of failures, and no discussion of where the approach breaks down."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 6 reports user feedback requesting improvements: more context awareness, better UI, faster response times. Section 7 acknowledges 'identified shortcomings' and the need to enhance 'technical reliability.' These constitute negative findings from the evaluation."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The reported statistics (M=3.26, SD=0.86; M=3.0, SD=0.96) match the results section. However, the abstract claims 'Onboarding Buddy holds great potential for enhancing developer productivity and satisfaction' which is not supported by an n=8 pilot study with no baseline comparison. The claim that it 'uniquely integrates' CoT with RAG 'tailored specifically for dynamic onboarding contexts' is a design claim without comparative evidence."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper implies the system causes improved onboarding (Section 1: 'improve onboarding', Section 7: 'easy-to-use and useful tool that helps users'). However, there is no control group, no randomization, and no baseline — the study design cannot support causal claims about the system's effect on onboarding outcomes."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "Section 5 acknowledges 'eight participants on one codebase... This may affect generalizability.' However, the title ('A Multi-agent Onboarding Assistant') and conclusion ('the Onboarding Buddy is an easy-to-use and useful tool that helps users in performing development tasks') make unbounded claims. The abstract states 'great potential for enhancing developer productivity and satisfaction' without qualification."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations are discussed. Novelty effect, Hawthorne effect (being observed), selection bias in participants, or the possibility that any IDE-integrated assistant would achieve similar ratings are not considered."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper measures 'perceived helpfulness' and 'ease of onboarding' on self-report Likert scales but frames results as evidence of 'enhancing developer productivity and satisfaction' (abstract). The gap between subjective ratings (proxy) and actual productivity improvement (claimed outcome) is never acknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper uses 'large language models' throughout but never specifies which LLM powers the agents. Only the embedding model is named (OpenAI text-embedding-3-large, Section 4.2) without a version date. The core LLM is completely unspecified."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No actual prompt text is provided anywhere in the paper. The agents' behaviors are described in natural language (e.g., 'Allows the user interaction to be conversational') but the actual system prompts, instructions, or templates used are not shown."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. Only RAG parameters are mentioned: chunk size (2000 characters), overlap (200 characters), similarity threshold (0.1), top-k (5 documents)."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The multi-agent architecture is described in detail across Sections 3-4 with Figures 1-2. Four agents are specified (Contextualization, Onboarding, Step Processor, Message Enhancer), along with the planning scratchpad mechanism, retrieval tools, memory storage, and the CoT generation workflow."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The RAG chunking pipeline is described (2000 char chunks, 200 overlap, FAISS indexing) but data preprocessing for the evaluation is not documented. How telemetry was collected, how questionnaire responses were aggregated from per-task to overall means, and what data cleaning was applied are not described."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations or threats-to-validity section. A single sentence in Section 5 notes 'This may affect generalizability' and the conclusion mentions 'identified shortcomings' without substantive discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The only specific threat mentioned is the small sample size and single codebase (Section 5: 'eight participants on one codebase'). No discussion of construct validity, internal validity threats (no control group, novelty effect), or external validity beyond the generic generalizability mention."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. While acknowledging the small sample, it does not state specific things it did not test or claim. The conclusion makes broad unbounded claims ('easy-to-use and useful tool that helps users in performing development tasks')."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Neither the raw questionnaire responses, telemetry data, nor per-participant ratings are made available. Only aggregate statistics (means and SDs) are reported."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5 describes the data collection: three onboarding tasks, project telemetry to track progress, post-task questionnaire with helpfulness and ease ratings on a 0-4 scale, and open-ended feedback. The tasks themselves are described in detail."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "The paper states 'We gathered a sample of eight programmers' (Section 5) but provides no information about how they were recruited, from what population, or whether they were students, colleagues, or external participants."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No description of how raw data was processed into the reported statistics. How telemetry was analyzed, how per-task ratings were aggregated to overall means, and whether any responses were excluded are not documented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 8 acknowledges: 'This work was conducted as part of the AI for Software Engineering (AI4SE) collaboration between JetBrains and Delft University of Technology. The authors gratefully acknowledge the financial support provided by JetBrains.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: Sergey Titov is from JetBrains Research. The tool is an IntelliJ Plugin (JetBrains product ecosystem). The JetBrains affiliation is transparent."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "JetBrains funds the research and has a direct commercial interest in demonstrating that AI-powered tools enhance their IDE ecosystem. The Onboarding Buddy is an IntelliJ plugin, making JetBrains a non-independent funder with a stake in positive outcomes."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided. While the JetBrains funding and affiliation are disclosed, there is no formal declaration of financial interests, patents, or equity."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper evaluates a tool through a user study, not a pre-trained model's capability on a benchmark. The contamination category is structurally inapplicable."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No benchmark evaluation of model knowledge is performed. The study evaluates a tool through human participants on a closed-source project."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No benchmark evaluation is performed. The paper evaluates a tool via user study, not model performance on a benchmark dataset."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No mention of pre-registration (OSF, AsPredicted, or any registry). The study design was not committed to before data collection."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No mention of IRB or ethics board approval despite conducting a study with human participants performing tasks and providing personal evaluations."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "Participants are described only as 'eight programmers' (Section 5). No demographics are reported — no experience level, programming background, years of experience, or familiarity with LLM tools."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No inclusion or exclusion criteria are stated. The paper says 'We gathered a sample of eight programmers' with no description of who was eligible or how they were selected."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "This is a single-condition study where all participants use the same tool. There are no experimental conditions requiring randomized assignment."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "Single-condition study with no comparison group. Blinding is not feasible when all participants use the same tool and there is nothing to blind against."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Section 6 reports that all 8 participants completed the study: 'Seven out of eight participants successfully completed all tasks, with only one participant encountering minor coding errors on a single task.' No dropout occurred."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, API costs, token consumption, or response latency is reported. Users mentioned wanting 'faster response times' (Section 6) but no actual latency measurements are provided."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No computational budget, API spend, or hardware requirements are stated for running the system."
    290       }
    291     }
    292   },
    293   "scan_version": 3,
    294   "active_modules": [],
    295   "claims": [
    296     {
    297       "claim": "Onboarding Buddy achieves a mean perceived helpfulness rating of M=3.26 (SD=0.86) out of 4 across all participants and tasks.",
    298       "evidence": "Section 6 reports aggregated questionnaire results from 8 participants across 3 tasks on a 0-4 scale.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "Ease of onboarding averaged M=3.0 (SD=0.96) out of 4.",
    303       "evidence": "Section 6 reports aggregated questionnaire results from 8 participants across 3 tasks on a 0-4 scale.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Seven out of eight participants successfully completed all tasks with close to 100% accuracy.",
    308       "evidence": "Section 6: 'Seven out of eight participants successfully completed all tasks, with only one participant encountering minor coding errors on a single task.' Verified via collected telemetry.",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "Onboarding Buddy provides more context-specific advice than mainstream AI coding tools like GitHub Copilot.",
    313       "evidence": "Section 6: 'participants who used mainstream AI coding tools in their daily workflow noted in their feedback that Onboarding Buddy gave more context-specific advice, particularly on setup tasks.' This is anecdotal participant feedback, not a controlled comparison.",
    314       "supported": "weak"
    315     },
    316     {
    317       "claim": "Onboarding Buddy holds great potential for enhancing developer productivity and satisfaction.",
    318       "evidence": "Abstract and Section 7. Based solely on the n=8 pilot study results with no baseline comparison and no actual productivity measurement.",
    319       "supported": "weak"
    320     }
    321   ],
    322   "methodology_tags": [
    323     "case-study",
    324     "qualitative"
    325   ],
    326   "key_findings": "Onboarding Buddy, a multi-agent LLM system with RAG and chain-of-thought reasoning, was evaluated in a pilot study with 8 programmers performing 3 onboarding tasks on a closed-source project. Participants rated perceived helpfulness at M=3.26/4 and ease of onboarding at M=3.0/4, with 7/8 completing all tasks. Users requested improvements including more specific code snippets, better UI, and faster response times. The evaluation lacks baseline comparisons and formal statistical analysis.",
    327   "red_flags": [
    328     {
    329       "flag": "Tiny sample size with broad claims",
    330       "detail": "Only 8 participants evaluated the system, yet the paper claims 'great potential for enhancing developer productivity and satisfaction.' This sample is far too small for the generalizations made, and no power analysis or sample size justification is provided."
    331     },
    332     {
    333       "flag": "No baseline comparison",
    334       "detail": "The system is evaluated in isolation with no control condition (e.g., onboarding without the tool, or using GitHub Copilot/ChatGPT). Without a baseline, it is impossible to attribute the outcomes to the tool rather than to the tasks being straightforward, the documentation being adequate, or other factors."
    335     },
    336     {
    337       "flag": "Non-independent funder evaluating own ecosystem",
    338       "detail": "JetBrains funds the research, employs a co-author (Sergey Titov), and the tool is an IntelliJ plugin — directly benefiting the JetBrains product ecosystem. This conflict of interest is not explicitly acknowledged beyond the acknowledgments section."
    339     },
    340     {
    341       "flag": "Core LLM unspecified",
    342       "detail": "The paper never discloses which large language model powers the agents. Without knowing the model, the results cannot be interpreted in the context of model capabilities, nor can the system be reproduced."
    343     },
    344     {
    345       "flag": "Recruitment bias unknown",
    346       "detail": "How the 8 participants were recruited is not described. If they were colleagues, students of the authors, or JetBrains employees, this would introduce significant selection bias. No demographics or selection criteria are reported."
    347     },
    348     {
    349       "flag": "No IRB/ethics approval for human subjects",
    350       "detail": "A user study with 8 participants collecting performance data and subjective ratings was conducted without any mention of ethics review or IRB approval."
    351     }
    352   ],
    353   "cited_papers": [
    354     {
    355       "title": "Extending source code pre-trained language models to summarise decompiled binaries",
    356       "authors": ["Ali Al-Kaswan", "Toufique Ahmed", "Maliheh Izadi", "Anand Ashok Sawant", "Premkumar Devanbu", "Arie van Deursen"],
    357       "year": 2023,
    358       "relevance": "Evaluates pre-trained language models for code understanding tasks (binary summarization), relevant to LLM capabilities for code."
    359     },
    360     {
    361       "title": "Leveraging large language models for enhancing the understandability of generated unit tests",
    362       "authors": ["Amirhossein Deljouyi", "Roham Koohestani", "Maliheh Izadi", "Andy Zaidman"],
    363       "year": 2024,
    364       "arxiv_id": "2408.11710",
    365       "relevance": "Uses LLMs to improve code test understandability, directly relevant to LLM-assisted software engineering productivity."
    366     },
    367     {
    368       "title": "Language models for code completion: A practical evaluation",
    369       "authors": ["Maliheh Izadi", "Jonathan Katzy", "Tim Van Dam", "Marc Otten", "Razvan Mihai Popescu", "Arie Van Deursen"],
    370       "year": 2024,
    371       "relevance": "Practical evaluation of LLMs for code completion, relevant to benchmarking AI coding assistants."
    372     },
    373     {
    374       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    375       "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin"],
    376       "year": 2021,
    377       "arxiv_id": "2005.11401",
    378       "relevance": "Foundational RAG paper used as the basis for the retrieval-augmented approach in this and many other LLM-based tools."
    379     },
    380     {
    381       "title": "Experiences from Using Code Explanations Generated by Large Language Models in a Web Software Development E-Book",
    382       "authors": ["Stephen MacNeil", "Andrew Tran", "Arto Hellas", "Joanne Kim", "Sami Sarsa", "Paul Denny"],
    383       "year": 2023,
    384       "relevance": "Studies LLM-generated code explanations for learning, directly relevant to AI-assisted developer onboarding and education."
    385     },
    386     {
    387       "title": "The Design Space of in-IDE Human-AI Experience",
    388       "authors": ["Agnia Sergeyuk", "Ekaterina Koshchenko", "Ilya Zakharov", "Timofey Bryksin", "Maliheh Izadi"],
    389       "year": 2024,
    390       "arxiv_id": "2410.08676",
    391       "relevance": "Explores the design space for human-AI interaction within IDEs, relevant to how AI coding assistants should be integrated."
    392     },
    393     {
    394       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    395       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Brian Ichter", "Fei Xia", "Ed Chi", "Quoc Le", "Denny Zhou"],
    396       "year": 2023,
    397       "arxiv_id": "2201.11903",
    398       "relevance": "Foundational chain-of-thought prompting paper that inspired the CoT approach used in the onboarding agent architecture."
    399     },
    400     {
    401       "title": "Automatic Chain of Thought Prompting in Large Language Models",
    402       "authors": ["Zhuosheng Zhang", "Aston Zhang", "Mu Li", "Alex Smola"],
    403       "year": 2022,
    404       "arxiv_id": "2210.03493",
    405       "relevance": "Automated CoT generation technique that influenced the automated chain-of-thought approach in the Onboarding Buddy system."
    406     }
    407   ],
    408   "engagement_factors": {
    409     "practical_relevance": {
    410       "score": 2,
    411       "justification": "An IDE plugin for developer onboarding has clear practical value, though it's tied to IntelliJ and requires specific setup with a project's codebase."
    412     },
    413     "surprise_contrarian": {
    414       "score": 0,
    415       "justification": "The finding that an LLM-RAG system can help with onboarding tasks confirms expectations rather than challenging them."
    416     },
    417     "fear_safety": {
    418       "score": 0,
    419       "justification": "No safety, security, or risk concerns are raised by this work."
    420     },
    421     "drama_conflict": {
    422       "score": 0,
    423       "justification": "No controversy or conflict; a straightforward tool paper with positive preliminary results."
    424     },
    425     "demo_ability": {
    426       "score": 2,
    427       "justification": "Source code and a demonstration video are publicly available, and there is a web registration portal, though setting up the full system requires effort."
    428     },
    429     "brand_recognition": {
    430       "score": 1,
    431       "justification": "JetBrains is well-known in developer tooling but not a major AI research brand; the paper comes from a university-industry collaboration."
    432     }
    433   }
    434 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs