ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26129B)


      1 {
      2   "paper": {
      3     "title": "CoLadder: Supporting Programmers with Hierarchical Code Generation in Multi-Level Abstraction",
      4     "authors": ["Ryan Yen", "Jiawen Zhu", "Sangho Suh", "Haijun Xia", "Jian Zhao"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2310.08699",
      8     "doi": "10.48550/arXiv.2310.08699"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, Zenodo archive, or other code release link is provided anywhere in the paper. The system is described as 'deployed on Vercel, accessible through a public domain URL' (Section 5.4) but no URL is given for reproducibility."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No dataset, interview transcripts, log data, or survey responses are released. The programming tasks are described in Appendix C but the actual interaction log data and qualitative coding are not made available."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper mentions Next.js, Monaco Editor, Pyodide, Firebase, OpenAI GPT-4 API, and LangChain (Section 5.4), but no requirements.txt, Dockerfile, or version-specific dependency list is provided."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided. The system architecture is described at a high level but not enough to reproduce."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Figure 11 shows 95% confidence intervals (error bars) for the frequency distribution of block types across layers. Standard deviations are also reported for various measures throughout."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Wilcoxon signed-rank tests are used throughout for comparing CoLadder vs. Baseline conditions, with p-values reported (e.g., Q4: p=.008, Q5: p=.007, Q3: p=.007). Effect sizes (r) are also reported alongside p-values (Section 6.5, Section 7)."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Effect sizes (r) are reported alongside every Wilcoxon test result, e.g., 'Q4: p=.008, r=.76', 'Q5: p=.007, r=.77', 'Q3: p=.007, r=.77' (Section 7). Medians for both conditions are also provided for context."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The sample size of 12 participants is not justified with a power analysis or explicit rationale. No acknowledgment that N=12 may be too small for certain statistical claims."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Standard deviations are reported for task completion times (Section 7.1.1: M=11.74, SD=0.50 min for CoLadder; M=10.05, SD=2.79 min for Baseline), participant demographics, and other measures throughout."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "A baseline system is included: a web-based code editor generating code from inline comments, similar to GitHub Copilot, powered by the same GPT-4 model (Section 6.3)."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The baseline mimics GitHub Copilot's inline code suggestion paradigm, which was the contemporary standard for LLM-driven code assistants at the time of writing. Both baseline and CoLadder use GPT-4 (Section 6.3)."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No ablation study is conducted. CoLadder has multiple components (prompt tree, block operations, List Steps, auto-completion, semantic highlight, recommendation) but their individual contributions are not isolated through systematic ablation."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple evaluation metrics are used: UMUX-LITE/SUS scores, NASA-TLX (6 dimensions), 9 self-defined Likert scale items, task correctness rates, completion times, and event log counts (Sections 6.4-7)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The entire evaluation is a human user study with 12 programmers evaluating the system through task performance, Likert scale ratings, think-aloud protocols, and semi-structured interviews (Section 6)."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is a user study, not a benchmark evaluation. The concept of held-out test sets does not apply to this paper type."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by individual questionnaire items (Q1-Q9), individual NASA-TLX dimensions, task correctness categories (complete/correct, complete/incorrect, incomplete), and feature usage counts (Figure 14)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Several failure cases are discussed: 3/12 participants forgot about bi-directional editing (Section 8), 2 participants noted misalignment between prompt tree and code structure, and specific challenges with OOP/complex projects are acknowledged (Section 8)."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Negative results are reported: CoLadder took significantly more time than Baseline (p=.040, Section 7.1.1), confidence in correctness was not significantly different (Q7: p=.58), and most NASA-TLX dimensions were not statistically significant (Section 7.1.5)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims CoLadder is 'effective in helping programmers externalize their problem-solving intentions flexibly, improving their ability to evaluate and modify code across various abstraction levels.' This is supported by significant results on Q4 (p=.008), Q5 (p=.007), Q9 (p=.006), and qualitative findings in Section 7."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The study uses a within-subjects experimental design with counterbalanced task-category combinations via Latin square (Section 6.4). Causal claims like 'CoLadder reduces cognitive switching' are backed by this controlled comparison. The design is adequate for the causal claims made."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper tests only with experienced programmers on Python scripting tasks (ML and data visualization) but the title and abstract make broad claims about 'supporting programmers' without bounding to this specific population and task type. Section 8 acknowledges some limitations but the title generalizes beyond the tested setting."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 9.4 discusses task familiarity as an alternative explanation and reports Spearman correlations showing it was not a dominant factor. Section 9.2 discusses the distinction between controlling the program vs. controlling AI interaction. Section 8 discusses novelty effects and learnability as potential confounds."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper states 'OpenAI GPT-4 API' (Section 5.4) and 'OpenAI's GPT-4' (Section 5.4.1) but does not specify the exact model version or snapshot date (e.g., gpt-4-0613). Reference [64] cites the GPT-4 Technical Report but no API version is stated."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Prompt templates for block operations (Add, Edit, List Steps, Semantic Highlighting) are provided as figures in Appendix A (Figures 15-18). These show the actual template text with placeholders, and the fill values are defined by the system's tree structure context."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No temperature, top-p, max tokens, or other API hyperparameters are reported for the GPT-4 calls. These settings significantly affect output quality but are not mentioned anywhere in the paper."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The agentic scaffolding is described in detail: Section 5.4 covers the prompting techniques (few-shot examples, Chain-of-Thought via LangChain, output parser), Section 5.4.2 covers block operations, error prevention, sequential chaining, and the Myers diff algorithm for code updates."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Data analysis methods are documented: thematic analysis for qualitative data (Section 6.5), inter-coder agreement of 97% refined to 100%, classification of prompt types (procedural, declarative, mixed), and statistical analysis approach (Wilcoxon signed-rank test) are all described."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 8 'Limitations' is a dedicated section spanning approximately two columns discussing multiple specific limitations of the work."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Specific threats are discussed: limited diversity of programming tasks (Section 8), CoLadder tested only with scripting languages (Section 8), prompt tree may not align with code structure in OOP scenarios (Section 8), 3/12 participants forgot bi-directional editing feature (Section 8), and task familiarity correlations (Section 9.4)."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 8 explicitly states that CoLadder was tested 'exclusively with scripting languages', acknowledges it may be less effective for 'certain programming languages, particularly compiled languages', rapid iteration tasks, and OOP scenarios. Section 9.4 bounds to experienced programmers."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw data (interaction logs, survey responses, interview transcripts, coding sheets) is made available for independent verification."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Data collection is described: interviews were audio-recorded and transcribed (Section 4.1), think-aloud data recorded (Section 6.4), screen activity recorded, post-task questionnaires administered, and system events logged (Section 6.4)."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Recruitment methods are described: purposive sampling for both studies, university mailing list for the evaluation study (Section 6.1), screening criteria (Python proficiency >= 4/5, LLM familiarity), and compensation (CAD$20 for formative, CAD$30 for evaluation) are all stated."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The analysis pipeline is documented: transcription of think-aloud and interview data, reflexive thematic analysis with inductive and deductive coding, collaborative coding by two authors with disagreement resolution, inter-coder agreement (97% initial, refined to 100%), and Wilcoxon tests for quantitative data (Sections 4.1, 6.5)."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding source or acknowledgments section mentioning grants or sponsors is found in the paper."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: University of Waterloo and University of California San Diego. The paper evaluates the authors' own system (CoLadder), which is standard in HCI systems research."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding information is disclosed at all, making it impossible to assess funder independence. The absence of funding disclosure is treated as non-disclosure."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "This paper evaluates a tool/system design rather than a pre-trained model's capability on a benchmark. The GPT-4 model is used as infrastructure, not as the subject of evaluation."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "The paper does not evaluate a pre-trained model on a benchmark. It evaluates a user-facing system's impact on programmer workflows."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The paper does not evaluate model performance on benchmarks. The programming tasks are novel tasks adapted from prior work for the user study."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No mention of pre-registration (OSF, AsPredicted, or other registry) is found in the paper."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "Section 6.4 states: 'The study was approved by the university's ethics review board.'"
    244       },
    245       "demographics_reported": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "Demographics are reported for both studies: formative study (5 males, 1 female, ages 25-27, programming experience M=6.67 years, Table 2) and evaluation study (7 males, 5 females, ages 23-36, M=7.88 years programming experience, Python proficiency, LLM usage frequency) in Sections 4.1 and 6.1."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": true,
    252         "answer": true,
    253         "justification": "Inclusion criteria are stated: experienced programmers with Python proficiency >= 4/5, familiarity with LLM-code assistants particularly GPT-4 rated on a 5-point Likert scale, and regular use of LLM-code generation tools (Section 6.1). For task selection, participants rated expertise in task categories with a threshold above 3 (Section 6.2)."
    254       },
    255       "randomization_described": {
    256         "applies": true,
    257         "answer": true,
    258         "justification": "Latin square design is used for counterbalancing task-category combinations across participants, ensuring equal distribution of each condition (baseline or system) across participants (Section 6.4)."
    259       },
    260       "blinding_described": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No blinding is described. Participants knew which system they were using (CoLadder vs. Baseline) as the interfaces are visually distinct. No mention of evaluator blinding for qualitative coding."
    264       },
    265       "attrition_reported": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No attrition or dropout information is reported. The paper mentions 12 participants but does not state whether any dropped out or whether all completed both conditions. A pilot study with 2 participants is mentioned but their exclusion from the final sample is not explicitly discussed."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No API costs, token consumption, or latency per operation is reported, despite the system making multiple GPT-4 API calls for every block operation, auto-completion, and recommendation."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No total computational budget, API spend, or hardware specifications are stated."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "CoLadder significantly helps programmers construct their intentions for solving programming tasks compared to Baseline.",
    287       "evidence": "Within-subjects comparison showing Q4: MdnC=6 vs. MdnB=2.5, p=.008, r=.76 (Section 7.2). Qualitative support from 11/12 participants structuring prompts hierarchically.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "CoLadder provides more controllability in scaffolding intentions to generate desired code compared to Baseline.",
    292       "evidence": "Q5: MdnC=6 vs. MdnB=3.5, p=.007, r=.77; Q3: MdnC=6 vs. MdnB=3.0, p=.007, r=.77. Participants made significantly fewer manual code edits with CoLadder (Mdn=8.0 vs. 53.0, p=.002, r=0.9) (Section 7.3).",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "CoLadder significantly reduces cognitive switching between prompt authoring and code evaluation.",
    297       "evidence": "Q1: MdnC=6.0 vs. MdnB=4.5, p=.01, r=.67. Participants compiled code significantly less with CoLadder (Mdn=18.5 vs. 24.0, p=.012, r=0.56) (Section 7.4).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "CoLadder significantly enhances prompt-code correspondence for evaluating generated code.",
    302       "evidence": "Q9: MdnC=6.0 vs. MdnB=3.0, p=.006, r=.80. 10/12 participants found the semantic highlighting feature helpful (Section 7.5).",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Participants using CoLadder demonstrated significantly better understanding of their programs.",
    307       "evidence": "Q8: MdnC=6.0 vs. MdnB=5.0, p=.007, r=.77. Recall test showed systematic code recall from higher-level to lower-level (Section 7.4.2).",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "CoLadder has significantly higher usability than Baseline as measured by SUS scores.",
    312       "evidence": "UMUX-LITE derived SUS scores: CoLadder Mdn=90.61 vs. Baseline Mdn=68.94, p=.02 (Section 7.1.4).",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "Participants took significantly more time to complete tasks with CoLadder compared to Baseline.",
    317       "evidence": "M=11.74 min (SD=0.50) for CoLadder vs. M=10.05 min (SD=2.79) for Baseline, p=.040 (Section 7.1.1).",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": ["rct", "qualitative"],
    322   "key_findings": "CoLadder, an interactive system supporting hierarchical code generation through a tree-based prompt editor, significantly improved programmers' ability to form and externalize intentions (p=.008), steer code generation controllably (p=.007), reduce cognitive switching (p=.01), and evaluate prompt-code correspondence (p=.006) compared to a Copilot-like baseline in a within-subjects study with 12 experienced programmers. However, CoLadder took significantly more time (11.74 vs. 10.05 minutes, p=.040), which the authors attribute to increased time spent on planning and intention formation rather than wasted effort. Task correctness was higher with CoLadder (50% correct vs. 25% for Baseline) though no significance test was reported for this metric.",
    323   "red_flags": [
    324     {
    325       "flag": "Small sample size",
    326       "detail": "The evaluation study uses only 12 participants with no power analysis. With N=12 in a within-subjects design, the study has limited statistical power. Many NASA-TLX comparisons were not statistically significant, possibly due to insufficient power rather than absence of effect."
    327     },
    328     {
    329       "flag": "No ablation study",
    330       "detail": "CoLadder has many components (prompt tree, block operations, List Steps, auto-completion, semantic highlighting, recommendations) but no ablation study isolates which components drive the observed benefits. It is unclear whether the full system or just one feature (e.g., the tree structure) accounts for most improvements."
    331     },
    332     {
    333       "flag": "No code or data release",
    334       "detail": "Neither the system source code, interaction logs, nor raw survey/interview data are released, making independent verification impossible."
    335     },
    336     {
    337       "flag": "Self-report measures dominate",
    338       "detail": "Most significant results are from self-report Likert scales rather than objective task performance measures. Task completion rates (50% vs. 25% correct) are not tested for significance and may not reflect true capability differences given the small sample."
    339     }
    340   ],
    341   "cited_papers": [
    342     {
    343       "title": "Grounded copilot: How programmers interact with code-generating models",
    344       "authors": ["Shraddha Barke", "Michael B James", "Nadia Polikarpova"],
    345       "year": 2023,
    346       "relevance": "Empirical study of programmer interaction patterns with code-generating models, directly relevant to understanding LLM-assisted programming workflows."
    347     },
    348     {
    349       "title": "Evaluating large language models trained on code",
    350       "authors": ["Mark Chen", "Jerry Tworek"],
    351       "year": 2021,
    352       "arxiv_id": "2107.03374",
    353       "relevance": "Foundational benchmark evaluation of Codex/LLM code generation capabilities."
    354     },
    355     {
    356       "title": "\"What It Wants Me To Say\": Bridging the Abstraction Gap Between End-User Programmers and Code-Generating Large Language Models",
    357       "authors": ["Michael Xieyang Liu", "Advait Sarkar", "Carina Negreanu"],
    358       "year": 2023,
    359       "relevance": "Introduces grounded abstraction matching to address the abstraction gap in LLM-driven code generation, a core challenge CoLadder builds upon."
    360     },
    361     {
    362       "title": "AI Chains: Transparent and Controllable Human-AI Interaction by Chaining Large Language Model Prompts",
    363       "authors": ["Tongshuang Wu", "Michael Terry", "Carrie Jun Cai"],
    364       "year": 2022,
    365       "relevance": "Proposes chaining LLM prompts for task decomposition, a foundational technique for CoLadder's hierarchical approach."
    366     },
    367     {
    368       "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges",
    369       "authors": ["Jenny T Liang", "Chenyang Yang", "Brad A Myers"],
    370       "year": 2023,
    371       "relevance": "Large-scale survey on usability challenges of AI programming assistants, providing context for the problems CoLadder addresses."
    372     },
    373     {
    374       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    375       "authors": ["Priyan Vaithilingam", "Tianyi Zhang", "Elena L Glassman"],
    376       "year": 2022,
    377       "relevance": "User study evaluating usability gaps in LLM-powered code generation tools, directly informing CoLadder's design goals."
    378     },
    379     {
    380       "title": "What is it like to program with artificial intelligence?",
    381       "authors": ["Advait Sarkar", "Andrew D Gordon", "Carina Negreanu"],
    382       "year": 2022,
    383       "arxiv_id": "2208.06213",
    384       "relevance": "Qualitative study on the experience of programming with AI, identifying abstraction matching as a key challenge."
    385     },
    386     {
    387       "title": "In-ide code generation from natural language: Promise and challenges",
    388       "authors": ["Frank F Xu", "Bogdan Vasilescu", "Graham Neubig"],
    389       "year": 2022,
    390       "relevance": "Empirical evaluation of in-IDE code generation from natural language, providing the programming task categories used in CoLadder's evaluation."
    391     },
    392     {
    393       "title": "Reading between the lines: Modeling user behaviour and costs in AI-assisted programming",
    394       "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"],
    395       "year": 2022,
    396       "arxiv_id": "2210.14306",
    397       "relevance": "Models programmer behavior and cognitive costs when using AI code assistants."
    398     },
    399     {
    400       "title": "Productivity assessment of neural code completion",
    401       "authors": ["Albert Ziegler", "Eirini Kalliamvakou"],
    402       "year": 2022,
    403       "relevance": "Assesses productivity impact of neural code completion, relevant to evaluating practical benefits of AI coding tools."
    404     },
    405     {
    406       "title": "Programming without a Programming Language: Challenges and Opportunities for Designing Developer Tools for Prompt Programming",
    407       "authors": ["Alexander J Fiannaca", "Chinmay Kulkarni", "Carrie J Cai", "Michael Terry"],
    408       "year": 2023,
    409       "relevance": "Identifies design challenges for prompt-based programming tools, directly relevant to CoLadder's design space."
    410     },
    411     {
    412       "title": "Low-code LLM: Visual Programming over LLMs",
    413       "authors": ["Yuzhe Cai", "Shaoguang Mao"],
    414       "year": 2023,
    415       "arxiv_id": "2304.08103",
    416       "relevance": "Proposes visual programming approaches for LLM interaction, an alternative approach to hierarchical code generation."
    417     }
    418   ]
    419 }

Impressum · Datenschutz