ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (18809B)


      1 {
      2   "paper": {
      3     "title": "Transforming Software Development: Evaluating the Efficiency and Challenges of GitHub Copilot in Real-World Projects",
      4     "authors": ["Ruchika Pandey", "Prabhat Singh", "Raymond Wei", "Shaila Shankar"],
      5     "year": 2024,
      6     "venue": "Cisco Systems Inc"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": false,
     13         "justification": "No repository URL or code archive is mentioned anywhere in the paper."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The developer logs and efficiency data collected are not released. The data comes from proprietary Cisco projects."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No environment specifications, dependency lists, or version details are provided beyond mentioning programming languages used."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No reproduction instructions are provided. The study was conducted on proprietary Cisco codebases with no replication guidance."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Results are reported as point estimates (e.g., '50% time saved', '30-40%') with no confidence intervals or error bars."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper claims differences across languages and task types but provides no statistical significance tests. Comparisons are made by eyeballing bar charts."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Only raw percentage time savings are reported. No standardized effect sizes (Cohen's d, odds ratios) are provided."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The study uses 26 engineers with no justification for this sample size and no power analysis."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Only averages are reported. No standard deviations, ranges, or variance measures across participants or tasks."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The study compares Copilot-assisted work against a baseline of 'work on similar tasks but without Copilot' (Section 3)."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No comparison against other AI coding assistants (e.g., Amazon CodeWhisperer, Cursor). The only baseline is no-AI coding."
     68       },
     69       "ablation_study": {
     70         "applies": false,
     71         "answer": false,
     72         "justification": "The paper evaluates a commercial tool (GitHub Copilot) as a black box; there are no components to ablate."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper reports both 'Average Time Saved' and 'Average Acceptance Rate' as metrics (Fig. 1, Fig. 3)."
     78       },
     79       "human_evaluation": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The entire study is based on developer self-reports of efficiency changes and qualitative assessments of code quality from 26 engineers."
     83       },
     84       "held_out_test_set": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "This is not a benchmark evaluation; it is a field study of developer productivity. No test set is involved."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down by task type (Fig. 1), programming language (Fig. 3), and task complexity (Fig. 5), with cross-tabulations (Figs. 4, 6, 7)."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 4.1 discusses specific failure cases: complex multi-file tasks, proprietary contexts, C/C++ code, unoptimized generated code (Fig. 2 example), and mocking failures in unit tests."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper reports that Copilot struggles with complex tasks, C/C++ code, large functions, proprietary contexts, and sometimes generates unoptimized or insecure code (Sections 4.1, 5.2)."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Abstract claims of '50% time saved in code documentation and autocompletion' and '30-40% in repetitive coding tasks' are supported by the bar charts in Section 4 (Fig. 1)."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "The paper implies Copilot causes productivity gains ('specific contributions of Copilot to coding practices') but the study design lacks randomization, blinding, or control for confounds like learning effects, self-selection bias, and Hawthorne effect."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The title says 'Real-World Projects' broadly but results are from a single team at Cisco working on cloud security products. The abstract projects '33-36% time reduction' for 'cloud-first software development lifecycle' generally, which overgeneralizes from one team."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "No discussion of alternative explanations such as Hawthorne effect, learning effects during the study, self-selection bias (participants choosing tasks where Copilot works), or novelty effects."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper refers only to 'GitHub Copilot' without specifying the model version, API version, or date of the Copilot version used."
    132       },
    133       "prompts_provided": {
    134         "applies": false,
    135         "answer": false,
    136         "justification": "The paper evaluates Copilot as a black-box IDE tool; prompting is ad hoc developer interaction, not a controlled experimental prompt design."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No hyperparameters (temperature, model settings) are reported for the Copilot configuration used."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper evaluates GitHub Copilot as a third-party black-box tool; the authors cannot describe its internal scaffolding."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper does not describe how developer logs were aggregated, how averages were computed, or what filtering was applied to the raw data."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 5.2 'Limitations and Cautions' discusses several limitations of Copilot's capabilities."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "Section 5.2 discusses limitations of Copilot itself (e.g., struggles with proprietary code) but does not discuss threats to the study's validity — no mention of sample bias, self-reporting bias, learning effects, or generalizability concerns."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. Section 4.4 briefly notes excluded lifecycle stages but does not bound the generalizability of the efficiency claims."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw data (developer logs, individual task measurements) is made available."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 3 describes that 'each developer maintained a detailed log of their interactions with Copilot, noting efficiency changes, challenges encountered, and the context in which the tool was used.'"
    181       },
    182       "recruitment_methods_described": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper says '26 engineers' from Cisco's Security Business Group but does not describe how they were selected or recruited, or whether participation was voluntary vs. assigned."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No description of how individual developer logs were transformed into the aggregate percentages shown in the figures."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding disclosure. All authors are from Cisco Systems Inc but no explicit funding statement is provided."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are listed as 'Security Business Group, Cisco Systems Inc' on the first page."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "Cisco is a GitHub Copilot customer evaluating the tool; they have a vested interest in justifying the license cost. This conflict is not acknowledged."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests or financial interests statement is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "This is a productivity study evaluating Copilot in real-world tasks, not a benchmark evaluation of model knowledge."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "Not a benchmark evaluation; no test set to overlap with training data."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Not a benchmark evaluation."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No mention of pre-registration."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No mention of IRB or ethics board approval despite collecting data from 26 human participants."
    242       },
    243       "demographics_reported": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The paper says participants ranged from 'junior to senior engineers' but provides no detailed demographics (years of experience distribution, gender, etc.)."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No inclusion or exclusion criteria for participant selection are stated."
    252       },
    253       "randomization_described": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No randomization is described. Participants self-selected which tasks to use Copilot for, as acknowledged in Section 4.4."
    257       },
    258       "blinding_described": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No blinding. Participants knew they were using Copilot; the baseline comparison is self-reported."
    262       },
    263       "attrition_reported": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "No information on whether all 26 participants completed the study or if any dropped out."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No mention of Copilot license costs or inference costs despite the study being about efficiency/productivity."
    274       },
    275       "compute_budget_stated": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "The study uses a commercial SaaS tool (Copilot); compute budget is not applicable."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "GitHub Copilot achieves up to 50% time savings in code documentation and autocompletion tasks.",
    285       "evidence": "Fig. 1 shows documentation and CI/CD tasks with ~50% average time saved (Section 4.1).",
    286       "supported": "moderate"
    287     },
    288     {
    289       "claim": "Copilot achieves 30-40% time savings in repetitive coding tasks, unit test generation, debugging, and pair programming.",
    290       "evidence": "Fig. 1 bar chart shows these tasks in the 30-35% range (Section 4.1).",
    291       "supported": "moderate"
    292     },
    293     {
    294       "claim": "A 33-36% overall time reduction for coding-related tasks in a cloud-first SDLC can be projected.",
    295       "evidence": "Section 4.4 models approximate time distribution across tasks and applies observed savings, arriving at 26-35% (stated as '26% and 35%').",
    296       "supported": "weak"
    297     },
    298     {
    299       "claim": "Copilot struggles with complex tasks involving large functions, multiple files, and proprietary contexts, particularly in C/C++.",
    300       "evidence": "Section 4.1 provides qualitative examples and Fig. 5 shows complex multi-file tasks have lowest time savings. Section 4.2 shows C/C++ has lowest efficiency gains.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "JavaScript shows the highest time savings (~50%) while C/C++ shows the lowest.",
    305       "evidence": "Fig. 3 shows language-specific efficiency results (Section 4.2).",
    306       "supported": "moderate"
    307     }
    308   ],
    309   "methodology_tags": ["observational", "case-study"],
    310   "key_findings": "A study of 26 Cisco engineers using GitHub Copilot on proprietary cloud security codebases found task-dependent time savings ranging from ~50% (documentation, CI/CD) to minimal gains for complex multi-file tasks. Performance varied significantly by programming language, with JavaScript and Java showing highest gains and C/C++ the lowest. The authors project 26-35% overall coding time reduction but acknowledge Copilot struggles with proprietary contexts, large codebases, and code requiring optimization.",
    311   "red_flags": [
    312     {
    313       "flag": "Self-reported measurements without validation",
    314       "detail": "All efficiency gains are based on developer self-reports of time saved, with no objective measurement, time tracking tools, or independent verification."
    315     },
    316     {
    317       "flag": "No statistical rigor",
    318       "detail": "No significance tests, confidence intervals, standard deviations, or variance measures are reported for any results despite N=26 participants. All comparisons are visual from bar charts."
    319     },
    320     {
    321       "flag": "Self-selection bias acknowledged but not addressed",
    322       "detail": "Section 4.4 acknowledges participants 'self-selected towards using Copilot only for the tasks where it was likely to show gains,' which inflates reported efficiency numbers."
    323     },
    324     {
    325       "flag": "No threats to study validity discussed",
    326       "detail": "The limitations section discusses Copilot's limitations but not threats to the study's own validity (Hawthorne effect, learning effects, sample representativeness)."
    327     },
    328     {
    329       "flag": "Potential conflict of interest",
    330       "detail": "Cisco employees evaluating a tool their company pays for. Positive results may justify continued license expenditure. This conflict is not disclosed."
    331     },
    332     {
    333       "flag": "Small uncharacterized sample",
    334       "detail": "26 engineers from a single team at one company with no demographics, no selection criteria, and no justification for sample size."
    335     }
    336   ],
    337   "cited_papers": [
    338     {
    339       "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions",
    340       "authors": ["N.T. Nguyen", "S. Nadi"],
    341       "year": 2022,
    342       "relevance": "Empirical evaluation of Copilot correctness across programming languages on LeetCode problems."
    343     },
    344     {
    345       "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models",
    346       "authors": ["P. Vaithilingam", "T. Zhang", "E.L. Glassman"],
    347       "year": 2022,
    348       "relevance": "User study of Copilot usability finding it did not improve task completion time despite user preference."
    349     },
    350     {
    351       "title": "GitHub Copilot AI pair programmer: Asset or Liability?",
    352       "authors": ["A.M. Dakhel", "V. Majdinasab", "A. Nikanjam", "F. Khomh", "M.C. Desmarais", "Z.M. Jiang"],
    353       "year": 2022,
    354       "relevance": "Assessment of Copilot's capability in solving algorithmic problems with analysis of incorrect solutions."
    355     },
    356     {
    357       "title": "Productivity assessment of neural code completion",
    358       "authors": ["A. Ziegler", "E. Kalliamvakou", "S. Simister"],
    359       "year": 2022,
    360       "relevance": "Analysis of relationship between Copilot suggestion acceptance rates and perceived productivity."
    361     },
    362     {
    363       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    364       "authors": ["H.A. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    365       "year": 2021,
    366       "relevance": "Security evaluation finding ~40% of Copilot-generated code contained vulnerabilities."
    367     },
    368     {
    369       "title": "ChatDev: Communicative agents for software development",
    370       "authors": ["C. Qian", "W. Liu", "H. Liu"],
    371       "year": 2023,
    372       "arxiv_id": "2307.07924",
    373       "relevance": "Multi-agent framework for software development referenced as future work for agentic workflows."
    374     },
    375     {
    376       "title": "Grounded Copilot: How Programmers Interact with Code-Generating Models",
    377       "authors": ["S. Barke", "M.B. James", "N. Polikarpova"],
    378       "year": 2022,
    379       "relevance": "Study of programmer interaction modes (acceleration vs exploration) with Copilot."
    380     }
    381   ]
    382 }

Impressum · Datenschutz