scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32043B)
      1 {
      2   "paper": {
      3     "title": "Natural Language Outlines for Code: Literate Programming in the LLM Era",
      4     "authors": [
      5       "Kensen Shi",
      6       "Deniz Altınbüken",
      7       "Saswat Anand",
      8       "Mihai Christodorescu",
      9       "Katja Grünwedel",
     10       "Alexa Koenings",
     11       "Sai Naidu",
     12       "Anurag Pathak",
     13       "Marc Rasi",
     14       "Fredde Ribeiro",
     15       "Brandon Ruffin",
     16       "Siddhant Sanyam",
     17       "Maxim Tabachnyk",
     18       "Sara Toth",
     19       "Roy Tu",
     20       "Tobias Welp",
     21       "Pengcheng Yin",
     22       "Manzil Zaheer",
     23       "Satish Chandra",
     24       "Charles Sutton"
     25     ],
     26     "year": 2025,
     27     "venue": "FSE Companion '25 (33rd ACM International Conference on the Foundations of Software Engineering)",
     28     "arxiv_id": "2408.04820",
     29     "doi": "10.1145/3696630.3728541"
     30   },
     31   "scan_version": 3,
     32   "active_modules": [],
     33   "methodology_tags": ["case-study", "qualitative"],
     34   "key_findings": "Modern LLMs (especially Gemini 1.5 Pro) can generate high-quality NL outlines for code functions, with 60% rated excellent and 80% completely correct by professional developers on 30 real Python functions. In an Android security case study, 30 professional reverse engineers found LLM-generated outlines very or extremely helpful (26/30), with 83.8% of outlines rated completely correct and LLM suspicion scores correlating more highly with experienced REs (r=0.85) than less experienced groups. A Virtual CL Split prototype for code review was considered useful for 58% of sufficiently complex change lists, and was launched to ~7000 beta users at Google.",
     35   "checklist": {
     36     "artifacts": {
     37       "code_released": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No source code, repository URL, or archive is provided anywhere in the paper. Prototype features (Finish Changes, Virtual CL Split) are described but not released."
     41       },
     42       "data_released": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The dataset of 30 Python functions, 80 Android security functions, and 73 CLs used in evaluations are not released. The paper notes functions come from 'real projects' and 'real Android apps' but provides no download link or repository."
     46       },
     47       "environment_specified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No environment specifications (requirements.txt, Dockerfile, library versions) are provided. The paper only names the LLMs used without specifying the software environment."
     51       },
     52       "reproduction_instructions": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No step-by-step reproduction instructions are provided. While Appendix E provides example prompts, there are no instructions for replicating the full experimental pipeline."
     56       }
     57     },
     58     "statistical_methodology": {
     59       "confidence_intervals_or_error_bars": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Results are reported as point estimates (e.g., '60% of outlines were rated as having excellent overall quality', '83.8% said the outline was completely correct') without any confidence intervals or error bars."
     63       },
     64       "significance_tests": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No statistical significance tests are used. Comparisons between models and techniques (e.g., 'Gemini 1.5 Flash was the next best LLM and the others were noticeably worse') are made by comparing raw percentages without any hypothesis tests."
     68       },
     69       "effect_sizes_reported": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Pearson correlation coefficients are reported for LLM vs. RE suspicion scores (r=0.85, r=0.80, r=0.78, r=0.96 in Section 6.1). Percentage breakdowns per model/technique in Fig. 5 and per experience group in Fig. 7 provide magnitude context for comparisons."
     73       },
     74       "sample_size_justified": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No power analysis or sample size justification is provided for any study. The choice of 30 functions, 6 evaluators (Section 5), 80 functions, 30 REs (Section 6.1), or 73 CLs, 4 engineers (Section 6.2) is not justified."
     78       },
     79       "variance_reported": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No variance or standard deviation is reported across evaluators or experimental conditions. All results are single-point aggregates (percentages, counts). Greedy decoding is used so LLM outputs are deterministic, but no inter-rater variability is reported."
     83       }
     84     },
     85     "evaluation_design": {
     86       "baselines_included": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Section 5 compares 5 LLMs (Gemini 1.5 Pro/Flash, Gemini 1.0 Pro/Ultra, DeepSeek-Coder-Instruct 33B) and 2 generation techniques (Interleaved Generation, Line Number Infilling). Section 6.1 compares 3 prediction styles (suspicion score, summary, NL outline)."
     90       },
     91       "baselines_contemporary": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Gemini 1.5 Pro and Flash (2024 models) are used alongside Gemini 1.0 Pro/Ultra and DeepSeek-Coder-Instruct 33B, representing a mix of contemporary and recent models at time of writing."
     95       },
     96       "ablation_study": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No ablation study is performed. While different generation techniques and models are compared, individual components of the approach (e.g., few-shot examples, prompt instructions, number of examples) are not systematically removed or varied to measure their contribution."
    100       },
    101       "multiple_metrics": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Section 5 evaluates 5 dimensions: overall quality, helpfulness, correctness, amount of detail, and style/fluency. Section 6.1 evaluates accuracy (suspicion scores), correctness, helpfulness, and detail. Table 1 reports parsing errors and number of outline statements."
    105       },
    106       "human_evaluation": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 5 uses 6 professional developers to rate outline quality. Section 6.1 surveys 30 professional reverse engineers on LLM prediction accuracy and helpfulness. Section 6.2 has 4 engineers evaluate virtual CL splits."
    110       },
    111       "held_out_test_set": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The 30 evaluation functions in Section 5 are separate from the 8 handwritten few-shot examples used in prompts. The evaluation set was curated independently from the prompt examples."
    115       },
    116       "per_category_breakdown": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Fig. 5 breaks down quality ratings by LLM and generation technique. Fig. 7 breaks down helpfulness by prediction type and RE experience group. Table 1 provides per-model parsing error rates. Section 6.1 separates results for suspicious vs. benign functions."
    120       },
    121       "failure_cases_discussed": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 5 notes '3 of 30 outlines were bad quality, 2 were not helpful, and only 1 was incorrect.' Section 6.1 discusses 4 false negatives and 2 false positives with detailed analysis. Fig. 16 shows a low-quality outline example with commentary. App. C discusses mistakes in Finish Changes."
    125       },
    126       "negative_results_reported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper reports that Gemini 1.0 Pro 'often includes too much detail' (Section 5), Line Number Infilling 'led to slightly worse predictions' (Section 5), weaker LLMs have more formatting errors (Table 1), and in the code review study only 58% of splits for complex CLs were useful (Section 6.2)."
    130       }
    131     },
    132     "claims_and_evidence": {
    133       "abstract_claims_supported": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Abstract claims are supported: '60% of outlines were rated by professionals as excellent overall (90% were acceptable or better)' matches Fig. 5 results; '26 of 30 professional reverse engineers said that outlines are very or extremely helpful' matches Fig. 7; '83.8% of our generated outlines were rated as completely correct' matches Section 6.1 results."
    137       },
    138       "causal_claims_justified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper is careful to frame most claims as survey findings ('60% were rated excellent', '26/30 said helpful') rather than strong causal claims. The comparative claims (Model A vs Model B) are adequately supported by the controlled comparison design. The broader use-case claims ('can accelerate understanding') are explicitly framed as proposals, not empirical findings."
    142       },
    143       "generalization_bounded": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The title 'Natural Language Outlines for Code' and abstract claims about 'code' generally are not bounded to the tested setting. Section 5 uses only Python (acknowledged in limitations: 'only in Python which is our expertise'). Section 6.1 uses only Java/Android. The paper proposes use cases across 'all of the different software development surfaces' from evidence in limited settings."
    147       },
    148       "alternative_explanations_discussed": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No alternative explanations for the positive survey results are discussed. Possible confounds include: novelty effect (participants may rate any new tool positively), demand characteristics (evaluators in Section 5 include 5 paper authors evaluating their own functions), Hawthorne effect in the RE study, and selection bias in function curation."
    152       },
    153       "proxy_outcome_distinction": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "The paper measures perceived helpfulness and quality via surveys but frames results in terms of 'accelerating understanding and navigation' (abstract) and 'improving developer efficiency' (Section 2) without prominently distinguishing the proxy (survey ratings) from the outcome (actual productivity/understanding improvement). Section 7 briefly acknowledges 'Further research is needed to measure the perceived and actual benefits' but this distinction is not salient in the main claims."
    157       }
    158     },
    159     "setup_transparency": {
    160       "model_versions_specified": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "Models are named as 'Gemini 1.0 Pro and Ultra', 'Gemini 1.5 Flash and Pro', and 'DeepSeek-Coder-Instruct 33B' (Section 5). No snapshot dates, API versions, or specific version identifiers are provided. Marketing names like 'Gemini 1.5 Pro' without a snapshot date do not count as specified versions per the schema."
    164       },
    165       "prompts_provided": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Appendix E provides full example prompts for both Interleaved Generation and Line Number Infilling techniques, including system instructions, few-shot examples (2-3 shown, 8 used in practice), and user query templates. Actual few-shot content is shown."
    169       },
    170       "hyperparameters_reported": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5 states 'all with greedy decoding,' which specifies the sampling strategy (temperature=0, deterministic). The paper uses 'a fixed set of 8 handwritten few-shot examples.'"
    174       },
    175       "scaffolding_described": {
    176         "applies": false,
    177         "answer": false,
    178         "justification": "No agentic scaffolding is used. The approach is direct few-shot prompting of LLMs without tool use, retry logic, or multi-step agentic workflows."
    179       },
    180       "data_preprocessing_documented": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 5 documents function curation: 30 Python functions from 21 projects, 10-90 LOC (median 46.5), outline-like comments removed from 7 functions, other comments preserved in 10 functions. Section 6.1 describes the 80-function dataset: 40 suspicious (manually curated), 40 benign (randomly sampled from 8 popular apps), 24-99 LOC, controlled for code length and style distributions."
    184       }
    185     },
    186     "limitations_and_scope": {
    187       "limitations_section_present": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 7 (Discussion) includes a dedicated 'Limitations' subsection that discusses specific limitations of the three studies."
    191       },
    192       "threats_to_validity_specific": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The limitations section identifies specific threats: 'In Section 5 we used 6 survey participants, and in Section 6.2 we used 4 participants from one team, so those findings may be specific to those participants.' Also: 'the 30 dataset functions were not sampled uniformly' with 'bias toward research projects' and 'Our choice of functions can introduce bias.'"
    196       },
    197       "scope_boundaries_stated": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The paper states specific scope boundaries: evaluation only in Python ('only in Python which is our expertise'), biased function selection ('many functions are boilerplate, very short or simple, or otherwise less interesting'), small evaluator pools, and 'A full evaluation of each use case is beyond the scope of this paper' (Section 3)."
    201       }
    202     },
    203     "data_integrity": {
    204       "raw_data_available": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No raw data is available. Individual survey responses, function datasets, generated outlines, and RE ratings are not released for independent verification."
    208       },
    209       "data_collection_described": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Section 5 describes the function curation process (6 professionals, 21 real projects, variety criteria). Section 6.1 describes the Android dataset (40 suspicious from prior RE flags, 40 benign from popular apps, length/style matching). Section 6.2 describes CL selection (73 randomly selected from one team's recent submissions, >10 lines changed)."
    213       },
    214       "recruitment_methods_described": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "While participants are characterized (professional SEs/researchers in Section 5, professional REs in Section 6.1, engineers from one team in Section 6.2), the recruitment process, channels used, and potential recruitment bias are not described. Footnote 1 notes '5 of the dataset contributors are also authors' but does not discuss how the other evaluator was recruited."
    218       },
    219       "data_pipeline_documented": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "The pipeline is documented: function curation → comment removal → LLM generation with specified prompts → programmatic parsing (with error classification detailed in App. F) → survey presentation (shuffled, unlabeled) → rating collection. For the Android study: function collection → LLM prediction (3 styles in one query with 7 few-shot examples) → RE survey with balanced assignments."
    223       }
    224     },
    225     "conflicts_of_interest": {
    226       "funding_disclosed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No funding source is explicitly disclosed. The acknowledgments section thanks individuals but does not mention grants, corporate funding, or funding agencies. The work was presumably funded by Google given all authors' affiliation, but this is not stated."
    230       },
    231       "affiliations_disclosed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "All authors are listed with 'Google' affiliation on the first page. The connection between the authors' employer and the primary models evaluated (Gemini, a Google product) is clear from the affiliations."
    235       },
    236       "funder_independent_of_outcome": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "Google funds this research (all authors are Google employees) and has direct commercial interest in positive findings about Gemini's capabilities for developer tools. Google's Gemini models are the primary focus of evaluation, and the paper's positive results support Google's product narrative."
    240       },
    241       "financial_interests_declared": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No competing interests or financial interests statement is provided. Google employees evaluating Google's Gemini product is a clear potential conflict that is not explicitly acknowledged beyond listing affiliations."
    245       }
    246     },
    247     "contamination": {
    248       "training_cutoff_stated": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No training data cutoff dates are stated for any of the 5 LLMs used (Gemini 1.0 Pro/Ultra, Gemini 1.5 Flash/Pro, DeepSeek-Coder). The paper cannot assess whether evaluation functions were in training data without this information."
    252       },
    253       "train_test_overlap_discussed": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No discussion of whether the 30 evaluation functions or 80 Android functions could have been in the models' training data. Footnote 2 notes 'The LLMs in our experiments were not trained on the removed comments' but this implicitly acknowledges the code itself may be in training data without addressing it."
    257       },
    258       "benchmark_contamination_addressed": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "The evaluation functions come from 'real projects' (some from authors' prior work) which could be publicly available and in training data. Section 5 notes 'custom dependencies... not in the prompt or training data' but this refers to dependency code, not the functions themselves. No systematic contamination analysis is performed."
    262       }
    263     },
    264     "human_studies": {
    265       "pre_registered": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No pre-registration is mentioned for any of the three human evaluation studies."
    269       },
    270       "irb_or_ethics_approval": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No IRB or ethics board approval is mentioned despite three studies involving human participants (6 developers, 30 REs, 4 engineers)."
    274       },
    275       "demographics_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "Demographics are minimal. Section 5: '6 professional software engineers and researchers.' Section 6.1: '30 professional REs' split into '3 groups of 10 by their amount of experience' — no further detail on experience levels, years, or other demographics. Section 6.2: 'four software engineers from the same team.' No gender, geographic, age, or detailed experience information is provided."
    279       },
    280       "inclusion_exclusion_criteria": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No explicit inclusion or exclusion criteria are stated for any study's participants. The only characterization is professional role (SEs, researchers, REs) without formal eligibility criteria."
    284       },
    285       "randomization_described": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "The studies are survey/rating studies, not randomized experiments comparing treatment vs. control conditions. Participants all see the same types of outputs to rate."
    289       },
    290       "blinding_described": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 5 / App. G describes single-blinding: 'presenting each function's 10 outlines in shuffled order to the function's contributor' without labeling which LLM or technique produced each outline. This prevents evaluator bias toward specific models."
    294       },
    295       "attrition_reported": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No attrition or dropout information is reported for any study. It is unclear whether all invited participants completed their evaluations."
    299       }
    300     },
    301     "cost_and_practicality": {
    302       "inference_cost_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No systematic cost or latency reporting is provided. Relative speedup ('Line Number Infilling provides a 4× to 5× speedup,' Section 4) and one latency estimate ('about 5 seconds' for Finish Changes with Flash, App. C) are mentioned in passing but no API costs, tokens consumed, or per-example costs are reported."
    306       },
    307       "compute_budget_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No total computational budget is stated. The experiments involve generating 300 outlines (Section 5), predictions for 80 functions (Section 6.1), and splits for 73+ CLs (Section 6.2), but GPU hours, total API costs, and hardware used are not reported."
    311       }
    312     }
    313   },
    314   "claims": [
    315     {
    316       "claim": "60% of NL outlines generated by Gemini 1.5 Pro with Interleaved Generation were rated as having excellent overall quality (90% acceptable or better).",
    317       "evidence": "Section 5 and Fig. 5: Quality survey of 30 Python functions rated by their curators (6 professional developers) across 5 dimensions. Results shown in bar charts per model/technique.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "80% of outlines from Gemini 1.5 Pro with Interleaved Generation were rated completely correct.",
    322       "evidence": "Section 5 and Fig. 5: Same survey, correctness dimension. 17% were 'mostly correct' and only 1 of 30 was incorrect.",
    323       "supported": "moderate"
    324     },
    325     {
    326       "claim": "83.8% of NL outlines for suspicious Android functions were rated as completely correct by professional reverse engineers.",
    327       "evidence": "Section 6.1: 30 professional REs rated LLM predictions. Per-function surveys collected with each function rated by one RE from each of 3 experience groups.",
    328       "supported": "strong"
    329     },
    330     {
    331       "claim": "26 of 30 professional reverse engineers said NL outlines are very or extremely helpful for malware detection.",
    332       "evidence": "Section 6.1 and Fig. 7: Final survey about predictions overall. REs rated suspicion scores, summaries, and outlines for helpfulness, broken down by experience group.",
    333       "supported": "strong"
    334     },
    335     {
    336       "claim": "LLM suspicion scores correlate more highly with the most experienced REs (r=0.85) than other RE groups correlate with them (r=0.80, r=0.78).",
    337       "evidence": "Section 6.1: Pearson correlation between LLM scores and each RE experience group's scores on the 80-function dataset. LLM scores also achieve r=0.96 against the average of all RE scores.",
    338       "supported": "moderate"
    339     },
    340     {
    341       "claim": "Line Number Infilling provides 4-5x speedup over Interleaved Generation for outline generation.",
    342       "evidence": "Section 4: Described as an 'observed' speedup because Line Number Infilling does not spend output tokens repeating the code. No formal timing measurements reported.",
    343       "supported": "weak"
    344     },
    345     {
    346       "claim": "Virtual CL Split was useful for 58% of sufficiently complex change lists during code review.",
    347       "evidence": "Section 6.2: Preliminary study with 4 engineers on 73 CLs. 24 of 73 CLs were deemed complex enough; 14 of those 24 splits were considered useful.",
    348       "supported": "weak"
    349     },
    350     {
    351       "claim": "The LLM achieves 5% false negative rate and 0% false positive rate on Android malware suspicion scoring.",
    352       "evidence": "Section 6.1: Of 40 suspicious functions, 4 had predicted score 0 (false negatives), but further analysis shows 2 were correct predictions, leaving 2 actual false negatives. Of 40 benign functions, 2 had nonzero scores but REs unanimously agreed these were accurate.",
    353       "supported": "moderate"
    354     }
    355   ],
    356   "red_flags": [
    357     {
    358       "flag": "Company evaluating own product",
    359       "detail": "All 20 authors are Google employees evaluating Google's Gemini models as the primary system. Gemini 1.5 Pro is found to be the best model across most dimensions. The one non-Google baseline (DeepSeek-Coder 33B) is substantially smaller and older, making the comparison favorable to Gemini."
    360     },
    361     {
    362       "flag": "Evaluators are paper authors",
    363       "detail": "In Section 5, the 6 evaluators who curated the 30 functions also rated the outlines for those functions. Footnote 1 confirms '5 of the dataset contributors are also authors of this paper.' This creates a clear conflict — authors have incentive to rate outlines favorably."
    364     },
    365     {
    366       "flag": "Small and non-representative samples",
    367       "detail": "Section 5 uses only 6 evaluators and 30 curated Python functions (biased toward research projects, only Python). Section 6.2 uses only 4 engineers from one team. These are too small for the broad claims about NL outlines as a general developer assistance paradigm."
    368     },
    369     {
    370       "flag": "No statistical tests for comparative claims",
    371       "detail": "Claims that one model or technique outperforms another are made by comparing raw percentages without any significance tests, confidence intervals, or uncertainty quantification. With 30 functions and 6 evaluators, differences could easily be due to chance."
    372     },
    373     {
    374       "flag": "Baseline fairness concern",
    375       "detail": "DeepSeek-Coder-Instruct 33B (a 33B parameter open-source model) is compared against Gemini 1.5 Pro (a much larger model) without acknowledging the substantial model size disparity. This makes the comparison inherently favorable to the Google models."
    376     }
    377   ],
    378   "cited_papers": [
    379     {
    380       "title": "Large language models for software engineering: Survey and open problems",
    381       "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman", "Mitya Lyubarskiy", "Shubho Sengupta", "Shin Yoo", "Jie M Zhang"],
    382       "year": 2023,
    383       "relevance": "Survey of LLMs applied to software engineering tasks, directly relevant to understanding the landscape of AI-assisted development."
    384     },
    385     {
    386       "title": "Large language models for software engineering: A systematic literature review",
    387       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu", "Zhou Yang", "Kailong Wang", "Li Li", "Xiapu Luo", "David Lo", "John Grundy", "Haoyu Wang"],
    388       "year": 2023,
    389       "arxiv_id": "2308.10620",
    390       "relevance": "Systematic review of LLMs in SE, providing context for how NL outlines fit within the broader LLM-for-SE research space."
    391     },
    392     {
    393       "title": "In-IDE Human-AI Experience in the Era of Large Language Models; A Literature Review",
    394       "authors": ["Agnia Sergeyuk", "Sergey Titov", "Maliheh Izadi"],
    395       "year": 2024,
    396       "arxiv_id": "2401.10739",
    397       "relevance": "Literature review of human-AI interaction in IDEs, directly relevant to NL outlines as an IDE integration concept."
    398     },
    399     {
    400       "title": "Unifying the perspectives of NLP and software engineering: A survey on language models for code",
    401       "authors": ["Ziyin Zhang", "Chaoyu Chen", "Bingchang Liu", "Cong Liao", "Zi Gong", "Hang Yu", "Jianguo Li", "Rui Wang"],
    402       "year": 2023,
    403       "arxiv_id": "2311.07989",
    404       "relevance": "Survey on language models for code generation and understanding, covering the foundational capabilities that enable NL outline generation."
    405     },
    406     {
    407       "title": "Source Code Summarization in the Era of Large Language Models",
    408       "authors": ["Weisong Sun", "Yun Miao", "Yuekang Li", "Hongyu Zhang", "Chunrong Fang", "Yi Liu", "Gelei Deng", "Yang Liu", "Zhenyu Chen"],
    409       "year": 2024,
    410       "arxiv_id": "2407.07959",
    411       "relevance": "Survey of LLM-based code summarization, the most closely related capability to NL outline generation."
    412     },
    413     {
    414       "title": "A comparative analysis of large language models for code documentation generation",
    415       "authors": ["Shubhang Shekhar Dvivedi", "Vyshnav Vijay", "Sai Leela Rahul Pujari", "Shoumik Lodh", "Dhruv Kumar"],
    416       "year": 2024,
    417       "relevance": "Evaluates LLMs for code documentation generation, a closely related application to NL outline generation."
    418     },
    419     {
    420       "title": "Using an LLM to help with code understanding",
    421       "authors": ["Daye Nam", "Andrew Macvean", "Vincent Hellendoorn", "Bogdan Vasilescu", "Brad Myers"],
    422       "year": 2024,
    423       "relevance": "Studies how LLMs can help developers understand code, directly relevant to the code understanding use case of NL outlines."
    424     },
    425     {
    426       "title": "Expectations, outcomes, and challenges of modern code review",
    427       "authors": ["Alberto Bacchelli", "Christian Bird"],
    428       "year": 2013,
    429       "relevance": "Foundational study on code review practices, providing context for the Virtual CL Split application of NL outlines."
    430     },
    431     {
    432       "title": "Modern code review: a case study at Google",
    433       "authors": ["Caitlin Sadowski", "Emma Söderberg", "Luke Church", "Michal Sipko", "Alberto Bacchelli"],
    434       "year": 2018,
    435       "relevance": "Case study of code review at Google, directly relevant to the Virtual CL Split feature evaluated at Google."
    436     },
    437     {
    438       "title": "\"What it wants me to say\": Bridging the abstraction gap between end-user programmers and code-generating large language models",
    439       "authors": ["Michael Xieyang Liu", "Advait Sarkar", "Carina Negreanu", "Benjamin Zorn", "Jack Williams", "Neil Toronto", "Andrew D Gordon"],
    440       "year": 2023,
    441       "relevance": "Studies the abstraction gap in code generation from NL, relevant to the bidirectional NL-code sync proposed for NL outlines."
    442     },
    443     {
    444       "title": "Enhancing LLM-Based Coding Tools through Native Integration of IDE-Derived Static Context",
    445       "authors": ["Yichen Li", "Yun Peng", "Yintong Huo", "Michael R Lyu"],
    446       "year": 2024,
    447       "arxiv_id": "2402.03630",
    448       "relevance": "Proposes enhancing LLM coding tools with IDE context, relevant to the RAG-based improvements discussed for NL outlines."
    449     },
    450     {
    451       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming – The Rise of Code Intelligence",
    452       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang", "Zhenda Xie", "Kai Dong", "Wentao Zhang"],
    453       "year": 2024,
    454       "arxiv_id": "2401.14196",
    455       "relevance": "Describes DeepSeek-Coder, one of the models evaluated in the NL outline generation experiments."
    456     }
    457   ],
    458   "engagement_factors": {
    459     "practical_relevance": {
    460       "score": 2,
    461       "justification": "NL outlines are a practical concept for developer tooling with working prototypes demonstrated, but no tool is released for practitioners to use."
    462     },
    463     "surprise_contrarian": {
    464       "score": 1,
    465       "justification": "Extends literate programming and code summarization ideas with LLMs — evolutionary rather than contrarian, though the bidirectional sync concept is novel."
    466     },
    467     "fear_safety": {
    468       "score": 0,
    469       "justification": "No AI safety or security concerns raised; the Android security application is about detecting malware, not creating it."
    470     },
    471     "drama_conflict": {
    472       "score": 0,
    473       "justification": "No controversial claims or conflicts with prior work; a constructive proposal paper."
    474     },
    475     "demo_ability": {
    476       "score": 0,
    477       "justification": "No code, demo, or tool is released; only mockups, screenshots, and prototype descriptions are provided."
    478     },
    479     "brand_recognition": {
    480       "score": 2,
    481       "justification": "From Google with evaluation of Gemini models; well-known lab and product but not as attention-grabbing as OpenAI/ChatGPT papers."
    482     }
    483   }
    484 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs