scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29098B)
      1 {
      2   "paper": {
      3     "title": "Static Program Analysis Guided LLM Based Unit Test Generation",
      4     "authors": [
      5       "Sujoy Roychowdhury",
      6       "Giriprasad Sridhara",
      7       "A K Raghavan",
      8       "Joy Bose",
      9       "Sourav Mazumdar",
     10       "Hamender Singh",
     11       "Srinivasan Bajji Sugumaran",
     12       "Ricardo Britto"
     13     ],
     14     "year": 2024,
     15     "venue": "CODS-COMAD Dec '24",
     16     "arxiv_id": "2503.05394",
     17     "doi": "10.1145/3703323.3703742"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "Augmenting LLM prompts with static program analysis information (declaring class, method signatures, field types) significantly increases the number of Java focal methods for which unit tests are generated compared to a whole-file baseline approach. On a commercial project (103 methods), the approach generated tests for 102/103 methods with llama 7b vs 37/103 for the baseline. Prompt token counts dropped from a mean of 5295 to 559. However, only test generation (not quality) was measured, and no statistical tests were applied.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No repository URL, code archive, or release link is provided anywhere in the paper. No mention of plans to release code."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The commercial project data is proprietary and not released. The specific focal methods selected from Guava are not identified (only DoubleUtils is named in the example). No dataset is provided."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Section 5.2.1 mentions 'single A100 80 GB GPU' and '8 bit quantization' but provides no requirements.txt, Dockerfile, library versions, or sufficient detail to recreate the environment. The Java Parser Library is mentioned without a version."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No reproduction instructions, scripts, or step-by-step guides are provided. A reader would have to reverse-engineer the entire experimental setup from the paper's prose."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Tables 1 and 2 report only raw counts with no confidence intervals or error bars. Table 3 reports mean/std/median for token counts but not for the main evaluation metric."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims their approach generates tests for more focal methods but provides no statistical tests (no p-values, chi-squared, or any test of significance) to support the comparison."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Tables 1 and 2 show both baseline and proposed approach counts side by side (e.g., 37 vs 102 for llama7b on commercial, 20 vs 30 for llama7b on Guava), providing baseline context for the reader to assess effect magnitude."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification for why 103 commercial focal methods or 34 open-source methods were chosen. No power analysis. Section 5.1 simply states 'We selected a random sub-project' without discussing whether the sample size is adequate."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance or standard deviation is reported for the main evaluation results (test generation counts). Table 3 reports std dev for token counts and Table 4 for time, but the primary metric has no variance measure. Temperature is set to 0 but this is not discussed in terms of result determinism."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper compares against a baseline approach from Siddiq et al. [10] that provides the entire source file as context. Both Tables 1 and 2 show baseline vs proposed approach."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The baseline (Siddiq et al. 2023 [10]) is from the prior year, which is contemporary for a 2024 publication. The related work also discusses recent approaches like RLPG [9] and Monitor Guided Decoding [1], both from 2023."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The system augments prompts with three types of information (declaring class, method signatures, field types) but no ablation study tests the contribution of each component individually. Only the full approach vs baseline is compared."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Only one metric is used: the number of focal methods for which one or more unit tests were generated. Section 8 acknowledges this limitation: 'One limitation of this study is that we are only looking at generated test cases rather than their quality.'"
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation of generated tests is performed. The only metric is automated counting of whether tests were generated, not their correctness, readability, or usefulness."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No discussion of separating development and test data. The motivating example in Listing 1 (getEmail method) appears to be from the evaluation set. No evidence that prompt design was done on separate data from evaluation."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down by model (llama7b, llama70b, codellama34b, gpt-4) in Tables 1-2 and by project type (commercial vs open source). Token statistics are also broken down by approach in Tables 3-4."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "The paper does not discuss specific failure cases where their approach failed to generate tests. Section 7 hypothesizes why the baseline fails but does not analyze the 1 (of 103) commercial method or 3-15 open-source methods where their approach did not generate tests."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "No negative results are reported. Every comparison shows their approach outperforming the baseline. No configurations or design decisions that were tried and abandoned are discussed."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims that 'augmenting prompts with concise and precise context information obtained by program analysis increases the effectiveness of generating unit test code through LLMs,' which is supported by Tables 1-2. The claim of validation on commercial and open-source projects is also supported."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The causal claim that program analysis augmentation improves test generation is tested via a controlled comparison: same models, same focal methods, different prompt strategies. This controlled single-variable manipulation (prompt content) is adequate for the causal claim."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper tests only on Java but Section 7 (Portability) claims 'it should be portable across languages' without evidence. The title uses the general 'Unit Test Generation' without bounding to Java. Results are from only 2 projects (1 commercial, 1 open source) but claims are framed broadly."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "Section 7 offers only one explanation for the baseline's failure (context length/extraneous information). No alternative explanations for their own approach's success are considered — e.g., whether the improvement comes from shorter prompts alone, from the specific information content, or from prompt structure."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Section 8 explicitly acknowledges the proxy gap: 'One limitation of this study is that we are only looking at generated test cases rather than their quality — this is a scope for future study.' The metric (generation count) is distinguished from the broader outcome (test effectiveness)."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Llama models are specified as 'llama 7b/70b 2.0 and CodeLlama 34b based on llama 2.0' which includes version and size. However, GPT-4 is used in Table 2 without any version or snapshot date, violating the criterion."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 6.1 provides the full prompt text used in Listing 1 (appendix), including the system instruction, focal method, declaring class, method signatures, and fields. The baseline prompt structure is also described. The actual prompt text is shown, not just a description."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 5.2.1 reports: temperature=0, topK=50, topP=0.95, input tokens limited to 1023, context length 4096, 8-bit quantization. 'Other parameters are kept at their default.'"
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The approach is direct prompting of LLMs with program analysis-augmented prompts."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "The paper states 'We selected a random sub-project from our commercial Java project' and 'two random Java classes' from Guava but does not describe how randomization was done, what criteria defined a 'sub-project,' or how focal methods were enumerated from the selected code."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "There is no dedicated limitations section. Two limitations are mentioned in Section 8 (Conclusion and Future Work) but they are embedded in the conclusion paragraph, not a substantive standalone discussion."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 8 identifies two specific threats: (1) 'we are only looking at generated test cases rather than their quality' and (2) 'not all dependencies, especially those at runtime, can be analyzed via static program analysis — such limitations would affect our results too.' These are specific to this study."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper does not clearly bound its scope. While acknowledging the quality limitation, the Portability section (Section 7) claims cross-language applicability without evidence. No explicit statement of what the results do NOT show beyond the quality caveat."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "No raw data is available. The commercial project is proprietary, and neither the specific focal methods, generated tests, nor detailed per-method results are released."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "Section 5.1 says 'We selected a random sub-project from our commercial Java project and this had 103 focal methods' and 'two random Java classes' from Guava. The randomization process, selection criteria, and specific classes/methods are not described in adequate detail."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants in this study. The study evaluates LLM-generated unit tests on code from existing projects."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": false,
    210         "justification": "The pipeline from code selection to evaluation results is not documented step-by-step. How Java Parser extracted information, how prompts were assembled, how output was evaluated (manual or automated counting) — these steps are described only at a high level."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosure or acknowledgments section is present. Seven of eight authors are from Ericsson R&D, suggesting corporate-funded research, but this is not explicitly stated."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: seven from Ericsson R&D (Bangalore/Stockholm) and one independent researcher. The commercial project is described as 'from a telecom company,' making the Ericsson connection apparent."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "Ericsson employees developed the approach and evaluated it on Ericsson's own commercial codebase. Ericsson has a direct interest in demonstrating that their approach improves developer productivity. The funder is not independent of the outcome."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper uses Llama 2 models and GPT-4 without stating their training data cutoff dates. This is relevant because the models may have seen Guava source code and its existing tests during training."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "Guava is one of the most popular Java libraries and its source code is widely available on GitHub. The LLMs almost certainly encountered Guava code during training, yet no discussion of potential overlap between training data and evaluation subjects is provided."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "Guava's source code and existing test suites have been publicly available for over a decade. Llama 2 and GPT-4 likely trained on this code. The paper does not address whether the LLMs might already know Guava's test patterns, which would inflate results for the open-source evaluation."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 4 reports mean, std dev, and median time per inference for both baseline and the proposed approach. Section 7 discusses the cost advantage in token-based pricing systems. Section 5.2.1 specifies the hardware used (single A100 80GB GPU)."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "While per-inference time statistics and GPU type are reported, the total computational budget (total GPU hours, total API spend, total experiment wall-clock time) is not stated."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Temperature is set to 0 (Section 5.2.1), suggesting deterministic outputs, but the paper does not explicitly discuss result determinism or report results across multiple seeds/runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper does not state how many times each experiment was run. With temperature=0 the output should be deterministic, but this is not explicitly stated or verified."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No mention of how the hyperparameters (temperature=0, topK=50, topP=0.95) or prompt format were selected. No search budget or alternative configurations tried are reported."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The prompt structure and hyperparameter settings appear to be a single chosen configuration with no justification for why these specific values were selected or how they were determined."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors implement the baseline from [10] themselves and compare it against their own proposed approach. This self-comparison bias (Lucic et al. 2018) is not acknowledged or discussed."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Table 3 shows their approach uses ~10x fewer tokens than the baseline, and Table 4 shows similar inference times. However, no systematic analysis of performance as a function of compute budget is provided, and the token cost advantage is not quantified in monetary or compute terms."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The sole metric is whether any test was generated, but the paper does not discuss whether this measures the claimed 'effectiveness' of test generation. Generating a test that compiles, passes, or achieves coverage is not addressed. Section 8 acknowledges quality is not measured but does not question the metric's construct validity."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "No scaffolding or agentic framework is used. The approach is direct LLM prompting."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Guava has been publicly available since 2009 and Llama 2's training data likely includes it. GPT-4's training data almost certainly includes Guava source and tests. No discussion of temporal leakage is provided."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether providing the focal method body as input constitutes feature leakage or whether the models already know the test patterns for these methods from training data."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "Focal methods are selected from within the same project (same codebase, same coding style, same architecture). No discussion of whether methods within the same project are independent test cases."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, or decontamination analysis is performed."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Augmenting prompts with static program analysis information increases the number of focal methods for which unit tests are generated compared to the whole-file baseline.",
    374       "evidence": "Tables 1-2 show consistent improvements across all models: on 103 commercial methods, llama7b generates tests for 102 vs 37 (baseline), llama70b 88 vs 9, codellama34b 63 vs 7. On 34 open-source methods, improvements range from 1→19 (codellama34b) to 32→34 (GPT-4).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "The program analysis approach produces significantly more concise prompts than the whole-file baseline.",
    379       "evidence": "Table 3 shows mean token counts of 559 (std 516) vs 5295 (std 3350), and median of 464 vs 4056. This is a ~10x reduction in prompt length.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "The approach should be portable across programming languages.",
    384       "evidence": "Section 7 (Portability) argues this based on the availability of parsers and symbolic resolvers for all languages, but provides no empirical evidence from non-Java languages.",
    385       "supported": "unsupported"
    386     },
    387     {
    388       "claim": "The baseline fails due to increased context length and extraneous information in the prompt.",
    389       "evidence": "Section 7 states this as a hypothesis supported by the token length statistics in Table 3. However, no controlled experiment isolates context length from information content as the causal factor.",
    390       "supported": "weak"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Company evaluating its own approach on its own codebase",
    396       "detail": "Seven of eight authors are from Ericsson R&D, and the primary evaluation (103 methods) is on Ericsson's own commercial Java project. The company has a direct interest in demonstrating the approach works. No independent evaluation is provided."
    397     },
    398     {
    399       "flag": "Very small sample sizes with no statistical testing",
    400       "detail": "The evaluation uses only 103 commercial focal methods and 34 open-source methods. No statistical significance tests, confidence intervals, or power analysis support the claims of improvement."
    401     },
    402     {
    403       "flag": "Generation-only metric ignores test quality",
    404       "detail": "The sole metric is whether any test was generated, ignoring compilation success, correctness, assertion quality, branch coverage, or test smells. A generated test that doesn't compile or test anything meaningful would count as a success. The authors acknowledge this limitation."
    405     },
    406     {
    407       "flag": "Contamination risk for open-source evaluation",
    408       "detail": "Guava is one of the most widely-used Java libraries. LLMs like Llama 2 and GPT-4 almost certainly trained on Guava source code and its existing test suite, potentially inflating results on the open-source evaluation."
    409     },
    410     {
    411       "flag": "No code or data released",
    412       "detail": "Neither the implementation, the specific focal methods evaluated, nor the generated tests are released. Results cannot be independently verified or reproduced."
    413     },
    414     {
    415       "flag": "Unbounded generalization claims",
    416       "detail": "Section 7 claims the approach 'should be portable across languages' and 'should also work for different complex methods' based solely on Java experiments on 2 projects."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    422       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    423       "year": 2023,
    424       "arxiv_id": "2302.06527",
    425       "relevance": "Directly relevant as an LLM-based unit test generation approach that relies on sample usages of the focal method."
    426     },
    427     {
    428       "title": "An Empirical Study of Using Large Language Models for Unit Test Generation",
    429       "authors": ["Mohammed Latif Siddiq", "Joanna C. S. Santos", "Ridwanul Hasan Tanvir", "Noshin Ulfat", "Fahmid Al Rifat", "Vinicius Carvalho Lopes"],
    430       "year": 2023,
    431       "arxiv_id": "2305.00418",
    432       "relevance": "Serves as the baseline approach in this paper's evaluation — treats unit test generation as a code completion task using the entire source file."
    433     },
    434     {
    435       "title": "Repository-level prompt generation for large language models of code",
    436       "authors": ["Disha Shrivastava", "Hugo Larochelle", "Daniel Tarlow"],
    437       "year": 2023,
    438       "relevance": "Related approach to augmenting LLM prompts with additional project context for code completion tasks."
    439     },
    440     {
    441       "title": "Monitor-Guided Decoding of Code LMs with Static Analysis of Repository Context",
    442       "authors": ["Lakshya Agrawal", "Aditya Kanade", "Navin Goyal", "Shuvendu K Lahiri", "Sriram Rajamani"],
    443       "year": 2023,
    444       "relevance": "Combines static analysis with LLMs for code generation via a white-box decoding approach, contrasting with this paper's black-box prompting."
    445     },
    446     {
    447       "title": "Unit test case generation with transformers and focal context",
    448       "authors": ["Michele Tufano", "Dawn Drain", "Alexey Svyatkovskiy", "Shao Kun Deng", "Neel Sundaresan"],
    449       "year": 2020,
    450       "relevance": "Encoder-decoder transformer approach to unit test generation using focal method pairs, foundational work in neural test generation."
    451     },
    452     {
    453       "title": "Toga: A neural method for test oracle generation",
    454       "authors": ["Elizabeth Dinella", "Gabriel Ryan", "Todd Mytkowicz", "Shuvendu K Lahiri"],
    455       "year": 2022,
    456       "relevance": "Neural approach to automatically generating test oracles, a key component of unit test generation."
    457     },
    458     {
    459       "title": "EvoSuite: Automatic Test Suite Generation for Object-Oriented Software",
    460       "authors": ["Gordon Fraser", "Andrea Arcuri"],
    461       "year": 2011,
    462       "doi": "10.1145/2025113.2025179",
    463       "relevance": "Foundational search-based test generation tool, represents the non-AI baseline approach to automated testing."
    464     },
    465     {
    466       "title": "Code Llama: Open Foundation Models for Code",
    467       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    468       "year": 2023,
    469       "arxiv_id": "2308.12950",
    470       "relevance": "One of the LLMs evaluated in this paper (CodeLlama 34b), a code-specialized open-source model."
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "The approach of augmenting prompts with static analysis info is practically useful for developers, but no tool or code is released for immediate use."
    477     },
    478     "surprise_contrarian": {
    479       "score": 0,
    480       "justification": "Confirms the expected intuition that providing more precise context to LLMs improves output quality."
    481     },
    482     "fear_safety": {
    483       "score": 0,
    484       "justification": "No safety, security, or AI risk concerns raised."
    485     },
    486     "drama_conflict": {
    487       "score": 0,
    488       "justification": "No controversy or provocative claims."
    489     },
    490     "demo_ability": {
    491       "score": 0,
    492       "justification": "No code, demo, or tool released — cannot try it."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "Ericsson is a well-known telecom company but not prominent in AI research circles."
    497     }
    498   }
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs