scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30795B)
      1 {
      2   "paper": {
      3     "title": "Python Symbolic Execution with LLM-powered Code Generation",
      4     "authors": [
      5       "Wenhan Wang",
      6       "Kaibo Liu",
      7       "An Ran Chen",
      8       "Ge Li",
      9       "Zhi Jin",
     10       "Gang Huang",
     11       "Lei Ma"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2409.09271",
     16     "doi": "10.48550/arXiv.2409.09271"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "LLM-Sym, an LLM agent augmenting an introductory-level symbolic execution engine for Python, can solve ~63% of execution path constraints on LeetCode problems by generating Z3 solver code via a multi-step pipeline with retrieval and self-refinement. The Z3 code generator (GPT-4o-mini backbone) achieves a 73% pass rate on paths it handles, higher than direct LLM solving, though GPT-4o alone solves more paths overall (71 vs 65). The approach costs $0.005 per path, roughly one-third the cost of GPT-4o direct solving.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No repository URL for LLM-Sym is provided. The paper links to The Fuzzing Book (the backbone engine) and a public LeetCode solutions repo, but the LLM-Sym code itself is not released."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The authors built a dataset of 111 execution traces from 50 LeetCode programs but provide no download link or release for this dataset. The LeetCode solutions source (walkccc/LeetCode) is public but their specific selection and trace extraction are not released."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions using GPT-4o-mini and GPT-4o with the AutoGen library and Sentence-BERT for retrieval, but provides no requirements.txt, library versions, or environment setup details."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No reproduction instructions, README, or scripts are provided. A reader would need to reverse-engineer the entire pipeline from the paper description."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results are reported as raw counts and percentages (e.g., Table 2: '70 (63.1%)') with no confidence intervals or error bars."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims LLM-Sym 'outperforms pure LLM-base approaches' (Section 1) and compares pass rates across settings (Tables 3, 5, 6, 7) but never applies any statistical test to these comparisons."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Tables 6 and 7 report pass rates for both LLM-Sym and baselines with raw numbers (e.g., LLM-Sym 65 path correct vs GPT-4o-mini 55 vs GPT-4o 71), providing enough context to assess the magnitude of differences."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The dataset consists of 50 LeetCode programs yielding 111 paths. No justification is given for why 50 programs or 111 paths is sufficient. Section 5.2 acknowledges the dataset is small but offers no power analysis."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Results appear to be from single runs. No standard deviation, variance, or spread across multiple runs is reported for any experiment."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 7 compares LLM-Sym against direct LLM solving with GPT-4o-mini and GPT-4o. The backbone engine (The Fuzzing Book) which 'can solve None of these traces' is also referenced as a baseline (Section 4.2)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The baselines use GPT-4o and GPT-4o-mini (2024 models). The paper also references SymPrompt (2024) as related work, though does not directly compare numerically."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "RQ2 (Section 4.4) provides ablations: different numbers of retrieved templates (Table 3), chunking strategies (Table 5), and the self-refine module's behavior (Figures 11-12). RQ3 compares the Z3 code generator vs LLM solver components (Table 6)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Table 2 reports three metrics: SAT (constraint satisfiability), Execution pass (runnable test case), and Path correct (correct execution trace). Table 8 adds time and money cost metrics."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Evaluation is entirely automated: test cases are executed and execution traces are compared. No human evaluation of generated Z3 code quality, test case usefulness, or other outputs."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "The 14 retrieval templates in the knowledge base were manually created and it is unclear whether they were developed independently from the 111 test paths. No explicit train/dev/test separation is described."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Table 6 breaks down results by Z3 code generator vs LLM solver. Tables 3 and 5 break down by retrieval settings and chunking strategies. Table 4 shows retrieval recall at different k values."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Figure 10 shows a detailed failed generation example with SSA variable reference errors. Figure 12 shows a self-refine failure where the LLM cannot generate correct Z3Py for string operations."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Table 5 shows chunking by condition is worse than chunking by line across all settings. Figure 12 demonstrates a case where the self-refine module completely fails. Table 7 shows GPT-4o outperforms LLM-Sym overall (71 vs 65)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The introduction claims 'LLM-Sym can correctly solve around 60% of the given execution traces and outperforms pure LLM-base approaches.' However, Table 7 shows GPT-4o alone achieves 71 correct paths vs LLM-Sym's 65. The 'outperforms' claim only holds for GPT-4o-mini (55) but not GPT-4o, making this statement misleading."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The ablation studies (Tables 3, 5, 6) involve controlled single-variable manipulations: varying number of templates, changing chunking strategy, and comparing Z3 generator vs LLM solver. These are adequate designs for the causal claims made about component contributions."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The abstract claims LLM-Sym 'opens up new opportunities in LLM-augmented test case generation' based on only 50 LeetCode programs (≤25 lines). Section 5.2 partially acknowledges this but the title and abstract frame results broadly as 'Python Symbolic Execution' rather than bounding to short competitive programming problems."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "Section 5 discusses model choice and data limitations but does not consider alternative explanations for observed results. For example, it does not discuss whether GPT-4o-mini may have memorized LeetCode solutions, whether the knowledge base templates overfit to the test set, or whether the improvements come from the retrieval examples rather than the Z3 solving approach."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures path constraint solving (SAT, execution pass, path correct) and frames results as evaluating symbolic execution capability for test case generation. The measurements directly match the granularity of the claims being made."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "The paper uses 'GPT-4o-mini' and 'GPT-4o' throughout without specifying snapshot dates or API versions (e.g., 'gpt-4o-mini-2024-07-18'). Model behavior varies across versions."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "Figures 4, 6, 7, and 8 show prompt templates but with unfilled placeholders: '{Basic rules for Z3Py code generation}' (Figure 6), '{examples}' (Figures 4, 7), and '{Format template}' (Figures 7, 8). The actual content of these placeholders is not provided, making exact prompt reconstruction impossible."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported. The self-refine retry count (3 attempts) and max path length (20) are stated, but API sampling parameters are missing."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Figure 2 provides a complete workflow diagram. The paper describes in detail: CFG extraction (Section 3.1), type inference (Section 3.2.1), retrieval-augmented Z3 code generation with knowledge base (Section 3.2.2), self-refine mechanism (Section 3.2.2), test case generation (Section 3.2.3), and LLM solver fallback (Section 3.2.4)."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 4.2 describes: 50 LeetCode problems selected with solutions ≤25 lines, example test cases run to collect 111 execution traces, traces truncated to max length 20 (average 12.3), and CFG path extractor used to match traces to complete execution paths."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 5 'Threats to Validity' contains two subsections: 5.1 'Model validity' and 5.2 'Data validity', providing substantive discussion of limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 5.1 specifically discusses: limited support for only 'list' operations, dependence on Z3 solver capabilities, and restriction to GPT-4o-mini/4o models. Section 5.2 specifically notes the evaluation is on 'simple, restricted programs' from LeetCode only."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 5.2 states: 'As the first step of integrating LLMs with an SMT solver for symbolic execution, we should first evaluate simple, restricted programs and understand the behavior of LLM-Sym on these data before we move on to more complicated, real-world data.' Section 5.1 acknowledges 'only supports a part of the Python list operation and does not explicitly support other data structures.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The 111 execution paths, generated Z3 code, and test cases are not released. Only aggregate results in tables are provided. The LeetCode solution source repo is referenced but the specific selections and traces are not available."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 4.2 describes: 50 problems selected from LeetCode, Python solutions collected from public GitHub repo (walkccc/LeetCode), solutions with up to 25 lines of code, example test cases executed to collect 111 traces."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data source is LeetCode problems selected from a public repository, which is described in Section 4.2."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 4.2 documents: LeetCode problem selection (50 problems, ≤25 lines) → solution collection from public repo → execution of example test cases → 111 traces collected → truncation to max length 20 → CFG path matching. Each step and its output count are described."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding acknowledgments section is present in the paper. No grants or sponsors are mentioned."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: University of Alberta, Peking University, and University of Tokyo. No commercial product is being evaluated — the paper evaluates its own research prototype using third-party LLMs."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses OpenAI models (GPT-4o, GPT-4o-mini) but no relationship with OpenAI is declared."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is included in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The paper uses GPT-4o and GPT-4o-mini to solve LeetCode problems but never states the models' training data cutoff dates. LeetCode solutions are widely available online and likely in training data."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The LeetCode solutions are from a public GitHub repository (walkccc/LeetCode) that GPT-4o/mini could have seen during training. This potential overlap is never discussed."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "LeetCode problems and their solutions have been publicly available for years before GPT-4o's training. The paper does not address whether the models could solve the constraints simply by having memorized the solutions rather than through genuine symbolic reasoning."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study. Evaluation is entirely automated on LeetCode programs."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Table 8 reports per-sample and total costs: LLM-Sym costs $0.005 per path ($0.61 total), GPT-4o costs $0.018 per path ($1.96 total). Time per path is also reported: 20.2s for LLM-Sym vs 10.7s for GPT-4o."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Table 8 reports total time (2241s for LLM-Sym) and total API cost ($0.61). The paper notes all experiments use the OpenAI API with no local GPU compute."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of multiple random seeds or seed sensitivity. All results appear to be from single runs despite LLM outputs being stochastic."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is never stated. Results are presented as single-run numbers without clarification."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search is described. The 14 knowledge base templates and self-refine retry count (3) appear to be manually chosen without systematic search."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Tables 3 and 5 report results for all tested configurations (1-5 templates, two chunking strategies) rather than only the best. The default setting (2 templates, chunk by line) is justified by the results shown."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "No statistical tests are performed, let alone corrections for multiple comparisons. Multiple configurations are compared using only raw counts."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors compare their LLM-Sym system against their own implementation of the LLM solver baseline (Table 7). No acknowledgment of author-evaluation bias is provided."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Table 8 directly compares performance (path correct from Table 7) against time and money cost for LLM-Sym, GPT-4o-mini, and GPT-4o. The cost-performance tradeoff is explicitly discussed in RQ4."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether LeetCode problems (short competitive programming solutions ≤25 lines) are representative of real-world symbolic execution needs. No analysis of whether performance on these problems generalizes to actual software testing scenarios."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "LLM-Sym uses a complex multi-step pipeline (type inference, retrieval, generation, self-refine) while the GPT-4o baseline uses a single prompt. The performance difference could be due to the scaffold rather than the Z3 integration, but this confound is not discussed."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "LeetCode problems and solutions from walkccc/LeetCode have been publicly available for years. GPT-4o/mini were trained on data including such repositories. This temporal leakage is not discussed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "The LLM receives the full program under test and execution path as input. If the model has memorized the LeetCode solution, it could bypass symbolic reasoning entirely. This potential feature leakage is not discussed."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "Multiple execution traces (111 total) are drawn from only 50 programs, creating non-independence between traces from the same program. This is not discussed."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, or decontamination analysis is performed."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "LLM-Sym can correctly solve around 63% of execution path constraints on LeetCode problems.",
    373       "evidence": "Table 2 (Section 4.3): 70 out of 111 paths (63.1%) produce correct execution traces. 99 (89.2%) are satisfiable, 97 (87.4%) pass execution.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "LLM-Sym outperforms pure LLM-based approaches in solving path constraints.",
    378       "evidence": "Table 7 (Section 4.5): LLM-Sym achieves 65 correct paths vs GPT-4o-mini's 55, but GPT-4o alone achieves 71. The 'outperforms' claim is supported only against GPT-4o-mini, not GPT-4o. The paper emphasizes the Z3-subset pass rate (73% vs 64.9%) to support the claim.",
    379       "supported": "weak"
    380     },
    381     {
    382       "claim": "The Z3 code generator achieves higher pass rates than the LLM solver on paths it handles.",
    383       "evidence": "Table 6 (Section 4.5): Z3 code generator achieves 73.0% pass rate with 2 templates vs LLM solver's 47.8%. This pattern holds across all retrieval settings (63.6-73.0% Z3 vs 37.5-58.6% LLM).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "LLM-Sym is cost-effective: its cost is less than one-third of GPT-4o while achieving similar results.",
    388       "evidence": "Table 8 (Section 4.6): LLM-Sym costs $0.61 total ($0.005/path) vs GPT-4o's $1.96 ($0.018/path). LLM-Sym achieves 65 correct paths vs GPT-4o's 71, so results are comparable but not identical.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Retrieval of 2 templates yields the best performance for the Z3 code generator.",
    393       "evidence": "Table 3 (Section 4.4): 2 templates yield 65 correct paths, higher than 1 template (54), 3 templates (62), 4 templates (61), or 5 templates (62).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "LLM-Sym is the first Python symbolic execution engine supporting Python 'list' without concolic execution.",
    398       "evidence": "Section 1 claims this as a contribution. The paper describes list support via Z3.Array approximation (Table 1, Section 3.2.2). However, this is a novelty claim that cannot be verified from the paper alone.",
    399       "supported": "weak"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "Misleading 'outperforms' claim",
    405       "detail": "The paper claims LLM-Sym 'outperforms pure LLM-base approaches' (Section 1), but Table 7 shows GPT-4o alone achieves 71 correct paths vs LLM-Sym's 65. The claim is supported only by cherry-picking the comparison with GPT-4o-mini (55) or by focusing on the Z3-subset pass rate rather than overall performance."
    406     },
    407     {
    408       "flag": "Very small dataset",
    409       "detail": "Only 50 LeetCode programs yielding 111 execution paths, all ≤25 lines. This is a very narrow evaluation scope for claims about 'Python Symbolic Execution.' The non-independence of multiple traces from the same program further reduces effective sample size."
    410     },
    411     {
    412       "flag": "No contamination analysis on LeetCode data",
    413       "detail": "LeetCode solutions are widely available online and almost certainly in GPT-4o/mini training data. The LLM may solve constraints by pattern-matching memorized solutions rather than genuine symbolic reasoning. This fundamental validity threat is never addressed."
    414     },
    415     {
    416       "flag": "No error bars or multiple runs",
    417       "detail": "All results are from apparent single runs of stochastic LLM calls. No variance, confidence intervals, or seed sensitivity analysis is reported despite LLM outputs being non-deterministic."
    418     },
    419     {
    420       "flag": "No code or data released",
    421       "detail": "Neither the LLM-Sym implementation nor the 111-path dataset is released, making independent verification impossible."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Codamosa: Escaping coverage plateaus in test generation with pre-trained large language models",
    427       "authors": ["Caroline Lemieux", "Jeevana Priya Inala", "Shuvendu K Lahiri", "Siddhartha Sen"],
    428       "year": 2023,
    429       "relevance": "Integrates LLMs with search-based testing (Pynguin) for test coverage improvement, directly relevant to LLM-augmented test generation."
    430     },
    431     {
    432       "title": "ChatUniTest: A framework for LLM-based test generation",
    433       "authors": ["Yinghao Chen", "Zehao Hu", "Chen Zhi", "Junxiao Han", "Shuiguang Deng", "Jianwei Yin"],
    434       "year": 2024,
    435       "relevance": "LLM-based unit test generation framework for Java, demonstrating LLM capabilities in automated test generation."
    436     },
    437     {
    438       "title": "Code-Aware Prompting: A Study of Coverage-Guided Test Generation in Regression Setting using LLM",
    439       "authors": ["Gabriel Ryan", "Siddhartha Jain", "Mingyue Shang", "Shiqi Wang", "Xiaofei Ma", "Murali Krishna Ramanathan", "Baishakhi Ray"],
    440       "year": 2024,
    441       "relevance": "SymPrompt combines symbolic analysis with LLM test generation using execution paths in prompts — the closest related work to LLM-Sym."
    442     },
    443     {
    444       "title": "HITS: High-coverage LLM-based Unit Test Generation via Method Slicing",
    445       "authors": ["Zejun Wang", "Kaibo Liu", "Ge Li", "Zhi Jin"],
    446       "year": 2024,
    447       "arxiv_id": "2408.11324",
    448       "relevance": "Applies program slicing to improve LLM-generated test coverage for Java methods."
    449     },
    450     {
    451       "title": "Logic-LM: Empowering Large Language Models with Symbolic Solvers for Faithful Logical Reasoning",
    452       "authors": ["Liangming Pan", "Alon Albalak", "Xinyi Wang", "William Wang"],
    453       "year": 2023,
    454       "relevance": "Demonstrates LLMs calling Z3 solver for logical reasoning, foundational to the LLM+solver integration approach."
    455     },
    456     {
    457       "title": "SatLM: Satisfiability-aided language models using declarative prompting",
    458       "authors": ["Xi Ye", "Qiaochu Chen", "Isil Dillig", "Greg Durrett"],
    459       "year": 2023,
    460       "relevance": "LLMs using SAT/SMT solvers via declarative prompting for constraint satisfaction, directly related to LLM+solver integration."
    461     },
    462     {
    463       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    464       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    465       "year": 2024,
    466       "relevance": "Multi-agent framework used as the implementation backbone for LLM-Sym's agent architecture."
    467     },
    468     {
    469       "title": "Software testing with large language models: Survey, landscape, and vision",
    470       "authors": ["Junjie Wang", "Yuchao Huang", "Chunyang Chen", "Zhe Liu", "Song Wang", "Qing Wang"],
    471       "year": 2024,
    472       "relevance": "Comprehensive survey of LLM applications in software testing, covering the broader research landscape."
    473     },
    474     {
    475       "title": "An empirical evaluation of using large language models for automated unit test generation",
    476       "authors": ["Max Schäfer", "Sarah Nadi", "Aryaz Eghbali", "Frank Tip"],
    477       "year": 2023,
    478       "relevance": "Empirical study of LLM-generated unit tests evaluating correctness, coverage, and bug-finding."
    479     },
    480     {
    481       "title": "Evaluating and improving ChatGPT for unit test generation",
    482       "authors": ["Zhiqiang Yuan", "Mingwei Liu", "Shiji Ding"],
    483       "year": 2024,
    484       "relevance": "ChatTester: evaluates and improves LLM-based unit test generation, relevant to the LLM testing pipeline."
    485     },
    486     {
    487       "title": "Pynguin: Automated unit test generation for Python",
    488       "authors": ["Stephan Lukasczyk", "Gordon Fraser"],
    489       "year": 2022,
    490       "relevance": "Search-based Python test generation tool; the primary automated testing tool for Python used as context for LLM-Sym's contribution."
    491     },
    492     {
    493       "title": "Code Agents are State of the Art Software Testers",
    494       "authors": ["Niels Mündler", "Mark Niklas Müller", "Jingxuan He", "Martin Vechev"],
    495       "year": 2024,
    496       "arxiv_id": "2406.12952",
    497       "relevance": "SWT-Bench: evaluates LLM agents as software testers, relevant to agentic testing approaches."
    498     },
    499     {
    500       "title": "Large-scale, Independent and Comprehensive study of the power of LLMs for test case generation",
    501       "authors": ["Wendkûuni C Ouédraogo", "Kader Kaboré", "Haoye Tian"],
    502       "year": 2024,
    503       "arxiv_id": "2407.00225",
    504       "relevance": "Large-scale empirical study of LLM test generation capabilities across different prompt designs."
    505     }
    506   ],
    507   "engagement_factors": {
    508     "practical_relevance": {
    509       "score": 1,
    510       "justification": "Research prototype with no released code; limited to list operations on short LeetCode programs, far from practical symbolic execution use."
    511     },
    512     "surprise_contrarian": {
    513       "score": 1,
    514       "justification": "Novel combination of LLMs with SMT solvers for symbolic execution, but does not challenge conventional wisdom — extends known approaches."
    515     },
    516     "fear_safety": {
    517       "score": 0,
    518       "justification": "No safety, security, or risk implications discussed or demonstrated."
    519     },
    520     "drama_conflict": {
    521       "score": 0,
    522       "justification": "No controversial claims or conflicts; straightforward technical contribution."
    523     },
    524     "demo_ability": {
    525       "score": 0,
    526       "justification": "No code, demo, or tool released. Cannot be tried by anyone."
    527     },
    528     "brand_recognition": {
    529       "score": 1,
    530       "justification": "Uses GPT-4o/mini (recognizable) but from academic labs (University of Alberta, Peking University) without major brand cachet."
    531     }
    532   }
    533 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs