scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27147B)
      1 {
      2   "paper": {
      3     "title": "BioPlanner: Automatic Evaluation of LLMs on Protocol Planning in Biology",
      4     "authors": [
      5       "Odhran O'Donoghue",
      6       "Aleksandar Shtedritski",
      7       "John Ginger",
      8       "Ralph Abboud",
      9       "Ali Essa Ghareeb",
     10       "Justin Booth",
     11       "Samuel G Rodriques"
     12     ],
     13     "year": 2023,
     14     "venue": "EMNLP 2023",
     15     "arxiv_id": "2310.10632"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper provides a GitHub link (https://github.com/bioplanner/bioplanner) in the footnote on page 1 and in Appendix E, stating 'The dataset and code for evaluation are available at' the URL."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The BIOPROT dataset is released alongside the code at the same GitHub repository. Section 3 states 'The dataset can be found in the Supplementary Materials' and Appendix E confirms public availability."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No mention of requirements.txt, Dockerfile, conda environment, or detailed dependency/version specifications found in the paper. Only the models used (GPT-3.5, GPT-4, Llama-2) and embedding model (text-embedding-ada-002) are named."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "While prompts are provided in the appendix and the evaluation framework is described, there are no step-by-step reproduction instructions (e.g., a README with commands to run the experiments). The paper describes the methodology but not how to execute the code."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper reports standard deviation (mean +/- std over 5 runs) in Tables 4 and 5, but does not report confidence intervals. Standard deviation across runs is a variance measure but not a confidence interval or error bar in the statistical sense. However, the +/- notation in tables effectively serves as error bars. This is borderline; the paper does report uncertainty via std dev across runs, which functions similarly. Setting to false because these are standard deviations, not formal confidence intervals."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper claims GPT-4 'outperforms' GPT-3.5 and that shuffling 'leads to a drop in performance' but provides no statistical significance tests (no p-values, t-tests, or similar). Comparisons are based solely on comparing mean values."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Results are reported with baseline context: e.g., GPT-4 achieves 70.6% next step accuracy vs GPT-3.5's 65%, and absolute performance numbers are given for all metrics (Tables 4-6, 9-10). The reader can compute effect sizes from the reported numbers with full context."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The dataset contains 100 protocols and experiments use 5 runs, but there is no justification for why 100 protocols or 5 runs were chosen, no power analysis, and no discussion of whether the sample size is adequate for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Tables 4, 5, and 9 explicitly report 'mean and standard deviation over 5 runs' with +/- notation for all main metrics. This provides variance information across experimental runs."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares GPT-4 against GPT-3.5 as a baseline, and additionally evaluates Llama-2-7B (Appendix D, Tables 9-10). It also includes a human benchmark comparison (Appendix F)."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "GPT-3.5, GPT-4 (2023), and Llama-2-7B (2023) were all contemporary models at the time of publication. These represented the state of the art for the evaluated task."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper systematically varies conditions: shuffled vs. non-shuffled function ordering and with/without feedback loops (Tables 4, 5), which function as ablation studies showing the contribution of ordering information and feedback mechanisms."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Multiple metrics are used: function accuracy, precision, recall, normalized Levenshtein distance, argument precision/recall, SciBERTScore, and BLEU (Tables 4, 5, 6). At least 5 different metrics per task."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Appendix F describes a human benchmarking study where an undergraduate biomedical sciences student completed next step prediction (n=32) and function selection (n=20) tasks. Section 5.5-5.6 also describes real-world validation where a scientist reviewed generated protocols and one was executed in the lab."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All 100 protocols appear to be used for evaluation without a clear train/dev/test split. The paper does not describe holding out any subset for tuning or development. Since GPT-4 generates the ground truth pseudocode AND is evaluated as a student model, there is no separation."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Results are reported only as aggregate means across all 100 protocols. There is no breakdown by protocol difficulty, biology subdomain, protocol length, or any other category. Only the qualitative example in Appendix C shows a single protocol."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 5.2 discusses poor performance on function retrieval and offers explanations (semantic ambiguity between similar functions like Mix and MixSubstance). Section 3.3 discusses errors in generated pseudocode. Appendix D notes Llama-2 was 'unable to complete' next step prediction."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper reports that feedback loops did not consistently help (Table 5 shows mixed results with feedback), function retrieval performance was 'generally poor' (Section 5.2), GPT-4 as evaluator performed only 'slightly above chance' (Section 5.3), and Llama-2 failed on next step prediction entirely (Appendix D)."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims are well-supported: the automatic evaluation framework is presented (Sections 3-4), the BIOPROT dataset is introduced (Section 3), GPT-3 and GPT-4 are evaluated (Section 5), external validation via lab execution is shown (Section 5.5-5.6). Claims are appropriately hedged."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Causal claims are limited and justified. The paper uses controlled single-variable manipulation (shuffled vs. non-shuffled, with/without feedback) across 5 runs. Claims like 'shuffling the input functions consistently leads to a drop in performance' (Section 5.2) are supported by these controlled ablations."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper explicitly states its scope as biology protocols and acknowledges limitations: 'Our work is focused on biology, but could be extended to other fields such as chemistry and materials science' (Section 7). The title and abstract specify 'Biology' explicitly."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper discusses alternative explanations: for poor function retrieval, they suggest semantic ambiguity between function names (Section 5.2). For GPT-4 evaluator results, they note it's unclear if the model can't distinguish or if generations are actually correct (Section 5.3). For Llama's failure, they discuss training regime differences (Appendix D)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper refers to 'GPT-3.5', 'GPT-4', and 'Llama-2' without specific version identifiers or API snapshot dates. No version strings like 'gpt-4-0613' are provided. Section 5.1 says only 'We explore the performance of GPT-3.5 and GPT-4 from the OpenAI API.'"
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompts are provided in the appendix: Figure 6 (pseudofunction/pseudocode generation), Figure 7 (pseudocode prediction), Figure 8 (protocol summarization), Figure 9 (error messages for feedback), Figure 10 (protocol retrieval), Figure 12 (protocol generation from retrieved functions). These appear to be actual prompts, not just descriptions."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No temperature, top-p, max tokens, or other API hyperparameters are reported anywhere in the paper. Section 5.1 mentions 'text-embedding-ada-002 embeddings' for nearest neighbor search but no generation hyperparameters."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The scaffolding is described in detail: Section 3.2 describes the automatic feedback loop for pseudocode generation (detecting undefined functions, Python syntax errors, missing units). Section 5.5 describes the Toolformer-like chain-of-thought agent with protocol search tool, with chain-of-thought output shown in Figure 11."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix A documents dataset filtering in detail: automatic filtering criteria (no description, linked files, images, tables, fewer than 3 steps) and manual filtering criteria (not biology-related, poorly written). Section 3.3 documents the manual verification process and edit statistics (Table 3)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 7 is titled 'Limitations' and contains substantive discussion organized into three subsections: 'Use of paid API', 'Additional scientific fields', and 'Misuse'."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The limitations section includes specific threats: total API cost of approximately $1000 limits reproducibility for resource-constrained researchers (Section 7), the scope is limited to biology (Section 7), and there are potential misuse concerns with protocol generation. The GPT-4 self-evaluation concern (Section 5.3) is also specific."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly bounds scope: 'Our work is focused on biology' (Section 7), the dataset is limited to Protocols.io (Section 3.1), and the framework evaluates protocol planning specifically (not general scientific reasoning). The paper does not overclaim beyond these boundaries."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The BIOPROT dataset (source protocols, pseudocode, pseudofunctions) is available at the GitHub repository. The original protocols from Protocols.io are publicly accessible. This enables independent verification of the generated data."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3.1 describes data collection: protocols were collected from Protocols.io, a public platform with 9,000+ protocols. Automatic and manual filtering criteria are described in Appendix A. Section 3.2 describes the pseudocode translation process."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "The data source is a standard public platform (Protocols.io), not human participants. The human benchmarking in Appendix F uses one undergraduate student, but this is not a human subjects study. NA because the main data comes from a public benchmark."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is documented: Protocols.io collection -> automatic filtering (Appendix A) -> manual filtering -> GPT-4 pseudocode generation (Section 3.2) -> automatic debugging feedback loop -> manual verification by scientist (Section 3.3). Table 3 shows the edit breakdown at the manual verification stage."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding or acknowledgments section is present in the paper. The authors are affiliated with Align to Innovate, Francis Crick Institute, Future House, and University of Oxford, but no funding sources are disclosed."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are listed on the first page: Align to Innovate, Francis Crick Institute, Future House, and University of Oxford. These are clearly disclosed."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding information is disclosed, so independence cannot be assessed. The paper evaluates OpenAI models (GPT-3.5, GPT-4), and authors are from organizations that may have relationships with model providers, but this is not explicitly addressed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement or financial interest declaration is present in the paper. Some authors are affiliated with 'Align to Innovate' and 'Future House', which may have commercial interests related to the findings, but this is not disclosed."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper does not state the training data cutoff dates for GPT-3.5, GPT-4, or Llama-2. This is relevant because the Protocols.io data used in BIOPROT was publicly available before these models were trained, so training data overlap is possible."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether the Protocols.io protocols (the evaluation data) could have been in the training data of GPT-3.5, GPT-4, or Llama-2. Since Protocols.io is a public platform, this overlap is plausible but unaddressed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "Protocols.io protocols are publicly available online and were published before the training cutoffs of the evaluated models. The paper does not discuss whether these specific protocols could have been included in training data, despite this being a clear contamination risk."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The paper's main evaluation is automated benchmark evaluation. The human benchmarking in Appendix F is a small validation exercise (one student), not a formal human subjects study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No formal human subjects study is conducted. The human benchmarking involved one undergraduate student as a convenience comparison, not a research study requiring ethics approval."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No formal human subjects study. The single participant is described as 'an undergraduate biomedical sciences student' which is minimal characterization but the study is not a human subjects experiment."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No formal human subjects study with participant recruitment. Only one student was used for human benchmarking."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No formal human subjects experiment with conditions requiring randomization."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No formal human subjects experiment requiring blinding."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No formal human subjects study with participants who could drop out."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Section 7 (Limitations) states: 'In total, we used approximately $1000 for API calls.' This provides the total cost of the experimental evaluation."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "The total API spend is stated as approximately $1000 (Section 7). While GPU hours for Llama-2 are not specified, the primary compute cost (API calls) is quantified."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "GPT-4 outperforms GPT-3.5 on next step prediction, with 70.6% vs 65% accuracy (unshuffled) and 57.0% vs 36.1% accuracy (shuffled).",
    294       "evidence": "Table 4, Section 5.2: Results reported as mean over 5 runs with standard deviations.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "GPT-4 significantly outperforms GPT-3.5 on protocol generation ordering (normalized Levenshtein distance), while showing similar precision and recall of function calls.",
    299       "evidence": "Table 5, Section 5.2: GPT-4 achieves Ldn of 0.396 vs GPT-3.5's 0.498 (unshuffled, no feedback). Precision/recall are comparable.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Shuffling input functions consistently leads to a drop in performance across all tasks.",
    304       "evidence": "Tables 4 and 5: Every shuffled condition shows lower accuracy/higher Ldn than the corresponding unshuffled condition.",
    305       "supported": "strong"
    306     },
    307     {
    308       "claim": "GPT-4 as an evaluator performs only slightly above chance at distinguishing ground truth from generated protocols.",
    309       "evidence": "Table 8, Section 5.3: GPT-4 evaluator scores range from 35.6% to 43.9%, near the 50% chance level.",
    310       "supported": "strong"
    311     },
    312     {
    313       "claim": "An LLM-generated protocol was successfully executed in a real biology laboratory, producing viable E. coli cells.",
    314       "evidence": "Section 5.5-5.6 and Figure 3: The protocol was reviewed by a scientist and executed, with Figure 3 showing E. coli growth on agar plates vs. control.",
    315       "supported": "moderate"
    316     },
    317     {
    318       "claim": "59% of GPT-4-generated pseudocode required no manual edits, and the framework enables largely accurate dataset creation.",
    319       "evidence": "Table 3, Section 3.3: Manual verification showed 59% required no edits, 24% needed 1-3 edits, 17% needed more than 3 edits.",
    320       "supported": "strong"
    321     },
    322     {
    323       "claim": "Llama-2-7B significantly underperforms GPT-3.5 and GPT-4 in function selection and was unable to complete next step prediction.",
    324       "evidence": "Appendix D, Tables 9-10: Llama-2-7B shows substantially lower scores across all protocol generation metrics and failed entirely on next step prediction.",
    325       "supported": "strong"
    326     }
    327   ],
    328   "methodology_tags": [
    329     "benchmark-eval"
    330   ],
    331   "key_findings": "BioPlanner introduces an automatic evaluation framework for LLM protocol planning in biology, converting free-text protocols into pseudocode to enable structured comparison. The BIOPROT dataset of 100 manually-verified biology protocols shows GPT-4 outperforms GPT-3.5 on protocol planning tasks, particularly in ordering steps correctly (measured by Levenshtein distance), while Llama-2-7B substantially underperforms both. Function ordering information (non-shuffled input) significantly aids performance. A generated protocol was successfully executed in a real laboratory, demonstrating practical applicability of the framework.",
    332   "red_flags": [
    333     {
    334       "flag": "Benchmark contamination risk",
    335       "detail": "Protocols.io data is publicly available online and likely in the training data of GPT-3.5 and GPT-4. The paper does not discuss whether the evaluated models may have seen the evaluation protocols during training, which could inflate performance numbers."
    336     },
    337     {
    338       "flag": "GPT-4 generates its own evaluation data",
    339       "detail": "GPT-4 is used as the 'teacher' to generate the ground truth pseudocode (Section 3.2), and is then also evaluated as a 'student' on reconstructing pseudocode from its own pseudofunctions. This circular dependency may bias results in favor of GPT-4, as it generated the action space and ground truth in a way that may align with its own capabilities."
    340     },
    341     {
    342       "flag": "No statistical significance tests",
    343       "detail": "Claims that GPT-4 'outperforms' GPT-3.5 are based on comparing mean values without significance tests. Given the overlapping standard deviations in some metrics (e.g., argument-level metrics in Table 4), some claimed differences may not be statistically significant."
    344     },
    345     {
    346       "flag": "No model versions specified",
    347       "detail": "GPT-3.5 and GPT-4 are referenced without specific API version identifiers. Since model behavior changes across versions, results may not be reproducible."
    348     },
    349     {
    350       "flag": "Single-sample lab validation",
    351       "detail": "The real-world lab validation (Section 5.5-5.6) tested only one generated protocol (E. coli culture/cryopreservation), which is a relatively routine procedure. This single demonstration is presented as evidence that 'an LLM generates a protocol that is successfully executed,' but one success does not demonstrate general reliability."
    352     }
    353   ],
    354   "cited_papers": [
    355     {
    356       "title": "Emergent autonomous scientific research capabilities of large language models",
    357       "authors": ["Daniil A Boiko", "Robert MacKnight", "Gabe Gomes"],
    358       "year": 2023,
    359       "arxiv_id": "2304.05332",
    360       "relevance": "Demonstrates LLM-driven autonomous scientific experimentation (chemistry), directly relevant to evaluating LLM capabilities in science."
    361     },
    362     {
    363       "title": "ChemCrow: Augmenting large-language models with chemistry tools",
    364       "authors": ["Andres M Bran", "Sam Cox", "Andrew D White", "Philippe Schwaller"],
    365       "year": 2023,
    366       "arxiv_id": "2304.05376",
    367       "relevance": "LLM agent augmented with chemistry tools for synthesis planning and drug discovery, relevant to agentic AI evaluation."
    368     },
    369     {
    370       "title": "Voyager: An open-ended embodied agent with large language models",
    371       "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"],
    372       "year": 2023,
    373       "arxiv_id": "2305.16291",
    374       "relevance": "Open-ended LLM agent with self-improving code generation, relevant to agentic LLM evaluation methodology."
    375     },
    376     {
    377       "title": "Large language models are not fair evaluators",
    378       "authors": ["Peiyi Wang", "Lei Li", "Liang Chen"],
    379       "year": 2023,
    380       "arxiv_id": "2305.17926",
    381       "relevance": "Demonstrates systematic bias when using LLMs as evaluators, directly relevant to methodology quality of LLM evaluation."
    382     },
    383     {
    384       "title": "Sparks of artificial general intelligence: Early experiments with GPT-4",
    385       "authors": ["Sebastien Bubeck"],
    386       "year": 2023,
    387       "arxiv_id": "2303.12712",
    388       "relevance": "Major GPT-4 evaluation paper that relies on self-evaluation, relevant to evaluation methodology quality assessment."
    389     },
    390     {
    391       "title": "Toolformer: Language models can teach themselves to use tools",
    392       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessi"],
    393       "year": 2023,
    394       "arxiv_id": "2302.04761",
    395       "relevance": "Foundation for tool-augmented LLMs, directly relevant to agentic AI capabilities and evaluation."
    396     },
    397     {
    398       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    399       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    400       "year": 2023,
    401       "arxiv_id": "2305.10601",
    402       "relevance": "LLM planning and reasoning methodology, relevant to evaluating multi-step LLM capabilities."
    403     },
    404     {
    405       "title": "Self-Refine: Iterative refinement with self-feedback",
    406       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    407       "year": 2023,
    408       "arxiv_id": "2303.17651",
    409       "relevance": "Self-refinement framework for LLMs, relevant to feedback loop methodology used in this paper and agentic AI evaluation."
    410     },
    411     {
    412       "title": "GPT-4 Technical Report",
    413       "authors": ["OpenAI"],
    414       "year": 2023,
    415       "relevance": "Primary technical report for GPT-4, central to understanding the capabilities and limitations of the evaluated model."
    416     },
    417     {
    418       "title": "LLM+P: Empowering large language models with optimal planning proficiency",
    419       "authors": ["Bo Liu", "Yuqian Jiang", "Xiaohan Zhang"],
    420       "year": 2023,
    421       "arxiv_id": "2304.11477",
    422       "relevance": "Combines LLMs with planning frameworks, relevant to evaluating LLM planning capabilities."
    423     },
    424     {
    425       "title": "Generalized planning in PDDL domains with pretrained large language models",
    426       "authors": ["Tom Silver", "Soham Dan", "Kavitha Srinivas"],
    427       "year": 2023,
    428       "arxiv_id": "2305.11014",
    429       "relevance": "LLM planning evaluation in formal domains, relevant to methodology for evaluating LLM planning abilities."
    430     }
    431   ]
    432 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs