scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21977B)
      1 {
      2   "paper": {
      3     "title": "Automating REST API Postman Test Cases Using LLM",
      4     "authors": [
      5       "S Deepika Sri",
      6       "Mohammed Aadil S",
      7       "Sanjjushri Varshini R",
      8       "Raja CSP Raman",
      9       "Gopinath Rajagopal",
     10       "S Taranath chan"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv",
     14     "arxiv_id": "2404.10678",
     15     "doi": "10.48550/arXiv.2404.10678"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "A GitHub link is provided at the top of the paper: https://github.com/tactlabs/test-case-generation."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper states the model is 'trained using manually collected postman test cases' but no dataset is released or linked. The GitHub repo is mentioned but no data download link is provided in the paper."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. The tech stack section mentions Flask, Postman, and OpenAI but provides no library versions or dependency specifications."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions are provided. The paper describes the architecture at a high level (training phase, testing phase) but provides no commands, configuration steps, or concrete instructions for replication."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No quantitative results are reported at all. The Results section (Section 5) is entirely qualitative, describing screenshots of the interface rather than any measured outcomes. No confidence intervals or error bars appear anywhere."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No statistical significance tests are used. The paper makes claims about efficacy and efficiency but provides no comparative numerical results and no statistical testing."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No effect sizes or quantitative improvements are reported. The paper contains no numerical performance metrics of any kind."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper mentions using 'manually collected postman test cases' for training but never states how many test cases were collected or why that number is sufficient. No sample size information is provided."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance, standard deviation, or any measure of variability is reported. There are no quantitative results at all."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No baselines are included. The paper does not compare its approach against any prior method, manual test case writing, or alternative tool. The Results section shows only screenshots of the system's interface."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No baselines are included at all, so the question of whether they are contemporary is moot. The literature review mentions RESTTESTGEN and other tools but none are used for comparison."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No ablation study is conducted. The system has multiple components (data collection, preprocessing, OpenAI model, Flask interface) but none are varied or removed to assess their contribution."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No evaluation metrics of any kind are reported. The Results section contains no quantitative measurements — only qualitative descriptions and screenshots."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation of the generated test cases is reported. There is no user study, expert review, or manual assessment of the quality or correctness of the generated outputs."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No held-out test set is mentioned. The paper does not describe any train/test split or evaluation on unseen data."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No per-category or per-API breakdown is provided. Results are not reported in any granular way — there are no tables or figures with quantitative data."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No failure cases are discussed. The paper presents only a positive narrative about the system's capabilities without any discussion of where the approach fails or produces incorrect test cases."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "No negative results are reported. Every description is positive, with no mention of approaches that failed, limitations encountered during development, or configurations that did not work."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The abstract claims the approach 'enhance[s] the efficiency and effectiveness of test case generation' and provides 'comprehensive testing,' but the Results section contains no quantitative evidence supporting these claims — only screenshots of the web interface."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes causal claims such as 'LLMs enhance the creation of Postman test cases' and the methodology 'streamlines the test case generation process.' No controlled experiment or adequate study design supports these causal assertions."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper makes broad claims about 'comprehensive testing' and 'a wide range of REST API properties' without bounding these generalizations to the specific APIs tested. No scope limitations on the types of REST APIs or domains are stated."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No alternative explanations are discussed. The paper does not consider whether simpler approaches (e.g., template-based test generation) could achieve similar results, or whether observed outputs are due to OpenAI's general capabilities rather than the proposed methodology."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper mentions 'OpenAI's language models, such as GPT-3' in Section 4.3 but does not specify an exact model version, API version, or snapshot date. The reference to GPT-3 is illustrative rather than specifying which model was actually used."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "No prompts or system instructions are provided. The paper describes the approach at a high level but never shows the actual prompt text sent to the OpenAI model for generating test cases."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No hyperparameters are reported. Temperature, top-p, max tokens, learning rate, or any other configuration parameters are not mentioned."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper describes a system with training, loading, and inference phases but provides insufficient detail about the actual scaffolding — no specifics on how the OpenAI API is called, how prompts are constructed from user input, or how outputs are post-processed into Postman format."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "Section 3.1.3 mentions 'tokenization, normalization, and potentially data augmentation' in generic terms but provides no concrete details about what preprocessing was actually performed on the collected Postman test cases."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no limitations or threats-to-validity section. The paper has a Future Scope section (Section 6) that discusses extensions but does not discuss any limitations of the current work."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed anywhere in the paper. There is no acknowledgment of potential weaknesses or methodological concerns."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No scope boundaries are stated. The paper does not specify what types of REST APIs were tested, what the approach cannot handle, or what claims the authors are not making."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The raw training data (manually collected Postman test cases) is not made available. No data files, download links, or supplementary materials are provided."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper says data was 'manually collected postman test cases or instances for various Rest APIs' but provides no details on how many were collected, from which APIs, by whom, or over what time period."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants are involved. The data consists of Postman test cases, not human subject data. The paper is a benchmark/tool evaluation, not a human study."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The data pipeline is described only in generic terms (Section 3.1: collection, storage, preprocessing, training, storage). No concrete details are given about the number of examples at each stage, filtering criteria, or actual transformations applied."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "Author names and email addresses are listed but no institutional affiliations are provided. It is unclear whether the authors are affiliated with any company or university."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of any funding or conflict-of-interest statement means this criterion is not satisfied."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is included in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper uses OpenAI models (GPT-3 mentioned) to generate test cases but does not state the model's training data cutoff date. This matters because the model may have seen Postman test case patterns during pre-training."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether the OpenAI model may have seen similar Postman test cases during its pre-training. The paper fine-tunes or prompts an LLM but does not address potential overlap between training data and evaluation scenarios."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No benchmark contamination is discussed. The paper does not consider whether the REST APIs used for evaluation or their test patterns could have been in the LLM's pre-training data."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants are involved in this study. It is a tool/system paper with no user study component."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost, API cost, tokens consumed, or latency figures are reported. The system calls OpenAI's API but the paper does not quantify the cost."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No computational budget is stated. Training time, hardware used, or total API spend are not mentioned."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "The LLM-based approach enhances the efficiency and effectiveness of test case generation for REST APIs.",
    294       "evidence": "No quantitative evidence is provided. The Results section (Section 5) shows screenshots of the web interface but reports no metrics comparing efficiency or effectiveness to any baseline.",
    295       "supported": "unsupported"
    296     },
    297     {
    298       "claim": "LLMs can intelligently formulate test cases that cover a broad range of REST API properties, ensuring comprehensive testing.",
    299       "evidence": "No measurement of coverage breadth is provided. The paper shows example output screenshots but does not quantify what proportion of API properties are covered or define 'comprehensive.'",
    300       "supported": "unsupported"
    301     },
    302     {
    303       "claim": "The approach reduces the workload for developers and testers.",
    304       "evidence": "No user study, time comparison, or workload measurement is provided. This claim is asserted without evidence in Section 5.",
    305       "supported": "unsupported"
    306     },
    307     {
    308       "claim": "The model conforms to current technological standards and holds promise for future advancements.",
    309       "evidence": "No evidence is provided for conformance to standards. This is a subjective assertion in the abstract and conclusion without supporting data.",
    310       "supported": "unsupported"
    311     }
    312   ],
    313   "methodology_tags": [
    314     "case-study"
    315   ],
    316   "key_findings": "The paper proposes a system that uses OpenAI's LLMs to automatically generate Postman test cases for REST APIs. The system is implemented as a Flask web application where users input a REST API endpoint and the desired number of test cases. The paper provides no quantitative evaluation of the generated test cases — the Results section consists entirely of screenshots of the web interface. No metrics on test case quality, coverage, correctness, or comparison to baselines are reported.",
    317   "red_flags": [
    318     {
    319       "flag": "No quantitative evaluation",
    320       "detail": "The Results section (Section 5) contains only screenshots and qualitative descriptions. There are zero quantitative metrics, tables, or figures with data. The paper makes claims about 'efficiency,' 'effectiveness,' and 'comprehensive testing' without any measurements."
    321     },
    322     {
    323       "flag": "No baselines or comparisons",
    324       "detail": "The paper does not compare its approach to any baseline — not manual test writing, not existing tools like RESTTESTGEN (which is cited in the literature review), not even a random generation approach."
    325     },
    326     {
    327       "flag": "Claims far exceed evidence",
    328       "detail": "The abstract and conclusion make broad claims about enhancing efficiency, comprehensive coverage, and conforming to technological standards, but the paper provides no evidence for any of these assertions."
    329     },
    330     {
    331       "flag": "Vague methodology",
    332       "detail": "The Proposed Solution (Section 3) describes generic machine learning training/testing phases using textbook descriptions (backpropagation, gradient descent, loss functions) but provides no specifics about the actual implementation — what model was used, how it was fine-tuned, what the prompts look like, or what the training data contains."
    333     },
    334     {
    335       "flag": "Missing model specification",
    336       "detail": "Section 4.3 mentions 'GPT-3' as an example of OpenAI models but never clarifies which model was actually used. The description of 'fine-tuning the pre-trained model' vs. the mention of APIs suggests confusion about whether the approach involves fine-tuning or prompting."
    337     },
    338     {
    339       "flag": "No limitations discussed",
    340       "detail": "The paper has no limitations section and does not acknowledge any weaknesses. The Future Scope section discusses extensions but not current shortcomings."
    341     }
    342   ],
    343   "cited_papers": [
    344     {
    345       "title": "Large language model assisted software engineering: prospects, challenges, and a case study",
    346       "authors": ["Lenz Belzner", "Thomas Gabor", "Martin Wirsing"],
    347       "year": 2023,
    348       "relevance": "Surveys LLM applications across the software engineering lifecycle, relevant to the scope of AI-assisted programming."
    349     },
    350     {
    351       "title": "Generate and pray: Using sallms to evaluate the security of llm generated code",
    352       "authors": ["Mohammed Latif Siddiq", "Joanna Santos"],
    353       "year": 2023,
    354       "arxiv_id": "2311.00889",
    355       "relevance": "Proposes a framework to systematically assess LLMs' ability to generate secure code, relevant to LLM code generation quality evaluation."
    356     },
    357     {
    358       "title": "TestSpark: IntelliJ IDEA's Ultimate Test Generation Companion",
    359       "authors": ["Arkadii Sapozhnikov"],
    360       "year": 2024,
    361       "arxiv_id": "2401.06580",
    362       "relevance": "An IDE plugin supporting LLM-based test generation techniques, directly relevant to automated testing with LLMs."
    363     },
    364     {
    365       "title": "Exploring the effectiveness of large language models in generating unit tests",
    366       "authors": ["Mohammed Latif Siddiq"],
    367       "year": 2023,
    368       "arxiv_id": "2305.00418",
    369       "relevance": "Empirical study of LLM effectiveness in generating JUnit tests, relevant to evaluating LLM code generation capabilities."
    370     },
    371     {
    372       "title": "Effective test generation using pre-trained large language models and mutation testing",
    373       "authors": ["Arghavan Moradi Dakhel"],
    374       "year": 2024,
    375       "relevance": "Evaluates LLM-generated test quality using mutation testing, relevant to methodology for assessing LLM-generated code."
    376     },
    377     {
    378       "title": "Automated unit test improvement using large language models at Meta",
    379       "authors": ["Nadia Alshahwan"],
    380       "year": 2024,
    381       "arxiv_id": "2402.09171",
    382       "relevance": "Industry-scale evaluation of LLMs for unit test improvement at Meta, relevant to practical deployment of AI coding tools."
    383     },
    384     {
    385       "title": "Resttestgen: automated black-box testing of restful apis",
    386       "authors": ["Emanuele Viglianisi", "Michael Dallago", "Mariano Ceccato"],
    387       "year": 2020,
    388       "relevance": "Prior work on automated REST API test generation using Swagger definitions, a direct baseline for this paper's domain."
    389     },
    390     {
    391       "title": "Empowering llm to use smartphone for intelligent task automation",
    392       "authors": ["Hao Wen"],
    393       "year": 2023,
    394       "arxiv_id": "2308.15272",
    395       "relevance": "AutoDroid system integrating LLMs for mobile task automation, relevant to agentic AI applications."
    396     },
    397     {
    398       "title": "Jailbreaker: Automated jailbreak across multiple large language model chatbots",
    399       "authors": ["Gelei Deng"],
    400       "year": 2023,
    401       "arxiv_id": "2307.08715",
    402       "relevance": "Automated jailbreak attacks on LLM chatbots, relevant to AI safety and security evaluation."
    403     }
    404   ]
    405 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs