scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24344B)
      1 {
      2   "paper": {
      3     "title": "LLM-ProS: Analyzing Large Language Models' Performance in Competitive Problem Solving",
      4     "authors": ["Md Sifat Hossain", "Anika Tabassum", "Md. Fahim Arefin", "Tarannum Shaila Zaman"],
      5     "year": 2025,
      6     "venue": "arXiv (IEEE preprint)",
      7     "arxiv_id": "2502.04355"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "On 166 ICPC World Finals problems (2011-2024), o1-mini and o1-preview achieved 9.64% and 9.04% accepted rates respectively, while GPT-4o, Mistral Large, and Llama-3.1-405B achieved 0% accepted solutions. Performance dropped on unseen 2024 problems (o1-mini 15.4% → o1-preview 7.7%), suggesting data contamination inflates earlier-year results. The o1 models' chain-of-thought reasoning was identified as the key differentiator.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "A GitHub repository is provided in reference [27]: https://github.com/sifat-hossain-niloy/LLMs-Performance-in-ICPC-Problems."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The dataset of 166 ICPC World Finals problems is scraped from the ICPC official website, which is publicly available. The GitHub repo contains scripts and solutions."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions 'publicly available scripts' and 'reproducible submissions' but provides no step-by-step reproduction instructions in the paper itself."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point estimates (e.g., '25.0% accuracy', '9.64% AC rate') with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims o1 models 'significantly outperform others' but provides no statistical significance tests. Comparisons are based solely on raw numbers."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Raw accuracy percentages are reported but no formal effect sizes (Cohen's d, odds ratios, etc.). The accuracy differences are presented without baseline context beyond raw numbers."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "166 ICPC problems are used with no justification for this sample size or discussion of whether it is sufficient for the claims being made."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Single-run results are reported (pass@1 setting). No variance, standard deviation, or multiple-run results are presented."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Five models are compared against each other: GPT-4o, Mistral Large, Llama-3.1-405B, o1-mini, and o1-preview. The general-purpose models effectively serve as baselines for the o1 family."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "All five models tested (GPT-4o, Mistral Large, Llama-3.1-405B, o1-mini, o1-preview) were contemporary state-of-the-art models at the time of the study."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "The paper evaluates existing models as-is with no system components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics are used: accuracy (pass@1), verdict distribution (AC/WA/CE/RE/TLE/MLE), resource utilization (runtime and memory), and temporal comparison (pre/post-2024)."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is irrelevant — correctness is determined by automated judge on Codeforces Gym."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The 2024 ICPC problems serve as a temporal held-out set, tested within 24 hours of release to ensure no contamination."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Figure 3 and surrounding text provide per-category breakdown (Implementation, Graph Theory, Math, Geometry, Greedy, etc.)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Verdict distribution analysis (Table II, Figure 4) discusses failure modes: compile errors, wrong answers, TLE, runtime errors across all models."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "GPT-4o, Mistral Large, and Llama-3.1-405B achieving 0% accuracy across all years is a substantial negative result. Performance drops on 2024 data are also reported."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about o1 models outperforming others, significant differences, and dataset contamination impact are all supported by the results in Sections IV-B."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper attributes o1 models' superiority to 'chain-of-thought reasoning and calibration' and 'specialized fine-tuning' without controlled experiments isolating these factors. These are causal claims from observational comparisons of different model architectures."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Performance in Competitive Problem Solving' broadly but only tests ICPC World Finals problems. The abstract mentions 'optimizing LLMs for algorithmic tasks' and 'novel problems' without bounding to ICPC specifically. Section V-B acknowledges external validity limitations but the title and framing remain broad."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section IV-B RQ3 discusses three alternative explanations for performance variability: dataset contamination, training methodologies, and reasoning strategies. The threats to validity section (Section V) also discusses alternative factors."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper uses Codeforces Gym acceptance as a proxy for 'reasoning, accuracy, and efficiency' and 'problem-solving capabilities' without discussing the gap between automated judge verdicts and these broader constructs."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are identified only by marketing names: 'GPT-4o', 'Mistral Large', 'Llama-3.1-405B', 'o1-mini', 'o1-preview'. No API versions, snapshot dates, or specific model IDs are provided."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Section III-B describes prompt formatting in natural language ('Standardized Prompt Formatting', 'Template Customization') but the actual prompt text is not provided in the paper or appendix."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the API calls."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used — models are prompted directly in a pass@1 setting."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section III-B describes five preprocessing steps in detail: extraction of problem components, standardized prompt formatting, text cleaning, template customization per model, and manual validation."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section V 'Threats to Validity' provides a dedicated, substantive discussion covering four types of threats: internal, external, construct, and conclusion validity."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The threats are specific to this study: data contamination of ICPC problems in training data (internal), ICPC not representing broader software engineering (external), zero-shot testing not leveraging full model capabilities (construct), and Codeforces Gym platform dependency (conclusion)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section V-B explicitly states 'ICPC problems... represent only a subset of the programming challenges faced in broader software engineering' and V-C notes zero-shot testing 'may not fully leverage the models' capabilities.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The GitHub repository [27] contains the generated solutions, and ICPC problems are publicly available. Codeforces submissions are logged for reproducibility."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section III-A describes scraping 166 problems from ICPC official website, covering World Finals 2011-2024, with details on problem structure (Table I)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data source is a standard public benchmark (ICPC problems)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The full pipeline is documented in Sections III-A through III-D: scraping → preprocessing → model testing → solution generation and submission, with tools specified (Selenium, PyPDF2, APIs)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: University of Dhaka (Department of CSE) and University of Maryland, Baltimore County (Department of Information Systems)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed. The paper evaluates OpenAI products (GPT-4o, o1 family) without any disclosure about potential relationships."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper discusses contamination risk (Section IV-B RQ3) but never states the actual training cutoff dates for any of the five models tested."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section IV-B RQ3 and Section V-A discuss data contamination as a threat, noting ICPC problems may exist in training data, and the 2024 problems were tested within 24 hours of the contest."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "The paper explicitly addresses contamination by comparing pre-2024 (potentially contaminated) vs. 2024 (unseen) results, and notes the performance drop as evidence of contamination effects. Section V-A discusses this mitigation strategy."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No API costs, tokens consumed, or per-problem inference costs are reported despite using five commercial/large model APIs across 166 problems each."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget, API spend, or hardware used is stated."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Single pass@1 results only. No multiple seeds or runs reported."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The paper does not state how many runs were performed. The pass@1 setting implies single attempts but this is not explicitly stated."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search is mentioned. The paper does not report what temperature or sampling settings were used, let alone any search over them."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Template customization per model is mentioned but no justification for how configurations were selected or validated."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite comparing 5 models across multiple years and categories."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The paper evaluates third-party models, not the authors' own system. There are no baseline re-implementations."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The o1 models likely use significantly more compute (chain-of-thought reasoning) than GPT-4o or Llama-3.1-405B, but compute budgets are not compared or discussed."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper assumes ICPC problems are good LLM evaluators without questioning construct validity. No discussion of whether Codeforces Gym verdicts actually measure the claimed 'reasoning, accuracy, and efficiency.'"
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is used — all models are tested directly via API in pass@1 setting."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "The paper explicitly addresses temporal leakage by using 2024 ICPC problems tested within 24 hours of release, and comparing against older problems that may be in training data."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether prompts or problem formatting could leak information not available in a real competition setting."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether ICPC problems across years share structural similarities or whether training on earlier ICPC-style problems provides unfair advantage on later ones."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection method is used. The temporal split (2024 problems) is a prevention strategy, but no membership inference, canary strings, or decontamination checks are applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "o1-mini and o1-preview significantly outperform GPT-4o, Mistral Large, and Llama-3.1-405B on ICPC problems",
    364       "evidence": "Table II shows o1-mini achieved 16 AC and o1-preview 15 AC, while GPT-4o, Mistral Large, and Llama-3.1-405B achieved 0 AC across all 166 problems. Figure 4 shows 9.64% and 9.04% AC rates for o1-mini and o1-preview respectively.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Data contamination likely contributes to inflated performance metrics on pre-2024 problems",
    369       "evidence": "Figure 2 shows o1-preview accuracy drops from 25.0% (2017) to 7.7% (unseen 2024), and o1-mini drops from 25.0% (2017) to 15.4% (2024). Section IV-B RQ3 discusses this pattern.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Chain-of-thought reasoning is the key factor enabling o1 models' superior performance",
    374       "evidence": "Section IV-B RQ3 attributes success to CoT reasoning and specialized fine-tuning, but this is inferred from comparing models with different architectures — no controlled experiment isolates CoT as the causal factor.",
    375       "supported": "weak"
    376     },
    377     {
    378       "claim": "o1 models demonstrate superior computational efficiency",
    379       "evidence": "Section IV-C Key Insights claims 'resource efficiency' but no actual resource utilization data (runtime, memory) is presented in the paper despite being listed as an evaluation metric.",
    380       "supported": "unsupported"
    381     }
    382   ],
    383   "red_flags": [
    384     {
    385       "flag": "Claims outrun evidence",
    386       "detail": "The paper claims o1 models show 'superior computational efficiency' (Key Insight #4) but presents no resource utilization data despite listing it as an evaluation metric. Resource usage is mentioned as measured by Codeforces but no data is shown."
    387     },
    388     {
    389       "flag": "No statistical rigor",
    390       "detail": "Claims of 'significant differences' are made without any statistical tests. Five models are compared across 14 years of problems with only raw percentages — no confidence intervals, significance tests, or effect sizes."
    391     },
    392     {
    393       "flag": "Missing hyperparameters and model versions",
    394       "detail": "Temperature, top-p, and other API parameters are not reported for any model. Models are identified only by marketing names without API versions or snapshot dates, making reproduction impossible."
    395     },
    396     {
    397       "flag": "Uncontrolled compute comparison",
    398       "detail": "The o1 models use chain-of-thought reasoning that likely consumes significantly more compute per problem than the other models. Comparing pass@1 across models with vastly different inference budgets is misleading without accounting for compute."
    399     },
    400     {
    401       "flag": "Small effective sample for key claims",
    402       "detail": "The 2024 unseen dataset contains only 13 problems (based on 15.4% = ~2/13 and 7.7% = ~1/13 for o1-mini and o1-preview). Contamination conclusions are drawn from this tiny sample."
    403     }
    404   ],
    405   "cited_papers": [
    406     {
    407       "title": "A survey of large language models",
    408       "authors": ["W. X. Zhao", "K. Zhou", "J. Li"],
    409       "year": 2023,
    410       "arxiv_id": "2303.18223",
    411       "relevance": "Comprehensive survey of LLM capabilities, architectures, and training methodologies."
    412     },
    413     {
    414       "title": "Evaluating large language models trained on code",
    415       "authors": ["M. Chen", "J. Tworek"],
    416       "year": 2021,
    417       "arxiv_id": "2107.03374",
    418       "relevance": "Introduces HumanEval benchmark and Codex, foundational work on LLM code generation evaluation."
    419     },
    420     {
    421       "title": "Competition-level problems are effective LLM evaluators",
    422       "authors": ["Y. Huang", "Z. Lin", "X. Liu"],
    423       "year": 2023,
    424       "arxiv_id": "2312.02143",
    425       "relevance": "Prior work using competitive programming problems as LLM evaluation benchmarks, discusses data contamination."
    426     },
    427     {
    428       "title": "Towards more realistic evaluation of LLM-based code generation: An experimental study and beyond",
    429       "authors": ["D. Zheng", "Y. Wang", "E. Shi"],
    430       "year": 2024,
    431       "arxiv_id": "2406.06918",
    432       "relevance": "Addresses evaluation methodology issues in LLM code generation benchmarks."
    433     },
    434     {
    435       "title": "LLM-Based test-driven interactive code generation: User study and empirical evaluation",
    436       "authors": ["S. Fakhoury", "A. Naik", "G. Sakkas"],
    437       "year": 2024,
    438       "arxiv_id": "2404.10100",
    439       "relevance": "Evaluates interactive code generation with test feedback, relevant to understanding LLM coding capabilities."
    440     },
    441     {
    442       "title": "A Performance Study of LLM-Generated Code on Leetcode",
    443       "authors": ["T. Coignion", "C. Quinton", "R. Rouvoy"],
    444       "year": 2024,
    445       "doi": "10.1145/3661167.3661221",
    446       "relevance": "Benchmarks LLM code generation efficiency on competitive programming platform."
    447     },
    448     {
    449       "title": "Code generation and algorithmic problem solving using Llama 3.1 405B",
    450       "authors": ["A. Deroy", "S. Maity"],
    451       "year": 2024,
    452       "arxiv_id": "2409.19027",
    453       "relevance": "Evaluates Llama 3.1 405B on algorithmic tasks, directly relevant to benchmark evaluation methodology."
    454     },
    455     {
    456       "title": "Calibration and correctness of language models for code",
    457       "authors": ["C. Spiess", "D. Gros", "K. S. Pai"],
    458       "year": 2024,
    459       "arxiv_id": "2402.02047",
    460       "relevance": "Studies LLM calibration and confidence in code generation, relevant to evaluation reliability."
    461     },
    462     {
    463       "title": "Program synthesis with large language models",
    464       "authors": ["J. Austin", "A. Odena", "M. Nye"],
    465       "year": 2021,
    466       "arxiv_id": "2108.07732",
    467       "relevance": "Early work on LLM program synthesis capabilities and evaluation."
    468     },
    469     {
    470       "title": "Quantifying memorization across neural language models",
    471       "authors": ["N. Carlini", "D. Ippolito", "M. Jagielski"],
    472       "year": 2022,
    473       "arxiv_id": "2202.07646",
    474       "relevance": "Foundational work on measuring memorization in language models, relevant to data contamination concerns."
    475     }
    476   ]
    477 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs