ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (26452B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM-ProS: Analyzing Large Language Models' Performance in Competitive Problem Solving",
      6     "authors": [
      7       "Md Sifat Hossain",
      8       "Anika Tabassum",
      9       "Md. Fahim Arefin",
     10       "Tarannum Shaila Zaman"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2502.04355",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims about evaluating five models on ICPC, revealing differences in generalization, and insights into reasoning are directly supported by methodology and results sections.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Paper attributes performance differences to 'training methodologies' and 'CoT reasoning' but this is observational comparison of off-the-shelf models with multiple confounded variables. No ablation studies or controlled experiments isolate causal factors.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Title and abstract are broad ('Analyzing LLM Performance') but scope is narrow (ICPC World Finals only). Paper acknowledges in Threats to External Validity that results may not generalize beyond ICPC, but this boundary is not stated upfront in main claims.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Paper attributes o1 superiority solely to CoT and training methodologies without exploring alternatives like model size differences, inference-time compute budget (o1 uses extended reasoning tokens), or architectural innovations beyond training.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Paper measures 'pass@1' (problems solved correctly) which directly maps to claimed outcome (competitive programming success). Verdict types (AC/WA/TLE) directly reflect problem-solving ability.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section V 'Threats to Validity' provides dedicated discussion of internal, external, construct, and conclusion validity threats.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats identified: data contamination may inflate performance (mitigated by 24-hr 2024 test), ICPC scope may not generalize, zero-shot setting may not fully leverage models, Codeforces platform dependency affects verdict reliability.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "Methodology specifies 166 ICPC World Finals problems, pass@1 zero-shot evaluation, Codeforces submission platform. But scope boundaries (what the study does NOT show) are buried in threats section, not stated upfront as explicit limitations.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding section or acknowledgment. No disclosure of whether work was unfunded, grant-supported, or industry-funded.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Authors clearly listed with affiliations (University of Dhaka, University of Maryland UMBC). No undisclosed industry affiliations.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding disclosed; work appears to be unfunded academic research.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or disclosure of patents, equity, or consulting relationships.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "Key terms are used without precise definitions: 'Chain-of-thought reasoning' mentioned repeatedly but never formally defined; 'pass@1' defined as 'percentage of problems solved correctly' but context-dependent; competitive programming assumed known.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Paper explicitly states: 'We propose LLM-ProS, to assess the performance of advanced LLMs' and lists two contributions: (1) performance analyzer, (2) experimental evaluation identifying performance factors.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section VI comprehensively covers related work (competition-level evaluation, methodologies, code refinement, model-specific insights) and positions this work as extending prior studies with multiple models and detailed error analysis.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Paper references GitHub repository (sifat-hossain-niloy/LLMs-Performance-in-ICPC-Problems) with 'publicly available scripts' [27] for reproducibility.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Dataset consists of 166 ICPC World Finals problems scraped from icpc.global, which are publicly available. Problems themselves are public benchmark data.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Section IV lists tools (Selenium, PyPDF2, re) but no requirements.txt, environment.yml, or Docker provided. Python version unspecified. Insufficient for environment reproduction.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "Process is described (data collection, preprocessing, model testing, submission) but step-by-step reproduction instructions are not provided. Reader must reverse-engineer from GitHub repo.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results reported as single percentages (e.g., 15.4%, 7.7%) without confidence intervals, error bars, or variance estimates. No statistical aggregation across runs shown.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No p-values, t-tests, or statistical significance tests reported when comparing model accuracies. Differences are descriptive only.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "Raw accuracy percentages provided but not framed as effect sizes with baselines. For example, '0% accuracy' for GPT-4o lacks context of what would be expected by chance or random baseline.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "166 ICPC problems appear to be all available World Finals problems from 2011-2024, not a justified sample. No power analysis or sample size calculation provided. Some categories have only 1-5 problems.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Accuracy metrics reported as single values (e.g., o1-mini 15.4% on 2024) without standard deviation, variance, or indication of whether results are single-run or averaged across multiple runs.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Five models evaluated against each other (GPT-4o, Mistral, Llama-3.1, o1-mini, o1-preview) serve as implicit baselines.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "All models are contemporary (GPT-4o, Llama 3.1, o1 family all released 2024-2025). No suspiciously weak or outdated baselines.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": false,
    193           "justification": "Paper attributes differences to training methodologies and CoT reasoning but does not ablate CoT prompts, control for training data, or systematically isolate contributing factors.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Metrics include accuracy (pass@1), verdict distribution (AC/WA/TLE/CE/RE/MLE counts), and resource utilization (runtime/memory) across categories.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Problems evaluated by automated Codeforces judging system, not human evaluation. Not applicable.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "2024 ICPC problems treated as unseen held-out test set, released after all model training cutoffs. Explicitly tested within 24 hours of contest.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Figure 3 shows problems solved breakdown by category (Implementation, Graph Theory, Math, Geometry, Greedy, etc.). Per-category analysis provided.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Paper discusses failure modes: high WA rates (74.70% for o1-mini), CE dominance (24.7% for GPT-4o), TLE occurrences. Failure patterns categorized by verdict type.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Paper reports that GPT-4o, Mistral, and Llama-3.1 achieved 0% accuracy on all years tested, which are clear negative results.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": false,
    237           "justification": "Models named without snapshot dates or API versions: 'GPT-4o', 'o1-preview', 'Llama-3.1-405B' are given but when these were accessed or specific API versions used is not stated.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "Section III-B describes prompt template structure (Problem Statement, Input, Output, Sample Cases) and mentions 'additional guiding instructions for o1 models', but actual prompts are not shown.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "Temperature, top-p, max_tokens, and other generation parameters are not reported. Critical for reproducing exact behavior.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "Prompt customization for each model is described, with mention of 'additional guiding instructions' for CoT models. Though not highly detailed, scaffolding approach is stated.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section III-B details five preprocessing steps: component extraction, standardized formatting, text cleaning, template customization, and validation. Pipeline is well-documented.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "ICPC problems are publicly available from icpc.global. Paper states scripts are publicly available on GitHub for reproducibility.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section III-A explicitly describes scraping 166 problems from ICPC official website, selecting World Finals editions 2011-2024 to avoid training data overlap. Collection procedure is clear.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; recruitment N/A. Data is problems from public contest.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "Full pipeline documented in Section III: data collection (scrape) → preprocessing (extract/standardize/clean/validate) → model testing (5 LLMs) → solution submission (Codeforces) → verdict collection.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Paper states intent to 'avoid potential overlap with training data of major LLMs' but specific training cutoff dates for each model (GPT-4o, o1, etc.) are not formally provided.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": true,
    301           "justification": "Section IV-C-1 and Section V-A extensively discuss contamination risk. Paper analyzes performance drop (25% on 2017 → 7.7% on 2024 for o1-preview) as evidence of contamination on older problems.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": true,
    307           "justification": "Paper addresses contamination through: selecting problems from pre-cutoff years, testing 2024 problems within 24 hours of contest release, and analyzing performance degradation as contamination proxy.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants; all N/A.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants; all N/A.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants; all N/A.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants; all N/A.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants; all N/A.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants; all N/A.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants; all N/A.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No API costs, token costs, or compute expenses reported. Total cost of evaluation across five models and 166 problems is not disclosed.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Total computational budget, hours of compute, or financial spend not stated. Resource utilization measurements promised but not reported in results.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "o1-mini and o1-preview significantly outperform general-purpose models on ICPC problems",
    374       "evidence": "o1-mini achieved 16 Accepted solutions, o1-preview 15 ACs; GPT-4o, Mistral, Llama-3.1 all achieved 0 ACs across 166 problems (Table II). o1-mini accuracy 15.4% on 2024 unseen problems vs 0% for others (Figure 2).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Training methodologies emphasizing chain-of-thought reasoning enable better problem-solving performance",
    379       "evidence": "o1 models trained for CoT outperform general models. However, this is correlational evidence from comparing off-the-shelf models with multiple confounded differences (architecture, training data, inference compute).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Data contamination inflates model performance metrics on older ICPC problems",
    384       "evidence": "o1-preview achieves 25% accuracy on 2017 problems but only 7.7% on unseen 2024 problems. Similar drop for o1-mini (25% → 15.4%). Suggestive of contamination but not definitive proof.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "General-purpose LLMs fail completely on World Finals-level competitive programming",
    389       "evidence": "GPT-4o, Mistral Large, Llama-3.1-405B achieved 0% accuracy across all years (2011-2024). Consistent across all categories tested.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Model performance varies significantly across problem categories",
    394       "evidence": "Figure 3 shows variation by category. Implementation and Graph Theory have higher success rates; Geometry and Greedy categories show lower performance. However, categories have unequal sample sizes (1-5 problems).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Resource efficiency is superior in o1 models compared to general-purpose models",
    399       "evidence": "Paper claims o1 models demonstrate 'superior computational efficiency' (Key Insights section) but does not provide actual runtime or memory usage comparisons in results.",
    400       "supported": "weak"
    401     },
    402     {
    403       "claim": "Models struggle with high-difficulty problems and complex algorithmic challenges",
    404       "evidence": "Low overall accuracy rates (max 15.4%) on World Finals problems indicate difficulty. Specific failure mechanisms (WA dominance, CE rates) shown in verdict analysis.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "benchmark-eval",
    410     "observational"
    411   ],
    412   "key_findings": "Evaluating five LLMs on 166 ICPC World Finals problems (2011-2024) reveals stark performance differences: o1-mini and o1-preview achieve 16-15 correct solutions respectively, while GPT-4o, Mistral Large, and Llama-3.1-405B achieve 0% accuracy. Performance drops dramatically on unseen 2024 problems (o1-preview: 25% → 7.7%), suggesting significant data contamination on earlier years. Chain-of-thought reasoning capabilities correlate with problem-solving success, though training methodologies are likely confounded with model architecture and inference-time compute differences.",
    413   "red_flags": [
    414     {
    415       "flag": "No statistical rigor",
    416       "detail": "All results reported as single percentages without confidence intervals, error bars, standard deviations, or indication of variance across runs. No significance tests reported."
    417     },
    418     {
    419       "flag": "Confounded causal claims",
    420       "detail": "Paper attributes performance to 'training methodologies' and 'CoT reasoning' but simultaneously varies model architecture, training data, inference compute budget, and size. No ablation studies isolate contributing factors."
    421     },
    422     {
    423       "flag": "Missing hyperparameters",
    424       "detail": "Temperature, top-p, max_tokens, and other generation parameters are not specified, making results non-reproducible."
    425     },
    426     {
    427       "flag": "No actual prompts provided",
    428       "detail": "Template structure described but actual prompts shown nowhere. Reproducibility requires finding/reconstructing exact prompt wording from GitHub."
    429     },
    430     {
    431       "flag": "Small category sample sizes",
    432       "detail": "Some ICPC categories have only 1-5 problems. Category-level conclusions lack statistical power. Geometry and Greedy conclusions based on 1 problem each."
    433     },
    434     {
    435       "flag": "Incomplete model specification",
    436       "detail": "Model versions lack snapshot dates (GPT-4o without API date, o1 models without access date). Different models may have been accessed at different times during evaluation period."
    437     },
    438     {
    439       "flag": "Missing cost analysis",
    440       "detail": "No API costs, token usage, or computational budget reported. Practical adoption assessment impossible without cost transparency."
    441     },
    442     {
    443       "flag": "Resource efficiency claimed but not shown",
    444       "detail": "Paper claims o1 models demonstrate 'superior computational efficiency' but provides no actual runtime or memory measurements in results."
    445     },
    446     {
    447       "flag": "Generalization overstated",
    448       "detail": "Titled broadly ('Analyzing LLM Performance') but scope narrow (ICPC only, pass@1 zero-shot). Generalizability to other domains not supported."
    449     },
    450     {
    451       "flag": "Single-run evaluation",
    452       "detail": "No indication of multiple runs or aggregation. Accuracy values (15.4%) appear to be single-run measurements without variance estimates."
    453     }
    454   ],
    455   "cited_papers": [
    456     {
    457       "title": "Competition-level problems are effective llm evaluators",
    458       "authors": "Huang et al.",
    459       "relevance": "Prior work establishing ICPC-style problems as rigorous LLM benchmarks; directly cited motivation for this study."
    460     },
    461     {
    462       "title": "Towards more realistic evaluation of LLM-based code generation",
    463       "authors": "Zheng et al.",
    464       "relevance": "Addresses realistic evaluation methodologies and avoids inflated metrics; cited for context-leakage and evolving-ignored problem concerns."
    465     },
    466     {
    467       "title": "LLM-Based test-driven interactive code generation",
    468       "authors": "Fakhoury et al.",
    469       "relevance": "Code refinement and iterative improvement techniques; cited for improving LLM solution quality through test feedback."
    470     },
    471     {
    472       "title": "A Performance Study of LLM-Generated Code on Leetcode",
    473       "authors": "Coignion et al.",
    474       "relevance": "Related benchmark evaluation of LLM code efficiency on competitive programming problems; parallel methodology."
    475     },
    476     {
    477       "title": "The Llama 3 Herd of Models",
    478       "authors": "AI Research",
    479       "relevance": "Documents Llama-3.1-405B architecture and training; evaluated in this study."
    480     },
    481     {
    482       "title": "OpenAI o1 System Card",
    483       "authors": "OpenAI Research",
    484       "relevance": "o1 model design and chain-of-thought reasoning capabilities; technical basis for o1 superiority hypothesis."
    485     },
    486     {
    487       "title": "Quantifying memorization across neural language models",
    488       "authors": "Carlini et al.",
    489       "relevance": "Data contamination and memorization issues in LLMs; cited motivation for controlling for training data overlap."
    490     },
    491     {
    492       "title": "Grammar-aware large language models for code generation",
    493       "authors": "Zhang et al.",
    494       "relevance": "Training improvements for code generation; cited as factor in model performance variability."
    495     }
    496   ],
    497   "engagement_factors": {
    498     "practical_relevance": {
    499       "score": 2,
    500       "justification": "Practitioners care about LLM coding performance, but results use pass@1 zero-shot (unrealistic), narrow to ICPC World Finals only, and don't assess production scenarios (iterative refinement, debugging, partial credit)."
    501     },
    502     "surprise_contrarian": {
    503       "score": 1,
    504       "justification": "o1 > general models result is expected given o1 is explicitly designed for chain-of-thought reasoning. No surprising inversions or counterintuitive findings."
    505     },
    506     "fear_safety": {
    507       "score": 0,
    508       "justification": "Paper is purely performance benchmarking. No safety analysis, deceptiveness concerns, or risk discussion."
    509     },
    510     "drama_conflict": {
    511       "score": 0,
    512       "justification": "Straightforward benchmark paper. Results align with model design expectations. No controversy or competing narratives."
    513     },
    514     "demo_ability": {
    515       "score": 1,
    516       "justification": "Difficult to try immediately. Requires access to proprietary LLM APIs, ICPC problem set (public but requires setup), and Codeforces account. Results are read-only statistics."
    517     },
    518     "brand_recognition": {
    519       "score": 2,
    520       "justification": "Authors from University of Dhaka and UMBC (not top tier), but evaluates well-known models (OpenAI, Meta, Mistral). Models are recognizable; authors are not."
    521     }
    522   },
    523   "hn_data": {
    524     "threads": [],
    525     "top_points": 0,
    526     "total_points": 0,
    527     "total_comments": 0
    528   }
    529 }

Impressum · Datenschutz