scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (16916B)
      1 {
      2   "paper": {
      3     "title": "Automatically Surfacing Opportunities for Improvements In Internet-Scale Applications",
      4     "authors": ["Vipul Harsh", "Sayan Sinha", "Henry Milner", "Haijie Wu", "B Aditya Prakash", "Vyas Sekar", "Hui Zhang"],
      5     "year": 2025,
      6     "venue": "HotNets '25 (24th ACM Workshop on Hot Topics in Networks)",
      7     "doi": "10.1145/3772356.3772423"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided anywhere in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper uses anonymized production data from a monitoring service provider but does not release it."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, dependency lists, or setup details are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions are included. The proof-of-concept implementation is described at a high level only."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars are reported for any results."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper uses heuristic thresholds (2x rate, 25% absolute difference) for opportunity detection but no formal significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No effect sizes reported. Results are presented as counts of useful opportunities (e.g., '5 out of 18 scenarios') without quantified magnitudes."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "One week of production data from 3 services is used with no justification for why this duration or number of services is sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance or standard deviation reported across any results."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "Table 1 compares categories of related work qualitatively but no quantitative baseline comparison is performed against any existing system."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No quantitative baselines are included at all."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The system has multiple components (hypothesis generator, attribute computation, opportunity finder with two experts) but no ablation is performed."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Evaluation is limited to counting how many opportunities were manually deemed useful. No other metrics (precision, recall, latency per hypothesis, etc.) are formally reported."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Results were manually inspected: 'we (manually) deemed to be insightful' (Section 5.2). However, the evaluation protocol is informal."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a benchmark evaluation paper; the system discovers opportunities in production data, so a held-out test set is not structurally applicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-KPI breakdown of identified opportunities. Results are also broken down by Expert 1 (18 scenarios) and Expert 2 (20 scenarios)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper acknowledges false positives: 'the false positive rate may seem high at first' and notes 13 of 18 Expert 1 scenarios and ~12 of 20 Expert 2 scenarios were not useful."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that many surfaced opportunities were false positives (only 5/18 from Expert 1 and 8/20 from Expert 2 were useful)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 'early promise from a proof-of-concept system' with 'evaluation on three real-world services,' which is appropriately hedged and matches the results in Section 5."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal-sounding claims (e.g., 'CORS error triggered client-side retries that led to the timeout') but acknowledges in Section 6 that causal inference methods are future work. The opportunity finder uses correlation-based heuristics only."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper is careful to frame results as preliminary: 'proof of concept,' 'early promise,' and explicitly lists open challenges in Section 6. Claims are bounded to the three tested services."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No substantive discussion of alternative explanations for the observed opportunities or why the heuristic thresholds might produce misleading results beyond acknowledging false positives."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies 'Llama 3.2' for hypothesis generation (Section 5.1), citing the model paper [18]. However, no specific parameter count or snapshot is given, so this is borderline. The citation to the Llama 3 paper provides traceability."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes prompt intent ('find patterns associated with high rebuffering') but does not provide the actual prompt text used with the LLM."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No LLM hyperparameters (temperature, top-p, etc.) are reported. Heuristic thresholds (10%, 2x, 25%) are stated but LLM settings are not."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The three-component pipeline (hypothesis generator, attribute computation engine, opportunity finder) is described with data flow in Figure 2 and Sections 4.1-4.3."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper mentions 'anonymized data' and 'one week of production data' but does not describe how the data was preprocessed, filtered, or anonymized."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 'Discussion and future work' identifies multiple open challenges and limitations of the current approach."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "Section 6 discusses future work directions but does not identify specific threats to validity of the current evaluation (e.g., manual evaluation bias, threshold sensitivity, generalizability to other domains)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper clearly states it presents a 'vision' and 'proof of concept' and identifies specific things not yet addressed: causal inference, complex event patterns, scalable data processing, privacy."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw production data is not available; it is proprietary and anonymized."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2 describes the data source: 'anonymized data from a large application-level monitoring and analytics service provider' collecting client-side events from end-user devices."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data comes from production telemetry systems, not recruited subjects."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline from raw events to opportunities is described architecturally but specifics of data transformations, filtering steps, and intermediate counts are not documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Conviva (industry), Georgia Tech, and Carnegie Mellon University."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Multiple authors are affiliated with Conviva, a commercial monitoring/analytics company. The system is evaluated on production data from what appears to be Conviva's service. This conflict is not acknowledged."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is provided. Conviva-affiliated authors evaluating a system in their own production environment is a potential conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. The LLM is used as a component for hypothesis generation, not evaluated for its knowledge."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not a benchmark evaluation of model capabilities."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not a benchmark evaluation of model capabilities."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in the study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in the study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Section 5.2 reports processing times: Expert 1 processed >1000 hypotheses in <10 minutes; Expert 2 processed 250 hypotheses in <3 minutes per KPI."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total compute budget, hardware specifications, or LLM API costs are reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The proof-of-concept system can surface novel opportunities for improvement in real-world production services.",
    286       "evidence": "Table 2 lists 8 specific opportunity leads across 3 production services, identified from one week of data (Section 5.2).",
    287       "supported": "moderate"
    288     },
    289     {
    290       "claim": "Expert 1 surfaced useful opportunities in 5 out of 18 scenarios; Expert 2 in 8 out of 20 scenarios on average.",
    291       "evidence": "Section 5.2 reports these counts, with manual inspection determining usefulness.",
    292       "supported": "weak"
    293     },
    294     {
    295       "claim": "The system is scalable: Expert 1 processed >1000 hypotheses in <10 minutes, Expert 2 processed 250 in <3 minutes per KPI.",
    296       "evidence": "Section 5.2 'Scalability' paragraph provides these timing numbers.",
    297       "supported": "weak"
    298     }
    299   ],
    300   "methodology_tags": ["case-study"],
    301   "key_findings": "The paper presents a vision and proof-of-concept for automatically surfacing improvement opportunities in Internet-scale services by generating and testing hypotheses based on derived attributes (indirect, stateful, non-local). Evaluated on production data from 3 services, the prototype identified actionable opportunities such as event sequences leading to payment failures and differential error rates for returning users. The system uses LLM-assisted hypothesis generation (Llama 3.2), efficient stateful attribute computation, and a mixture-of-experts validation approach, though the current evaluation is preliminary with high false positive rates (5/18 and 8/20 useful).",
    302   "red_flags": [
    303     {
    304       "flag": "Undisclosed conflict of interest",
    305       "detail": "Multiple authors are Conviva employees evaluating a system on what appears to be Conviva's production data. This commercial interest in the system's success is not acknowledged."
    306     },
    307     {
    308       "flag": "No quantitative baselines",
    309       "detail": "Despite Table 1 listing four categories of related work, no quantitative comparison against any existing system is performed."
    310     },
    311     {
    312       "flag": "Informal evaluation methodology",
    313       "detail": "The evaluation relies on manual inspection by the authors themselves to determine which opportunities are 'insightful.' No inter-rater agreement, blinding, or systematic evaluation criteria are described."
    314     },
    315     {
    316       "flag": "Selective reporting of opportunities",
    317       "detail": "Table 2 reports 'a subset of the opportunities' that were manually deemed insightful, without systematic criteria for selection."
    318     }
    319   ],
    320   "cited_papers": [
    321     {
    322       "title": "InsightPilot: An LLM-Empowered Automated Data Exploration System",
    323       "authors": ["P. Ma", "R. Ding", "S. Wang", "S. Han", "D. Zhang"],
    324       "year": 2023,
    325       "relevance": "LLM-based automated data analysis system, relevant to AI-assisted analytics and software tooling."
    326     },
    327     {
    328       "title": "Why do multi-agent LLM systems fail?",
    329       "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"],
    330       "year": 2025,
    331       "arxiv_id": "2503.13657",
    332       "relevance": "Directly relevant to understanding failure modes in multi-agent LLM systems."
    333     },
    334     {
    335       "title": "Automatic root cause analysis via large language models for cloud incidents",
    336       "authors": ["Y. Chen", "H. Xie", "M. Ma"],
    337       "year": 2024,
    338       "relevance": "LLM-based root cause analysis for cloud systems, relevant to AI-assisted software engineering and operations."
    339     },
    340     {
    341       "title": "The Llama 3 herd of models",
    342       "authors": ["A. Grattafiori", "A. Dubey"],
    343       "year": 2024,
    344       "arxiv_id": "2407.21783",
    345       "relevance": "Foundation model used in this system's hypothesis generation component."
    346     },
    347     {
    348       "title": "A survey on multimodal large language models",
    349       "authors": ["S. Yin", "C. Fu", "S. Zhao"],
    350       "year": 2024,
    351       "relevance": "Survey of multimodal LLMs referenced as future direction for hypothesis generation."
    352     }
    353   ]
    354 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs