scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (17656B)
      1 {
      2   "paper": {
      3     "title": "Compilation Quotient (CQ): A Metric for the Compilation Hardness of Programming Languages",
      4     "authors": ["Violet Szabó", "Dominik Winterer", "Zhendong Su"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2406.04778"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "The paper states 'We will open-source source cq-test and plan to submit it to artifact evaluation' and 'the experimental data is attached to this submission.' A promise of future release counts as NO per schema rules. No repository URL is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper says experimental data 'is attached to this submission' but no public URL or download link is provided in the paper text."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions the hardware (AMD Ryzen Threadripper 2990 WX, 128 GB RAM, Ubuntu 22.04) and lists compilers used (GCC, G++, etc.) but does not provide a requirements.txt, Dockerfile, or detailed dependency/version listing sufficient to recreate the environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or scripts are described. The methodology is described at an algorithmic level but not with runnable commands."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "CQ values are reported as point estimates (e.g., 'C has a CQ of 48.11') without confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "The paper does not make comparative claims that one language is statistically significantly different from another; it reports CQ values as descriptive metrics rather than making inferential claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "The paper introduces a descriptive metric (CQ) and reports its values. It does not test treatments or interventions where effect sizes would be relevant."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper samples 100,000 programs per bucket (10,000 for Java/Kotlin 'due to performance issues') but does not justify why these sample sizes are sufficient or provide power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 2.5 states: 'We repeat all experiments three times. The relative standard deviations of CQ were at 3.54%, 3.24%, and 2.46% for Go, Swift, and Java respectively, and below 2% for all other languages.'"
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This paper introduces a new metric (CQ) with no prior work defining a comparable metric. There is no prior baseline to compare against."
     64       },
     65       "baselines_contemporary": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "No prior CQ-like metric exists, so contemporary baselines are not applicable."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 4.1 and Fig. 7 present ablation-style analysis: 'we calculated the LCQ of two variants of C: one without pointer declarations, and one without any declarations' to isolate the contribution of pointer declarations to C's high CQ."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports both CQ (overall compilation quotient) and LCQ (local compilation quotient as a function of program size), providing complementary views."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation is not relevant to measuring compilation success rates of generated programs."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning study; there is no train/test split. Programs are sampled and compiled."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per language (12 languages) with individual CQ values, and LCQ curves show per-size breakdowns for each language (Figs. 3, 4)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper extensively discusses why programs fail to compile in each language, with specific examples of invalid programs and analysis of error causes (Sections 4.1-4.3)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that Rust's CQ is nearly 0 (only 6 out of 1.5 million programs compile) and discusses limitations of the metric openly."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims about CQ values (C=48.11, C++=0.60, Java=0.27, Haskell=0.13, Rust nearly 0) are all supported by results in Section 3 and Fig. 3."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about why CQ varies (e.g., pointer declarations cause C's high CQ) and supports them with ablation experiments (Fig. 7, removing pointer declarations causes LCQ to plummet)."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper claims CQ 'can help understand the differences of compiled programming languages better and help language designers' but the metric only measures compilation of single-function programs without standard library imports, which is a narrow slice of real programming. The limitations section acknowledges this but the abstract and conclusion do not bound the claims accordingly."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 5 discusses alternative interpretations: CQ measures syntax-level compilation, not real programmer difficulty. The Limitations subsection explicitly acknowledges that single entry point and no standard library use limits the metric. Section 4 discusses multiple factors affecting CQ per language."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "This paper does not use LLMs or pre-trained models."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting is used in this paper."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Key parameters are reported: bucket size (16 bytes), number of buckets (16), sample target (100,000 per bucket, 10,000 for Java/Kotlin), max_tries=16, oversampling factor alpha=8, beta=2, epsilon=5 for LCQ, size bound 256 bytes."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 2.2 thoroughly documents grammar modifications per language (bounding identifiers to 2, restricting literals, fixing modifier orders, removing extensions). Section 2.4 documents the sampling algorithm in detail with pseudocode."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "A 'Limitations' subsection appears at the end of Section 5 (Discussion), discussing restrictions of the CQ metric."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The limitations are specific: 'we measured CQ only on programs with a single entry point and forbade use of the language's standard library,' 'this ignores other important features contributing to a language's complexity such as includes in C/C++ and traits in Rust,' and 'CQ does not consider the complexity of the language's runtime semantics.'"
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states what CQ does NOT measure: no standard library usage, single entry point only, no runtime semantics. These are specific scope boundaries."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper mentions data is 'attached to this submission' but no public URL is provided for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection procedure is described in detail: programs are sampled from ANTLR grammars using the FEAT enumerator with a bucket-based algorithm (Section 2.4), compiled with specific compilers (Section 2.5)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; data source is algorithmically generated programs from grammars."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline is documented: CFG → regular tree grammar → FEAT enumeration → bucket-based sampling → compilation → CQ computation. Algorithms 1 and 2 provide pseudocode."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: TU Delft and ETH Zurich."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No funding is disclosed; appears to be academic research with no commercial interest in the outcome."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate a pre-trained model on any benchmark."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "This paper does not evaluate a pre-trained model on any benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper does not evaluate a pre-trained model on any benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No wall-clock time, computation cost, or resource usage per language is reported despite sampling over 12 million programs."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is mentioned (AMD Ryzen Threadripper 2990 WX, 64 cores, 128 GB RAM) but total compute time or budget is not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "C has the highest CQ (48.11) among 12 tested compiled languages, far exceeding all others.",
    286       "evidence": "Section 3, Fig. 3a: CQ values reported for all 12 languages. C leads with 48.11, followed by Erlang at 6.51.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Rust has the lowest CQ (0.0004), with only 6 out of 1.5 million programs compiling.",
    291       "evidence": "Section 3 and Section 4.1: 'For Rust, only 6 out of 1.5 million programs compile.'",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "C's non-collapsing LCQ curve is caused by pointer declarations that impose no additional constraints as they are nested.",
    296       "evidence": "Section 4.1 and Fig. 7: Ablation removing pointer declarations causes C's LCQ to plummet to zero, confirming the causal role of pointer declarations.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The four most popular languages on TIOBE are all in the top half of the CQ ranking, suggesting high CQ correlates with language popularity.",
    301       "evidence": "Section 5: Observation based on TIOBE ranking (Fig. 2a) and CQ ranking (Fig. 3a). However, this is a correlation across 12 languages with no statistical test.",
    302       "supported": "weak"
    303     },
    304     {
    305       "claim": "CQ measures a part of the semantic complexity experienced by non-novice programmers.",
    306       "evidence": "Section 5: Argumentative reasoning that frequently-used features likely to compile at random correspond to lower effort for programmers. No empirical validation with actual programmers.",
    307       "supported": "weak"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper introduces the Compilation Quotient (CQ), measuring the fraction of syntactically valid programs that also compile, across 12 languages using over 12 million grammar-sampled programs. C dominates with CQ=48.11 due to pointer declarations that remain likely valid at any nesting depth, while Rust has the lowest CQ (0.0004) due to its strict type system rejecting nearly all generated expressions. An ablation study confirms pointer declarations are the key driver of C's uniquely non-collapsing LCQ curve.",
    312   "red_flags": [
    313     {
    314       "flag": "Speculative claims about programmer experience",
    315       "detail": "Section 5 claims CQ relates to 'semantic complexity experienced by non-novice programmers' and language adoption, but provides no empirical evidence linking CQ to actual programmer difficulty or language popularity trends."
    316     },
    317     {
    318       "flag": "Grammar modifications may bias results",
    319       "detail": "Extensive per-language grammar modifications (bounding identifiers to 2, restricting literals, removing features like macros, attributes, standard library) could systematically favor or disadvantage certain languages. The paper does not assess sensitivity to these choices."
    320     }
    321   ],
    322   "cited_papers": [
    323     {
    324       "title": "A Large Scale Study of Programming Languages and Code Quality in Github",
    325       "authors": ["Baishakhi Ray", "Daryl Posnett", "Vladimir Filkov", "Premkumar Devanbu"],
    326       "year": 2014,
    327       "relevance": "Empirical study of how programming language choice affects software quality, relevant to understanding language impact on code."
    328     },
    329     {
    330       "title": "Finding and Understanding Bugs in C Compilers",
    331       "authors": ["Xuejun Yang", "Yang Chen", "Eric Eide", "John Regehr"],
    332       "year": 2011,
    333       "relevance": "Foundational compiler testing work (Csmith) directly related to grammar-based program generation for testing."
    334     },
    335     {
    336       "title": "Compiler validation via equivalence modulo inputs",
    337       "authors": ["Vu Le", "Mehrdad Afshari", "Zhendong Su"],
    338       "year": 2014,
    339       "relevance": "Compiler validation technique relevant to automated program generation and compiler testing."
    340     },
    341     {
    342       "title": "Empirical Analysis of Programming Language Adoption",
    343       "authors": ["Leo A. Meyerovich", "Ariel S. Rabkin"],
    344       "year": 2013,
    345       "relevance": "Studies factors in programming language adoption, related to the paper's claims about CQ and language popularity."
    346     }
    347   ]
    348 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs