scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (28732B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Code Generation for Low-Resource Languages: No Silver Bullet",
      6     "authors": [
      7       "Alessandro Giagnorio",
      8       "Alberto Martin-Lopez",
      9       "Gabriele Bavota"
     10     ],
     11     "year": 2025,
     12     "venue": "IEEE International Conference on Program Comprehension",
     13     "arxiv_id": "2501.19085",
     14     "doi": "10.1109/ICPC66645.2025.00058"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims about fine-tuning helping small models, in-context learning being a safe bet, and large models degrading with fine-tuning are all supported by Tables I and III.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper makes causal claims ('ﬁne-tuning [...] helps in substantially boosting performance', 'possibly due to the fact that even a small dataset is sufficient') but the study design is observational across models — confounds between model architecture, training data, and size are not controlled.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The title says 'No Silver Bullet' and the paper consistently qualifies findings by model size and language. The threats to validity explicitly acknowledges findings 'may not generalize to other settings' (Section V).",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper discusses alternative explanations for performance gaps: language similarity to high-resource languages, programming paradigm differences, domain-specificity of R, repository size vs count (Section III-E).",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Claims match measurement granularity: the paper measures pass@1 on HumanEval and reports it as pass@1 on HumanEval, without inflating to broader 'code quality' or 'developer productivity' claims.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section V 'Threats to Validity' provides a dedicated discussion of construct, internal, and external validity threats.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats discussed: pass@k metric bias and mitigation via n=50, no hyperparameter tuning performed, training limited to 3 epochs which may cap results, specific prompt choices may be suboptimal (Section V).",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "External validity explicitly states: 'We decided to focus our study on four low-resource languages...Our findings may not generalize to other settings' and notes the specific models and sizes tested (Section V).",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section VII acknowledges 'the financial support of the Swiss National Science Foundation for the PARSED project (SNF Project No. 219294)' and CHOOSE sponsorship.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All authors are affiliated with Software Institute, USI Università della Svizzera italiana, Switzerland. No conflict with evaluated tools.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Swiss National Science Foundation is an independent government funding agency with no stake in whether fine-tuning or in-context learning performs better.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement is present in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "'Low-resource language' is explicitly defined as niche languages characterized by scarcity of training data; 'high-resource' is defined by contrast; pass@k is explained with formula and repetition procedure.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper explicitly frames its contribution as a comparative empirical study of five techniques (three in-context learning, two fine-tuning) for boosting LLM code generation on low-resource languages across six models.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section II explicitly positions the work relative to Cassano et al., Athiwaratkun et al., Van Dam et al., and others, explaining what this study adds (comparative analysis, model size dimension) that prior work lacked.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "A replication package is provided via Zenodo (ref [27], https://doi.org/10.5281/zenodo.13128630).",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The study uses publicly available MultiPL-E benchmark [19] and MultiPL-T datasets [2]. The replication package is released via Zenodo.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Hardware is mentioned (NVIDIA A30/A40/A100 GPUs) but no requirements.txt, Dockerfile, or detailed software dependency listing is provided in the paper.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided in the paper. The replication package is referenced but no README or commands are described.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Only point estimates of pass@1 are reported in Tables I and III. No confidence intervals or error bars are shown.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "McNemar's test is used for pairwise comparisons of dichotomous results, with Benjamini-Hochberg correction for multiple comparisons (Section III-D).",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Odds Ratios are reported alongside significance tests (Table II and Section IV-D), providing effect size magnitude for all comparisons.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No power analysis or justification for the choice of n=50 repetitions beyond citing prior work that 'this rate appears to stabilize at n=20'.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Only average pass@1 rates are reported. No standard deviations, interquartile ranges, or spread measures across the 50 repetitions are provided.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Each model's baseline (out-of-the-box) performance is compared against all techniques (Table III white rows).",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "DeepSeek Coder and Code Llama were reasonable at submission time, but GitHub Copilot is a black box with unknown model version. More critically, newer models (e.g., GPT-4, Claude) are not included despite being available.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The study systematically compares five techniques (three in-context learning variants, fine-tuning, pre-training+fine-tuning) across models and sizes, effectively serving as an ablation of technique components.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "Only pass@1 is used as the evaluation metric. No additional metrics (e.g., pass@10, CodeBLEU, syntactic correctness rate) are reported.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "The authors manually analyzed a sample of generated programs to understand failure reasons (Section III-E): 'We analyzed a sample of the generated programs to understand the reasons behind the performance gap.'",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "MultiPL-E HumanEval is used as the test set, separate from the MultiPL-T fine-tuning data. The 157/161 programs used for evaluation are distinct from training data.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down per model, per model size, per language, and per technique in Tables I and III.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Specific failure examples are analyzed: R returning null instead of empty list, vector instead of list (Fig. 1), Julia push vs push! API errors (Section III-E).",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper reports that fine-tuning worsens performance for DeepSeek Coder 33B (ORs of 1.64 and 1.42 for degradation), translation rules sometimes hurts, and pre-training doesn't consistently help.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Model families and sizes are specified (DeepSeek Coder 1B/7B/33B, Code Llama 7B/13B) but no specific version snapshots or dates. Copilot version is completely unspecified.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Prompt templates are shown in Listings 1-3 with structural detail. Full prompts are stated to be in the replication package [27]. The few-shot examples and translation rules are described with enough detail.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Temperature (0.2), learning rates (2×10⁻⁵ for DeepSeek, 5×10⁻⁵ for Code Llama), optimizers (AdamW), schedulers, max sequence lengths (1024/2048/3072), mixed precision (bfloat16), and epochs (3) are all reported (Sections III-C, IV-B).",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used. Models are prompted directly for code generation.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The paper documents how fine-tuning datasets were constructed from Cassano et al.'s data, the matching process for pre-training pairs (name+docstring matching with quality filtering), and filtering from 161→157 common programs (Sections III-C, IV-A).",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Replication package released via Zenodo [27] (doi: 10.5281/zenodo.13128630) containing experimental results.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Data sources are well-described: MultiPL-E benchmark with 157 common programs, MultiPL-T datasets (37,592 R functions, 40,489 Racket functions), and the matching process for pre-training pairs (Sections III-C, IV-A).",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Data source is a standard public benchmark (MultiPL-E/HumanEval).",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The pipeline from 164 HumanEval → 161 MultiPL-E → 157 common programs is documented with reasons for each filtering step. Fine-tuning dataset construction and pre-training pair matching are documented with counts (Section IV-A.5).",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "No training data cutoff dates are stated for any of the models used. DeepSeek Coder and Code Llama training data temporal boundaries are not discussed.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether HumanEval problems appeared in the training data of any evaluated model.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "HumanEval was published in 2021. All models were trained after 2021 and could have seen these problems. This contamination risk is not discussed.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "No inference costs, API costs, or wall-clock times are reported despite running 50 repetitions × 157-161 problems × 6 models × multiple techniques.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "Hardware is mentioned (NVIDIA A30/A40/A100 GPUs) but no GPU hours, training time, or total compute budget is stated.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "Results are averaged over 50 repetitions (via temperature=0.2 sampling) but no seed sensitivity analysis or variance across seeds is reported.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": true,
    378           "justification": "Explicitly stated: 'We compute pass@1 with n = 50 repetitions' (Section III-C).",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "The paper explicitly states 'we did not perform hyperparameter tuning' (Section V) and used default configurations. No search budget reported.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": true,
    390           "justification": "For fine-tuned models, 'we evaluate each epoch on the MultiPL-E benchmark and only report the best model's results' — selection on the evaluation benchmark, which is documented (Section IV-C). All epoch results in replication package.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": true,
    396           "justification": "Benjamini-Hochberg procedure is applied to adjust p-values for multiple comparisons (Sections III-D, IV-C).",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors devised the translation examples and translation rules prompts and compare them against existing techniques without acknowledging author-evaluation bias.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "Fine-tuning vs in-context learning have vastly different compute costs but performance is not compared at matched compute budgets.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": false,
    414           "justification": "HumanEval is used without discussing whether it adequately represents real-world code generation tasks for low-resource languages. The paper notes R is used for data analysis but HumanEval tests general programming — this construct validity gap is not addressed.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding is involved; models are directly prompted.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "HumanEval (2021) predates all models' training. No discussion of temporal leakage.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether evaluation setup leaks information (e.g., function signatures providing too much structure).",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether fine-tuning data (MultiPL-T) shares structural similarities with test data (MultiPL-E HumanEval), despite both being derived from similar Python function collections.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No concrete leakage detection or prevention methods are applied.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "Modern LLMs achieve near-high-resource performance on Julia and Lua, with performance gaps often below 10pp from Python/Java",
    455       "evidence": "Table I: DeepSeek 33B scores 43.3% Julia vs 57.8% Java (14.5pp gap) and 53.8% Lua; Copilot achieves 61.4% Lua vs 61.7% Python",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "R and Racket remain problematic low-resource languages with pass@1 gaps of 24-39pp below Python",
    460       "evidence": "Table I: average pass@1 for R is 23.1% vs Python 57.3%; for Racket 18.7% vs Python 57.3%; all differences statistically significant",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Fine-tuning is best for small models (~1B parameters) but hurts very large models (~33B parameters)",
    465       "evidence": "Table III: DeepSeek 1B improves from 7.0% to 18.4% on Racket with pre-training+fine-tuning; DeepSeek 33B drops from 30.2% to 25.3% on R after fine-tuning (OR=1.64)",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "In-context learning with translation examples is a 'safe bet' — consistently improves performance across all model sizes and both languages",
    470       "evidence": "Table III: translation examples improves over baseline for all 5 models on R (excluding 1B) and in-context learning is the best family for 33B; OR range 1.28-2.27 for significant differences",
    471       "supported": "moderate"
    472     },
    473     {
    474       "claim": "Pre-training on code translation before fine-tuning provides no consistent additional benefit over fine-tuning alone",
    475       "evidence": "Fine-tuning-only outperforms pre-training+fine-tuning for 3/5 models on R and 4/5 on Racket; average pass@1 difference is 19.92% vs 19.62% across models on R",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "No single technique is best across all combinations of model size and language",
    480       "evidence": "Table III shows mixed results: for 7B models, in-context learning wins on R but fine-tuning wins on Racket; the paper title reflects this 'no silver bullet' conclusion",
    481       "supported": "strong"
    482     },
    483     {
    484       "claim": "GitHub Copilot substantially benefits from few-shot in-context learning on R (+8.4% pass@1, OR=1.94)",
    485       "evidence": "Table III: Copilot baseline 32.7% R, few-shot 41.1% R; stated as statistically significant with OR=1.94",
    486       "supported": "strong"
    487     }
    488   ],
    489   "methodology_tags": [
    490     "benchmark-eval",
    491     "observational"
    492   ],
    493   "key_findings": "Modern LLMs have effectively closed the gap for some nominally 'low-resource' languages (Julia, Lua), achieving pass@1 rates within 10-15pp of Python/Java, suggesting data volume is not the only determinant of LLM performance. For R and Racket, performance gaps of 25-40pp below Python persist, motivating enhancement techniques. Among five evaluated strategies, no single approach dominates: fine-tuning with limited data helps small models (1B) substantially but degrades performance for the largest model (33B), while in-context learning with translation examples consistently improves all model sizes and is the recommended 'safe bet.' The pre-training-on-translation step before fine-tuning provides no reliable additional benefit.",
    494   "red_flags": [
    495     {
    496       "flag": "No variance reported",
    497       "detail": "Tables I and III report only average pass@1 across 50 repetitions with no standard deviations or confidence intervals, making it impossible to assess the reliability of small performance differences."
    498     },
    499     {
    500       "flag": "Model versions unspecified",
    501       "detail": "GitHub Copilot version is explicitly 'unknown'; DeepSeek Coder and Code Llama are identified only by arXiv citation without checkpoint hashes or snapshot dates, undermining reproducibility."
    502     },
    503     {
    504       "flag": "Benchmark contamination unaddressed",
    505       "detail": "HumanEval (2021) and MBPP were published before training of the evaluated models; the possibility that test problems were in training data is not discussed."
    506     },
    507     {
    508       "flag": "Single evaluation metric",
    509       "detail": "Only pass@1 is reported; no complementary metrics (compilation rate, pass@5, partial correctness) are used, limiting insight into how techniques affect different aspects of code quality."
    510     }
    511   ],
    512   "cited_papers": [
    513     {
    514       "title": "Knowledge transfer from high-resource to low-resource programming languages for code LLMs (MultiPL-T)",
    515       "relevance": "Provides the fine-tuning datasets used in this study and the framework for low-resource language benchmarking"
    516     },
    517     {
    518       "title": "MultiPL-E: A scalable and polyglot approach to benchmarking neural code generation",
    519       "relevance": "Provides the primary evaluation benchmark (HumanEval translated to 18 languages) used throughout"
    520     },
    521     {
    522       "title": "DeepSeek-Coder: When the large language model meets programming",
    523       "relevance": "One of the two open-source model families evaluated across four size variants (1B, 7B, 33B)"
    524     },
    525     {
    526       "title": "Code Llama: Open foundation models for code",
    527       "relevance": "Second open-source model family evaluated (7B and 13B variants)"
    528     },
    529     {
    530       "title": "Multi-lingual evaluation of code generation models",
    531       "relevance": "Prior work on few-shot learning for low-resource languages that this study replicates and extends"
    532     },
    533     {
    534       "title": "Measuring the impact of programming language distribution",
    535       "relevance": "Establishes that training data distribution across languages affects model performance disparities"
    536     },
    537     {
    538       "title": "On the transferability of pre-trained language models for low-resource programming languages",
    539       "relevance": "Prior work showing multilingual fine-tuning benefits for low-resource languages including Ruby"
    540     },
    541     {
    542       "title": "A survey on LLM-based code generation for low-resource and domain-specific programming languages",
    543       "relevance": "Contemporary survey that contextualizes the importance of this research direction"
    544     }
    545   ],
    546   "engagement_factors": {
    547     "practical_relevance": {
    548       "score": 2,
    549       "justification": "Directly actionable guidance for developers using LLMs with niche languages: use in-context learning with translation examples as a safe default, fine-tune only for small models."
    550     },
    551     "surprise_contrarian": {
    552       "score": 2,
    553       "justification": "The finding that Julia and Lua are no longer practically low-resource for modern LLMs challenges prior characterizations and the assumption that data volume is the primary driver."
    554     },
    555     "fear_safety": {
    556       "score": 0,
    557       "justification": "No AI safety or risk concerns raised."
    558     },
    559     "drama_conflict": {
    560       "score": 1,
    561       "justification": "Mild contrarian angle in the 'no silver bullet' title, but no significant controversy with prior work."
    562     },
    563     "demo_ability": {
    564       "score": 1,
    565       "justification": "Reproducible with public benchmarks and open-source models, but requires significant GPU compute for fine-tuning experiments."
    566     },
    567     "brand_recognition": {
    568       "score": 1,
    569       "justification": "USI is a reputable but not top-brand research institution; GitHub Copilot evaluation adds recognition but is only one of six models."
    570     }
    571   },
    572   "hn_data": {
    573     "threads": [],
    574     "top_points": 0,
    575     "total_points": 0,
    576     "total_comments": 0
    577   }
    578 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs