scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25782B)
      1 {
      2   "paper": {
      3     "title": "Attention Pruning: Automated Fairness Repair of Language Models via Surrogate Simulated Annealing",
      4     "authors": [
      5       "Vishnu Asutosh Dasu",
      6       "Md Rafi ur Rashid",
      7       "Vipul Gupta",
      8       "Saeid Tizpaz-Niari",
      9       "Gang Tan"
     10     ],
     11     "year": 2025,
     12     "venue": "ICSE '26",
     13     "arxiv_id": "2503.15815",
     14     "doi": "10.1145/3744916.3773115"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper states 'We open-source our implementations' and provides a Bitbucket link: https://bitbucket.org/psu_soslab/attention_pruning (Section 5, footnote 2)."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper uses publicly available datasets: HolisticBias and WikiText-2. Both are standard public benchmarks that the authors did not modify, making them accessible to reproducers."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Section 5 (Technical Details) specifies python=3.10, torch==2.2.2, transformers==4.29.0, torch==2.4.1, Ubuntu 20.04.6 LTS and Ubuntu 22.04.3 LTS, Intel Xeon Gold 6336Y CPU, NVIDIA RTX A6000 GPUs, and Intel Core i7-7700 CPU. This is sufficiently detailed."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While the code is released and environment details are given, the paper itself does not provide step-by-step reproduction instructions or describe a README with commands. The algorithmic details are described at a high level, but specific reproduction steps (e.g., commands to run, configuration files) are not provided in the paper."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Table 6 reports bias values with ± standard deviation (e.g., '0.264 ± 0.01'). The Discussion section reports 95% CIs for Cliff's delta (e.g., '95% CI ≈ [−0.994, −0.847]')."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The Discussion section reports Cliff's delta and Mann-Whitney U tests with p-values (e.g., p = 2.319 × 10^-9 for GPT-J-6B, p = 6.539 × 10^-9 for Llama-2-7B) to validate the practical significance of AP over FASP."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Cliff's delta is reported as an effect size measure (δ = −0.968 for GPT-J-6B, δ = −0.9392 for Llama-2-7B). Relative improvements are also reported (e.g., '40.8% improvement', '65.21% improvement')."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper uses 6 LLMs and 25 random seeds for the statistical tests but provides no justification for why these specific numbers were chosen. No power analysis is discussed."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 6 reports standard deviations across 3 seeds (e.g., '0.264 ± 0.01'). Table 3 reports min, max, median, mean, and standard deviation for perplexity distributions in training datasets."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Table 6 compares AP against four baselines: FASP (state-of-the-art), Weight Magnitude pruning, Gradient Magnitude pruning, and Random pruning, plus the unpruned baseline LLM."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "FASP (Zayed et al., 2024) is the state-of-the-art fairness-aware pruning method published the year before. Weight Magnitude (Han et al., 2016) and Gradient Magnitude (Michel et al., 2019) are standard general-purpose pruning baselines. The choice of baselines mirrors the setup of the state-of-the-art method."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Table 5 presents ablation studies using only one surrogate DNN in the cost function (only bias DNN vs only perplexity DNN vs combined). RQ3 examines the effect of varying epsilon, eta_bias, and running time on results."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper reports both bias (HolisticBias metric) and perplexity (WikiText-2) as primary metrics. MSE is used for surrogate DNN validation. Multiple bias types (gender, race, nationality, sexual orientation, age) are also evaluated."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "The paper evaluates bias and utility entirely through automated metrics (toxicity scores from a BERT classifier and perplexity). No human evaluation of the generated text quality or fairness is included. Given the paper makes claims about fairness in language generation, human evaluation of whether the outputs are actually perceived as less biased would be relevant."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The surrogate DNNs use a 95%-5% train-validation split. Final bias results in Table 6 are reported on '33,789 test set prompts' from HolisticBias. Perplexity is evaluated on the WikiText-2 test dataset. Surrogate training uses validation subsets, while final evaluation uses separate test data."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Table 6 provides per-model breakdowns for all 6 LLMs. Table 7 breaks down results by social bias type (race, nationality, sexual orientation, age). Table 4 provides per-model MSE for the surrogate DNNs."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper discusses that for GPT-Neo-125M, AP's bias is worse than FASP (Table 6). The Limitations section discusses sub-optimality due to surrogate DNN errors and SA convergence issues. The anomalous Llama-2-7B behavior in Table 5 is discussed."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "GPT-Neo-125M shows AP performing worse on bias than FASP (0.241 vs 0.221). The paper notes that for Llama-2-7B, using only the bias DNN in the cost function yields higher bias than the combined cost function, which is an unexpected negative result they explicitly discuss."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims 'up to 40% reduction in gender bias' which is supported by Table 6 (GPT-J-6B: 40.8% relative improvement). The claim that AP 'outperforms state-of-the-art bias mitigation strategies' is supported in 4/6 cases for combined bias+perplexity, which the paper honestly reports."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims about pruning attention heads reducing bias (ablation-style claims). The experimental design is adequate: they systematically prune different subsets of attention heads and measure the resulting bias/perplexity changes. The ablation in Table 5 validates the contribution of each surrogate DNN component."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The Limitations section explicitly states: 'it is unknown whether our method will generalize to all LLMs and bias metrics.' The title and paper scope focus on attention head pruning specifically. They acknowledge limitation to HolisticBias and WikiText-2 datasets due to computational resources."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The Threats to Validity section discusses DNN architecture sensitivity, randomness in SA, and dataset limitations. The Discussion section considers that the relationship between bias and perplexity may be complex (Llama-2-7B anomaly). They discuss surrogate DNN accuracy as an alternative explanation for sub-optimal results."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper specifies exact model names: Distilgpt-2, GPT-2, GPT-Neo-125M, GPT-Neo-1.3B, GPT-J-6B, Llama-2-7B. These are specific open-source models with fixed weights (not API-accessed models that change over time), so the exact version is deterministic from the model name. transformers==4.29.0 is specified."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The prompts come from the publicly available HolisticBias dataset. Table 9 shows example prompts used (e.g., 'I don't like intersex boys.'). The bias evaluation procedure using HolisticBias prompts is well-documented via reference to prior work [14, 72]."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The paper reports: epsilon values (0.5, 0.3-0.7 range), n_l and n_u bounds, eta_bias ∈ [0.1, 0.2], eta_ppl = 1, DNN architectures (Table 2), dataset sizes (Table 2), train-validation split (95%-5%), 3 seeds for text generation, 25 seeds for statistical tests, SA time limit of 3 hours, sigma ≤ 10 for clamping, token sampling of 40 tokens."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. The approach is a traditional ML pipeline (surrogate DNN training + simulated annealing optimization), not an LLM agent system."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5 documents data preprocessing in detail: perplexity clamping with sigma threshold (Equation 22), bias scaling by maximum, biased subsampling to maintain subgroup distribution, and the rationale for each step."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The Discussion section (Section 7) contains a dedicated 'Limitations' subsection and a 'Threats to validity' subsection, both providing substantive discussion."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The Threats to Validity section discusses specific threats: (1) DNN architecture sensitivity ('The results might vary with different architectures'), (2) outlier behavior for specific models (Llama-2-7B in Table 5), (3) limited computational resources restricting to HolisticBias and WikiText-2, (4) unknown generalizability to all LLMs and bias metrics."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper explicitly states scope boundaries: limited to HolisticBias and WikiText-2 datasets, focused on 6 specific LLMs, primarily studies gender bias with extension to race. The Limitations section states 'it is unknown whether our method will generalize to all LLMs and bias metrics.'"
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "While HolisticBias and WikiText-2 are public, the surrogate DNN training datasets (the sampled attention head configurations with their bias/perplexity values) are not released. These are the raw experimental data, and without them, independent verification of the surrogate DNN training is not possible."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Algorithm 1 describes the data collection procedure in detail: random state sampling, pruning, prompting with subsets of HolisticBias, computing bias and perplexity, repeating until time limit. Table 2 reports dataset sizes. Table 3 reports dataset statistics."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants are involved. The data sources are standard benchmarks (HolisticBias, WikiText-2)."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The full pipeline is documented: (1) random state sampling with bounds [n_l, n_u], (2) LLM pruning, (3) bias/perplexity computation on subsets, (4) dataset creation with ≥25,000 samples, (5) preprocessing (clamping, scaling, biased subsampling), (6) surrogate DNN training with 95-5 split, (7) SA search, (8) final evaluation on test data."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The Acknowledgments section states: 'This material is based upon work supported by the National Science Foundation under Grant No. CNS-2527657 and CNS-2230061.'"
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "All author affiliations are listed: Pennsylvania State University (Dasu, Rashid, Gupta, Tan) and University of Illinois Chicago (Tizpaz-Niari). The paper evaluates open-source models, not proprietary models from the authors' affiliated institutions."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The funder is the National Science Foundation, a U.S. government funding agency that has no financial interest in whether attention pruning reduces bias in specific LLMs."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "The paper does not evaluate pre-trained model capability on a benchmark. It evaluates a post-processing pruning technique's effect on bias and perplexity. The models' training cutoffs are irrelevant because the evaluation measures the effect of pruning, not the models' learned knowledge."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Same reasoning as training_cutoff_stated: the paper tests a pruning intervention, not pre-trained model knowledge. Train/test overlap for the LLMs' pre-training data is not relevant to the claims."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Same reasoning: the paper evaluates a bias mitigation technique, not LLM benchmark performance. Contamination of HolisticBias prompts in training data would not affect the validity of comparing pruned vs unpruned models on the same prompts."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants are involved in this study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants are involved in this study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants are involved in this study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants are involved in this study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants are involved in this study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants are involved in this study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants are involved in this study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": true,
    281         "justification": "The paper reports: one round of inference for Llama-2-7B takes ≈13 minutes on NVIDIA RTX A6000 (Section 4.1). Surrogate DNN inference takes ≈0.34 milliseconds on CPU, enabling ≈2,940 states/second, a 2,260,000× speedup. SA search time limit is 3 hours."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "The Limitations section states: 'the one-time offline data collection of 27,000 samples and training of surrogate DNNs takes ≈1900 GPU hours in total across multiple GPUs for Llama-2-7B.' Hardware details (10 NVIDIA RTX A6000 GPUs, 512GB RAM) are provided."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "Surrogate DNNs accurately predict the bias and perplexity of pruned LLMs with low mean squared error (MSE ≤ 0.01 in most cases).",
    293       "evidence": "Table 4 shows MSE values: bias DNN MSE ranges from 0.0038 (Distilgpt-2) to 0.0073 (GPT-J-6B); perplexity DNN MSE ranges from 0.0005 (Distilgpt-2) to 0.026 (GPT-Neo-1.3B). Only GPT-Neo-1.3B's perplexity DNN exceeds 0.01.",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "Attention Pruning achieves up to 40% reduction in gender bias compared to baseline LLMs.",
    298       "evidence": "Table 6: GPT-J-6B baseline bias is 0.446 ± 0.013; AP reduces it to 0.264 ± 0.01, a 40.8% relative improvement. AP achieves bias reduction across all 6 LLMs.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "AP outperforms the state-of-the-art FASP method by finding states with both lower bias and lower perplexity in 4 out of 6 LLMs.",
    303       "evidence": "Table 6 shows AP achieves better bias AND perplexity than FASP for GPT-2, GPT-Neo-1.3B, GPT-J-6B, and Llama-2-7B (on bias only). For GPT-Neo-125M, AP has worse bias but better perplexity. For Distilgpt-2, AP has better bias but slightly worse perplexity than baseline.",
    304       "supported": "strong"
    305     },
    306     {
    307       "claim": "The improvement of AP over FASP is statistically significant with large effect sizes.",
    308       "evidence": "Discussion section: Cliff's delta = −0.968 (95% CI [−0.994, −0.847]) for GPT-J-6B and δ = −0.9392 (95% CI [−0.984, −0.785]) for Llama-2-7B. Mann-Whitney U test: p = 2.319 × 10^-9 and p = 6.539 × 10^-9 respectively, over 25 random seeds each.",
    309       "supported": "strong"
    310     },
    311     {
    312       "claim": "Reducing gender bias with AP also reduces other social biases (race, nationality, sexual orientation, age) by up to 65%.",
    313       "evidence": "Table 7 shows consistent reduction across all 6 LLMs and 4 additional bias types. Highest relative improvement is 65.21% for GPT-J-6B age bias (0.069 → 0.024).",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "Surrogate SA achieves a ~2,260,000× speedup over vanilla SA on LLMs.",
    318       "evidence": "Section 4.3: One round of inference on Llama-2-7B takes ~13 minutes on GPU. Surrogate DNN inference takes ~0.34 milliseconds on CPU, enabling ~2,940 states/second vs ~5 states/hour for vanilla SA.",
    319       "supported": "moderate"
    320     }
    321   ],
    322   "methodology_tags": [
    323     "benchmark-eval"
    324   ],
    325   "key_findings": "Attention Pruning (AP) uses surrogate deep neural networks to approximate the effect of pruning attention heads on LLM bias and perplexity, enabling efficient simulated annealing search over the combinatorial space of attention head configurations. AP achieves up to 40% reduction in gender bias while outperforming the state-of-the-art FASP pruning strategy in 4 out of 6 LLMs by finding configurations with both lower bias and lower perplexity. Reducing gender bias via AP also reduces other social biases (race, nationality, sexual orientation, age) by up to 65%. The surrogate approach provides a ~2.26M× speedup over direct SA search on the LLMs, making combinatorial optimization feasible for billion-parameter models.",
    326   "red_flags": [
    327     {
    328       "flag": "Fairness metric relies on automated toxicity classifier",
    329       "detail": "The bias metric is computed using a BERT-based toxicity classifier trained on the Jigsaw toxic comment dataset. This classifier itself may have biases, and the paper does not validate whether the toxicity scores accurately reflect human perceptions of bias. No human evaluation of the debiased outputs is performed."
    330     },
    331     {
    332       "flag": "Statistical tests only on 2 of 6 models",
    333       "detail": "Cliff's delta and Mann-Whitney U tests are reported only for GPT-J-6B and Llama-2-7B. The other 4 models, including GPT-Neo-125M where AP underperforms FASP on bias, do not have statistical significance tests."
    334     },
    335     {
    336       "flag": "Surrogate DNN training cost may limit practical applicability",
    337       "detail": "The one-time offline data collection and training takes ~1900 GPU hours for Llama-2-7B. This significant upfront cost is acknowledged but may limit the practical applicability of the approach to larger or different models."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "Fairness-aware structured pruning in transformers",
    343       "authors": ["Abdelrahman Zayed", "Gonçalo Mordido", "Samira Shabanian", "Ioana Baldini", "Sarath Chandar"],
    344       "year": 2024,
    345       "relevance": "State-of-the-art fairness-aware attention head pruning method that AP is compared against; directly relevant to LLM bias mitigation methodology."
    346     },
    347     {
    348       "title": "NeuFair: Neural Network Fairness Repair with Dropout",
    349       "authors": ["Vishnu Asutosh Dasu", "Ashish Kumar", "Saeid Tizpaz-Niari", "Gang Tan"],
    350       "year": 2024,
    351       "doi": "10.1145/3650212.3680380",
    352       "relevance": "Prior work on search-based fairness repair using simulated annealing for smaller DNNs; directly informs the AP approach."
    353     },
    354     {
    355       "title": "Information-Theoretic Testing and Debugging of Fairness Defects in Deep Neural Networks",
    356       "authors": ["Verya Monjezi", "Ashutosh Trivedi", "Gang Tan", "Saeid Tizpaz-Niari"],
    357       "year": 2023,
    358       "doi": "10.1109/ICSE48619.2023.00136",
    359       "relevance": "Fairness testing and debugging technique for DNNs using information theory; relevant to the SE fairness testing literature."
    360     },
    361     {
    362       "title": "Bias and fairness in large language models: A survey",
    363       "authors": ["Isabel O Gallegos", "Ryan A Rossi", "Joe Barrow"],
    364       "year": 2024,
    365       "relevance": "Comprehensive survey of bias and fairness in LLMs; provides context for the fairness mitigation landscape."
    366     },
    367     {
    368       "title": "\"I'm sorry to hear that\": Finding New Biases in Language Models with a Holistic Descriptor Dataset",
    369       "authors": ["Eric Michael Smith", "Melissa Hall", "Melanie Kambadur", "Eleonora Presani", "Adina Williams"],
    370       "year": 2022,
    371       "arxiv_id": "2205.09209",
    372       "relevance": "HolisticBias dataset and metric used as the primary fairness evaluation benchmark in this paper."
    373     },
    374     {
    375       "title": "BOLD: Dataset and Metrics for Measuring Biases in Open-Ended Language Generation",
    376       "authors": ["Jwala Dhamala", "Tony Sun", "Varun Kumar"],
    377       "year": 2021,
    378       "doi": "10.1145/3442188.3445924",
    379       "relevance": "Foundational bias evaluation dataset and methodology that HolisticBias builds upon."
    380     },
    381     {
    382       "title": "Are Sixteen Heads Really Better than One?",
    383       "authors": ["Paul Michel", "Omer Levy", "Graham Neubig"],
    384       "year": 2019,
    385       "arxiv_id": "1905.10650",
    386       "relevance": "Influential work on attention head pruning based on gradient magnitude; used as a baseline in this paper."
    387     },
    388     {
    389       "title": "Black Box Fairness Testing of Machine Learning Models",
    390       "authors": ["Aniya Aggarwal", "Pranay Lohia", "Seema Nagar", "Kuntal Dey", "Diptikalyan Saha"],
    391       "year": 2019,
    392       "doi": "10.1145/3338906.3338937",
    393       "relevance": "Early work on fairness testing in SE community; relevant to the systematic evaluation of ML model fairness."
    394     },
    395     {
    396       "title": "GPT-4 Technical Report",
    397       "authors": ["OpenAI"],
    398       "year": 2024,
    399       "arxiv_id": "2303.08774",
    400       "relevance": "Key reference for LLM capabilities; motivates the need for fairness repair in advanced language models."
    401     },
    402     {
    403       "title": "A genetic programming approach to automated software repair",
    404       "authors": ["Stephanie Forrest", "ThanhVu Nguyen", "Westley Weimer", "Claire Le Goues"],
    405       "year": 2009,
    406       "doi": "10.1145/1569901.1570031",
    407       "relevance": "Foundational work on search-based software repair that inspires the simulated annealing approach used in AP."
    408     },
    409     {
    410       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    411       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    412       "year": 2023,
    413       "arxiv_id": "2307.09288",
    414       "relevance": "One of the largest LLMs evaluated in the paper; relevant to understanding LLM capabilities and bias."
    415     }
    416   ]
    417 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs