diff --git a/downloads-generation/allele_sequences/GENERATE.sh b/downloads-generation/allele_sequences/GENERATE.sh index 47791a8bdd16664229cb2d0059bcc89071021cc4..b87d8a36f99c296de0966d54959ac87456a266fa 100755 --- a/downloads-generation/allele_sequences/GENERATE.sh +++ b/downloads-generation/allele_sequences/GENERATE.sh @@ -40,10 +40,9 @@ cp $SCRIPT_ABSOLUTE_PATH . # Training data is used to decide which additional positions to include in the # allele sequences to differentiate alleles that have identical traditional # pseudosequences but have associated training data -TRAINING_DATA="$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" +TRAINING_DATA="$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" -bzcat "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ - | cut -f 1 -d , | uniq | sort | uniq | grep -v allele > training_data.alleles.txt +python select_alleles_to_disambiguate.py "$TRAINING_DATA" --out training_data.alleles.txt # Human wget -q ftp://ftp.ebi.ac.uk/pub/databases/ipd/imgt/hla/fasta/A_prot.fasta diff --git a/downloads-generation/allele_sequences/make_allele_sequences.py b/downloads-generation/allele_sequences/make_allele_sequences.py index cfd87efca5101b0ff58ab6c0f4601180c09b367e..a15023e98c7f832b16195e454210973137f9b64a 100644 --- a/downloads-generation/allele_sequences/make_allele_sequences.py +++ b/downloads-generation/allele_sequences/make_allele_sequences.py @@ -141,22 +141,89 @@ def run(): "Read %d alleles to differentiate:" % len(differentiate_alleles), differentiate_alleles) - allele_sequences_to_differentiate = allele_sequences.loc[ + to_differentiate = allele_sequences.loc[ allele_sequences.index.isin(differentiate_alleles) - ] - print(allele_sequences_to_differentiate.shape) + ].copy() + print(to_differentiate.shape) additional_positions = [] - for (_, sub_df) in allele_sequences_to_differentiate.groupby("recapitulated"): + + # Greedy search, looking ahead 3 positions at a time. + possible_additional_positions = set() + for (_, sub_df) in to_differentiate.groupby("recapitulated"): if sub_df.aligned.nunique() > 1: differing = pandas.DataFrame( dict([(pos, chars) for (pos, chars) in enumerate(zip(*sub_df.aligned.values)) if any(c != chars[0] for c in chars) and "X" not in chars])).T - print(sub_df) - print(differing) - print() - additional_positions.extend(differing.index) + possible_additional_positions.update(differing.index.values) + + def disambiguation_score(sequences): + counts = pandas.Series(sequences, copy=False).value_counts() + score = -1 * (counts[counts > 1] - 1).sum() + return score + + possible_additional_positions = sorted(possible_additional_positions) + current_sequences = to_differentiate.recapitulated + while current_sequences.value_counts().max() > 1: + to_differentiate["equivalence_class_size"] = ( + current_sequences.map(current_sequences.value_counts()) + ) + print("Ambiguous alleles", " ".join( + to_differentiate.loc[ + to_differentiate.equivalence_class_size > 1 + ].index)) + position1s = [] + position2s = [] + position3s = [] + negative_position1_distances = [] + possible_additional_positions_scores = [] + position1_scores = [] + for position1 in possible_additional_positions: + new_sequence1 = ( + current_sequences + + to_differentiate.aligned.str.get(position1)) + negative_position1_distance = -1 * min( + abs(position1 - selected) for selected in selected_positions) + position1_score = disambiguation_score(new_sequence1) + + for (i, position2) in enumerate(possible_additional_positions): + new_sequence2 = ( + new_sequence1 + + to_differentiate.aligned.str.get(position2)) + for position3 in possible_additional_positions: + new_sequence3 = ( + new_sequence2 + + to_differentiate.aligned.str.get(position3)) + + score = disambiguation_score(new_sequence3) + position1s.append(position1) + position2s.append(position2) + position3s.append(position3) + possible_additional_positions_scores.append(score) + negative_position1_distances.append( + negative_position1_distance) + position1_scores.append(position1_score) + + scores_df = pandas.DataFrame({ + "position1": position1s, + "position2": position2s, + "position3": position3s, + "negative_position1_distance": negative_position1_distances, + "tuple_score": possible_additional_positions_scores, + "position1_score": position1_scores, + }).sort_values( + ["tuple_score", "position1_score", "negative_position1_distance"], + ascending=False) + print(scores_df) + selected_additional_position = scores_df.iloc[0].position1 + print("Selected additional position", selected_additional_position) + additional_positions.append(selected_additional_position) + current_sequences = ( + current_sequences + + to_differentiate.aligned.str.get( + selected_additional_position)) + possible_additional_positions.remove(selected_additional_position) additional_positions = sorted(set(additional_positions)) print( diff --git a/downloads-generation/allele_sequences/select_alleles_to_disambiguate.py b/downloads-generation/allele_sequences/select_alleles_to_disambiguate.py new file mode 100644 index 0000000000000000000000000000000000000000..649df45b31d860df42d0d1275b6979b4cd811de1 --- /dev/null +++ b/downloads-generation/allele_sequences/select_alleles_to_disambiguate.py @@ -0,0 +1,45 @@ +""" +Select alleles to disambiguate + +""" +from __future__ import print_function + +import sys +import argparse + +import pandas + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "train_data", + help="Path to training data CSV. Must have column: allele") +parser.add_argument( + "--min-count", + type=int, + default=50, + metavar="N", + help="Keep only alleles with at least N measurements") +parser.add_argument( + "--out", + help="Result file.") + + +def run(): + args = parser.parse_args(sys.argv[1:]) + print(args) + + df = pandas.read_csv(args.train_data) + if args.min_count: + allele_counts = df.allele.value_counts() + df = df.loc[ + df.allele.map(allele_counts) > args.min_count + ] + + df.drop_duplicates("allele").allele.to_csv( + args.out, header=False, index=False) + print("Wrote: ", args.out) + + +if __name__ == '__main__': + run()