Skip to content
Snippets Groups Projects
Commit e4b4dbb0 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

calibrate percentile ranks for all alleles

parent 9cce08f2
No related branches found
No related tags found
No related merge requests found
...@@ -99,11 +99,10 @@ for kind in combined ...@@ -99,11 +99,10 @@ for kind in combined
do do
MODELS_DIR="models.unselected.${kind}" MODELS_DIR="models.unselected.${kind}"
# For now we calibrate percentile ranks only for alleles for which there # Older method calibrated only particular alleles. We are now calibrating
# is training data. Calibrating all alleles would be too slow. # all alleles, so this is commented out.
# This could be improved though. #ALLELE_LIST=$(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
ALLELE_LIST=$(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) #ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
mhcflurry-class1-select-pan-allele-models \ mhcflurry-class1-select-pan-allele-models \
--data "$MODELS_DIR/train_data.csv.bz2" \ --data "$MODELS_DIR/train_data.csv.bz2" \
...@@ -114,17 +113,17 @@ do ...@@ -114,17 +113,17 @@ do
$PARALLELISM_ARGS $PARALLELISM_ARGS
cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/train_data.csv.bz2" cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/train_data.csv.bz2"
# For now we calibrate percentile ranks only for alleles for which there # We are now calibrating all alleles.
# is training data. Calibrating all alleles would be too slow. # Previously had argument: --allele $ALLELE_LIST \
# This could be improved though.
time mhcflurry-calibrate-percentile-ranks \ time mhcflurry-calibrate-percentile-ranks \
--models-dir models.${kind} \ --models-dir models.${kind} \
--match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
--motif-summary \ --motif-summary \
--num-peptides-per-length 100000 \ --num-peptides-per-length 100000 \
--allele $ALLELE_LIST \ --alleles-per-work-chunk 10 \
--verbosity 1 \ --verbosity 1 \
$PARALLELISM_ARGS $PARALLELISM_ARGS
done done
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
......
...@@ -85,6 +85,12 @@ parser.add_argument( ...@@ -85,6 +85,12 @@ parser.add_argument(
type=int, type=int,
default=4096, default=4096,
help="Keras batch size for predictions") help="Keras batch size for predictions")
parser.add_argument(
"--alleles-per-work-chunk",
type=int,
metavar="N",
default=1,
help="Number of alleles per work chunk. Default: %(default)s.")
parser.add_argument( parser.add_argument(
"--verbosity", "--verbosity",
type=int, type=int,
...@@ -120,7 +126,26 @@ def run(argv=sys.argv[1:]): ...@@ -120,7 +126,26 @@ def run(argv=sys.argv[1:]):
else: else:
alleles = predictor.supported_alleles alleles = predictor.supported_alleles
alleles = sorted(set(alleles)) allele_set = set(alleles)
if predictor.allele_to_sequence:
# Remove alleles that have the same sequence.
new_allele_set = set()
sequence_to_allele = collections.defaultdict(set)
for allele in list(allele_set):
sequence_to_allele[predictor.allele_to_sequence[allele]].add(allele)
for equivalent_alleles in sequence_to_allele.values():
equivalent_alleles = sorted(equivalent_alleles)
keep = equivalent_alleles.pop(0)
new_allele_set.add(keep)
print(
"Sequence comparison reduced num alleles from",
len(allele_set),
"to",
len(new_allele_set))
allele_set = new_allele_set
alleles = sorted(allele_set)
distribution = None distribution = None
if args.match_amino_acid_distribution_data: if args.match_amino_acid_distribution_data:
...@@ -171,7 +196,14 @@ def run(argv=sys.argv[1:]): ...@@ -171,7 +196,14 @@ def run(argv=sys.argv[1:]):
serial_run = not args.cluster_parallelism and args.num_jobs == 0 serial_run = not args.cluster_parallelism and args.num_jobs == 0
worker_pool = None worker_pool = None
start = time.time() start = time.time()
work_items = [{"allele": allele} for allele in alleles]
work_items = []
for allele in alleles:
if not work_items or len(
work_items[-1]['alleles']) >= args.alleles_per_work_chunk:
work_items.append({"alleles": []})
work_items[-1]['alleles'].append(allele)
if serial_run: if serial_run:
# Serial run # Serial run
print("Running in serial.") print("Running in serial.")
...@@ -197,12 +229,13 @@ def run(argv=sys.argv[1:]): ...@@ -197,12 +229,13 @@ def run(argv=sys.argv[1:]):
chunksize=1) chunksize=1)
summary_results_lists = collections.defaultdict(list) summary_results_lists = collections.defaultdict(list)
for (transforms, summary_results) in tqdm.tqdm(results, total=len(work_items)): for work_item in tqdm.tqdm(results, total=len(work_items)):
predictor.allele_to_percent_rank_transform.update(transforms) for (transforms, summary_results) in work_item:
if summary_results is not None: predictor.allele_to_percent_rank_transform.update(transforms)
for (item, value) in summary_results.items(): if summary_results is not None:
summary_results_lists[item].append(value) for (item, value) in summary_results.items():
print("Done calibrating %d alleles." % len(work_items)) summary_results_lists[item].append(value)
print("Done calibrating %d alleles." % len(alleles))
if summary_results_lists: if summary_results_lists:
for (name, lst) in summary_results_lists.items(): for (name, lst) in summary_results_lists.items():
df = pandas.concat(lst, ignore_index=True) df = pandas.concat(lst, ignore_index=True)
...@@ -223,12 +256,17 @@ def run(argv=sys.argv[1:]): ...@@ -223,12 +256,17 @@ def run(argv=sys.argv[1:]):
print("Predictor written to: %s" % args.models_dir) print("Predictor written to: %s" % args.models_dir)
def do_calibrate_percentile_ranks(allele, constant_data=GLOBAL_DATA): def do_calibrate_percentile_ranks(alleles, constant_data=GLOBAL_DATA):
return calibrate_percentile_ranks( result_list = []
allele, for (i, allele) in enumerate(alleles):
constant_data['predictor'], print("Processing allele", i + 1, "of", len(alleles))
peptides=constant_data['calibration_peptides'], result_item = calibrate_percentile_ranks(
**constant_data["args"]) allele,
constant_data['predictor'],
peptides=constant_data['calibration_peptides'],
**constant_data["args"])
result_list.append(result_item)
return result_list
def calibrate_percentile_ranks( def calibrate_percentile_ranks(
......
...@@ -897,6 +897,20 @@ class Class1AffinityPredictor(object): ...@@ -897,6 +897,20 @@ class Class1AffinityPredictor(object):
transform = self.allele_to_percent_rank_transform[normalized_allele] transform = self.allele_to_percent_rank_transform[normalized_allele]
return transform.transform(affinities) return transform.transform(affinities)
except KeyError: except KeyError:
if self.allele_to_sequence:
# See if we have information for an equivalent allele
sequence = self.allele_to_sequence[normalized_allele]
other_alleles = [
other_allele for (other_allele, other_sequence)
in self.allele_to_sequence.items()
if other_sequence == sequence
]
for other_allele in other_alleles:
if other_allele in self.allele_to_percent_rank_transform:
transform = self.allele_to_percent_rank_transform[
other_allele]
return transform.transform(affinities)
msg = "Allele %s has no percentile rank information" % ( msg = "Allele %s has no percentile rank information" % (
allele + ( allele + (
"" if allele == normalized_allele "" if allele == normalized_allele
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment