diff --git a/downloads-generation/models_class1_pan_refined/GENERATE.sh b/downloads-generation/models_class1_pan_refined/GENERATE.sh index 17d63e6fbb5175efcf19faa198fbe2f277d2e64d..a0cf2a4b7ba958a0c9cbb20b216561e3665d8513 100755 --- a/downloads-generation/models_class1_pan_refined/GENERATE.sh +++ b/downloads-generation/models_class1_pan_refined/GENERATE.sh @@ -98,7 +98,6 @@ else --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \ --decoys-per-hit 1 \ --out train.multiallelic.csv \ - --alleles "HLA-A*02:01" "HLA-B*27:01" "HLA-C*07:01" "HLA-A*03:01" "HLA-B*15:01" "HLA-C*01:02" fi ALLELE_LIST=$(bzcat "$MONOALLELIC_TRAIN" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) @@ -113,6 +112,7 @@ time mhcflurry-multiallelic-refinement \ --out-affinity-predictor-dir $(pwd)/models.affinity \ --out-presentation-predictor-dir $(pwd)/models.presentation \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ + --only-alleles-with-mass-spec \ $PARALLELISM_ARGS time mhcflurry-calibrate-percentile-ranks \ diff --git a/mhcflurry/multiallelic_refinement_command.py b/mhcflurry/multiallelic_refinement_command.py index ac302861f67e30075375763068b4ad58a8566830..63371e58d18929cd768c4595b1dbab5d24c5ac90 100644 --- a/mhcflurry/multiallelic_refinement_command.py +++ b/mhcflurry/multiallelic_refinement_command.py @@ -72,6 +72,10 @@ parser.add_argument( "--max-models", type=int, default=None) +parser.add_argument( + "--only-alleles-with-mass-spec", + type=int, + default=None) parser.add_argument( "--verbosity", type=int, @@ -106,6 +110,23 @@ def run(argv=sys.argv[1:]): monoallelic_df = pandas.read_csv(args.monoallelic_data) print("Loaded monoallelic data: %s" % (str(monoallelic_df.shape))) + if args.only_alleles_with_mass_spec: + multiallelic_alleles = set() + for hla in multiallelic_df.hla.unique(): + multiallelic_alleles.update(hla.split()) + print( + "Multiallelic alleles (%d)" % len(multiallelic_alleles), + multiallelic_alleles) + new_monoallelic_df = monoallelic_df.loc[ + monoallelic_df.allele.isin((multiallelic_alleles)) + ].copy() + print( + "Allele selection reduced monoallelic data from", + len(monoallelic_df), + "to", + len(new_monoallelic_df)) + monoallelic_df = new_monoallelic_df + input_predictor = Class1AffinityPredictor.load( args.models_dir, optimization_level=0, max_models=args.max_models) print("Loaded: %s" % input_predictor)