Skip to content
Snippets Groups Projects
Commit 42ba21de authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

remove obsolete downloads

parent 7159ba9b
No related branches found
No related tags found
No related merge requests found
Showing
with 8 additions and 843 deletions
#!/bin/bash
#
# Cross validation using the standard class I models.
# Splits training data into 5 folds (stratifying on allele), trains and tests a
# predictor on each (train, test) fold, and writes a summary CSV giving
# performance for each allele on each fold.
#
set -e
set -x
DOWNLOAD_NAME=cross_validation_class1
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
NFOLDS=5
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml
cp $SCRIPT_DIR/split_folds.py .
cp $SCRIPT_DIR/score.py .
time python split_folds.py \
"$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
--min-measurements-per-allele 75 \
--folds $NFOLDS \
--random-state 1 \
--output-pattern-test "./test.fold_{}.csv" \
--output-pattern-train "./train.fold_{}.csv"
# Kill child processes if parent exits:
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
for fold in $(seq 0 $(expr $NFOLDS - 1))
do
mhcflurry-class1-train-allele-specific-models \
--data train.fold_${fold}.csv \
--hyperparameters hyperparameters.yaml \
--out-models-dir models.fold_${fold} \
--min-measurements-per-allele 0 \
--num-jobs 8 \
--percent-rank-calibration-num-peptides-per-length 0 \
2>&1 | tee -a LOG.train.fold_${fold}.txt &
done
wait
echo "DONE TRAINING. NOW PREDICTING."
for fold in $(seq 0 $(expr $NFOLDS - 1))
do
mhcflurry-predict \
test.fold_${fold}.csv \
--models models.fold_${fold} \
--no-throw \
--include-individual-model-predictions \
--out predictions.fold_${fold}.csv &
done
wait
time python score.py \
predictions.fold_*.csv \
--out-combined predictions.combined.csv \
--out-scores scores.csv \
--out-summary summary.all.csv
grep -v single summary.all.csv > summary.ensemble.csv
cp $SCRIPT_ABSOLUTE_PATH .
for i in $(ls *.txt)
do
bzip2 $i
done
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
# Cross validation of standard Class I models
This download contains cross validation results and intermediate data for
class I allele-specific MHCflurry models.
This exists to track the exact steps used to generate cross-validation results.
Users will probably not interact with this directly.
\ No newline at end of file
../models_class1/generate_hyperparameters.py
\ No newline at end of file
"""
Scoring script for cross-validation.
"""
import argparse
import sys
import collections
import pandas
from mhcflurry.scoring import make_scores
parser = argparse.ArgumentParser(usage = __doc__)
parser.add_argument(
"input", metavar="INPUT.csv", help="Input CSV", nargs="+")
parser.add_argument(
"--out-scores",
metavar="RESULT.csv")
parser.add_argument(
"--out-combined",
metavar="COMBINED.csv")
parser.add_argument(
"--out-summary",
metavar="RESULT.csv")
def run(argv):
args = parser.parse_args(argv)
df = None
for (i, filename) in enumerate(args.input):
input_df = pandas.read_csv(filename)
assert not input_df.mhcflurry_prediction.isnull().any()
cols_to_merge = []
input_df["prediction_%d" % i] = input_df.mhcflurry_prediction
cols_to_merge.append(input_df.columns[-1])
if 'mhcflurry_model_single_0' in input_df.columns:
input_df["prediction_single_%d" % i] = input_df.mhcflurry_model_single_0
cols_to_merge.append(input_df.columns[-1])
if df is None:
df = input_df[
["allele", "peptide", "measurement_value"] + cols_to_merge
].copy()
else:
df = pandas.merge(
df,
input_df[['allele', 'peptide'] + cols_to_merge],
on=['allele', 'peptide'],
how='outer')
print("Loaded data:")
print(df.head(5))
if args.out_combined:
df.to_csv(args.out_combined, index=False)
print("Wrote: %s" % args.out_combined)
prediction_cols = [
c
for c in df.columns
if c.startswith("prediction_")
]
scores_rows = []
for (allele, allele_df) in df.groupby("allele"):
for prediction_col in prediction_cols:
sub_df = allele_df.loc[~allele_df[prediction_col].isnull()]
scores = collections.OrderedDict()
scores['allele'] = allele
scores['fold'] = prediction_col.replace("prediction_", "").replace("single_", "")
scores['kind'] = "single" if "single" in prediction_col else "ensemble"
scores['train_size'] = allele_df[prediction_col].isnull().sum()
scores['test_size'] = len(sub_df)
# make_scores returns a dict with entries "auc", "f1", "tau"
scores.update(
make_scores(
sub_df.measurement_value, sub_df[prediction_col]))
scores_rows.append(scores)
scores_df = pandas.DataFrame(scores_rows)
print(scores_df)
if args.out_scores:
scores_df.to_csv(args.out_scores, index=False)
print("Wrote: %s" % args.out_scores)
summary_df = scores_df.groupby(["allele", "kind"])[
["train_size", "test_size", "auc", "f1", "tau"]
].mean().reset_index()
print("Summary:")
print(summary_df)
if args.out_summary:
summary_df.to_csv(args.out_summary, index=False)
print("Wrote: %s" % args.out_summary)
if __name__ == '__main__':
run(sys.argv[1:])
"""
Split training data into CV folds.
"""
import argparse
import sys
from os.path import abspath
import pandas
import numpy
from sklearn.model_selection import StratifiedKFold
parser = argparse.ArgumentParser(usage = __doc__)
parser.add_argument(
"input", metavar="INPUT.csv", help="Input CSV")
parser.add_argument(
"--folds", metavar="N", type=int, default=5)
parser.add_argument(
"--allele",
nargs="+",
help="Include only the specified allele(s)")
parser.add_argument(
"--min-measurements-per-allele",
type=int,
metavar="N",
help="Use only alleles with >=N measurements.")
parser.add_argument(
"--subsample",
type=int,
metavar="N",
help="Subsample to first N rows")
parser.add_argument(
"--random-state",
metavar="N",
type=int,
help="Specify an int for deterministic splitting")
parser.add_argument(
"--output-pattern-train",
default="./train.fold_{}.csv",
help="Pattern to use to generate output filename. Default: %(default)s")
parser.add_argument(
"--output-pattern-test",
default="./test.fold_{}.csv",
help="Pattern to use to generate output filename. Default: %(default)s")
def run(argv):
args = parser.parse_args(argv)
df = pandas.read_csv(args.input)
print("Loaded data with shape: %s" % str(df.shape))
df = df.ix[
(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
]
print("Subselected to 8-15mers: %s" % (str(df.shape)))
allele_counts = df.allele.value_counts()
if args.allele:
alleles = args.allele
else:
alleles = list(
allele_counts.ix[
allele_counts > args.min_measurements_per_allele
].index)
df = df.loc[df.allele.isin(alleles)].copy()
print("Potentially subselected by allele to: %s" % str(df.shape))
print("Data has %d alleles: %s" % (
df.allele.nunique(), " ".join(df.allele.unique())))
print(df.head())
# Take log before taking median (in case of even number of samples).
df["measurement_value"] = numpy.log1p(df.measurement_value)
df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index()
df["measurement_value"] = numpy.expm1(df.measurement_value)
print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape))
print(df.head())
if args.subsample:
df = df.head(args.subsample)
print("Subsampled to: %s" % str(df.shape))
kf = StratifiedKFold(
n_splits=args.folds,
shuffle=True,
random_state=args.random_state)
# Stratify by both allele and binder vs. nonbinder.
df["key"] = [
"%s_%s" % (
row.allele,
"binder" if row.measurement_value < 500 else "nonbinder")
for (_, row) in df.iterrows()
]
for i, (train, test) in enumerate(kf.split(df, df.key)):
train_filename = args.output_pattern_train.format(i)
test_filename = args.output_pattern_test.format(i)
df.iloc[train].to_csv(train_filename, index=False)
print("Wrote: %s" % abspath(train_filename))
df.iloc[test].to_csv(test_filename, index=False)
print("Wrote: %s" % abspath(test_filename))
if __name__ == '__main__':
run(sys.argv[1:])
#!/bin/bash
#
# Train "experimental" models using various hyperparameter combinations.
# This trains models only for a small number of alleles for which we have good
# mass-spec validation data.
#
set -e
set -x
DOWNLOAD_NAME=models_class1_experiments1
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
# Terminate children on exit
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
ALLELES="HLA-A*01:01 HLA-A*02:01 HLA-A*02:03 HLA-A*02:07 HLA-A*03:01 HLA-A*11:01 HLA-A*24:02 HLA-A*29:02 HLA-A*31:01 HLA-A*68:02 HLA-B*07:02 HLA-B*15:01 HLA-B*35:01 HLA-B*44:02 HLA-B*44:03 HLA-B*51:01 HLA-B*54:01 HLA-B*57:01"
# Standard architecture on quantitative only
cp $SCRIPT_DIR/hyperparameters-standard.yaml .
mkdir models-standard-quantitative
time mhcflurry-class1-train-allele-specific-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
--only-quantitative \
--hyperparameters hyperparameters-standard.yaml \
--out-models-dir models-standard-quantitative \
--percent-rank-calibration-num-peptides-per-length 0 \
--allele $ALLELES 2>&1 | tee -a LOG.standard.txt &
# Model variations on qualitative + quantitative
for mod in 0local_noL1 0local 2local widelocal dense8 dense32 noL1 onehot embedding
do
cp $SCRIPT_DIR/hyperparameters-${mod}.yaml .
mkdir models-${mod}
time mhcflurry-class1-train-allele-specific-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
--hyperparameters hyperparameters-${mod}.yaml \
--out-models-dir models-${mod} \
--percent-rank-calibration-num-peptides-per-length 0 \
--allele $ALLELES 2>&1 | tee -a LOG.${mod}.txt &
done
wait
cp $SCRIPT_ABSOLUTE_PATH .
for i in $(ls *.txt)
do
bzip2 $i
done
tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" *
echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2"
# Experimental class I allele-specific models (ensemble)
This download contains trained MHC Class I allele-specific MHCflurry models
using a variety of experimental architectures. These were generated for a
publication and are not intended for production use.
\ No newline at end of file
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
16
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
16
],
"dense_layer_l1_regularization": 0.0,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
},
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
16
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
32
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
8
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "embedding",
"use_embedding": true, # maintained for backward compatability
"embedding_output_dim": 8, # only used if using embedding
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
16
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
16
],
"dense_layer_l1_regularization": 0.0,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "one-hot",
"use_embedding": false, # maintained for backward compatability
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
16
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
../models_class1/hyperparameters.yaml
\ No newline at end of file
[{
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 8,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 10,
"early_stopping": true,
"validation_split": 0.2,
"minibatch_size": 128,
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.0,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": false, # maintained for backward compatability
"embedding_output_dim": 8, # only used if using embedding
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 5
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
16
],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": false,
"dropout_probability": 0.0,
}]
...@@ -10,6 +10,7 @@ import traceback ...@@ -10,6 +10,7 @@ import traceback
import random import random
from functools import partial from functools import partial
import numpy
import pandas import pandas
import yaml import yaml
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
...@@ -413,6 +414,8 @@ def train_model( ...@@ -413,6 +414,8 @@ def train_model(
def subselect_df_held_out(df, recriprocal_held_out_fraction=10, seed=0): def subselect_df_held_out(df, recriprocal_held_out_fraction=10, seed=0):
df["allele_peptide"] = df.allele + "_" + df.peptide
kf = StratifiedKFold( kf = StratifiedKFold(
n_splits=recriprocal_held_out_fraction, n_splits=recriprocal_held_out_fraction,
shuffle=True, shuffle=True,
...@@ -425,8 +428,12 @@ def subselect_df_held_out(df, recriprocal_held_out_fraction=10, seed=0): ...@@ -425,8 +428,12 @@ def subselect_df_held_out(df, recriprocal_held_out_fraction=10, seed=0):
"binder" if row.measurement_value <= 500 else "nonbinder") "binder" if row.measurement_value <= 500 else "nonbinder")
for (_, row) in df.iterrows() for (_, row) in df.iterrows()
] ]
(train, test) = next(kf.split(df, df.key)) (train, test) = next(kf.split(df, df.key))
return df.iloc[train] selected_allele_peptides = df.iloc[train].allele_peptide.unique()
result_df = df.allele_peptide.isin(selected_allele_peptides)
del result_df["allele_peptide"]
return result_df
if __name__ == '__main__': if __name__ == '__main__':
run() run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment