Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions 1_sequence_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Description:
CLI for sequence prediction using the Sei deep learning model,
given an input FASTA or BED file.
Outputs Sei chromatin profile predictions.

Usage:
1_sequence_prediction.py <seq-input> <output-dir> [--genome=<hg>] [--cuda]
1_sequence_prediction.py -h | --help

Options:
-h --help Show this screen.
<seq-input> Input FASTA or BED file.
<output-dir> Output directory
--genome=<hg> If <seq-input> is a BED file, specify the reference
genome hg38 or hg19 [default: hg19]
--cuda Run variant effect prediction on a CUDA-enabled
GPU

"""
import os

from docopt import docopt

from selene_sdk.sequences import Genome
from selene_sdk.utils import load_path
from selene_sdk.utils import parse_configs_and_run


def _finditem(obj, val):
for k, v in obj.items():
if hasattr(v, 'keywords'):
_finditem(v.keywords, val)
elif isinstance(v, dict):
_finditem(v, val)
elif isinstance(v, str) and '<PATH>' in v:
obj[k] = v.replace('<PATH>', val)


if __name__ == "__main__":
arguments = docopt(
__doc__,
version='0.0.0')

seq_input = arguments["<seq-input>"]

os.makedirs(arguments["<output-dir>"], exist_ok=True)

# Assumes that the `models` directory is in the same directory as this
# script. Please update this line if not.
use_dir = os.path.dirname(os.path.abspath(__file__))
use_cuda = arguments["--cuda"]

sei_out = os.path.join(arguments["<output-dir>"], "chromatin-profiles-hdf5")
os.makedirs(sei_out, exist_ok=True)

configs = load_path("./model/sei_seq_prediction.yml", instantiate=False)
_finditem(configs, use_dir)

configs["prediction"].update(input_path=seq_input, output_dir=sei_out)
configs["analyze_sequences"].bind(use_cuda=use_cuda)

# Assumes BED file coordinates input if file doesn't end with .fa or .fasta
if not seq_input.endswith('.fa') and not seq_input.endswith('.fasta'):
hg_version = arguments["--genome"]
genome = None
if hg_version == 'hg38' or hg_version == 'hg19':
genome = Genome(
os.path.join('.', 'resources', '{0}_UCSC.fa'.format(hg_version)))
configs["analyze_sequences"].bind(reference_sequence=genome)
else:
raise ValueError("--genome=<hg> must be 'hg19' or 'hg38'")
parse_configs_and_run(configs)

49 changes: 49 additions & 0 deletions 1_sequence_prediction.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash

#####################################################################
# Example script for running Sei deep learning model sequence
# prediction with Selene

# Usage:
# sh 1_sequence_prediction.sh <input-file> <genome> <output-dir> --cuda

# Please only specify hg38 or hg19 as input for <genome> if <input-file> is
# a BED file. If you are using a FASTA file of sequences, you can specify
# whatever genome version you wish for your own reference (will be printed
# as part of output for this script) but do not leave it empty.

# --cuda is optional, use if you are running on a CUDA-enabled
# GPU machine (see example_slurm_scripts/1_example_seqpred.slurm_gpu.sh)
#####################################################################

set -o errexit
set -o pipefail
set -o nounset

input_filepath="${1:-}"
hg_version="${2:-}"
out_dir="${3:-}"
cuda="${4:-}"

mkdir -p $out_dir

input_basename=$(basename $input_filepath)
cp $input_filepath $out_dir/

echo "Input argments: $input_filepath $out_dir $hg_version $cuda"

if [ "$cuda" = "--cuda" ]
then
echo "use_cuda: True"
python -u 1_sequence_prediction.py \
"$out_dir/$input_basename" \
$out_dir \
--genome=${hg_version} \
--cuda
else
echo "use_cuda: False"
python -u 1_sequence_prediction.py \
"$out_dir/$input_basename" \
$out_dir \
--genome=${hg_version}
fi
11 changes: 6 additions & 5 deletions vep_cli.py → 1_variant_effect_prediction.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
"""
Description:
CLI for variant effect prediction using the Sei deep learning model.
Outputs both Sei chromatin profile predictions and sequence class scores.
CLI for variant effect prediction using the Sei deep learning model,
given an input VCF file.
Outputs Sei chromatin profile predictions.

Usage:
vep_cli.py <vcf> <output-dir> [--genome=<hg>] [--cuda]
vep_cli.py -h | --help
1_variant_effect_prediction.py <vcf> <output-dir> [--genome=<hg>] [--cuda]
1_variant_effect_prediction.py -h | --help

Options:
-h --help Show this screen.
Expand Down Expand Up @@ -69,5 +70,5 @@ def run_config(config_yml, output_dir):

sei_out = os.path.join(arguments["<output-dir>"], "chromatin-profiles-hdf5")
os.makedirs(sei_out, exist_ok=True)
run_config("./model/sei_prediction.yml", sei_out)
run_config("./model/sei_varianteffect_prediction.yml", sei_out)

29 changes: 16 additions & 13 deletions run_pipeline.sh → 1_variant_effect_prediction.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#!/usr/bin/env bash

###############################################################
# Example script for running the variant effect prediction
# pipeline for Sei and sequence classes.
# Example script for running Sei variant effect prediction
# using Selene.

# sh run_pipeline.sh <vcf> <hg> <output-dir> --cuda
# Usage:
# sh 1_variant_effect_prediction.sh <vcf> <hg> <output-dir> [--cuda]

# Please only specify hg38 or hg19 as input for <hg>.

# --cuda is optional, use if you are running on a CUDA-enabled
# GPU machine
# GPU machine (see example_slurm_scripts/1_example_vep.slurm_gpu.sh)
###############################################################

set -o errexit
Expand All @@ -23,21 +24,23 @@ cuda="${4:-}"

mkdir -p $outdir

echo "Input arguments: $vcf_filepath $hg_version $outdir $cuda"

vcf_basename=$(basename $vcf_filepath)
cp $vcf_filepath $outdir/

if [ "$cuda" = "--cuda" ]
then
echo "use_cuda: True"
python vep_cli.py "$outdir/$vcf_basename" \
$outdir \
--genome=${hg_version} \
--cuda
python -u 1_variant_effect_prediction.py \
"$outdir/$vcf_basename" \
$outdir \
--genome=${hg_version} \
--cuda
else
echo "use_cuda: False"
python vep_cli.py "$outdir/$vcf_basename" \
$outdir \
--genome=${hg_version}
python -u 1_variant_effect_prediction.py \
"$outdir/$vcf_basename" \
$outdir \
--genome=${hg_version}
fi

python sequence_class.py $outdir "$vcf_basename"
67 changes: 67 additions & 0 deletions 2_raw_sc_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""
Description:
This script is run after `1_sequence_prediction.py`. It computes the
raw sequence class scores based on Sei chromatin profile predictions
of sequences (i.e. no variants).

Usage:
2_raw_sc_score.py <input-fp> <output-dir>
[--out-name=<out-name>]
2_raw_sc_score.py -h | --help

Options:
<input-fp> Path to Sei sequence predictions to compute raw
sequence class projection scores (NO variant effect).
<output-dir> The directory to output the sequence class scores
as an .NPY file.
--out-name=<out-name> Specify an output filename prefix. Otherwise, output
filenames will be based on <input-fp>.

"""
import os

from docopt import docopt
import h5py
import numpy as np
import pandas as pd

from utils import get_filename_prefix, get_data, get_targets
from utils import sc_projection


if __name__ == "__main__":
arguments = docopt(
__doc__,
version='1.0.0')
output_dir = arguments['<output-dir>']
os.makedirs(output_dir, exist_ok=True)

input_pred_file = arguments['<input-fp>']
input_preds = get_data(input_pred_file)
input_dir, input_fn = os.path.split(input_pred_file)

input_prefix = get_filename_prefix(input_fn)
rowlabels_file = os.path.join(input_dir, '{0}_row_labels.txt'.format(
input_prefix))
rowlabels = pd.read_csv(rowlabels_file, sep='\t')
if len(rowlabels) != len(input_preds):
raise ValueError(("Rowlabels file '{0}' does not have the same number "
"of rows as '{1}'").format(rowlabels_file,
input_pred_file))

output_prefix = arguments['--out-name']
if output_prefix is None:
output_prefix = input_fn.split('_predictions')[0]
print("Output files will start with prefix '{0}'".format(output_prefix))

sei_dir = "./model"
chromatin_profiles = get_targets(os.path.join(sei_dir, "target.names"))
seqclass_names = get_targets(os.path.join(sei_dir, "seqclass.names"))

clustervfeat = np.load(os.path.join(sei_dir, 'projvec_targets.npy'))

projscores = sc_projection(input_preds, clustervfeat)
np.save(os.path.join(
output_dir, "{0}.raw_sequence_class_scores.npy".format(output_prefix)),
projscores)

22 changes: 22 additions & 0 deletions 2_raw_sc_score.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

#####################################################################
# Example script for computing the raw sequence class scores
# given Sei chromatin profile sequence predictions

# Usage:
# sh 2_raw_sc_score.sh <input-file> <output-dir>
#####################################################################

set -o errexit
set -o pipefail
set -o nounset

input_filepath="${1:-}"
out_dir="${2:-}"

mkdir -p $out_dir

echo "Input argments: $input_filepath $out_dir"

python -u 2_raw_sc_score.py $input_filepath $out_dir
Loading