FunctionLab · kathyxchen · Oct 10, 2022 · Sep 1, 2022 · Sep 1, 2022 · Sep 1, 2022
diff --git a/1_sequence_prediction.py b/1_sequence_prediction.py
@@ -0,0 +1,74 @@
+"""
+Description:
+    CLI for sequence prediction using the Sei deep learning model,
+    given an input FASTA or BED file.
+    Outputs Sei chromatin profile predictions.
+
+Usage:
+    1_sequence_prediction.py <seq-input> <output-dir> [--genome=<hg>] [--cuda]
+    1_sequence_prediction.py -h | --help
+
+Options:
+    -h --help               Show this screen.
+    <seq-input>             Input FASTA or BED file.
+    <output-dir>            Output directory
+    --genome=<hg>           If <seq-input> is a BED file, specify the reference
+                            genome hg38 or hg19 [default: hg19]
+    --cuda                  Run variant effect prediction on a CUDA-enabled
+                            GPU
+
+"""
+import os
+
+from docopt import docopt
+
+from selene_sdk.sequences import Genome
+from selene_sdk.utils import load_path
+from selene_sdk.utils import parse_configs_and_run
+
+
+def _finditem(obj, val):
+    for k, v in obj.items():
+        if hasattr(v, 'keywords'):
+            _finditem(v.keywords, val)
+        elif isinstance(v, dict):
+            _finditem(v, val)
+        elif isinstance(v, str) and '<PATH>' in v:
+            obj[k] = v.replace('<PATH>', val)
+
+
+if __name__ == "__main__":
+    arguments = docopt(
+        __doc__,
+        version='0.0.0')
+
+    seq_input = arguments["<seq-input>"]
+
+    os.makedirs(arguments["<output-dir>"], exist_ok=True)
+
+    # Assumes that the `models` directory is in the same directory as this
+    # script. Please update this line if not.
+    use_dir = os.path.dirname(os.path.abspath(__file__))
+    use_cuda = arguments["--cuda"]
+
+    sei_out = os.path.join(arguments["<output-dir>"], "chromatin-profiles-hdf5")
+    os.makedirs(sei_out, exist_ok=True)
+
+    configs = load_path("./model/sei_seq_prediction.yml", instantiate=False)
+    _finditem(configs, use_dir)
+
+    configs["prediction"].update(input_path=seq_input, output_dir=sei_out)
+    configs["analyze_sequences"].bind(use_cuda=use_cuda)
+
+    # Assumes BED file coordinates input if file doesn't end with .fa or .fasta
+    if not seq_input.endswith('.fa') and not seq_input.endswith('.fasta'):
+        hg_version = arguments["--genome"]
+        genome = None
+        if hg_version == 'hg38' or hg_version == 'hg19':
+            genome = Genome(
+                os.path.join('.', 'resources', '{0}_UCSC.fa'.format(hg_version)))
+            configs["analyze_sequences"].bind(reference_sequence=genome)
+        else:
+            raise ValueError("--genome=<hg> must be 'hg19' or 'hg38'")
+    parse_configs_and_run(configs)
+
diff --git a/1_sequence_prediction.sh b/1_sequence_prediction.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+#####################################################################
+# Example script for running Sei deep learning model sequence
+# prediction with Selene
+
+# Usage:
+# sh 1_sequence_prediction.sh <input-file> <genome> <output-dir> --cuda
+
+# Please only specify hg38 or hg19 as input for <genome> if <input-file> is
+# a BED file. If you are using a FASTA file of sequences, you can specify
+# whatever genome version you wish for your own reference (will be printed
+# as part of output for this script) but do not leave it empty.
+
+# --cuda is optional, use if you are running on a CUDA-enabled
+# GPU machine (see example_slurm_scripts/1_example_seqpred.slurm_gpu.sh)
+#####################################################################
+
+set -o errexit
+set -o pipefail
+set -o nounset
+
+input_filepath="${1:-}"
+hg_version="${2:-}"
+out_dir="${3:-}"
+cuda="${4:-}"
+
+mkdir -p $out_dir
+
+input_basename=$(basename $input_filepath)
+cp $input_filepath $out_dir/
+
+echo "Input argments: $input_filepath $out_dir $hg_version $cuda"
+
+if [ "$cuda" = "--cuda" ]
+then
+    echo "use_cuda: True"
+    python -u 1_sequence_prediction.py \
+        "$out_dir/$input_basename" \
+        $out_dir \
+        --genome=${hg_version} \
+        --cuda
+else
+    echo "use_cuda: False"
+    python -u 1_sequence_prediction.py \
+        "$out_dir/$input_basename" \
+        $out_dir \
+        --genome=${hg_version}
+fi
diff --git a/vep_cli.py → 1_variant_effect_prediction.py b/vep_cli.py → 1_variant_effect_prediction.py
@@ -1,11 +1,12 @@
 """
 Description:
-    CLI for variant effect prediction using the Sei deep learning model.
-    Outputs both Sei chromatin profile predictions and sequence class scores.
+    CLI for variant effect prediction using the Sei deep learning model,
+    given an input VCF file.
+    Outputs Sei chromatin profile predictions.
 
 Usage:
-    vep_cli.py <vcf> <output-dir> [--genome=<hg>] [--cuda]
-    vep_cli.py -h | --help
+    1_variant_effect_prediction.py <vcf> <output-dir> [--genome=<hg>] [--cuda]
+    1_variant_effect_prediction.py -h | --help
 
 Options:
     -h --help               Show this screen.
@@ -69,5 +70,5 @@ def run_config(config_yml, output_dir):
 
     sei_out = os.path.join(arguments["<output-dir>"], "chromatin-profiles-hdf5")
     os.makedirs(sei_out, exist_ok=True)
-    run_config("./model/sei_prediction.yml", sei_out)
+    run_config("./model/sei_varianteffect_prediction.yml", sei_out)
 
diff --git a/run_pipeline.sh → 1_variant_effect_prediction.sh b/run_pipeline.sh → 1_variant_effect_prediction.sh
@@ -1,15 +1,16 @@
 #!/usr/bin/env bash
 
 ###############################################################
-# Example script for running the variant effect prediction
-# pipeline for Sei and sequence classes.
+# Example script for running Sei variant effect prediction
+# using Selene.
 
-# sh run_pipeline.sh <vcf> <hg> <output-dir> --cuda
+# Usage:
+# sh 1_variant_effect_prediction.sh <vcf> <hg> <output-dir> [--cuda]
 
 # Please only specify hg38 or hg19 as input for <hg>.
 
 # --cuda is optional, use if you are running on a CUDA-enabled
-# GPU machine
+# GPU machine (see example_slurm_scripts/1_example_vep.slurm_gpu.sh)
 ###############################################################
 
 set -o errexit
@@ -23,21 +24,23 @@ cuda="${4:-}"
 
 mkdir -p $outdir
 
+echo "Input arguments: $vcf_filepath $hg_version $outdir $cuda"
+
 vcf_basename=$(basename $vcf_filepath)
 cp $vcf_filepath $outdir/
 
 if [ "$cuda" = "--cuda" ]
 then
     echo "use_cuda: True"
-    python vep_cli.py "$outdir/$vcf_basename" \
-                      $outdir \
-                      --genome=${hg_version} \
-                      --cuda
+    python -u 1_variant_effect_prediction.py \
+        "$outdir/$vcf_basename" \
+        $outdir \
+        --genome=${hg_version} \
+        --cuda
 else
     echo "use_cuda: False"
-    python vep_cli.py "$outdir/$vcf_basename" \
-                      $outdir \
-                      --genome=${hg_version}
+    python -u 1_variant_effect_prediction.py \
+        "$outdir/$vcf_basename" \
+        $outdir \
+        --genome=${hg_version}
 fi
-
-python sequence_class.py $outdir "$vcf_basename"
diff --git a/2_raw_sc_score.py b/2_raw_sc_score.py
@@ -0,0 +1,67 @@
+"""
+Description:
+    This script is run after `1_sequence_prediction.py`. It computes the
+    raw sequence class scores based on Sei chromatin profile predictions
+    of sequences (i.e. no variants).
+
+Usage:
+    2_raw_sc_score.py <input-fp> <output-dir>
+                      [--out-name=<out-name>]
+    2_raw_sc_score.py -h | --help
+
+Options:
+    <input-fp>             Path to Sei sequence predictions to compute raw
+                           sequence class projection scores (NO variant effect).
+    <output-dir>           The directory to output the sequence class scores
+                           as an .NPY file.
+    --out-name=<out-name>  Specify an output filename prefix. Otherwise, output
+                           filenames will be based on <input-fp>.
+
+"""
+import os
+
+from docopt import docopt
+import h5py
+import numpy as np
+import pandas as pd
+
+from utils import get_filename_prefix, get_data, get_targets
+from utils import sc_projection
+
+
+if __name__ == "__main__":
+    arguments = docopt(
+        __doc__,
+        version='1.0.0')
+    output_dir = arguments['<output-dir>']
+    os.makedirs(output_dir, exist_ok=True)
+
+    input_pred_file = arguments['<input-fp>']
+    input_preds = get_data(input_pred_file)
+    input_dir, input_fn = os.path.split(input_pred_file)
+
+    input_prefix = get_filename_prefix(input_fn)
+    rowlabels_file = os.path.join(input_dir, '{0}_row_labels.txt'.format(
+        input_prefix))
+    rowlabels = pd.read_csv(rowlabels_file, sep='\t')
+    if len(rowlabels) != len(input_preds):
+        raise ValueError(("Rowlabels file '{0}' does not have the same number "
+                          "of rows as '{1}'").format(rowlabels_file,
+                                                     input_pred_file))
+
+    output_prefix = arguments['--out-name']
+    if output_prefix is None:
+        output_prefix = input_fn.split('_predictions')[0]
+    print("Output files will start with prefix '{0}'".format(output_prefix))
+
+    sei_dir = "./model"
+    chromatin_profiles = get_targets(os.path.join(sei_dir, "target.names"))
+    seqclass_names = get_targets(os.path.join(sei_dir, "seqclass.names"))
+
+    clustervfeat = np.load(os.path.join(sei_dir, 'projvec_targets.npy'))
+
+    projscores = sc_projection(input_preds, clustervfeat)
+    np.save(os.path.join(
+        output_dir, "{0}.raw_sequence_class_scores.npy".format(output_prefix)),
+        projscores)
+
diff --git a/2_raw_sc_score.sh b/2_raw_sc_score.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+#####################################################################
+# Example script for computing the raw sequence class scores
+# given Sei chromatin profile sequence predictions
+
+# Usage:
+# sh 2_raw_sc_score.sh <input-file> <output-dir>
+#####################################################################
+
+set -o errexit
+set -o pipefail
+set -o nounset
+
+input_filepath="${1:-}"
+out_dir="${2:-}"
+
+mkdir -p $out_dir
+
+echo "Input argments: $input_filepath $out_dir"
+
+python -u 2_raw_sc_score.py $input_filepath $out_dir