On Github emi80 / lab-meeting-20140625
Emilio Palumbo June 25, 2014
#!/bin/bash
# Force bash shell #$ -S /bin/bash # # Export environment variables #$ -V # # Use current working directory as job working directory #$ -cwd # # Rename stdout and stderr files #$ -o $JOB_NAME.out #$ -e $JOB_NAME.err
# getting pipeline steps to be executed
steps=(mapping bigwig contig flux)
if [[ "$@" ]]; then
read -ra steps <<< "$@"
fi
for step in ${steps[@]}; do
case $step in
mapping)
doMapping="true"
;;
bigwig)
doBigWig="true"
;;
contig)
doContig="true"
;;
quant|flux)
doFlux="true"
;;
esac
done
Run specific steps:
blueprint.pipeline.sh ... -- contig flux
Some extra code required:
#!/bin/bash
base="/users/rg/epalumbo/projects/BluePrint"
threads="8"
vmem="64G"
fluxMem=$((${vmem:0:$((${#vmem}-1))}/2))${vmem:$((${#vmem}-1)):1}
rtime="72:00:00"
do=${1-"echo"}
while read lab id path strand sex; do
quality="33"
genome="$base/genome/Homo_sapiens.GRCh37.chromosomes.chr.fa"
annotation="$base/annotation/gencode.v15.annotation.gtf"
if [[ $sex == 'Female' ]]; then
genome="$base/genome/Homo_sapiens.GRCh37.chromosomes.female.chr.fa"
annotation="$base/annotation/gencode.v15.annotation.female.gtf"
fi
sample=`echo $id | cut -d"_" -f1`
readGroup="ID=$id,PL=ILLUMINA,PU=$id,LB=Blueprint,SM=$sample,DT=`date +"%Y-%m-%dT%H:%M:%S%z"`,CN=$lab,DS=Blueprint"
command="qsub -pe smp $threads -l virtual_free=$vmem,h_rt=$rtime -q rg-el6,long -N $id.pipeline"
command=$command" ../blueprint.pipeline.sh -i `basename $path` -g $genome -a $annotation -q $quality -t $threads --paired --read-group $readGroup -r 150 --read-strand ${strand^^} --flux-mem $fluxMem"
if [ ! -d $id ]; then
$do "mkdir $id"
$do "ln -s $path $id"
$do "ln -s ${path//_1/_2} $id"
fi
$do "cd $id"
$do "$command"
$do "cd ..""
done
User has to keep metadata and file information:
<filepath>TAB<attributes_list>
with attribute_list as a semicolon separated list of key=value strings:
/path/to/file size=100; id=1; class=MyClass; type=MyType
Metadata:
. id=test1; quality=33; sex=female;
Files:
/data/test1_1.fastq.gz id=test1; quality=33; sex=female; type=fastq; view=FqRd1; /data/test1.bam id=test1; quality=33; sex=female; type=bam; view=Alignments;
{
"colsep": "\t",
"id": "labExpId",
"kw_sep": " ",
"sep": "=",
"trail": ";",
"fileinfo": [
"path",
"size",
"md5",
"type",
"view"
]
}
Useful when importing from csv/tsv:
{
"colsep": "\t",
"sep": "=",
"trail": ";",
"kw_sep": " ",
"id": "labExpId",
"fileinfo": [
"path",
"size",
"md5",
"type",
"view",
"submittedDataVersion"
],
"map": {
"BASE_COUNT": "baseCount",
"BIOMATERIAL_PROVIDER": "",
"BIOMATERIAL_TYPE": "localization",
"CELL_TYPE": "cell",
"CENTER_NAME": "lab",
"DISEASE": "",
"DONOR_AGE": "age",
"DONOR_ETHNICITY": "ethnicity",
"DONOR_HEALTH_STATUS": "",
"DONOR_ID": "donorId",
"DONOR_REGION_OF_RESIDENCE": "",
"DONOR_SEX": "sex",
"EXPERIMENT_ID": "",
"EXPERIMENT_TYPE": "",
"_FILE": "path",
"FILE_MD5": "md5",
"FILE_SIZE": "size",
"FIRST_SUBMISSION_DATE": "dateSubmittedFirst",
"INSTRUMENT_MODEL": "",
"INSTRUMENT_PLATFORM": "seqPlatform",
"LIBRARY_LAYOUT": "",
"LIBRARY_NAME": "libProtocol",
"LIBRARY_STRATEGY": "dataType",
"MOLECULE": "rnaExtract",
"READ_COUNT": "readCount",
"READ_TYPE": "readType",
"READ_QUALITIES": "quality",
"READ_STRAND": "readStrand",
"RUN_ID": "labExpId",
"RUN_NAME": "",
"SAMPLE_ID": "sraSampleAccession",
"SAMPLE_NAME": "labProtocolId",
"SEQ_RUNS_COUNT": "seqRun",
"SPECIMEN_PROCESSING": "",
"SPECIMEN_STORAGE": "",
"STUDY_ID": "sraStudyAccession",
"STUDY_NAME": "",
"SUBMISSION_DATE": "dateSubmitted",
"SUBMISSION_ID": "",
"TISSUE": "tissue",
"TWIN_PAIR_ID": "",
"TYPE": "type",
"WITHDRAWN": ""
}
}
# command line $ idxtools -i index.txt -f format.json [command] # environment variables export IDX_FILE=/path/to/index.txt export IDX_FORMAT=/path/to/format.json $ idxtools show
A CSV file, test.csv:
path,labExpId,quality,sex,type,view /data/test1_1.fastq.gz,test1,33,female,fastq,FqRd1 /data/test1_2.fastq.gz,test1,33,female,fastq,FqRd2 /data/test2_1.fastq.gz,test2,64,male,fastq,FqRd1 /data/test2_2.fastq.gz,test2,64,male,fastq,FqRd2 /data/test2.bam,test2,64,male,bam,Alignments
# import csv and show $ export IDX_FORMAT=$PWD/format.json $ idxtools -i test.csv show /data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; /data/test1_2.fastq.gz labExpId=test1; type=fastq; view=FqRd2; quality=33; sex=female; /data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; /data/test2_2.fastq.gz labExpId=test2; type=fastq; view=FqRd2; quality=64; sex=male; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male; # import csv and save to file $ idxtools -i test.csv -f format.json show -o index.txt $ export IDX_FILE=$PWD/index.txt
# query by attributes $ idxtools show id=test2 /data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; /data/test2_2.fastq.gz labExpId=test2; type=fastq; view=FqRd2; quality=64; sex=male; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male; $ idxtools show view=FqRd1 ./data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; ./data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; $ idxtools show type=bam ./data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male;
# queries use regular expressions # query by id starting with t $ idxtools show id=t # equivalent to: idxtools show id="^t" /data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; /data/test1_2.fastq.gz labExpId=test1; type=fastq; view=FqRd2; quality=33; sex=female; /data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; /data/test2_2.fastq.gz labExpId=test2; type=fastq; view=FqRd2; quality=64; sex=male; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male; # exact queries $ idxtools show id=t --exact # operations on numeric attributes $ idxtools show quality=">40" /data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; /data/test2_2.fastq.gz labExpId=test2; type=fastq; view=FqRd2; quality=64; sex=male; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male; $ idxtools show quality="<40" /data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; /data/test1_2.fastq.gz labExpId=test1; type=fastq; view=FqRd2; quality=33; sex=female;
# TSV output - single attribute $ idxtools show view=FqRd1 -t path /data/test1_1.fastq.gz /data/test2_1.fastq.gz # TSV output - multiple attributes $ idxtools show view=FqRd1 -t id,path test1 ./data/test1_1.fastq.gz test2 ./data/test2_1.fastq.gz # with header $ idxtools show view=FqRd1 -t id,path --header id path test1 ./data/test1_1.fastq.gz test2 ./data/test2_1.fastq.gz
# add /data/test1.bam $ idxtools add path=/data/test1.bam id=test1 type=bam view=Alignments # check $ idxtools show type=bam /data/test1.bam labExpId=test1; type=bam; view=Alignments; quality=33; sex=female; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male;
# remove /data/test1.bam $ idxtools remove /data/test1.bam # check $ idxtools show type=bam /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male;
# remove test2 $ idxtools remove id=test2 # check $ idxtools show ./data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; ./data/test1_2.fastq.gz labExpId=test1; type=fastq; view=FqRd2; quality=33; sex=female;
Tools can be implemented in various ways:
{
"cluster": "jip.cluster.SGE",
"sge" : {
"threads_pe": "smp",
"time_limit": "h_rt"
},
"profiles": {
"default": {
"queue": "short,long,rg-el6",
"time": "3h"
},
"long": {
"queue": "long,rg-el6",
"time": "24h",
"mem": "64G"
},
"short": {
"queue": "short",
"time": "6h",
"mem": "16G"
}
}
}
The jip bash command:
# run `hostname` locally $ jip bash -c hostname # submit $ jip bash -s -c hostname
Check jobs with jip jobs:
# check the status of the job $ jip jobs
JIP scripts are extended Bash scripts:
#!/usr/bin/env jip
# Greetings
# usage:
# hello.jip <name>
echo "Hello ${name}"
Make the file executable and you can use the JIP interpreter to run it or submit it:
$ chmod +x hello.jip
# run the script
$ ./hello.jip Joe
# show dry run and command only
$ ./hello.jip Joe -- --dry --show
# submit the script run
$ ./hello.jip Joe -- submit
$> export JIP_PATH=$PWD
$> jip tools
...
Name Path
======================================================================================
hello.jip /users/rg/epalumbo/jip/hello.jip
...
#!/bin/env jip
# Sort fastq file by read id
#
# Usage:
# fastq-sort.jip [-i <fastq>] [-o <sorted_fastq>]
#
# Inputs:
# -i, --input <input> Input fastq [default: stdin]
#
# Outputs:
# -o, --output <output> Output sorted fastq [default: stdout]
#
${input|arg("cat ")|suf(" | ")}paste - - - - | sort -k1,1 | tr '\t' '\n' ${output|arg(">")}
#!/bin/env jip
# Run pigz on an input file
#
# Usage:
# pigz.jip [-i <INPUT>] [-d] [-o <OUTPUT>]
#
# Inputs:
# -i, --input <INPUT> The input file [default: stdin]
#
# Outputs:
# -o, --output <OUTPUT> The output file [default: stdout]
#
# Options:
# -d, --decompress Decompress data
#%begin setup
add_option('stdout', False, hidden=False, short='-c')
if input.get():
if opts['input'].raw().endswith('gz'):
opts['decompress'] = True
else:
opts['decompress'] = False
if not output.get():
opts['stdout'] = True
else:
if output.get() == '${input}.gz' or output.get().replace('.gz','') == input.get():
opts['output'].hidden = True
#%end
pigz -p ${JIP_THREADS} ${stdout|arg} ${decompress|arg} ${input|arg("")} ${output|arg(">")}
#!/bin/env jip
#
# Sort and compress fastq
#
# Usage:
# fq-pipe [-i <INPUT>] [-o <OUTPUT>]
#
# Inputs:
# -i, --input <INPUT> Input file [default: stdin]
#
# Outputs:
# -o, --output <OUTPUT> Output file [default: stdout]
#
#%begin pipeline
decomp_input = input
if type(input.raw()) == unicode and input.raw().endswith('gz'):
decomp_input = job(temp=True).run('pigz', input=decomp_input)
sorted_fastq = job(temp=True).run('fastq-sort', input=decomp_input)
run('pigz', input=sorted_fastq, output=output)
#%end
$ ./sort_fastq.jip -i *_1.fastq -- --dry --show ... ##################### | Job hierarchy | ##################### sort_fastq.0 sort_fastq.1 ##################### | Tasks: 2 | | Jobs: 2 | | Named Groups: 1 | | Job Groups: 2 | ##################### Job commands ============ ### sort_fastq.0 -- Interpreter: bash ### stdout: <default> ### stderr: <default> cat /home/epalumbo/testA_1.fastq | paste - - - - | sort -k1,1 | tr '\t' '\n' >/home/epalumbo/testA_1_sorted.fastq ### ### sort_fastq.1 -- Interpreter: bash ### stdout: <default> ### stderr: <default> cat /home/epalumbo/testB_1.fastq | paste - - - - | sort -k1,1 | tr '\t' '\n' >/home/epalumbo/testB_1_sorted.fastq ###
(shrimp + rsem + flux-capacitor) * 96:
SQlite db limitation
// sample configuration
process {
executor = 'sge'
queue = 'rg-el6'
}
env {
PATH="$PWD/gemtools-1.6.2-i3/bin:$PWD/flux-capacitor-1.2.4/bin:$PATH"
}
Nextflow scripts are extended Groovy scripts:
#!/usr/bin/env nextflow
str = Channel.from('hello', 'hola', 'bonjour', 'ciao')
process printHello {
input:
val str
output:
stdout into result
"""
echo $str
"""
}
result.subscribe { print it }
$ chmod +x hello.nf $ ./hello.nf N E X T F L O W ~ version 0.8.2 [warm up] executor > local [5a/d7e7d3] Submitted process > printHello (2) [42/e78174] Submitted process > printHello (1) [c3/88c277] Submitted process > printHello (4) [7c/55887c] Submitted process > printHello (3) bonjour hola ciao hello
#!/usr/bin/env nextflow
/*
* Pipeline parameters that can be ovverridden by the command line parameter
*/
params.query = "$HOME/*.fa"
params.db = "$HOME/tools/blast-db/pdb/pdb"
params.out = "./result.txt"
params.chunkSize = 100
db = file(params.db)
fasta = Channel
.fromPath(params.query)
.splitFasta(by: params.chunkSize)
process blast {
input:
file 'query.fa' from fasta
output:
file 'top_hits'
"""
blastp -db ${db} -query query.fa -outfmt 6 > blast_result
cat blast_result | head -n 10 | cut -f 2 > top_hits
"""
}
process extract {
input:
file top_hits
output:
file 'sequences'
"blastdbcmd -db ${db} -entry_batch top_hits | head -n 10 > sequences"
}
sequences
.collectFile(name: params.out)
.subscribe { println "Result saved at file: $it" }
$ echo $MODULEPATH /software/rg/el6.3/modulefiles:/usr/share/Modules/modulefiles:/etc/modulefiles # modulefile folder structure $ tree /software/rg/el6.3/modulefiles /software/rg/el6.3/modulefiles/ ├── aspera │ ├── 3.0.1 │ └── 3.3.3 ├── bedtools │ ├── 2.17.0 │ └── 2.19.1 ...
#%Module1.0
########################################################
#
# Author: Emilio Palumbo (emilio.palumbo@crg.es)
#
########################################################
set PROG_NAME samtools
set PROG_VERSION 0.1.19
set PROG_HOME /software/rg/el6.3/$PROG_NAME-$PROG_VERSION
proc ModulesHelp { } {
puts stderr "$PROG_NAME version $PROG_VERSION"
}
module-whatis "loads the $PROG_NAME $PROG_VERSION module"
module-verbosity {on}
# Tests of consistency
# --------------------
# This application cannot be loaded if another $PROG_NAME modulefile was previously loaded
set namelow [string tolower $PROG_NAME]
conflict $namelow
### This shows info about loaded/unloaded module
if { [module-info mode] != "whatis" } {
puts stderr "[module-info mode] [module-info name] (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH)"
}
prepend-path PATH $PROG_HOME/bin
prepend-path MANPATH $PROG_HOME/man
prepend-path LD_LIBRARY_PATH $PROG_HOME/lib
prepend-path C_INCLUDE_PATH $PROG_HOME/include
$ module avail ------------------------ /software/rg/el6.3/modulefiles ------------------------ aspera/3.0.1(default) htslib/0.2.0-rc8(default) aspera/3.3.3 ipsa/1.0(default) bedtools/2.17.0(default) ipsa/1.1 bedtools/2.19.1 jip-tools/1.0(default) bowtie/1.0.1(default) picard/1.81(default) cufflinks/2.0.2 pigz/2.2.5(default) cufflinks/2.1.1 pigz/2.3.1 cufflinks/2.2.1(default) plink/1.07(default) edirect/1.50(default) python/2.7/2.7.3 emboss/6.6.0(default) python/2.7/2.7.5 fastqc/0.10.1(default) python/2.7/2.7.6(default) flux/1.2.3 python/2.7/2.7.6-sqlite flux/1.2.4(default) python/3/3.3.4 flux/1.2.4-SNAPSHOT python-modules/2.6(default) flux/1.3 rsem/1.1.17 flux/1.4 rsem/1.2.12(default) flux/1.5.1 samtools/0.1.18 flux/1.6 samtools/0.1.19(default) flux/1.6.1 shrimp/2.2.3(default) gemtools/1.6.1-i3 sickle/1.210(default) gemtools/1.6.2-i3(default) sratoolkit/2.3.5(default) gemtools/1.7.1-i3 sublime-text/3-build-3059(default) gemtools/1.7-i3 texlive/2012(default) glimmps ucsc/2013.02(default) htop/1.0.2(default) vcftools/0.2.12a(default) ------------------------ /usr/share/Modules/modulefiles ------------------------ dot module-cvs module-info modules null use.own ------------------------------- /etc/modulefiles ------------------------------- compat-openmpi-x86_64 openmpi-x86_64
# load default module version $ module load samtools load samtools/0.1.19 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) $ which samtools /software/rg/el6.3/samtools-0.1.19/bin/samtools # load specific version $ module load samtools/0.1.18 load samtools/0.1.18 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) $ which samtools /software/rg/el6.3/samtools-0.1.18/bin/samtools # switch module version $ module switch samtools samtools/0.1.19 switch1 samtools/0.1.18 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) ... $ which samtools /software/rg/el6.3/samtools-0.1.19/bin/samtools
# unload module # equivalent to 'module unload samtools' but shorter! $ module rm samtools remove samtools/0.1.19 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) # unload all loaded modules $ module purge remove samtools/0.1.19 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) remove bedtools/2.17.0 (PATH)
# list loaded modules $ module list Currently Loaded Modulefiles: 1) samtools/0.1.19 2) bedtools/2.17.0