On Github emi80 / lab-meeting-20140625
Emilio Palumbo June 25, 2014
#!/bin/bash
# Force bash shell #$ -S /bin/bash # # Export environment variables #$ -V # # Use current working directory as job working directory #$ -cwd # # Rename stdout and stderr files #$ -o $JOB_NAME.out #$ -e $JOB_NAME.err
# getting pipeline steps to be executed steps=(mapping bigwig contig flux) if [[ "$@" ]]; then read -ra steps <<< "$@" fi for step in ${steps[@]}; do case $step in mapping) doMapping="true" ;; bigwig) doBigWig="true" ;; contig) doContig="true" ;; quant|flux) doFlux="true" ;; esac done
Run specific steps:
blueprint.pipeline.sh ... -- contig flux
Some extra code required:
#!/bin/bash base="/users/rg/epalumbo/projects/BluePrint" threads="8" vmem="64G" fluxMem=$((${vmem:0:$((${#vmem}-1))}/2))${vmem:$((${#vmem}-1)):1} rtime="72:00:00" do=${1-"echo"} while read lab id path strand sex; do quality="33" genome="$base/genome/Homo_sapiens.GRCh37.chromosomes.chr.fa" annotation="$base/annotation/gencode.v15.annotation.gtf" if [[ $sex == 'Female' ]]; then genome="$base/genome/Homo_sapiens.GRCh37.chromosomes.female.chr.fa" annotation="$base/annotation/gencode.v15.annotation.female.gtf" fi sample=`echo $id | cut -d"_" -f1` readGroup="ID=$id,PL=ILLUMINA,PU=$id,LB=Blueprint,SM=$sample,DT=`date +"%Y-%m-%dT%H:%M:%S%z"`,CN=$lab,DS=Blueprint" command="qsub -pe smp $threads -l virtual_free=$vmem,h_rt=$rtime -q rg-el6,long -N $id.pipeline" command=$command" ../blueprint.pipeline.sh -i `basename $path` -g $genome -a $annotation -q $quality -t $threads --paired --read-group $readGroup -r 150 --read-strand ${strand^^} --flux-mem $fluxMem" if [ ! -d $id ]; then $do "mkdir $id" $do "ln -s $path $id" $do "ln -s ${path//_1/_2} $id" fi $do "cd $id" $do "$command" $do "cd .."" done
User has to keep metadata and file information:
<filepath>TAB<attributes_list>
with attribute_list as a semicolon separated list of key=value strings:
/path/to/file size=100; id=1; class=MyClass; type=MyType
Metadata:
. id=test1; quality=33; sex=female;
Files:
/data/test1_1.fastq.gz id=test1; quality=33; sex=female; type=fastq; view=FqRd1; /data/test1.bam id=test1; quality=33; sex=female; type=bam; view=Alignments;
{ "colsep": "\t", "id": "labExpId", "kw_sep": " ", "sep": "=", "trail": ";", "fileinfo": [ "path", "size", "md5", "type", "view" ] }
Useful when importing from csv/tsv:
{ "colsep": "\t", "sep": "=", "trail": ";", "kw_sep": " ", "id": "labExpId", "fileinfo": [ "path", "size", "md5", "type", "view", "submittedDataVersion" ], "map": { "BASE_COUNT": "baseCount", "BIOMATERIAL_PROVIDER": "", "BIOMATERIAL_TYPE": "localization", "CELL_TYPE": "cell", "CENTER_NAME": "lab", "DISEASE": "", "DONOR_AGE": "age", "DONOR_ETHNICITY": "ethnicity", "DONOR_HEALTH_STATUS": "", "DONOR_ID": "donorId", "DONOR_REGION_OF_RESIDENCE": "", "DONOR_SEX": "sex", "EXPERIMENT_ID": "", "EXPERIMENT_TYPE": "", "_FILE": "path", "FILE_MD5": "md5", "FILE_SIZE": "size", "FIRST_SUBMISSION_DATE": "dateSubmittedFirst", "INSTRUMENT_MODEL": "", "INSTRUMENT_PLATFORM": "seqPlatform", "LIBRARY_LAYOUT": "", "LIBRARY_NAME": "libProtocol", "LIBRARY_STRATEGY": "dataType", "MOLECULE": "rnaExtract", "READ_COUNT": "readCount", "READ_TYPE": "readType", "READ_QUALITIES": "quality", "READ_STRAND": "readStrand", "RUN_ID": "labExpId", "RUN_NAME": "", "SAMPLE_ID": "sraSampleAccession", "SAMPLE_NAME": "labProtocolId", "SEQ_RUNS_COUNT": "seqRun", "SPECIMEN_PROCESSING": "", "SPECIMEN_STORAGE": "", "STUDY_ID": "sraStudyAccession", "STUDY_NAME": "", "SUBMISSION_DATE": "dateSubmitted", "SUBMISSION_ID": "", "TISSUE": "tissue", "TWIN_PAIR_ID": "", "TYPE": "type", "WITHDRAWN": "" } }
# command line $ idxtools -i index.txt -f format.json [command] # environment variables export IDX_FILE=/path/to/index.txt export IDX_FORMAT=/path/to/format.json $ idxtools show
A CSV file, test.csv:
path,labExpId,quality,sex,type,view /data/test1_1.fastq.gz,test1,33,female,fastq,FqRd1 /data/test1_2.fastq.gz,test1,33,female,fastq,FqRd2 /data/test2_1.fastq.gz,test2,64,male,fastq,FqRd1 /data/test2_2.fastq.gz,test2,64,male,fastq,FqRd2 /data/test2.bam,test2,64,male,bam,Alignments
# import csv and show $ export IDX_FORMAT=$PWD/format.json $ idxtools -i test.csv show /data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; /data/test1_2.fastq.gz labExpId=test1; type=fastq; view=FqRd2; quality=33; sex=female; /data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; /data/test2_2.fastq.gz labExpId=test2; type=fastq; view=FqRd2; quality=64; sex=male; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male; # import csv and save to file $ idxtools -i test.csv -f format.json show -o index.txt $ export IDX_FILE=$PWD/index.txt
# query by attributes $ idxtools show id=test2 /data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; /data/test2_2.fastq.gz labExpId=test2; type=fastq; view=FqRd2; quality=64; sex=male; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male; $ idxtools show view=FqRd1 ./data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; ./data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; $ idxtools show type=bam ./data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male;
# queries use regular expressions # query by id starting with t $ idxtools show id=t # equivalent to: idxtools show id="^t" /data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; /data/test1_2.fastq.gz labExpId=test1; type=fastq; view=FqRd2; quality=33; sex=female; /data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; /data/test2_2.fastq.gz labExpId=test2; type=fastq; view=FqRd2; quality=64; sex=male; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male; # exact queries $ idxtools show id=t --exact # operations on numeric attributes $ idxtools show quality=">40" /data/test2_1.fastq.gz labExpId=test2; type=fastq; view=FqRd1; quality=64; sex=male; /data/test2_2.fastq.gz labExpId=test2; type=fastq; view=FqRd2; quality=64; sex=male; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male; $ idxtools show quality="<40" /data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; /data/test1_2.fastq.gz labExpId=test1; type=fastq; view=FqRd2; quality=33; sex=female;
# TSV output - single attribute $ idxtools show view=FqRd1 -t path /data/test1_1.fastq.gz /data/test2_1.fastq.gz # TSV output - multiple attributes $ idxtools show view=FqRd1 -t id,path test1 ./data/test1_1.fastq.gz test2 ./data/test2_1.fastq.gz # with header $ idxtools show view=FqRd1 -t id,path --header id path test1 ./data/test1_1.fastq.gz test2 ./data/test2_1.fastq.gz
# add /data/test1.bam $ idxtools add path=/data/test1.bam id=test1 type=bam view=Alignments # check $ idxtools show type=bam /data/test1.bam labExpId=test1; type=bam; view=Alignments; quality=33; sex=female; /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male;
# remove /data/test1.bam $ idxtools remove /data/test1.bam # check $ idxtools show type=bam /data/test2.bam labExpId=test2; type=bam; view=Alignments; quality=64; sex=male;
# remove test2 $ idxtools remove id=test2 # check $ idxtools show ./data/test1_1.fastq.gz labExpId=test1; type=fastq; view=FqRd1; quality=33; sex=female; ./data/test1_2.fastq.gz labExpId=test1; type=fastq; view=FqRd2; quality=33; sex=female;
Tools can be implemented in various ways:
{ "cluster": "jip.cluster.SGE", "sge" : { "threads_pe": "smp", "time_limit": "h_rt" }, "profiles": { "default": { "queue": "short,long,rg-el6", "time": "3h" }, "long": { "queue": "long,rg-el6", "time": "24h", "mem": "64G" }, "short": { "queue": "short", "time": "6h", "mem": "16G" } } }
The jip bash command:
# run `hostname` locally $ jip bash -c hostname # submit $ jip bash -s -c hostname
Check jobs with jip jobs:
# check the status of the job $ jip jobs
JIP scripts are extended Bash scripts:
#!/usr/bin/env jip # Greetings # usage: # hello.jip <name> echo "Hello ${name}"
Make the file executable and you can use the JIP interpreter to run it or submit it:
$ chmod +x hello.jip # run the script $ ./hello.jip Joe # show dry run and command only $ ./hello.jip Joe -- --dry --show # submit the script run $ ./hello.jip Joe -- submit
$> export JIP_PATH=$PWD $> jip tools ... Name Path ====================================================================================== hello.jip /users/rg/epalumbo/jip/hello.jip ...
#!/bin/env jip # Sort fastq file by read id # # Usage: # fastq-sort.jip [-i <fastq>] [-o <sorted_fastq>] # # Inputs: # -i, --input <input> Input fastq [default: stdin] # # Outputs: # -o, --output <output> Output sorted fastq [default: stdout] # ${input|arg("cat ")|suf(" | ")}paste - - - - | sort -k1,1 | tr '\t' '\n' ${output|arg(">")}
#!/bin/env jip # Run pigz on an input file # # Usage: # pigz.jip [-i <INPUT>] [-d] [-o <OUTPUT>] # # Inputs: # -i, --input <INPUT> The input file [default: stdin] # # Outputs: # -o, --output <OUTPUT> The output file [default: stdout] # # Options: # -d, --decompress Decompress data #%begin setup add_option('stdout', False, hidden=False, short='-c') if input.get(): if opts['input'].raw().endswith('gz'): opts['decompress'] = True else: opts['decompress'] = False if not output.get(): opts['stdout'] = True else: if output.get() == '${input}.gz' or output.get().replace('.gz','') == input.get(): opts['output'].hidden = True #%end pigz -p ${JIP_THREADS} ${stdout|arg} ${decompress|arg} ${input|arg("")} ${output|arg(">")}
#!/bin/env jip # # Sort and compress fastq # # Usage: # fq-pipe [-i <INPUT>] [-o <OUTPUT>] # # Inputs: # -i, --input <INPUT> Input file [default: stdin] # # Outputs: # -o, --output <OUTPUT> Output file [default: stdout] # #%begin pipeline decomp_input = input if type(input.raw()) == unicode and input.raw().endswith('gz'): decomp_input = job(temp=True).run('pigz', input=decomp_input) sorted_fastq = job(temp=True).run('fastq-sort', input=decomp_input) run('pigz', input=sorted_fastq, output=output) #%end
$ ./sort_fastq.jip -i *_1.fastq -- --dry --show ... ##################### | Job hierarchy | ##################### sort_fastq.0 sort_fastq.1 ##################### | Tasks: 2 | | Jobs: 2 | | Named Groups: 1 | | Job Groups: 2 | ##################### Job commands ============ ### sort_fastq.0 -- Interpreter: bash ### stdout: <default> ### stderr: <default> cat /home/epalumbo/testA_1.fastq | paste - - - - | sort -k1,1 | tr '\t' '\n' >/home/epalumbo/testA_1_sorted.fastq ### ### sort_fastq.1 -- Interpreter: bash ### stdout: <default> ### stderr: <default> cat /home/epalumbo/testB_1.fastq | paste - - - - | sort -k1,1 | tr '\t' '\n' >/home/epalumbo/testB_1_sorted.fastq ###
(shrimp + rsem + flux-capacitor) * 96:
SQlite db limitation
// sample configuration process { executor = 'sge' queue = 'rg-el6' } env { PATH="$PWD/gemtools-1.6.2-i3/bin:$PWD/flux-capacitor-1.2.4/bin:$PATH" }
Nextflow scripts are extended Groovy scripts:
#!/usr/bin/env nextflow str = Channel.from('hello', 'hola', 'bonjour', 'ciao') process printHello { input: val str output: stdout into result """ echo $str """ } result.subscribe { print it }
$ chmod +x hello.nf $ ./hello.nf N E X T F L O W ~ version 0.8.2 [warm up] executor > local [5a/d7e7d3] Submitted process > printHello (2) [42/e78174] Submitted process > printHello (1) [c3/88c277] Submitted process > printHello (4) [7c/55887c] Submitted process > printHello (3) bonjour hola ciao hello
#!/usr/bin/env nextflow /* * Pipeline parameters that can be ovverridden by the command line parameter */ params.query = "$HOME/*.fa" params.db = "$HOME/tools/blast-db/pdb/pdb" params.out = "./result.txt" params.chunkSize = 100 db = file(params.db) fasta = Channel .fromPath(params.query) .splitFasta(by: params.chunkSize) process blast { input: file 'query.fa' from fasta output: file 'top_hits' """ blastp -db ${db} -query query.fa -outfmt 6 > blast_result cat blast_result | head -n 10 | cut -f 2 > top_hits """ } process extract { input: file top_hits output: file 'sequences' "blastdbcmd -db ${db} -entry_batch top_hits | head -n 10 > sequences" } sequences .collectFile(name: params.out) .subscribe { println "Result saved at file: $it" }
$ echo $MODULEPATH /software/rg/el6.3/modulefiles:/usr/share/Modules/modulefiles:/etc/modulefiles # modulefile folder structure $ tree /software/rg/el6.3/modulefiles /software/rg/el6.3/modulefiles/ ├── aspera │ ├── 3.0.1 │ └── 3.3.3 ├── bedtools │ ├── 2.17.0 │ └── 2.19.1 ...
#%Module1.0 ######################################################## # # Author: Emilio Palumbo (emilio.palumbo@crg.es) # ######################################################## set PROG_NAME samtools set PROG_VERSION 0.1.19 set PROG_HOME /software/rg/el6.3/$PROG_NAME-$PROG_VERSION proc ModulesHelp { } { puts stderr "$PROG_NAME version $PROG_VERSION" } module-whatis "loads the $PROG_NAME $PROG_VERSION module" module-verbosity {on} # Tests of consistency # -------------------- # This application cannot be loaded if another $PROG_NAME modulefile was previously loaded set namelow [string tolower $PROG_NAME] conflict $namelow ### This shows info about loaded/unloaded module if { [module-info mode] != "whatis" } { puts stderr "[module-info mode] [module-info name] (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH)" } prepend-path PATH $PROG_HOME/bin prepend-path MANPATH $PROG_HOME/man prepend-path LD_LIBRARY_PATH $PROG_HOME/lib prepend-path C_INCLUDE_PATH $PROG_HOME/include
$ module avail ------------------------ /software/rg/el6.3/modulefiles ------------------------ aspera/3.0.1(default) htslib/0.2.0-rc8(default) aspera/3.3.3 ipsa/1.0(default) bedtools/2.17.0(default) ipsa/1.1 bedtools/2.19.1 jip-tools/1.0(default) bowtie/1.0.1(default) picard/1.81(default) cufflinks/2.0.2 pigz/2.2.5(default) cufflinks/2.1.1 pigz/2.3.1 cufflinks/2.2.1(default) plink/1.07(default) edirect/1.50(default) python/2.7/2.7.3 emboss/6.6.0(default) python/2.7/2.7.5 fastqc/0.10.1(default) python/2.7/2.7.6(default) flux/1.2.3 python/2.7/2.7.6-sqlite flux/1.2.4(default) python/3/3.3.4 flux/1.2.4-SNAPSHOT python-modules/2.6(default) flux/1.3 rsem/1.1.17 flux/1.4 rsem/1.2.12(default) flux/1.5.1 samtools/0.1.18 flux/1.6 samtools/0.1.19(default) flux/1.6.1 shrimp/2.2.3(default) gemtools/1.6.1-i3 sickle/1.210(default) gemtools/1.6.2-i3(default) sratoolkit/2.3.5(default) gemtools/1.7.1-i3 sublime-text/3-build-3059(default) gemtools/1.7-i3 texlive/2012(default) glimmps ucsc/2013.02(default) htop/1.0.2(default) vcftools/0.2.12a(default) ------------------------ /usr/share/Modules/modulefiles ------------------------ dot module-cvs module-info modules null use.own ------------------------------- /etc/modulefiles ------------------------------- compat-openmpi-x86_64 openmpi-x86_64
# load default module version $ module load samtools load samtools/0.1.19 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) $ which samtools /software/rg/el6.3/samtools-0.1.19/bin/samtools # load specific version $ module load samtools/0.1.18 load samtools/0.1.18 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) $ which samtools /software/rg/el6.3/samtools-0.1.18/bin/samtools # switch module version $ module switch samtools samtools/0.1.19 switch1 samtools/0.1.18 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) ... $ which samtools /software/rg/el6.3/samtools-0.1.19/bin/samtools
# unload module # equivalent to 'module unload samtools' but shorter! $ module rm samtools remove samtools/0.1.19 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) # unload all loaded modules $ module purge remove samtools/0.1.19 (PATH, MANPATH, LD_LIBRARY_PATH, C_INCLUDE_PATH) remove bedtools/2.17.0 (PATH)
# list loaded modules $ module list Currently Loaded Modulefiles: 1) samtools/0.1.19 2) bedtools/2.17.0