Bionode.io - Modular and universal bioinformatics
Pipeable UNIX command line tools and JavaScript / Node.js APIs for bioinformatic analysis workflows on the server and browser. #bionode gitter.im/bionode/bionode
Difficulty getting relevant description and datasets from NCBI API using bio* libs
Python example: URL for the Acromyrmex assembly?
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000188075.1_Si_gnG
import xml.etree.ElementTree as ET from Bio import Entrez Entrez.email = "mail@bmpvieira.com" esearch_handle = Entrez.esearch(db="assembly", term="Acromyrmex") esearch_record = Entrez.read(esearch_handle) for id in esearch_record['IdList']: esummary_handle = Entrez.esummary(db="assembly", id=id) esummary_record = Entrez.read(esummary_handle) documentSummarySet = esummary_record['DocumentSummarySet'] document = documentSummarySet['DocumentSummary'][0] metadata_XML = document['Meta'].encode('utf-8') metadata = ET.fromstring('' + metadata_XML + '') for entry in Metadata[1]: print entry.text
Solution: bionode-ncbi
Better way with Bionode - 4 approaches
JavaScript
var bio = require('bionode')
//Callback pattern bio.ncbi.urls('assembly', 'Acromyrmex', function(urls) { console.log(urls[0].genomic.fna) })
//Event pattern bio.ncbi.urls('assembly', 'Acromyrmex').on('data', printGenomeURL) function printGenomeURL(url) { console.log(url.genomic.fna) }
//Pipe pattern var tool = require('tool-stream') bio.ncbi.urls('assembly', 'Acromyrmex') .pipe(tool.extractProperty('genomic.fna')) .pipe(process.stdout)
BASH
bionode ncbi urls assembly Acromyrmex | tool-stream extractProperty genomic.fna
Complex pipelines with forks
ncbi .search('sra', 'Solenopsis invicta') .pipe(fork1) .pipe(dat.reads) fork1 .pipe(tool.extractProperty('expxml.Biosample.id')) .pipe(ncbi.search('biosample')) .pipe(dat.samples) fork1 .pipe(tool.extractProperty('uid')) .pipe(ncbi.link('sra', 'pubmed')) .pipe(ncbi.search('pubmed')) .pipe(fork2) .pipe(dat.papers)
# Mac brew install n n stable
# Ubuntu sudo apt-get install npm npm install -g n n stable
# Windows Go to http://nodejs.org
npm install -g bionode-ncbi bionode-fasta json
bionode-ncbi search genome spiders bionode-ncbi search genome spiders | wc bionode-ncbi search genome spiders | head -n 1 | json bionode-ncbi search genome spiders | json -ga organism_name
bionode-ncbi search genome spiders | \ json -ga uid | \ bionode-ncbi link genome pubmed - | \ json -ga destUID | \ bionode-ncbi search pubmed - | \ json -ga title
bionode-ncbi download assembly Guillardia theta | \ json -ga -c 'this.status === "completed"' | \ json -ga path | \ bionode-fasta -f | \ json -ga -c 'this.seq.length > 10000' | \ bionode-fasta --write > gtheta-big-scaffolds.fasta
var through = require('through2') var stream = through2.obj(transform) function transform (obj, enc, next) { // do things, example: obj.name = obj.name.toUpperCase() // Push downstream this.push(obj) // Callback to fetch next object next() }
var through = require('through2') var stream = through2.obj(transform) function transform (obj, enc, next) { // do things, example: var self = this requestSomethingFromDB(obj.name, function(data) { obj.data = data self.push(obj) next() }) }
Bash
mkdir project cd project npm install bionode-ncbi through2
JavaScript
var ncbi = require('bionode-ncbi') var through = require('through2') var json = require('ndjson') var myStream = through.obj(transform) function transform (obj, enc, next) { var result = { specie: obj.organism, organisazation: obj.meta['submitter-organization'] } this.push(result) next() } ncbi.search('assembly', 'spiders') .pipe(myStream) .pipe(json.stringify()) .pipe(process.stdout)
var counter = 0 myStream .on('data', function (data) { counter++ }) .on('end', function () { console.log('Processed ' + counter) })
var counter = 0 var count = function (data) { counter++ } var log = function () { console.log('Processed ' + counter) } myStream.on('data', count).on('end', log)
Bionode
Hackday
Same code client/server side
CoffeeScript pipeline and a new format?
ncbi.search 'genome', 'rodentia' .pipe ncbi.expand 'assembly' .pipe ncbi.expand 'tax' .pipe getLineage() .pipe ncbi.link 'tax', 'sra' .pipe ncbi.expand 'sra' .pipe through.obj (obj, enc, next) -> async.map obj.sra, expandBiosample, (error, sra)=> obj.sra = sra @push obj next()
pipeline1 ncbi.search genome rodentia ncbi.expand assembly ncbi.expand tax getLineage ncbi.link tax sra ncbi.expand sra stream (obj, next) -> async.map obj.sra expandBiosample (sra) => obj.sra = sra @push obj next()
Pipelines and alternatives to Makefiles?