Bionode.io - Modular and universal bioinformatics
Pipeable UNIX command line tools and JavaScript / Node.js APIs for bioinformatic analysis workflows on the server and browser. #bionode gitter.im/bionode/bionode
Difficulty getting relevant description and datasets from NCBI API using bio* libs
Python example: URL for the Acromyrmex assembly?
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA_000188075.1_Si_gnG
import xml.etree.ElementTree as ET
from Bio import Entrez
Entrez.email = "mail@bmpvieira.com"
esearch_handle = Entrez.esearch(db="assembly", term="Acromyrmex")
esearch_record = Entrez.read(esearch_handle)
for id in esearch_record['IdList']:
esummary_handle = Entrez.esummary(db="assembly", id=id)
esummary_record = Entrez.read(esummary_handle)
documentSummarySet = esummary_record['DocumentSummarySet']
document = documentSummarySet['DocumentSummary'][0]
metadata_XML = document['Meta'].encode('utf-8')
metadata = ET.fromstring('' + metadata_XML + '')
for entry in Metadata[1]:
print entry.text
Solution: bionode-ncbi
Better way with Bionode - 4 approaches
JavaScript
var bio = require('bionode')
//Callback pattern
bio.ncbi.urls('assembly', 'Acromyrmex', function(urls) {
console.log(urls[0].genomic.fna)
})
//Event pattern
bio.ncbi.urls('assembly', 'Acromyrmex').on('data', printGenomeURL)
function printGenomeURL(url) {
console.log(url.genomic.fna)
}
//Pipe pattern
var tool = require('tool-stream')
bio.ncbi.urls('assembly', 'Acromyrmex')
.pipe(tool.extractProperty('genomic.fna'))
.pipe(process.stdout)
BASH
bionode ncbi urls assembly Acromyrmex | tool-stream extractProperty genomic.fna
Complex pipelines with forks
ncbi
.search('sra', 'Solenopsis invicta')
.pipe(fork1)
.pipe(dat.reads)
fork1
.pipe(tool.extractProperty('expxml.Biosample.id'))
.pipe(ncbi.search('biosample'))
.pipe(dat.samples)
fork1
.pipe(tool.extractProperty('uid'))
.pipe(ncbi.link('sra', 'pubmed'))
.pipe(ncbi.search('pubmed'))
.pipe(fork2)
.pipe(dat.papers)
# Mac brew install n n stable
# Ubuntu sudo apt-get install npm npm install -g n n stable
# Windows Go to http://nodejs.org
npm install -g bionode-ncbi bionode-fasta json
bionode-ncbi search genome spiders bionode-ncbi search genome spiders | wc bionode-ncbi search genome spiders | head -n 1 | json bionode-ncbi search genome spiders | json -ga organism_name
bionode-ncbi search genome spiders | \
json -ga uid | \
bionode-ncbi link genome pubmed - | \
json -ga destUID | \
bionode-ncbi search pubmed - | \
json -ga title
bionode-ncbi download assembly Guillardia theta | \
json -ga -c 'this.status === "completed"' | \
json -ga path | \
bionode-fasta -f | \
json -ga -c 'this.seq.length > 10000' | \
bionode-fasta --write > gtheta-big-scaffolds.fasta
var through = require('through2')
var stream = through2.obj(transform)
function transform (obj, enc, next) {
// do things, example:
obj.name = obj.name.toUpperCase()
// Push downstream
this.push(obj)
// Callback to fetch next object
next()
}
var through = require('through2')
var stream = through2.obj(transform)
function transform (obj, enc, next) {
// do things, example:
var self = this
requestSomethingFromDB(obj.name, function(data) {
obj.data = data
self.push(obj)
next()
})
}
Bash
mkdir project cd project npm install bionode-ncbi through2
JavaScript
var ncbi = require('bionode-ncbi')
var through = require('through2')
var json = require('ndjson')
var myStream = through.obj(transform)
function transform (obj, enc, next) {
var result = {
specie: obj.organism,
organisazation: obj.meta['submitter-organization']
}
this.push(result)
next()
}
ncbi.search('assembly', 'spiders')
.pipe(myStream)
.pipe(json.stringify())
.pipe(process.stdout)
var counter = 0
myStream
.on('data', function (data) {
counter++
})
.on('end', function () {
console.log('Processed ' + counter)
})
var counter = 0
var count = function (data) {
counter++
}
var log = function () {
console.log('Processed ' + counter)
}
myStream.on('data', count).on('end', log)
Bionode
Hackday
Same code client/server side
CoffeeScript pipeline and a new format?
ncbi.search 'genome', 'rodentia'
.pipe ncbi.expand 'assembly'
.pipe ncbi.expand 'tax'
.pipe getLineage()
.pipe ncbi.link 'tax', 'sra'
.pipe ncbi.expand 'sra'
.pipe through.obj (obj, enc, next) ->
async.map obj.sra, expandBiosample, (error, sra)=>
obj.sra = sra
@push obj
next()
pipeline1
ncbi.search genome rodentia
ncbi.expand assembly
ncbi.expand tax
getLineage
ncbi.link tax sra
ncbi.expand sra
stream (obj, next) ->
async.map obj.sra expandBiosample (sra) =>
obj.sra = sra
@push obj
next()
Pipelines and alternatives to Makefiles?