Class 17: NCBI Entrez

ESearch -> ESummary; ESearch -> EFetch; EPost -> ESummary; EPost -> EFetch; ESearch -> ELink; EPost -> ELink; EPost -> ESearch; ELink -> ESearch; ESearch -> ELink -> ESummary; ESearch -> ELink -> EFetch; EPost -> ESearch -> ESummary; EPost -> ESearch -> EFetch; EPost -> ELink -> ESearch -> ESummary; EPost -> ELink -> ESearch -> EFetch;

query subject identity positives length 1 ACU21521.1 XP_003538059.1 100.000 100.00 291 2 ACU21521.1 XP_020225874.1 87.542 92.26 297 3 ACU21521.1 XP_007149035.1 87.629 94.16 291 4 ACU21521.1 XP_017425721.1 86.254 92.10 291 5 ACU21521.1 XP_006591036.1 88.660 88.66 291 6 ACU21521.1 XP_014501961.1 85.223 91.07 291 mismatches gaps q.start q.end s.start s.end evalue 1 0 0 1 291 54 344 0 2 31 1 1 291 52 348 0 3 36 0 1 291 50 340 0 4 40 0 1 291 50 340 0 5 0 1 1 291 54 311 0 6 43 0 1 291 50 340 0 score q.gi q.ref s.gi s.ref 1 612 255642515 ACU21521.1 356539142 XP_003538059.1 2 545 255642515 ACU21521.1 1150166268 XP_020225874.1 3 545 255642515 ACU21521.1 593697106 XP_007149035.1 4 526 255642515 ACU21521.1 1044577906 XP_017425721.1 5 526 255642515 ACU21521.1 571488796 XP_006591036.1 6 522 255642515 ACU21521.1 950979929 XP_014501961.1

These are the genbank ids of all the proteins found

  [1] "255642515"  "1045396645" "1045375294"
  [4] "1044582125" "1044577908" "1044577906"
  [7] "1021583843" "1021558720" "1012361995"
 [10] "1012338638" "1012260727" "1012202223"
 [13] "965665445"  "965609789"  "571507141" 
 [16] "571496646"  "571488798"  "571488796" 
 [19] "356539142"  "356501332"  "950995503" 
 [22] "950979935"  "950979929"  "950930754" 
 [25] "947065573"  "357493575"  "357458443" 
 [28] "920699279"  "920691060"  "502169256" 
 [31] "502098169"  "502090906"  "734430373" 
 [34] "734416564"  "593701573"  "593697106" 
 [37] "593562324"  "388522749"  "1150166268"
 [40] "1150166270" "1117517859" "1012225626"
 [43] "1150166272" "1117517861" "1012225630"
 [46] "1012225634" "1150128621" "1021534275"
 [49] "1117375272" "593489431"  "1150094071"
 [52] "1044545110" "1117563883" "571553627" 
 [55] "955389649"  "922350178"  "571553636" 
 [58] "922329305"  "922350180"  "1044556548"
 [61] "1044556546" "1044557823" "1044557821"
 [64] "1044557819" "951025059"  "1044557829"
 [67] "951025065"  "1044557825" "1044557827"
 [70] "951025063"  "571482571"  "571482569" 
 [73] "593689820"  "571482575"  "571482573" 
 [76] "1150095614" "502183121"  "1150095616"
 [79] "828339994"  "502183133"  "502183129" 
 [82] "922400539"  "357439909"  "828339999" 
 [85] "502183147"  "502183143"  "502183138" 
 [88] "571440442"  "571440440"  "571440444" 
 [91] "356500353"  "1117375772" "1117375759"
 [94] "1117375765" "1117375785" "1117375778"
 [97] "1117546227" "1117546205" "1117375768"
[100] "1117375787" "1117375775" "1117375781"
[103] "1117375790" "955307577"  "955307575" 
[106] "1117546230" "1117546224" "955307580" 
[109] "951017511"  "593689824"  "1117342058"
[112] "1117342051" "1117342038" "356537561" 
[115] "1117342048" "1117342055" "1021550847"
[118] "1021550849" "1150095590" "1012214348"
[121] "1044553727" "502103542"  "828298200" 
[124] "1150095620" "951017515"  "571440447" 
[127] "593689826"  "955384590"  "571546627" 
[130] "950962514"  "950962510"  "356567862" 
[133] "571546623"  "1044534197" "1044534201"
[136] "357505281"  "593441102"  "357505277" 
[139] "593689822"  "1150093585" "1021530819"
[142] "571474527"  "1012196136" "1021530815"
[145] "1012196130" "1021530817" "1012196133"
[148] "502132291"  "502132289"  "571509404" 
[151] "356552535"  "1150118542" "1150118544"
[154] "502132293"  "1044523131" "828313695" 
[157] "1150118546" "356552537"  "1150118548"
[160] "356503387"  "571474529"  "1012184691"
[163] "1012184695" "357509397"  "1150136062"
[166] "955314277"  "593562195"  "1044582959"
[169] "950983855"  "356553464"  "356499483" 
[172] "1150127909" "828296784"  "357495283" 
[175] "1117372388" "1012199286" "1021533997"

domains <- entrez_link(dbfrom = "protein", id=proteins, by_id=TRUE, db="cdd") doms <- lapply(domains, function(m) m$links$protein_cdd_concise_2) udomains <- unique(unlist(doms)) udomains

dom.summary <- entrez_summary("cdd", udomains) dom.title <- extract_from_esummary(dom.summary, "title") prot.domains <- sapply(doms, function(d) paste(dom.title[d], collapse=" ")) unique(prot.domains)

[1] "WD40 F-box-like" [2] "WD40 F-box" [3] "F-box-like WD40" [4] "WD40" [5] "WD40 F-box-like WD40" [6] "" [7] "SGNH_hydrolase WD40 F-box-like" [8] "WD40 LisH Dyp_perox" [9] "WD40 LisH" [10] "WD40 Med15 LisH" [11] "WD40 LisH Med15" [12] "F-box" [13] "WD40 Amelogenin LisH"

wd40.id <- names(dom.title)[dom.title=="WD40"] fbox.id <- names(dom.title)[dom.title %in% c("F-box", "F-box-like")] has.wd40 <- sapply(doms, function(d) length(intersect(d, wd40.id))>0) has.fbox <- sapply(doms, function(d) length(intersect(d, fbox.id))>0) has.both <- has.fbox & has.wd40 table(has.both)

messn <- entrez_link(dbfrom = "protein", id=proteins, by_id=TRUE, linkname="protein_nuccore_mrna") messn <- sapply(messn, function(m) m$links$protein_nuccore_mrna) messn.seq <- entrez_fetch(db="nuccore", rettype = "fasta", id=messn[has.both]) write(messn.seq, file="ncbi-messngr-wd40-fbox.fna")

geoprof <- entrez_link(dbfrom = "gene", id=ugenes, by_id=TRUE, linkname="gene_geoprofiles") profiles <- lapply(geoprof, function(m) m$links$gene_geoprofiles) ugeoprof <- unique(unlist(profiles)) geoprofiles_gds <- entrez_link(dbfrom = "geoprofiles", id=ugeoprof, linkname="geoprofiles_gds")

gene_pubmed <- entrez_link(dbfrom = "gene", id=ugenes, by_id=TRUE, linkname="gene_pubmed") upubmed <- unique(unlist(lapply(gene_pubmed, function(m) m$links$gene_pubmed))) recs <- entrez_fetch(db="pubmed", id=upubmed, rettype="xml") papers <- parse_pubmed_xml(recs)

sonuc <- data.frame(proteins, name=pnames[proteins], messn=sapply(messn, function(m) ifelse(is.null(m),"",m[1])), domains=prot.domains, has.fbox, has.wd40, has.both, stringsAsFactors = FALSE)

Limiting BLAST databases

NCBI Entrez queries

Entrez Examples in BLAST

Entrez Examples

What are the keywords for Entrez?

Databases

Assembly

Gene

Genome

Databases

EST

GSS

Nucleotide

Databases

SRA

Taxonomy

MeSH

Databases

Protein

Protein Clusters

Databases

Conserved Domains

Structure

Databases

HomoloGene

SNP

Databases

BioProject

BioSample

BioSystems

Databases

Bookshelf

NCBI Web Site Search

NLM Catalog

Databases

dbGaP

dbVAR

Databases

GEO Datasets

GEO Profiles

Probe

Literature

PubMed

PubMed Central

Other databases

Combining queries

Example

Creating advanced queries

Dates and Other Ranges

“Clipboard” and “My Collections”

Pre-computed answers

Automatization

Pipelines: putting all together

F-Box protein domain

Protein domains according to http://pfam.xfam.org/

F-box

WD-40

Finding proteins with those domains

Results

Results

Filter results: only Legumes

Downloading

Finding more proteins

Save your search strategy

Process the output

check domains

Next steps

It is boring to do it one by one

E-tools: Entrez Pipelines

Map of E-tools

Use your favorite language

Example: analyzing BLAST output

Protein ids

What are their domains?

But what do these id mean?

Keep only proteins that have both domains

Download the protein sequences

Find the messengers that code for the proteins

Find the genes that encode the proteins

Find expression data for those genes