diff --git a/DESCRIPTION b/DESCRIPTION index ec0b2bca..dab7829d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,7 @@ Description: If you have a set of genomic ranges, this package can help you with exons, or intergenic regions. It also makes it easy to compare one set of ranges to another. Authors@R: c( - person("Kristyna", "Kupkova", role=c("aut", "cre"), + person("Kristyna", "Kupkova", role=c("aut", "cre"), email = "kristynakupkova@gmail.com"), person("Jose", "Verdezoto", role="aut"), person("Tessa", "Danehy", role="aut"), @@ -20,8 +20,8 @@ Authors@R: c( person("Jose", "Verdezoto", role="aut"), person("Michal", "Stolarczyk", role="aut"), person("Jason", "Smith", role="aut"), - person("Bingjie", "Xue", role="aut"), - person("Sophia", "Rogers", role="aut"), + person("Bingjie", "Xue", role="aut"), + person("Sophia", "Rogers", role="aut"), person("John", "Stubbs", role="aut"), person(given=c("Nathan", "C."), "Sheffield", email = "nathan@code.databio.org", role="aut")) @@ -57,7 +57,7 @@ Enhances: LazyData: true VignetteBuilder: knitr License: BSD_2_clause + file LICENSE -biocViews: Software, GenomeAnnotation, GenomeAssembly, DataRepresentation, Sequencing, +biocViews: Software, GenomeAnnotation, GenomeAssembly, DataRepresentation, Sequencing, Coverage, FunctionalGenomics, Visualization RoxygenNote: 7.1.2 URL: http://code.databio.org/GenomicDistributions diff --git a/NAMESPACE b/NAMESPACE index 6be9b969..2a924a18 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(calcFeatureDist) export(calcFeatureDistRefTSS) export(calcGCContent) export(calcGCContentRef) +export(calcNearestGenes) export(calcNearestNeighbors) export(calcNeighborDist) export(calcPartitions) @@ -79,4 +80,4 @@ importFrom(utils,data) importFrom(utils,download.file) importFrom(utils,getAnywhere) importFrom(utils,globalVariables) -importFrom(utils,installed.packages) +importFrom(utils,installed.packages) \ No newline at end of file diff --git a/R/nearest-genes.R b/R/nearest-genes.R new file mode 100644 index 00000000..f2363853 --- /dev/null +++ b/R/nearest-genes.R @@ -0,0 +1,106 @@ +#' Returns the distance for each range in x to its nearest neighbor +#' in y with directionality. That is, if the nearest neighbor is +#' upstream, it is denoted with a negative distance. Likewise, +#' downstream distances are reported as positive. +#' +#' @param x A GRanges object +#' @param y A Granges object +#' +#' @return A vector of nearest distances with directionality +.directionalDistanceToNearest = function(x, y) { + # get distance to upsream and downstream + # with proper sign + distToUpstream = -1 * distance(x, y[precede(query, y)]) + distToDownstream = distance(x, y[follow(query, y)]) + + # calculate absolute distance and find nearest + nearestDist = pmin(abs(distToUpstream), abs(distToDownstream)) + + # coerce upstream back to negative by + # finding where the upstream distance was + # chosen and force it back to negative + nearestDist[nearestDist == abs(distToUpstream)] = -1 * nearestDist[nearestDist == abs(distToUpstream)] + + return(nearestDist) +} + +#' Given a query and set of annotations, this function will calculate +#' the nearest annotation to each region in the region set, as well +#' as the nearest gene type and the distance to the nearest gene. +#' +#' @param query A GRanges or GRangesList object with query sets +#' @param annotations A GRanges or GRangesList object with annotation sets +#' +#' @return A data table that contains observations for each genomic region +#' and the associated aforementioned annotations. +#' @export +#' @examples +#' queryFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") +#' query = rtracklayer::import(queryFile) +#' data(TSS_hg19) +#' +#' queryAnnotated = calcNearestGenes(query, TSS_hg19) +calcNearestGenes = function(query, annotations, gene_name_key="gene_id", gene_type_key="gene_biotype") { + .validateInputs(list(query=c("GRanges","GRangesList"))) + if (is(query, "GRangesList")) { + # Recurse over each GRanges object + annots = lapply( + query, + function(x) { + calcNearestGenes(x, annotations, gene_name_key=gene_name_key, gene_type_key=gene_type_key) + } + ) + return(annots) + } + # calculate the nearest annotations to given query + nearestIds = nearest(query, annotations) + + # annotate nearest gene and type + nearestGenes = annotations[nearestIds] + + # + # use mcols to get the metadata columns as a + # a data frame and dynamically access the column + # that way... + # this is used to circumvent the fact that we cannot + # dynamically access metadata columns inside a GRanges + # object like we can a dataframe: + # col = "gene_id" + # dt[[col]] + # ^^^ This doesnt work in a GRanges object. + # + query$nearest_gene = mcols(nearestGenes)[[gene_name_key]] + query$nearest_gene_type = mcols(nearestGenes)[[gene_type_key]] + + # annotate on the distance as well + query$nearest_distance = .directionalDistanceToNearest(query, annotations[nearestIds]) + + # dump a query to a data table and return + dt = grToDt(query) + + return(dt) +} + +#' Given a query and reference assembly, this function will calculate +#' the nearest TSS to each region in the region set, as well +#' as the nearest gene type and the distance to the nearest gene. +#' +#' It is a wrapper around \code{calcFeatureDist} that uses built-in TSS +#' features for a reference assembly. +#' +#' @param query A GenomicRanges or GenomicRangesList object with query regions +#' @param refAssembly A character vector specifying the reference genome +#' assembly (*e.g.* 'hg19'). +#' +#' @return A data table that contains observations for each genomic region +#' and the associated aforementioned TSSs. +#' @export +#' @examples +#' queryFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") +#' query = rtracklayer::import(queryFile) +#' +#' queryAnnotated = calcNearestGenesRef(query, "hg19") +calcNearestGenestRef = function(query, refAssembly) { + features = getTSSs(refAssembly) + return(calcNearestGenes(query, features)) +} \ No newline at end of file diff --git a/data/TSS_hg19.rda b/data/TSS_hg19.rda index 077fa48e..9e7521be 100644 Binary files a/data/TSS_hg19.rda and b/data/TSS_hg19.rda differ diff --git a/data/geneModels_hg19.rda b/data/geneModels_hg19.rda index cd4d5ede..fa9c4661 100644 Binary files a/data/geneModels_hg19.rda and b/data/geneModels_hg19.rda differ diff --git a/data/setB_100.rda b/data/setB_100.rda index 45c93891..dc283d36 100644 Binary files a/data/setB_100.rda and b/data/setB_100.rda differ diff --git a/data/vistaEnhancers.rda b/data/vistaEnhancers.rda index e428c23d..33fc0328 100644 Binary files a/data/vistaEnhancers.rda and b/data/vistaEnhancers.rda differ diff --git a/man/calcNearestGenes.Rd b/man/calcNearestGenes.Rd new file mode 100644 index 00000000..82dcef03 --- /dev/null +++ b/man/calcNearestGenes.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/nearest-genes.R +\name{calcNearestGenes} +\alias{calcNearestGenes} +\title{Given a query and set of annotations, this function will calculate +the nearest annotation to each region in the region set, as well +as the nearest gene type and the distance to the nearest gene.} +\usage{ +calcNearestGenes( + query, + annotations, + gene_name_key = "gene_id", + gene_type_key = "gene_biotype" +) +} +\arguments{ +\item{query}{A GRanges or GRangesList object with query sets} + +\item{annotations}{A GRanges or GRangesList object with annotation sets} +} +\value{ +A data table that contains observations for each genomic region + and the associated aforementioned annotations. +} +\description{ +Given a query and set of annotations, this function will calculate +the nearest annotation to each region in the region set, as well +as the nearest gene type and the distance to the nearest gene. +} +\examples{ +queryFile = system.file("extdata", "vistaEnhancers.bed.gz", package="GenomicDistributions") +query = rtracklayer::import(queryFile) +data(TSS_hg19) + +queryAnnotated = calcNearestGenes(query, TSS_hg19) +}