Skip to content

Commit ad889d0

Browse files
committed
Trying to clean names
1 parent 8bc08ab commit ad889d0

13 files changed

+374
-52
lines changed

R/analyze/create_tox_file.R

+14-3
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,23 @@ create_tox_object <- function(all_data, chem_info, sites, exclude){
55
!(chnm == "Chlorpyrifos" & generic_class == "WW"),
66
!(chnm == "Caffeine" & generic_class == "WW"),
77
!(chnm == "Cotinine" & generic_class == "WW")) %>%
8-
select(SiteID, `Sample Date`, CAS, Value, comment) %>%
9-
filter(SiteID %in% sites$SiteID)
8+
select(SiteID, `Sample Date`, CAS, Value, comment)
9+
10+
sites_ordered <- sites %>%
11+
filter(SiteID %in% chem_data$SiteID)
1012

13+
sites_ordered$map_nm <- substr(gsub("Lake ", "", sites_ordered$site_grouping),1,1)
14+
15+
sites_ordered$map_nm <- paste0(sites_ordered$map_nm,
16+
c(1:sum(sites_ordered$map_nm == "S"),
17+
1:sum(sites_ordered$map_nm == "M"),
18+
1:sum(sites_ordered$map_nm == "H"),
19+
1:sum(sites_ordered$map_nm == "E"),
20+
1:sum(sites_ordered$map_nm == "O")))
21+
1122
tox_list <- list("Data" = chem_data,
1223
"Chemicals" = chem_info,
13-
"Sites" = sites,
24+
"Sites" = sites_ordered,
1425
"Exclude" = exclude)
1526
return(tox_list)
1627
}

R/analyze/data_reader.R

+47-18
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ generic_file_opener <- function(file_name, cas_df, n_max, sheet, site_sheet,
6161
sheet <- "pharms"
6262
}
6363
}
64-
data_long <- na.omit(data_long)
64+
data_long <- dplyr::filter(data_long, !is.na(Value))
65+
data_long <- dplyr::filter(data_long, !is.na(chnm))
6566
data_long$comment <- ""
6667
data_long$comment[grep("<",data_long$Value)] <- "<"
6768
data_long$comment[grep("DNQ",data_long$Value)] <- "DNQ"
@@ -74,18 +75,22 @@ generic_file_opener <- function(file_name, cas_df, n_max, sheet, site_sheet,
7475
data_long <- data_long[data_long$Value != "lostinfield",]
7576
data_long <- data_long[data_long$Value != "-----",]
7677
data_long <- data_long[data_long$Value != "'-----",]
78+
data_long <- data_long[data_long$Value != "nosmple",]
7779
data_long$comment[which(data_long$Value == "ND")] <- "<"
7880
data_long$Value[which(data_long$Value == "ND")] <- data_long$MDL[which(data_long$Value == "ND")]
79-
data_long <- data_long[data_long$Value != "NA",]
81+
data_long <- filter(data_long, Value != "NA")
8082

8183
data_long$Value <- as.numeric(data_long$Value)
8284
data_long$Value <- data_long$Value/convert
8385
data_long$generic_class <- sheet
8486
data_long$`Sample Date` <- year
8587
data_long$SiteID <- gsub("site ","",data_long$SiteID, ignore.case = TRUE)
8688

87-
data_long <- filter(data_long,
89+
# Premature taking out censored values?
90+
data_long <- filter(data_long,
8891
!(is.na(Value) & comment == ""))
92+
# data_long <- filter(data_long,
93+
# !(is.na(Value)))
8994

9095
data_long <- data_long %>%
9196
mutate(chnm = tolower(chnm)) %>%
@@ -110,10 +115,18 @@ generic_file_opener <- function(file_name, cas_df, n_max, sheet, site_sheet,
110115
data_long$CAS[data_long$chnm == "Nadolol"] <- "42200-33-9"
111116
data_long$chnm[data_long$chnm == "Tris(1,3-Dichloro-2-Propyl)Phosphate (t"] <- "Tris(1,3-dichloro-2-propyl)phosphate (TDCPP)"
112117
data_long$CAS[data_long$chnm == "Tris(1,3-dichloro-2-propyl)phosphate (TDCPP)"] <- "13674-87-8"
118+
data_long$CAS[data_long$CAS == "26248-87-3"] <- "13674-84-5" #2 versions of TDCPP
119+
data_long$CAS[data_long$chnm == "TCEP"] <- "115-96-8"
120+
data_long$CAS[data_long$chnm == "Tri(2-chloroethyl) phosphate (TCEP)"] <- "115-96-8"
121+
# data_long$CAS[data_long$CAS == "51805-45-9"] <- "115-96-8"
122+
data_long$chnm[data_long$CAS == "101-20-2"] <- "3,4,4'-Trichlorocarbanilide"
123+
data_long$chnm[data_long$CAS == "115-96-8"] <- "Tri(2-chloroethyl) phosphate (TCEP)"
113124

114125
data_long <- data_long[!(data_long$chnm %in% c("Tcpp_isomer","Tcpp Isomer")),]
115126

116127
data_long$CAS[data_long$chnm == "Omeprazole + Esomprazole"] <- "73590-58-6"
128+
data_long$chnm[data_long$CAS == "73590-58-6"] <- "Omeprazole + Esomprazole"
129+
117130

118131
if(any(is.na(data_long$CAS))){
119132
message("Some CAS didn't match up")
@@ -157,28 +170,44 @@ clean_cas <- function(cas_df){
157170
filter(!duplicated(CAS)) %>%
158171
mutate(chnm = tools::toTitleCase(chnm))
159172

160-
cas_final$chnm[cas_final$chnm == "Deet"] <- "DEET"
161-
cas_final$chnm[cas_final$chnm == "Tcep"] <- "TCEP"
162-
cas_final$chnm[cas_final$chnm == "Tcpp"] <- "TCPP"
163-
cas_final$chnm[cas_final$chnm == "Tbep"] <- "TBEP"
164-
cas_final$chnm[cas_final$chnm == "Tdcpp"] <- "TDCPP"
165-
cas_final$chnm[cas_final$chnm == "Total Pcbs"] <- "Total PCBS"
173+
cas_final$chnm[cas_final$chnm == "Deet"] <- "N,N-diethyltoluamide (DEET)"
174+
cas_final$chnm[cas_final$chnm == "Tcep"] <- "Tri(2-chloroethyl) phosphate (TCEP)"
175+
# cas_final$CAS[cas_final$chnm == "Tri(2-chloroethyl) phosphate (TCEP)"] <- "115-96-8"
176+
cas_final$chnm[cas_final$chnm == "Tcpp"] <- "Tris(1-chloro-2-propyl)phosphate (TCPP)"
177+
cas_final$chnm[cas_final$chnm == "Tbep"] <- "Tri(2-chloroethyl) phosphate (TCEP)"
178+
cas_final$chnm[cas_final$chnm == "Tdcpp"] <- "Tris(1,3-dichloro-2-propyl) phosphate (TDCPP)"
179+
cas_final$chnm[cas_final$chnm == "Total Pcbs"] <- "Total PCBs"
166180
cas_final$chnm[cas_final$chnm == "O,p'-Ddd"] <- "o,p'-DDD"
167181
cas_final$chnm[cas_final$chnm == "P,p'-Ddd"] <- "p,p'-DDD"
168-
cas_final$chnm[cas_final$chnm == "Pentachloroanisole (Pca)"] <- "PCA"
169-
cas_final$chnm[cas_final$chnm == "Tributyl Phosphate (Tbp)"] <- "TBP"
170-
cas_final$chnm[cas_final$chnm == "Hydrochlorothiazide (Hctz)"] <- "HCTZ"
171-
cas_final$chnm[cas_final$chnm == "Tris(2−Chloroethyl)Phosphate (Tcep)"] <- "TCEP"
182+
cas_final$chnm[cas_final$chnm == "Pentachloroanisole (Pca)"] <- "Pentachloroanisole"
183+
cas_final$chnm[cas_final$chnm == "Tributyl Phosphate (Tbp)"] <- "Tributyl phosphate (TBP)"
184+
cas_final$chnm[cas_final$chnm == "Hydrochlorothiazide (Hctz)"] <- "Hydrochlorothiazide"
172185
cas_final$chnm[cas_final$chnm == "O,p'-Ddt"] <- "o,p'-DDT"
173186
cas_final$chnm[cas_final$chnm == "O,p'-Ddt"] <- "o,p'-DDT"
174187
cas_final$chnm[cas_final$chnm == "P,p'-Dde"] <- "p,p'-DDE"
175188
cas_final$chnm[cas_final$chnm == "P,p'-Ddt"] <- "p,p'-DDT"
176189
cas_final$chnm[cas_final$chnm == "O,p'-Dde"] <- "o,p'-DDE"
190+
cas_final$chnm[cas_final$chnm == "Indeno[1,2,3-Cd]pyrene"] <- "Indeno[1,2,3-cd]pyrene"
191+
cas_final$chnm[cas_final$chnm == "Benzo(a)Pyrene"] <- "Benzo(a)pyrene"
192+
cas_final$chnm[cas_final$chnm == "beta-Bhc"] <- "beta-Hexachlorocyclohexane"
193+
cas_final$chnm[cas_final$chnm == "P,p'-Methoxychlor"] <- "p,p'-Methoxychlor"
194+
cas_final$chnm[cas_final$chnm == "alpha-Bhc"] <- "alpha-Hexachlorocyclohexane"
195+
cas_final$chnm[cas_final$chnm == "Benzo[b]naphtho[2,1-D]thiophene"] <- "Benzo[b]naphtho[2,1-d]thiophene"
196+
cas_final$chnm[cas_final$chnm == "Dibenzo[a,h]anthracene"] <- "Dibenz[a,h]anthracene"
197+
cas_final$chnm[cas_final$chnm == "p-Tert-Octylphenol"] <- "p-tert-octylphenol"
177198
cas_final$chnm[cas_final$CAS =="26248-87-3"] <- "Tri(chloropropyl) phosphate"
178-
# cas_final$chnm[cas_final$chnm == "Tris(1-Chloro-2-Propyl)Phosphate (Tcpp)"] <- "TCPP"
179-
cas_final$chnm[cas_final$chnm == "Hexachlorobenzene (Hcb)"] <- "HCB"
199+
cas_final$chnm[cas_final$chnm == "Hexachlorobenzene (Hcb)"] <- "Hexachlorobenzene"
180200
cas_final$chnm[cas_final$CAS == "77-93-0"] <- "Triethyl Citrate "
181201
cas_final$chnm[cas_final$CAS == "30306-93-5"] <- "Ethyl Citrate"
202+
cas_final$chnm[cas_final$CAS == "101-20-2"] <- "3,4,4'-Trichlorocarbanilide"
203+
cas_final$chnm[cas_final$CAS == "73590-58-6"] <- "Omeprazole + Esomprazole"
204+
cas_final$chnm[grep("Cis-", cas_final$chnm)] <- gsub(pattern = "Cis-",
205+
replacement = "cis-",
206+
cas_final$chnm[grep("Cis-", cas_final$chnm)])
207+
cas_final$chnm[grep("Trans-", cas_final$chnm)] <- gsub(pattern = "Trans-",
208+
replacement = "trans-",
209+
cas_final$chnm[grep("Trans-", cas_final$chnm)])
210+
182211
cas_final$chnm[grep("Pbde-", cas_final$chnm)] <- gsub(pattern = "Pbde-",
183212
replacement = "PBDE-",
184213
cas_final$chnm[grep("Pbde-", cas_final$chnm)])
@@ -187,9 +216,9 @@ clean_cas <- function(cas_df){
187216
stringsAsFactors = FALSE))
188217
cas_final$chnm[cas_final$CAS == "34911-55-2"] <- "Bupropion hydrochloride"
189218

190-
cas_final$chnm[grep(pattern = "Delta-Benzenehexachloride",cas_final$chnm)] <- "delta-Bhc"
191-
cas_final$chnm[grep(pattern = "Beta-Benzenehexachloride",cas_final$chnm)] <- "beta-Bhc"
192-
cas_final$chnm[grep(pattern = "Alpha-Benzenehexachloride", cas_final$chnm)] <- "alpha-Bhc"
219+
cas_final$chnm[grep(pattern = "Delta-Benzenehexachloride",cas_final$chnm)] <- "Delta-Benzenehexachloride"
220+
cas_final$chnm[grep(pattern = "Beta-Benzenehexachloride",cas_final$chnm)] <- "Beta-Benzenehexachloride"
221+
cas_final$chnm[grep(pattern = "Alpha-Benzenehexachloride", cas_final$chnm)] <- "Alpha-Benzenehexachloride"
193222

194223
return(cas_final)
195224
}

R/analyze/get_chem_info.R

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ get_chem_info <- function(all_data, chem_info_old){
99
chem_data <- all_data %>%
1010
select(SiteID, Date=`Sample Date`, CAS, Value, comment)
1111

12-
chem_info <- select(all_data, CAS, generic_class) %>%
12+
chem_info <- select(all_data, CAS, generic_class, chnm) %>%
1313
distinct() %>%
14-
left_join(select(chem_info_old, CAS, Class, chnm), by="CAS") %>%
14+
left_join(distinct(select(chem_info_old, CAS, Class)), by="CAS") %>%
1515
filter(!is.na(CAS)) %>%
1616
distinct(CAS, .keep_all = TRUE)
1717

R/analyze/get_sites_ready.R

+2-9
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,14 @@ get_sites_ready <- function(file_2014_download, file_2010_download, sites_OWC){
5353
sites_orig_2014$SiteID[sites_orig_2014$SiteID == "40851385"] <- "040851385"
5454

5555
sites_orig <- bind_rows(sites_orig_2014, sites_OWC)
56+
5657
sites_orig <- sites_orig[sites_orig$SiteID != "000-----",]
5758

5859
sites_orig_2010 <- readxl::read_excel(file_2010_download,
5960
sheet = "site info",
6061
skip = 2) %>%
6162
select(SiteID = `USGS Station ID`) %>%
63+
mutate(SiteID = dataRetrieval::zeroPad(SiteID, 8)) %>%
6264
filter(!(SiteID %in% sites_orig$SiteID),
6365
!is.na(SiteID))
6466

@@ -81,15 +83,6 @@ get_sites_ready <- function(file_2014_download, file_2010_download, sites_OWC){
8183
sites_ordered <- sites_ordered %>%
8284
arrange(site_grouping, `Short Name`)
8385

84-
sites_ordered$map_nm <- substr(gsub("Lake ", "", sites_ordered$site_grouping),1,1)
85-
86-
sites_ordered$map_nm <- paste0(sites_ordered$map_nm,
87-
c(1:sum(sites_ordered$map_nm == "S"),
88-
1:sum(sites_ordered$map_nm == "M"),
89-
1:sum(sites_ordered$map_nm == "H"),
90-
1:sum(sites_ordered$map_nm == "E"),
91-
1:sum(sites_ordered$map_nm == "O")))
92-
9386
return(sites_ordered)
9487

9588
}

R/report/stack_plots.R

+101
Original file line numberDiff line numberDiff line change
@@ -187,3 +187,104 @@ whole_stack <- function(chemicalSummary,
187187
return(list(chem_count=chem_count_graph,no_axis=no_axis_plot_back))
188188
}
189189

190+
191+
plot_tox_stacks_manuscript2 <- function(chemical_summary,
192+
chem_site,cbValues,
193+
category = "Biological",
194+
mean_logic = FALSE,
195+
sum_logic = TRUE,
196+
manual_remove = NULL,
197+
include_legend = TRUE,
198+
font_size = NA,
199+
title = NA){
200+
201+
match.arg(category, c("Biological","Chemical Class","Chemical"))
202+
203+
site <- EAR <- sumEAR <- meanEAR <- groupCol <- nonZero <- maxEAR <- ".dplyr"
204+
SiteID <- site_grouping <- n <- index <- `Short Name` <- count <- x <- y <- label <- ".dplyr"
205+
206+
if(!("site_grouping" %in% names(chem_site))){
207+
chem_site$site_grouping <- ""
208+
}
209+
210+
if(category == "Chemical"){
211+
graphData <- graph_chem_data(chemical_summary = chemical_summary,
212+
manual_remove = manual_remove,
213+
mean_logic = mean_logic,
214+
sum_logic = sum_logic)
215+
names(graphData)[names(graphData) == "maxEAR"] <- "meanEAR"
216+
names(graphData)[names(graphData) == "chnm"] <- "category"
217+
} else {
218+
graphData <- tox_boxplot_data(chemical_summary = chemical_summary,
219+
category = category,
220+
manual_remove = manual_remove,
221+
mean_logic = mean_logic,
222+
sum_logic = sum_logic)
223+
if(category == "Chemical"){
224+
graphData$category <- graphData$chnm
225+
}
226+
}
227+
228+
graphData <- graphData %>%
229+
dplyr::full_join(chem_site[, c("SiteID", "site_grouping", "Short Name")],
230+
by=c("site"="SiteID"))
231+
232+
graphData$`Short Name` <- factor(graphData$`Short Name`, levels = rev(levels(graphData$`Short Name`)))
233+
234+
if(is.na(title)){
235+
graphData$count_title <- ""
236+
} else {
237+
graphData$count_title <- title
238+
}
239+
240+
counts_df <- chem_counts(chemical_summary, chem_site)
241+
242+
counts_df <- counts_df %>%
243+
right_join(select(chem_site, `Short Name`, map_nm), by="Short Name")
244+
245+
counts_df <- counts_df[!duplicated(counts_df$`Short Name`),]
246+
247+
labels_df <- data.frame(y = c(-0.05,-0.01),
248+
x = c(Inf,Inf),
249+
label = c("Map Name","Chemicals"),
250+
site_grouping = c("Lake Superior","Lake Superior"))
251+
252+
upperPlot <- ggplot() +
253+
geom_col(data = graphData,
254+
aes(x=`Short Name`, y=meanEAR, fill = category)) +
255+
theme_minimal() +
256+
ylab("Sum of Maximum EAR Values") +
257+
geom_text(data = counts_df,
258+
aes(x=`Short Name`, label = count, y=-0.01),
259+
hjust = 0.5, vjust = 0.35, size = font_size/2.5) +
260+
geom_text(data = counts_df,
261+
aes(x=`Short Name`, label = map_nm, y=-0.05),
262+
hjust = 0.5, vjust = 0.35, size = font_size/2.5) +
263+
geom_text(data = labels_df,
264+
aes(y = y, x = x, label = label),
265+
vjust = -0.5, size = font_size/3) +
266+
facet_grid(site_grouping ~ ., scales="free", space="free") +
267+
coord_flip(clip = "off") +
268+
scale_y_continuous(breaks = scales::pretty_breaks(n = 2)) +
269+
scale_fill_manual(name = category,
270+
values = cbValues, drop=TRUE) +
271+
theme(strip.background = element_blank(),
272+
strip.text.x = element_text(size = 5),
273+
strip.text.y = element_text(size = font_size),
274+
axis.title.y = element_blank(),
275+
legend.position="bottom",
276+
panel.grid.minor = element_blank(),
277+
panel.grid.major = element_line(size = 0.1),
278+
legend.justification = "left",
279+
legend.background = element_rect(fill = "transparent", colour = "transparent"),
280+
legend.title=element_blank(),
281+
legend.text = element_text(size=5),
282+
legend.key.height = unit(0.5,"line"),
283+
legend.key.width = unit(0.5, "line"),
284+
axis.text = element_text(size = font_size, vjust = 0.35),
285+
axis.title = element_text(size= font_size))
286+
upperPlot
287+
return(upperPlot)
288+
}
289+
290+

R/setup/file_config.R

+3
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,7 @@ last_modified_exclude_id <- drive_get_datetime_modified(exclude_id)
2828
cas_change_id <- as_id("1NcsZ3sfB8SEGLO0dS3oCTRDPRUbHtWXJ")
2929
last_modified_cas_change_id <- drive_get_datetime_modified(cas_change_id)
3030

31+
chem_name_id <- as_id("15Ph4jPBuLndJbUInLDu_FO8SrhnTpB0d7UDvOHouliY")
32+
last_modified_chem_name_id <- drive_get_datetime_modified(chem_name_id)
33+
3134

create_triple_fig.R

+21-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ chemicalSummary_conc_no_match = chemicalSummary_conc %>%
1919

2020
graphData_conc_no_match = graph_chem_data_CAS(chemicalSummary_conc_no_match) %>%
2121
mutate(guide_side = "Concentration [\U003BCg/L]") %>%
22-
left_join(cas_final, by="CAS")
22+
left_join(select(cas_final, CAS, chnm), by="CAS")
2323

2424
full_classes <- c(levels(graphData_tox_det$Class),
2525
levels(graphData_conc_no_match$Class)[!(levels(graphData_conc_no_match$Class) %in% levels(graphData_tox_det$Class))])
@@ -56,7 +56,7 @@ site_counts_df_no_match <- site_counts(tox_list$chem_data, no_axis_no_match$data
5656
site_graph_no_match <- site_count_plot(site_counts_df_no_match,
5757
axis_size = axis_num)
5858

59-
pdf("plots/triple_graph_full_page.pdf", width = 9, height = 11, onefile=FALSE)
59+
pdf("plots/triple_graph_full_page_v3.pdf", width = 9, height = 11, onefile=FALSE)
6060
ggarrange(
6161

6262
matches$site_graph,
@@ -79,7 +79,7 @@ library(cowplot)
7979

8080
l2 <- get_legend(toxPlot_no_match)
8181

82-
pdf("plots/triple_graph_cow.pdf", width = 9, height = 11, onefile=FALSE)
82+
pdf("plots/triple_graph_v3_new_names.pdf", width = 9, height = 11, onefile=FALSE)
8383
plot_grid(
8484
matches$site_graph,
8585
matches$no_axis,
@@ -94,7 +94,24 @@ plot_grid(
9494
nrow = 2, ncol = 1,
9595
rel_heights = c(n_chems_no_match,n_chems_matches-n_chems_no_match)
9696
),
97-
rel_widths = c(2,4,4),
97+
rel_widths = c(2.5,4,4),
9898
nrow=1,ncol=3
9999
)
100100
dev.off()
101+
102+
loadd(chemicalSummary)
103+
104+
pdf("plots/top_eps.pdf")
105+
for(i in rev(levels(chemicalSummary$chnm))[1:10]){
106+
# add threshold!!!
107+
ep_plot <- plot_tox_endpoints(chemicalSummary,
108+
category = 'Chemical',
109+
mean_logic = FALSE,
110+
hit_threshold = NA,
111+
title = i,
112+
top_num = 10,
113+
filterBy = i)
114+
print(ep_plot)
115+
116+
}
117+
dev.off()

explore_detection_limits.R

+2
Original file line numberDiff line numberDiff line change
@@ -185,3 +185,5 @@ plot_DL <- plot_chemical_boxplots_mod(cs_det_level,
185185
ggsave(plot_DL, filename = "plots/detection_level_EARs.pdf",
186186
width = 9, height = 11)
187187

188+
graphDataDL <- graph_chem_data(cs_det_level) %>%
189+
arrange(desc(meanEAR))

0 commit comments

Comments
 (0)