diff --git a/R/read-html.R b/R/read-html.R index e08b11a..6e6d97c 100644 --- a/R/read-html.R +++ b/R/read-html.R @@ -336,13 +336,13 @@ ragnar_read_document <- function(x, ..., #' depth = 1 #' ) ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TRUE, ..., - url_filter = identity) { + url_filter = identity, modify_request = identity) { rlang::check_dots_empty() if (!inherits(x, "xml_node")) { check_string(x) - x <- read_html2(x) + x <- read_html2(x, modify_request = modify_request) } depth <- as.integer(depth) @@ -384,7 +384,7 @@ ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TR visited$add(item$url) links <- tryCatch( - html_find_links(item$url), + html_find_links(item$url, modify_request = modify_request), error = function(e) { # if there's an issue finding child links we log it into the `problems` table # which is included in the output as an attribute. @@ -428,10 +428,10 @@ ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TR # E.g., # for same site only: prefix = url_host(xml_url(x)) # for child links only: prefix = url_normalize_stem(xml_url(x)) -html_find_links <- function(x, absolute = TRUE) { +html_find_links <- function(x, absolute = TRUE, modify_request) { if (!inherits(x, "xml_node")) { - x <- read_html2(x) + x <- read_html2(x, modify_request = modify_request) } links <- x |> @@ -486,29 +486,32 @@ stri_subset_startswith_fixed <- function(str, pattern, ...) { } # workaround for https://github.com/r-lib/xml2/issues/453 -read_html2 <- function(url, ...) { - # For some reason curl is both erroring and warning when the URL is invalid or - # returns 404. We don't really want the warnings, so we discard them. - suppressWarnings({ - handle <- curl::new_handle(followlocation = TRUE) - # We first try the original URL, if some error occurs we retry with the - # URL encoded version. (If it's different from the original URL.) - conn <- tryCatch( - curl::curl(url, "rb", handle = handle), - error = function(err) { - encoded_url <- utils::URLencode(url) - if (url != encoded_url) { - handle <<- curl::new_handle(followlocation = TRUE) - curl::curl(encoded_url, "rb", handle = handle) - } else { - stop(err) - } - } - ) +read_html2 <- function(url, ..., modify_request) { + + perform_request <- function(url) { + url |> + httr2::request() |> + modify_request() |> + httr2::req_perform() + } + + resp <- tryCatch({ + perform_request(url) + }, error = function(err) { + encoded_url <- utils::URLencode(url) + if (url != encoded_url) { + perform_request(url) + } else { + stop(err) + } }) + + raw <- httr2::resp_body_raw(resp) + conn <- rawConnection(raw, "rb") on.exit(tryCatch(close(conn), error = function(e) NULL)) + out <- xml2::read_html(conn, ...) - attr(out, "resolved_url") <- curl::handle_data(handle)$url + attr(out, "resolved_url") <- httr2::resp_url(resp) out } diff --git a/R/read-markdown.R b/R/read-markdown.R index 7ca7077..9a7b3b4 100644 --- a/R/read-markdown.R +++ b/R/read-markdown.R @@ -51,7 +51,7 @@ init_markitdown <- function(...) { # ' chat <- ellmer::chat_openai(echo = TRUE) # ' chat$chat("Describe this image", content_image_file(jpg)) # ' } -read_as_markdown <- function(x, ..., canonical = FALSE) { +read_as_markdown <- function(x, ..., canonical = FALSE, modify_request = identity) { check_string(x) @@ -69,9 +69,33 @@ read_as_markdown <- function(x, ..., canonical = FALSE) { if (getOption("ragnar.markitdown.use_reticulate", TRUE)) { # use the Python API, faster, more powerful convert <- .globals$markitdown$convert %||% init_markitdown()$convert - md <- convert(x, ...) + # if `x` is a url, we use httr to retrieve it + x <- tryCatch({ + curl::curl_parse_url(x) + httr2::request(x) + }, error = function(err) { + x + }) + + if (inherits(x, "httr2_request")) { + x <- x |> + modify_request() |> + httr2::req_perform() |> + r_to_py.httr2_response() + } + + md <- convert(x, ...) } else { + + if (!identical(modify_request, identity)) { + cli::cli_warn( + "The {.arg modify_request} argument is not supported when using the CLI interface.", + .frequency = "once", + .frequency_id = "read_as_markdown" + ) + } + # use the markitdown cli API, (much) slower, but easier to isolate check_dots_empty() outfile <- withr::local_tempfile(fileext = ".md") @@ -103,6 +127,17 @@ read_as_markdown <- function(x, ..., canonical = FALSE) { glue::as_glue(md) } +r_to_py.httr2_response <- function(x, convert = FALSE) { + requests <- reticulate::import("requests") + io <- reticulate::import("io") + response <- requests$Response() + response$status_code <- httr2::resp_status(x) + response$url <- httr2::resp_url(x) + response$raw <- io$BytesIO(httr2::resp_body_raw(x)) + response$headers <- httr2::resp_headers(x) + response$encoding <- httr2::resp_encoding(x) + response +} markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) { lines <- text |> stri_split_lines() |> unlist() @@ -286,20 +321,20 @@ markdown_segment_text <- function(text, split_by = c("h1", "h2", "h3", "pre", "p #' @param frame_by_tags character vector of html tag names used to create a #' dataframe of the returned content #' -#' @returns +#' @returns #' Always returns a data frame with the columns: #' - `origin`: the file path or url #' - `hash`: a hash of the text content #' - `text`: the markdown content -#' +#' #' If `split_by_tags` is not `NULL`, then a `tag` column is also included containing #' the corresponding tag for each text chunk. `""` is used for text chunks that #' are not associated with a tag. -#' +#' #' If `frame_by_tags` is not `NULL`, then additional columns are included for each #' tag in `frame_by_tags`. The text chunks are associated with the tags in the #' order they appear in the markdown content. -#' +#' #' @export #' #' @examples @@ -364,9 +399,9 @@ markdown_segment_text <- function(text, split_by = c("h1", "h2", "h3", "pre", "p #' )--") |> #' # inspect #' _[9:10] |> cat(sep = "\n~~~~~~~~~~~\n") -ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) { +ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL, modify_request = identity) { - text <- read_as_markdown(x, ...) + text <- read_as_markdown(x, ..., modify_request = identity) hash <- rlang::hash(text) if (is.null(frame_by_tags) && is.null(split_by_tags)) { @@ -377,18 +412,18 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) { ) return(out) } - + segmented <- markdown_segment( - text, - tags = unique(c(split_by_tags, frame_by_tags)), - trim = TRUE, + text, + tags = unique(c(split_by_tags, frame_by_tags)), + trim = TRUE, omit_empty = TRUE ) frame <- vec_frame_flattened_tree( - segmented, - frame_by_tags %||% character(), - names = "tag", + segmented, + frame_by_tags %||% character(), + names = "tag", leaves = "text" ) @@ -397,8 +432,8 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) { frame[["tag"]] <- NULL } - frame <- frame |> - dplyr::mutate(origin = x, hash = hash) |> + frame <- frame |> + dplyr::mutate(origin = x, hash = hash) |> dplyr::select(origin, hash, text, dplyr::everything()) as_tibble(frame)