tidyverse · dfalbel · Apr 9, 2025
diff --git a/R/read-html.R b/R/read-html.R
@@ -336,13 +336,13 @@ ragnar_read_document <- function(x, ...,
 #'   depth = 1
 #' )
 ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TRUE, ...,
-                              url_filter = identity) {
+                              url_filter = identity, modify_request = identity) {
 
   rlang::check_dots_empty()
 
   if (!inherits(x, "xml_node")) {
     check_string(x)
-    x <- read_html2(x)
+    x <- read_html2(x, modify_request = modify_request)
   }
 
   depth <- as.integer(depth)
@@ -384,7 +384,7 @@ ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TR
       visited$add(item$url)
 
       links <- tryCatch(
-        html_find_links(item$url),
+        html_find_links(item$url, modify_request = modify_request),
         error = function(e) {
           # if there's an issue finding child links we log it into the `problems` table
           # which is included in the output as an attribute.
@@ -428,10 +428,10 @@ ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TR
 # E.g.,
 # for same site only: prefix = url_host(xml_url(x))
 # for child links only: prefix = url_normalize_stem(xml_url(x))
-html_find_links <- function(x, absolute = TRUE) {
+html_find_links <- function(x, absolute = TRUE, modify_request) {
 
   if (!inherits(x, "xml_node")) {
-    x <- read_html2(x)
+    x <- read_html2(x, modify_request = modify_request)
   }
 
   links <- x |>
@@ -486,29 +486,32 @@ stri_subset_startswith_fixed <- function(str, pattern, ...) {
 }
 
 # workaround for https://github.com/r-lib/xml2/issues/453
-read_html2 <- function(url, ...) {
-  # For some reason curl is both erroring and warning when the URL is invalid or
-  # returns 404. We don't really want the warnings, so we discard them.
-  suppressWarnings({
-    handle <- curl::new_handle(followlocation = TRUE)
-    # We first try the original URL, if some error occurs we retry with the
-    # URL encoded version. (If it's different from the original URL.)
-    conn <- tryCatch(
-      curl::curl(url, "rb", handle = handle),
-      error = function(err) {
-        encoded_url <- utils::URLencode(url)
-        if (url != encoded_url) {
-          handle <<- curl::new_handle(followlocation = TRUE)
-          curl::curl(encoded_url, "rb", handle = handle)
-        } else {
-          stop(err)
-        }
-      }
-    )
+read_html2 <- function(url, ..., modify_request) {
+
+  perform_request <- function(url) {
+    url |>
+      httr2::request() |>
+      modify_request() |>
+      httr2::req_perform()
+  }
+
+  resp <- tryCatch({
+    perform_request(url)
+  }, error = function(err) {
+    encoded_url <- utils::URLencode(url)
+    if (url != encoded_url) {
+      perform_request(url)
+    } else {
+      stop(err)
+    }
   })
+
+  raw <- httr2::resp_body_raw(resp)
+  conn <- rawConnection(raw, "rb")
   on.exit(tryCatch(close(conn), error = function(e) NULL))
+
   out <- xml2::read_html(conn, ...)
-  attr(out, "resolved_url") <- curl::handle_data(handle)$url
+  attr(out, "resolved_url") <- httr2::resp_url(resp)
   out
 }
 

diff --git a/R/read-markdown.R b/R/read-markdown.R
@@ -51,7 +51,7 @@ init_markitdown <- function(...) {
 # '   chat <- ellmer::chat_openai(echo = TRUE)
 # '   chat$chat("Describe this image", content_image_file(jpg))
 # ' }
-read_as_markdown <- function(x, ..., canonical = FALSE) {
+read_as_markdown <- function(x, ..., canonical = FALSE, modify_request = identity) {
 
   check_string(x)
 
@@ -69,9 +69,33 @@ read_as_markdown <- function(x, ..., canonical = FALSE) {
   if (getOption("ragnar.markitdown.use_reticulate", TRUE)) {
     # use the Python API, faster, more powerful
     convert <- .globals$markitdown$convert %||% init_markitdown()$convert
-    md <- convert(x, ...)
 
+    # if `x` is a url, we use httr to retrieve it
+    x <- tryCatch({
+      curl::curl_parse_url(x)
+      httr2::request(x)
+    }, error = function(err) {
+      x
+    })
+
+    if (inherits(x, "httr2_request")) {
+      x <- x |>
+        modify_request() |>
+        httr2::req_perform() |>
+        r_to_py.httr2_response()
+    }
+
+    md <- convert(x, ...)
   } else {
+
+    if (!identical(modify_request, identity)) {
+      cli::cli_warn(
+        "The {.arg modify_request} argument is not supported when using the CLI interface.",
+        .frequency = "once",
+        .frequency_id = "read_as_markdown"
+      )
+    }
+
     # use the markitdown cli API, (much) slower, but easier to isolate
     check_dots_empty()
     outfile <- withr::local_tempfile(fileext = ".md")
@@ -103,6 +127,17 @@ read_as_markdown <- function(x, ..., canonical = FALSE) {
   glue::as_glue(md)
 }
 
+r_to_py.httr2_response <- function(x, convert = FALSE) {
+  requests <- reticulate::import("requests")
+  io <- reticulate::import("io")
+  response <- requests$Response()
+  response$status_code <- httr2::resp_status(x)
+  response$url <- httr2::resp_url(x)
+  response$raw <- io$BytesIO(httr2::resp_body_raw(x))
+  response$headers <- httr2::resp_headers(x)
+  response$encoding <- httr2::resp_encoding(x)
+  response
+}
 
 markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
   lines <- text |> stri_split_lines() |> unlist()
@@ -286,20 +321,20 @@ markdown_segment_text <- function(text, split_by = c("h1", "h2", "h3", "pre", "p
 #' @param frame_by_tags character vector of html tag names used to create a
 #'   dataframe of the returned content
 #'
-#' @returns 
+#' @returns
 #' Always returns a data frame with the columns:
 #'   - `origin`: the file path or url
 #'   - `hash`: a hash of the text content
 #'   - `text`: the markdown content
-#' 
+#'
 #' If `split_by_tags` is not `NULL`, then a `tag` column is also included containing
 #' the corresponding tag for each text chunk. `""` is used for text chunks that
 #' are not associated with a tag.
-#' 
+#'
 #' If `frame_by_tags` is not `NULL`, then additional columns are included for each
 #' tag in `frame_by_tags`. The text chunks are associated with the tags in the
 #' order they appear in the markdown content.
-#' 
+#'
 #' @export
 #'
 #' @examples
@@ -364,9 +399,9 @@ markdown_segment_text <- function(text, split_by = c("h1", "h2", "h3", "pre", "p
 #'     )--") |>
 #'   # inspect
 #'   _[9:10] |> cat(sep = "\n~~~~~~~~~~~\n")
-ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
+ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL, modify_request = identity) {
 
-  text <- read_as_markdown(x, ...)
+  text <- read_as_markdown(x, ..., modify_request = identity)
   hash <- rlang::hash(text)
 
   if (is.null(frame_by_tags) && is.null(split_by_tags)) {
@@ -377,18 +412,18 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
     )
     return(out)
   }
-  
+
   segmented <- markdown_segment(
-    text, 
-    tags = unique(c(split_by_tags, frame_by_tags)), 
-    trim = TRUE, 
+    text,
+    tags = unique(c(split_by_tags, frame_by_tags)),
+    trim = TRUE,
     omit_empty = TRUE
   )
 
   frame <- vec_frame_flattened_tree(
-    segmented, 
-    frame_by_tags %||% character(), 
-    names = "tag", 
+    segmented,
+    frame_by_tags %||% character(),
+    names = "tag",
     leaves = "text"
   )
 
@@ -397,8 +432,8 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
     frame[["tag"]] <- NULL
   }
 
-  frame <- frame |> 
-    dplyr::mutate(origin = x, hash = hash) |> 
+  frame <- frame |>
+    dplyr::mutate(origin = x, hash = hash) |>
     dplyr::select(origin, hash, text, dplyr::everything())
 
   as_tibble(frame)