diff --git a/R/read-html.R b/R/read-html.R
index e08b11a..6e6d97c 100644
--- a/R/read-html.R
+++ b/R/read-html.R
@@ -336,13 +336,13 @@ ragnar_read_document <- function(x, ...,
#' depth = 1
#' )
ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TRUE, ...,
- url_filter = identity) {
+ url_filter = identity, modify_request = identity) {
rlang::check_dots_empty()
if (!inherits(x, "xml_node")) {
check_string(x)
- x <- read_html2(x)
+ x <- read_html2(x, modify_request = modify_request)
}
depth <- as.integer(depth)
@@ -384,7 +384,7 @@ ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TR
visited$add(item$url)
links <- tryCatch(
- html_find_links(item$url),
+ html_find_links(item$url, modify_request = modify_request),
error = function(e) {
# if there's an issue finding child links we log it into the `problems` table
# which is included in the output as an attribute.
@@ -428,10 +428,10 @@ ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TR
# E.g.,
# for same site only: prefix = url_host(xml_url(x))
# for child links only: prefix = url_normalize_stem(xml_url(x))
-html_find_links <- function(x, absolute = TRUE) {
+html_find_links <- function(x, absolute = TRUE, modify_request) {
if (!inherits(x, "xml_node")) {
- x <- read_html2(x)
+ x <- read_html2(x, modify_request = modify_request)
}
links <- x |>
@@ -486,29 +486,32 @@ stri_subset_startswith_fixed <- function(str, pattern, ...) {
}
# workaround for https://github.com/r-lib/xml2/issues/453
-read_html2 <- function(url, ...) {
- # For some reason curl is both erroring and warning when the URL is invalid or
- # returns 404. We don't really want the warnings, so we discard them.
- suppressWarnings({
- handle <- curl::new_handle(followlocation = TRUE)
- # We first try the original URL, if some error occurs we retry with the
- # URL encoded version. (If it's different from the original URL.)
- conn <- tryCatch(
- curl::curl(url, "rb", handle = handle),
- error = function(err) {
- encoded_url <- utils::URLencode(url)
- if (url != encoded_url) {
- handle <<- curl::new_handle(followlocation = TRUE)
- curl::curl(encoded_url, "rb", handle = handle)
- } else {
- stop(err)
- }
- }
- )
+read_html2 <- function(url, ..., modify_request) {
+
+ perform_request <- function(url) {
+ url |>
+ httr2::request() |>
+ modify_request() |>
+ httr2::req_perform()
+ }
+
+ resp <- tryCatch({
+ perform_request(url)
+ }, error = function(err) {
+ encoded_url <- utils::URLencode(url)
+ if (url != encoded_url) {
+ perform_request(url)
+ } else {
+ stop(err)
+ }
})
+
+ raw <- httr2::resp_body_raw(resp)
+ conn <- rawConnection(raw, "rb")
on.exit(tryCatch(close(conn), error = function(e) NULL))
+
out <- xml2::read_html(conn, ...)
- attr(out, "resolved_url") <- curl::handle_data(handle)$url
+ attr(out, "resolved_url") <- httr2::resp_url(resp)
out
}
diff --git a/R/read-markdown.R b/R/read-markdown.R
index 7ca7077..9a7b3b4 100644
--- a/R/read-markdown.R
+++ b/R/read-markdown.R
@@ -51,7 +51,7 @@ init_markitdown <- function(...) {
# ' chat <- ellmer::chat_openai(echo = TRUE)
# ' chat$chat("Describe this image", content_image_file(jpg))
# ' }
-read_as_markdown <- function(x, ..., canonical = FALSE) {
+read_as_markdown <- function(x, ..., canonical = FALSE, modify_request = identity) {
check_string(x)
@@ -69,9 +69,33 @@ read_as_markdown <- function(x, ..., canonical = FALSE) {
if (getOption("ragnar.markitdown.use_reticulate", TRUE)) {
# use the Python API, faster, more powerful
convert <- .globals$markitdown$convert %||% init_markitdown()$convert
- md <- convert(x, ...)
+ # if `x` is a url, we use httr to retrieve it
+ x <- tryCatch({
+ curl::curl_parse_url(x)
+ httr2::request(x)
+ }, error = function(err) {
+ x
+ })
+
+ if (inherits(x, "httr2_request")) {
+ x <- x |>
+ modify_request() |>
+ httr2::req_perform() |>
+ r_to_py.httr2_response()
+ }
+
+ md <- convert(x, ...)
} else {
+
+ if (!identical(modify_request, identity)) {
+ cli::cli_warn(
+ "The {.arg modify_request} argument is not supported when using the CLI interface.",
+ .frequency = "once",
+ .frequency_id = "read_as_markdown"
+ )
+ }
+
# use the markitdown cli API, (much) slower, but easier to isolate
check_dots_empty()
outfile <- withr::local_tempfile(fileext = ".md")
@@ -103,6 +127,17 @@ read_as_markdown <- function(x, ..., canonical = FALSE) {
glue::as_glue(md)
}
+r_to_py.httr2_response <- function(x, convert = FALSE) {
+ requests <- reticulate::import("requests")
+ io <- reticulate::import("io")
+ response <- requests$Response()
+ response$status_code <- httr2::resp_status(x)
+ response$url <- httr2::resp_url(x)
+ response$raw <- io$BytesIO(httr2::resp_body_raw(x))
+ response$headers <- httr2::resp_headers(x)
+ response$encoding <- httr2::resp_encoding(x)
+ response
+}
markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
lines <- text |> stri_split_lines() |> unlist()
@@ -286,20 +321,20 @@ markdown_segment_text <- function(text, split_by = c("h1", "h2", "h3", "pre", "p
#' @param frame_by_tags character vector of html tag names used to create a
#' dataframe of the returned content
#'
-#' @returns
+#' @returns
#' Always returns a data frame with the columns:
#' - `origin`: the file path or url
#' - `hash`: a hash of the text content
#' - `text`: the markdown content
-#'
+#'
#' If `split_by_tags` is not `NULL`, then a `tag` column is also included containing
#' the corresponding tag for each text chunk. `""` is used for text chunks that
#' are not associated with a tag.
-#'
+#'
#' If `frame_by_tags` is not `NULL`, then additional columns are included for each
#' tag in `frame_by_tags`. The text chunks are associated with the tags in the
#' order they appear in the markdown content.
-#'
+#'
#' @export
#'
#' @examples
@@ -364,9 +399,9 @@ markdown_segment_text <- function(text, split_by = c("h1", "h2", "h3", "pre", "p
#' )--") |>
#' # inspect
#' _[9:10] |> cat(sep = "\n~~~~~~~~~~~\n")
-ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
+ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL, modify_request = identity) {
- text <- read_as_markdown(x, ...)
+ text <- read_as_markdown(x, ..., modify_request = identity)
hash <- rlang::hash(text)
if (is.null(frame_by_tags) && is.null(split_by_tags)) {
@@ -377,18 +412,18 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
)
return(out)
}
-
+
segmented <- markdown_segment(
- text,
- tags = unique(c(split_by_tags, frame_by_tags)),
- trim = TRUE,
+ text,
+ tags = unique(c(split_by_tags, frame_by_tags)),
+ trim = TRUE,
omit_empty = TRUE
)
frame <- vec_frame_flattened_tree(
- segmented,
- frame_by_tags %||% character(),
- names = "tag",
+ segmented,
+ frame_by_tags %||% character(),
+ names = "tag",
leaves = "text"
)
@@ -397,8 +432,8 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
frame[["tag"]] <- NULL
}
- frame <- frame |>
- dplyr::mutate(origin = x, hash = hash) |>
+ frame <- frame |>
+ dplyr::mutate(origin = x, hash = hash) |>
dplyr::select(origin, hash, text, dplyr::everything())
as_tibble(frame)