Skip to content

Delegate requests to httr2 #20

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 28 additions & 25 deletions R/read-html.R
Original file line number Diff line number Diff line change
Expand Up @@ -336,13 +336,13 @@ ragnar_read_document <- function(x, ...,
#' depth = 1
#' )
ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TRUE, ...,
url_filter = identity) {
url_filter = identity, modify_request = identity) {

rlang::check_dots_empty()

if (!inherits(x, "xml_node")) {
check_string(x)
x <- read_html2(x)
x <- read_html2(x, modify_request = modify_request)
}

depth <- as.integer(depth)
Expand Down Expand Up @@ -384,7 +384,7 @@ ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TR
visited$add(item$url)

links <- tryCatch(
html_find_links(item$url),
html_find_links(item$url, modify_request = modify_request),
error = function(e) {
# if there's an issue finding child links we log it into the `problems` table
# which is included in the output as an attribute.
Expand Down Expand Up @@ -428,10 +428,10 @@ ragnar_find_links <- function(x, depth = 0L, children_only = TRUE, progress = TR
# E.g.,
# for same site only: prefix = url_host(xml_url(x))
# for child links only: prefix = url_normalize_stem(xml_url(x))
html_find_links <- function(x, absolute = TRUE) {
html_find_links <- function(x, absolute = TRUE, modify_request) {

if (!inherits(x, "xml_node")) {
x <- read_html2(x)
x <- read_html2(x, modify_request = modify_request)
}

links <- x |>
Expand Down Expand Up @@ -486,29 +486,32 @@ stri_subset_startswith_fixed <- function(str, pattern, ...) {
}

# workaround for https://github.com/r-lib/xml2/issues/453
read_html2 <- function(url, ...) {
# For some reason curl is both erroring and warning when the URL is invalid or
# returns 404. We don't really want the warnings, so we discard them.
suppressWarnings({
handle <- curl::new_handle(followlocation = TRUE)
# We first try the original URL, if some error occurs we retry with the
# URL encoded version. (If it's different from the original URL.)
conn <- tryCatch(
curl::curl(url, "rb", handle = handle),
error = function(err) {
encoded_url <- utils::URLencode(url)
if (url != encoded_url) {
handle <<- curl::new_handle(followlocation = TRUE)
curl::curl(encoded_url, "rb", handle = handle)
} else {
stop(err)
}
}
)
read_html2 <- function(url, ..., modify_request) {

perform_request <- function(url) {
url |>
httr2::request() |>
modify_request() |>
httr2::req_perform()
}

resp <- tryCatch({
perform_request(url)
}, error = function(err) {
encoded_url <- utils::URLencode(url)
if (url != encoded_url) {
perform_request(url)
} else {
stop(err)
}
})

raw <- httr2::resp_body_raw(resp)
conn <- rawConnection(raw, "rb")
on.exit(tryCatch(close(conn), error = function(e) NULL))

out <- xml2::read_html(conn, ...)
attr(out, "resolved_url") <- curl::handle_data(handle)$url
attr(out, "resolved_url") <- httr2::resp_url(resp)
out
}

Expand Down
69 changes: 52 additions & 17 deletions R/read-markdown.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ init_markitdown <- function(...) {
# ' chat <- ellmer::chat_openai(echo = TRUE)
# ' chat$chat("Describe this image", content_image_file(jpg))
# ' }
read_as_markdown <- function(x, ..., canonical = FALSE) {
read_as_markdown <- function(x, ..., canonical = FALSE, modify_request = identity) {

check_string(x)

Expand All @@ -69,9 +69,33 @@ read_as_markdown <- function(x, ..., canonical = FALSE) {
if (getOption("ragnar.markitdown.use_reticulate", TRUE)) {
# use the Python API, faster, more powerful
convert <- .globals$markitdown$convert %||% init_markitdown()$convert
md <- convert(x, ...)

# if `x` is a url, we use httr to retrieve it
x <- tryCatch({
curl::curl_parse_url(x)
httr2::request(x)
}, error = function(err) {
x
})

if (inherits(x, "httr2_request")) {
x <- x |>
modify_request() |>
httr2::req_perform() |>
r_to_py.httr2_response()
}

md <- convert(x, ...)
} else {

if (!identical(modify_request, identity)) {
cli::cli_warn(
"The {.arg modify_request} argument is not supported when using the CLI interface.",
.frequency = "once",
.frequency_id = "read_as_markdown"
)
}

# use the markitdown cli API, (much) slower, but easier to isolate
check_dots_empty()
outfile <- withr::local_tempfile(fileext = ".md")
Expand Down Expand Up @@ -103,6 +127,17 @@ read_as_markdown <- function(x, ..., canonical = FALSE) {
glue::as_glue(md)
}

r_to_py.httr2_response <- function(x, convert = FALSE) {
requests <- reticulate::import("requests")
io <- reticulate::import("io")
response <- requests$Response()
response$status_code <- httr2::resp_status(x)
response$url <- httr2::resp_url(x)
response$raw <- io$BytesIO(httr2::resp_body_raw(x))
response$headers <- httr2::resp_headers(x)
response$encoding <- httr2::resp_encoding(x)
response
}

markdown_locate_boundaries_bytes_index <- function(text, tags = NULL) {
lines <- text |> stri_split_lines() |> unlist()
Expand Down Expand Up @@ -286,20 +321,20 @@ markdown_segment_text <- function(text, split_by = c("h1", "h2", "h3", "pre", "p
#' @param frame_by_tags character vector of html tag names used to create a
#' dataframe of the returned content
#'
#' @returns
#' @returns
#' Always returns a data frame with the columns:
#' - `origin`: the file path or url
#' - `hash`: a hash of the text content
#' - `text`: the markdown content
#'
#'
#' If `split_by_tags` is not `NULL`, then a `tag` column is also included containing
#' the corresponding tag for each text chunk. `""` is used for text chunks that
#' are not associated with a tag.
#'
#'
#' If `frame_by_tags` is not `NULL`, then additional columns are included for each
#' tag in `frame_by_tags`. The text chunks are associated with the tags in the
#' order they appear in the markdown content.
#'
#'
#' @export
#'
#' @examples
Expand Down Expand Up @@ -364,9 +399,9 @@ markdown_segment_text <- function(text, split_by = c("h1", "h2", "h3", "pre", "p
#' )--") |>
#' # inspect
#' _[9:10] |> cat(sep = "\n~~~~~~~~~~~\n")
ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL, modify_request = identity) {

text <- read_as_markdown(x, ...)
text <- read_as_markdown(x, ..., modify_request = identity)
hash <- rlang::hash(text)

if (is.null(frame_by_tags) && is.null(split_by_tags)) {
Expand All @@ -377,18 +412,18 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
)
return(out)
}

segmented <- markdown_segment(
text,
tags = unique(c(split_by_tags, frame_by_tags)),
trim = TRUE,
text,
tags = unique(c(split_by_tags, frame_by_tags)),
trim = TRUE,
omit_empty = TRUE
)

frame <- vec_frame_flattened_tree(
segmented,
frame_by_tags %||% character(),
names = "tag",
segmented,
frame_by_tags %||% character(),
names = "tag",
leaves = "text"
)

Expand All @@ -397,8 +432,8 @@ ragnar_read <- function(x, ..., split_by_tags = NULL, frame_by_tags = NULL) {
frame[["tag"]] <- NULL
}

frame <- frame |>
dplyr::mutate(origin = x, hash = hash) |>
frame <- frame |>
dplyr::mutate(origin = x, hash = hash) |>
dplyr::select(origin, hash, text, dplyr::everything())

as_tibble(frame)
Expand Down
Loading