% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/read-markdown.R
\name{read_as_markdown}
\alias{read_as_markdown}
\title{Convert files to Markdown}
\usage{
read_as_markdown(
  path,
  ...,
  origin = path,
  html_extract_selectors = c("main"),
  html_zap_selectors = c("nav"),
  youtube_transcript_formatter = NULL
)
}
\arguments{
\item{path}{[string] A filepath or URL. Accepts a wide variety of file
types, including plain text (markdown), PDF, PowerPoint, Word, Excel,
images (EXIF metadata and OCR), audio (EXIF metadata and speech
transcription), HTML, text-based formats (CSV, JSON, XML), ZIP files
(iterates over contents), YouTube URLs, and EPUBs.}

\item{...}{Passed on to \code{MarkItDown.convert()}.}

\item{origin}{The value to use for the \verb{@origin} property of the returned
\code{MarkdownDocument}.}

\item{html_extract_selectors}{Character vector of CSS selectors. If a match
for a selector is found in the document, only the matched node's contents
are converted. Unmatched extract selectors have no effect.}

\item{html_zap_selectors}{Character vector of CSS selectors. Elements
matching these selectors will be excluded ("zapped") from the HTML document
before conversion to markdown. This is useful for removing navigation bars,
sidebars, headers, footers, or other unwanted elements. By default,
navigation elements (\code{nav}) are excluded.}

\item{youtube_transcript_formatter}{A function used to customize how YouTube
transcript data is converted to markdown. It receives a tibble/data.frame
with columns \code{text} (chr), \code{start} (dbl, seconds), and \code{duration} (dbl,
seconds), along with a \code{"youtube_metadata"} attribute, a named list
containing elements \code{language}, \code{language_code}, \code{video_id}, and
\code{is_generated}. The formatter must return a single string; by default it
behaves like \verb{\\(transcript) paste0(transcript$text, collapse = " ")}.
Provide a custom formatter to include timestamps or links (see examples).}
}
\value{
A \code{\link{MarkdownDocument}} object, which is a single string of Markdown
with an \verb{@origin} property.
}
\description{
Convert files to Markdown
}
\details{
\subsection{Converting HTML}{

When converting HTML, you might want to omit certain elements, like sidebars,
headers, footers, etc. You can pass CSS selector strings to either extract
nodes or exclude nodes during conversion.

The easiest way to make selectors is to use SelectorGadget:
\url{https://rvest.tidyverse.org/articles/selectorgadget.html}

You can also right-click on a page and select "Inspect Element" in a browser
to better understand an HTML page's structure.

For comprehensive or advanced usage of CSS selectors, consult
\url{https://www.crummy.com/software/BeautifulSoup/bs4/doc/#css-selectors-through-the-css-property}
and \url{https://facelessuser.github.io/soupsieve/selectors/}
}
}
\examples{
\dontrun{
# Convert HTML
md <- read_as_markdown("https://r4ds.hadley.nz/base-R.html")
md

cat_head <- \(md, n = 10) writeLines(head(strsplit(md, "\n")[[1L]], n))
cat_head(md)

## Using selector strings

# By default, this output includes the sidebar and other navigational elements
url <- "https://duckdb.org/code_of_conduct"
read_as_markdown(url) |> cat_head(15)

# To extract just the main content, use a selector
read_as_markdown(url, html_extract_selectors = "#main_content_wrap") |>
  cat_head()

# Alternative approach: zap unwanted nodes
read_as_markdown(
  url,
  html_zap_selectors = c(
    "header",          # name
    ".sidenavigation", # class
    ".searchoverlay",  # class
    "#sidebar"         # ID
  )
) |> cat_head()

# Quarto example
read_as_markdown(
  "https://quarto.org/docs/computations/python.html",
  html_extract_selectors = "main",
  html_zap_selectors = c(
    "#quarto-sidebar",
    "#quarto-margin-sidebar",
    "header",
    "footer",
    "nav"
  )
) |> cat_head()

## Convert PDF
pdf <- file.path(R.home("doc"), "NEWS.pdf")
read_as_markdown(pdf) |> cat_head(15)
## Alternative:
# pdftools::pdf_text(pdf) |> cat_head()

# Convert images to markdown descriptions using OpenAI
jpg <- file.path(R.home("doc"), "html", "logo.jpg")
if (Sys.getenv("OPENAI_API_KEY") != "") {
  # if (xfun::is_macos()) system("brew install ffmpeg")
  reticulate::py_require("openai")
  llm_client <- reticulate::import("openai")$OpenAI()
  read_as_markdown(jpg, llm_client = llm_client, llm_model = "gpt-4.1-mini") |>
    writeLines()
  # # Description:
  # The image displays the logo of the R programming language. It features a
  # large, stylized capital letter "R" in blue, positioned prominently in the
  # center. Surrounding the "R" is a gray oval shape that is open on the right
  # side, creating a dynamic and modern appearance. The R logo is commonly
  # associated with statistical computing, data analysis, and graphical
  # representation in various scientific and professional fields.
}

# Alternative approach to image conversion:
if (
  Sys.getenv("OPENAI_API_KEY") != "" &&
    rlang::is_installed("ellmer") &&
    rlang::is_installed("magick")
) {
  chat <- ellmer::chat_openai(echo = TRUE)
  chat$chat("Describe this image", ellmer::content_image_file(jpg))
}

# YouTube transcripts
## read_as_markdown() fetches transcripts for YouTube links
cat_head(read_as_markdown("https://youtu.be/GELhdezYmP0"))

## The default transcript omits timestamps. Supply a custom
## `youtube_transcript_formatter` to control the output. This example formats
## the transcript with timestamped YouTube links.

format_youtube_timestamp <- function(time) {
  h <- time \%/\% 3600
  time <- time \%\% 3600
  m <- time \%/\% 60
  time <- time \%\% 60
  s <- floor(time)
  out <- paste0(h, "h", m, "m", s, "s")
  out <- sub("^0h", "", out)
  out <- sub("^0m", "", out)
  out
}

format_transcript_with_timestamps <-
  function(data, min_timestamp_stride_seconds = 30, links = FALSE) {
    ts <- format_youtube_timestamp(data$start)
    if (links) {
      video_id <- attr(data, "youtube_metadata")$video_id
      ts <- sprintf("\n<https://youtu.be/\%s?t=\%s>\n", video_id, ts)
    } else {
      ts <- sprintf("\n[\%s] ", ts)
    }

    if (!is.null(min_timestamp_stride_seconds)) {
      show <- c(TRUE, as.logical(diff(x \%/\% min_timestamp_stride_seconds)))
      ts[!show] <- ""
    }

    paste0(ts, data$text, sep = "", collapse = "\n")
  }


read_as_markdown(
  "https://www.youtube.com/watch?v=GELhdezYmP0",
  youtube_transcript_formatter = \(data) {
    format_transcript_with_timestamps(data, links = TRUE)
  }
) |>
  cat_head(n = 60)
}
}
