get_latest_revision()
returns metadata about the latest
revision of each
page.
get_page_html()
returns the rendered html for each
page.
get_page_summary()
returns metadata about the latest revision, along
with the page description and a summary extracted from the opening
paragraph
get_page_related()
returns summaries for 20 related pages for each
passed page
get_page_talk()
returns structured talk page content for each
title. You must ensure to use the title for the Talk page itself, e.g.
"Talk:Earth" rather than "Earth"
get_page_langlinks()
returns interwiki links for each
title
Usage
get_latest_revision(title, language = "en")
get_page_html(title, language = "en")
get_page_summary(title, language = "en")
get_page_related(title, language = "en")
get_page_talk(title, language = "en")
get_page_langlinks(title, language = "en")
Examples
# Get language links for a known page on English Wikipedia
get_page_langlinks("Charles Harpur")
#> # A tibble: 2 × 4
#> code name key title
#> <chr> <chr> <chr> <chr>
#> 1 de Deutsch Charles_Harpur Charles Harpur
#> 2 fr français Charles_Harpur Charles Harpur
# Many of these functions return a list of data frames. Tidyr can be useful.
# Get 20 related pages for German City
cities <- tibble::tribble(
~city,
"Berlin",
"Darmstadt",
) %>%
dplyr::mutate(related = get_page_related(city))
cities
#> # A tibble: 2 × 2
#> city related
#> <chr> <list>
#> 1 Berlin <tibble [20 × 32]>
#> 2 Darmstadt <tibble [20 × 32]>
# Unest to get one row per related page:
tidyr::unnest(cities, "related")
#> # A tibble: 40 × 33
#> city pageid ns index type title displaytitle namespace_id namespace_text
#> <chr> <int> <int> <int> <chr> <chr> <chr> <int> <chr>
#> 1 Berl… 3.72e3 0 19 stan… Berl… "<span clas… 0 ""
#> 2 Berl… 3.76e3 0 12 stan… Bran… "<span clas… 0 ""
#> 3 Berl… 1.19e4 0 13 stan… Germ… "<span clas… 0 ""
#> 4 Berl… 4.91e4 0 17 stan… Pots… "<span clas… 0 ""
#> 5 Berl… 1.48e5 0 14 stan… Berl… "<span clas… 0 ""
#> 6 Berl… 1.57e5 0 5 stan… Bran… "<span clas… 0 ""
#> 7 Berl… 2.72e5 0 3 stan… Berl… "<span clas… 0 ""
#> 8 Berl… 3.88e5 0 10 stan… Mitte "<span clas… 0 ""
#> 9 Berl… 4.33e5 0 7 stan… Berl… "<span clas… 0 ""
#> 10 Berl… 1.14e6 0 8 stan… Hert… "<span clas… 0 ""
#> # ℹ 30 more rows
#> # ℹ 24 more variables: wikibase_item <chr>, titles_canonical <chr>,
#> # titles_normalized <chr>, titles_display <chr>, thumbnail_source <chr>,
#> # thumbnail_width <int>, thumbnail_height <int>, originalimage_source <chr>,
#> # originalimage_width <int>, originalimage_height <int>, lang <chr>,
#> # dir <chr>, revision <chr>, tid <chr>, timestamp <chr>, description <chr>,
#> # description_source <chr>, coordinates_lat <dbl>, coordinates_lon <dbl>, …
# The functions are vectorised over title and language
# Find all articles about Joanna Baillie, and retrieve summary data for
# the first two.
baillie <- get_page_langlinks("Joanna Baillie") %>%
dplyr::slice(1:2) %>%
dplyr::mutate(get_page_summary(title = title, language = code))
baillie
#> # A tibble: 2 × 30
#> code name key title type displaytitle namespace_id namespace_text
#> <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr>
#> 1 ar العربية جوانا_بيلي جوانا… stan… "<span clas… 0 ""
#> 2 arz مصرى جوانا_بيلى جوانا… stan… "<span clas… 0 ""
#> # ℹ 22 more variables: wikibase_item <chr>, titles_canonical <chr>,
#> # titles_normalized <chr>, titles_display <chr>, pageid <int>,
#> # thumbnail_source <chr>, thumbnail_width <int>, thumbnail_height <int>,
#> # originalimage_source <chr>, originalimage_width <int>,
#> # originalimage_height <int>, lang <chr>, dir <chr>, revision <chr>,
#> # tid <chr>, timestamp <chr>, description <chr>, description_source <chr>,
#> # content_urls_desktop <list>, content_urls_mobile <list>, extract <chr>, …