# Libraries ----
library(tidyverse) ## for data manipulation
library(rvest) ## for scraping
library(textclean) ## some cleaning functions
library(progress) ## for tracking progress in console
# Some setting ----
<- 1 ## how many pages of the site I want to do
pages <- "path/to/your/directory/myfile.csv"
save_location
# Function 1: Clean an element ----
<- function(page_html, selector) {
clean_element <- page_html |>
output html_nodes(selector) |>
html_text() |>
replace_html() |> ## strip html markup (if any)
str_squish() ## remove leading and trailing whitespace
ifelse(length(output) == 0, NA, output) ## return NA if an element is missing
}
# Function 2: Clean an ad ----
<- function(link) {
clean_ad <- purrr::safely(read_html) ## safely read HTML
safely_read_html <- safely_read_html(link)
result
if (!is.null(result$error)) {
return(NA) ## return NA if there's an error
}
<- result$result
page_html
## the list of items I want in each post
<- clean_element(page_html, "h1") ## parse HTML
title <- clean_element(page_html, ".description-website-container")
description <- clean_element(page_html, ".attribute:nth-child(4) .value")
jobtype <- clean_element(page_html, ".attribute:nth-child(3) .value")
employer <- clean_element(page_html, ".attribute:nth-child(1) .value")
location <- Sys.time() ## current time
time
## I put the selected post info in a tibble
tibble(title, description, jobtype, employer, location, time)
}
# Loop 1: Get my links ----
<- list()
job_list <- "https://www.gumtree.co.za/s-jobs/page-" ## url fragment
link
for (i in 1:pages) {
<- read_html(paste0(link, i, "/v1c8p", i)) ## using paste0
jobs <- jobs |>
links html_nodes(".related-ad-title") |>
html_attr("href") ## get links
<- links ## add to list
job_list[[i]] Sys.sleep(2) ## rest
}
<- unlist(job_list) ## make a single list
links <- paste0("https://www.gumtree.co.za", links)
links head(links) ## looks good!
# Let's go! ----
<- length(links) ## track progress
total <- progress_bar$new(format = "[:bar] :current/:total (:percent)", total = total)
pb
<- list() ## list to store the results
output
for (i in 1:total) {
$tick()
pb<- clean_ad(links[i])
deets <- deets ## add to list
output[[i]] Sys.sleep(2) ## resting
}
## combining all tibbles in the list into a single big tibble
<- output[!is.na(output)] ## remove empty tibbles, if any
all <- bind_rows(all)
all
## Fab!
glimpse(all)
write_csv(all, save_location)
Goal
- You want to grab structured information from a website, and you want to put it into a data-set for analysis
- For example, job attributes for jobs from a job-posting site, or product prices over time, or whatever
Context
- A few years ago, when I was a Research Associate on a labor market project1, we needed to compile large lists of entry-level jobs across a number of websites. This needed to be done each day, and was rather tedious, so we developed some basic scripts to help us out. In the end it was quite an involved process with webscraping scripts + a lot of manual additions, filtering, and curation
- Afterwards, I wrote a post on webscraping for “Coder’s Corner”, a set of resources maintained by the Centre for the Study of African Economies (CSAE) at Oxford
- Revisiting it, I was happy to see it still works!
- The script is an example of how one might go about automating the process of visiting a job-posting website, extracting key information from each job ad, and compiling all the information into a data-set
Stuff to know
- This just illustrates the kind of thing you need to think through when trying to get stuff off of a website — you need to look at it, look at the structure of the page, the elements you’re interested in, and code up some logic to deal with it, and to get the data into the form you want. Each site is different, and even the same site may change some things in the meantime, so be ready for that
- If you want to do something more sophisticated, like clicking on things, or interacting with dynamic content, you need a web-driver. I see there’s a helpful post on the CSAE site about it. Often this is what you’d want to do — click on things to reveal more detail, for instance
- How do you find the right CSS selectors to target the right elements in the HTML? You can use your browser’s inspect tool or something like https://selectorgadget.com/ (“point and click CSS selectors”)
- Check the site’s
robots.txt
file to see if it’s OK to scrape it: https://github.com/ropensci/robotstxt, and don’t overwhelm the site with requests, pace your script