[R Course] R nanocourse Perspective: Mapping UK Tweets

R Courses NLP Data Visualization

Mapping a sentiment analysis of UK tweets per population and income for each UK district.

Loading packages

Creating income by UK district

## INCOME BY DISTRICT
income_2012 <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=2078260586")
income_2013 <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=1695992236")
income_2014 <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=0")
income_2015 <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=725888174")

income <- bind_rows(income_2012, income_2013)
income <- bind_rows(income, income_2014)
income <- bind_rows(income, income_2015)

names(income)[names(income) == "districts"] <- "lad17nm"

income$lad17nm <- gsub(" UA", "", income$lad17nm)
income$lad17nm <- gsub(" Towns", "", income$lad17nm)
income$lad17nm <- gsub("-", " ", income$lad17nm)
income$lad17nm <- gsub("The ", "", income$lad17nm)
income$lad17nm <- gsub(" City of$", "", income$lad17nm)
income$lad17nm <- gsub("Edinburgh", "City of Edinburgh", income$lad17nm)
income$lad17nm <- gsub("Down", "Newry, Mourne and Down", income$lad17nm)
income$lad17nm <- gsub("Newry and Mourne", "Newry, Mourne and Down", income$lad17nm)
income$lad17nm <- gsub("Rhondda Cynon Taff", "Rhondda Cynon Taf", income$lad17nm)
income$lad17nm <- gsub("South Buckinghamshire", "South Bucks", income$lad17nm)
income$lad17nm <- gsub("Comhairle nan Eilean Siar", "Na h Eileanan Siar", income$lad17nm)

income[which(income[,1]=="Antrim"),1] <- "Antrim and Newtownabbey"
income[which(income[,1]=="Newtownabbey"),1] <- "Antrim and Newtownabbey"
income[which(income[,1]=="Armagh"),1] <- "Armagh City, Banbridge and Craigavon"
income[which(income[,1]=="Banbridge"),1] <- "Armagh City, Banbridge and Craigavon"
income[which(income[,1]=="Craigavon"),1] <- "Armagh City, Banbridge and Craigavon"
income[which(income[,1]=="Derry"),1] <- "Derry City and Strabane"
income[which(income[,1]=="Strabane"),1] <- "Derry City and Strabane"
income[which(income[,1]=="Fermanagh"),1] <- "Fermanagh and Omagh"
income[which(income[,1]=="Omagh"),1] <- "Fermanagh and Omagh"
income[which(income[,1]=="Lisburn"),1] <- "Lisburn and Castlereagh"
income[which(income[,1]=="Castlereagh"),1] <- "Lisburn and Castlereagh"
income[which(income[,1]=="Ards"),1] <- "Ards and North Down"
income[which(income[,1]=="North Down"),1] <- "Ards and North Down"

income <- aggregate(income_tot_mean ~ lad17nm + year, data = income, mean)
income$income_tot_mean <- round(income$income_tot_mean, digits = 0)

income$lad17nm <- toTitleCase(income$lad17nm)

Creating population by UK district

## POPULATION BY DISTRICT

population <- gsheet2tbl("https://docs.google.com/spreadsheets/d/1HosE_dBedt3Idho0f1jr-MQBt7PS_mpge7c1GIip0Yg/edit#gid=1762880462")

population <- population[, -c(14:22)]

population$`Area2(sqkm)` <- NULL
population$Geography <- NULL
population$Code <- NULL

population <- gather(population, "year", "population", 2:10)

names(population)[names(population) == "Name"] <- "lad17nm"

population$year <- as.numeric(population$year)

population <- filter(population, year == 2012 | year == 2013 | year == 2014 | year == 2015)

population$lad17nm <- gsub(", City of", "", population$lad17nm)
population$lad17nm <- gsub(", County of", "", population$lad17nm)
population$lad17nm <- gsub("-", " ", population$lad17nm)
population$lad17nm <- gsub("'", "", population$lad17nm)
population$lad17nm <- gsub("St ", "St. ", population$lad17nm)
population$lad17nm <- gsub("Folkstone and Hythe", "Shepway", population$lad17nm)

population$lad17nm <- toTitleCase(population$lad17nm)

Merging income and population by UK district

## INCOME AND POPULATION 

district_incpop <- left_join(population, income, by = c("lad17nm", "year"))

Mining UK Tweets

## LOADING TWEETS
tweets.overall <- read_csv("./data/tweets.overall.csv")

## KEEPING TWEETS OF UK
tweets.overall.LatLong <- filter(tweets.overall, latitude >= 49.771686 & latitude <= 60.862568)
tweets.overall.LatLong <- filter(tweets.overall.LatLong, longitude >= -12.524414 & longitude <= 1.785278)

## TWEETS MINING
tweets <- tweets.overall.LatLong

tweets.overall.LatLong$year <- substr(tweets.overall.LatLong$date, 0, 4)

tweets.LatLong <- tibble(line = 1:nrow(tweets.overall.LatLong), 
                         year = tweets.overall.LatLong$year, 
                         latitude = tweets.overall.LatLong$latitude, 
                         longitude = tweets.overall.LatLong$longitude)

# Cleaning
text <- tweets$content

# remove retweet entities
text <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", text)
# remove at people
text <- gsub("@\\w+", "", text)
# remove punctuation
# text <- gsub("[[:punct:]]", "", text)
# remove numbers
text <- gsub("[[:digit:]]", "", text)
# remove html links
text <- gsub("http\\w+", "", text)
# remove all pictwitter
text <- gsub("pictwitter\\w+ *", "", text)
# Remove chinese language
text <- iconv(text, "latin1", "ASCII", sub="")

# Tibble format
text_df <- tibble(line = 1:length(text), text = text)

# Tokenization 
tidy_tweets <- text_df %>% 
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")

# Join tweets to longitude and latitude by line number
tidy_tweets_LatLong <- left_join(tidy_tweets, tweets.LatLong, by = "line")

# Words that contribute to positive and negative sentiment
AFINN <- get_sentiments("afinn")

afinn_word_LatLong <- tidy_tweets_LatLong %>%
  inner_join(AFINN, by = "word")

afinn_word_LatLong_Tot <- aggregate(value ~ line + year + latitude + longitude, afinn_word_LatLong, sum)

afinn_word_LatLong_Tot$sentiment <- ifelse(afinn_word_LatLong_Tot$value > 0, "positive", 
                                           ifelse(afinn_word_LatLong_Tot$value == 0, "neutral", "negative"))

afinn_word_LatLong_Tot_PN <- filter(afinn_word_LatLong_Tot, sentiment != "neutral")

Mapping UK Tweets by UK district

## SHAPEFILE MAP DISTRICT UK

district <- readOGR(dsn = "./shapefiles/ladUK", 
                    layer = 'Local_Authority_Districts_December_2017_Full_Clipped_Boundaries_in_United_Kingdom_WGS84')

district@data$lad17nm <- gsub(", City of", "", district@data$lad17nm)
district@data$lad17nm <- gsub(", County of", "", district@data$lad17nm)
district@data$lad17nm <- gsub("-", " ", district@data$lad17nm)
district@data$lad17nm <- gsub("'", "", district@data$lad17nm)
district@data$lad17nm<- gsub("St ", "St. ", district@data$lad17nm)

district@data$lad17nm <- toTitleCase(district@data$lad17nm)
map <- read_sf("./shapefiles/ladUK/Local_Authority_Districts_December_2017_Full_Clipped_Boundaries_in_United_Kingdom_WGS84.shp")

map$lad17nm <- gsub(", City of", "", map$lad17nm)
map$lad17nm <- gsub(", County of", "", map$lad17nm)
map$lad17nm <- gsub("-", " ", map$lad17nm)
map$lad17nm <- gsub("'", "", map$lad17nm)
map$lad17nm<- gsub("St ", "St. ", map$lad17nm)

map$lad17nm <- toTitleCase(map$lad17nm)

pnts <- afinn_word_LatLong_Tot_PN

pnts_sf <- st_as_sf(pnts, coords = c('longitude', 'latitude'), crs = st_crs(map))

pnts <- pnts_sf %>% mutate(
  intersection = as.integer(st_intersects(geometry, map)), 
  lad17nm = if_else(is.na(intersection), '', map$lad17nm[intersection])
) 

pnts <- na.omit(pnts)

lll <- select(afinn_word_LatLong_Tot_PN, line, year, longitude, latitude)

pnts <- left_join(pnts, lll, by = c("line", "year"))

pnts$year <- as.numeric(pnts$year)

tweets_sentiment_income_pop_latlong <- left_join(pnts, district_incpop, by = c("lad17nm", "year"))

tweets_sentiment_income_pop_latlong$intersection <- NULL
tweets_sentiment_income_pop_latlong$geometry <- NULL

districtID <- select(district@data, objectid, lad17cd, lad17nm)

tweets_sentiment_income_pop_latlong <- left_join(tweets_sentiment_income_pop_latlong, districtID, by = "lad17nm")

tweets_sentiment_income_pop_latlong_final <- select(tweets_sentiment_income_pop_latlong, 
                                                    line, sentiment, value, longitude, 
                                                    latitude, year, lad17nm, lad17cd, 
                                                    objectid, population, income_tot_mean)

names(tweets_sentiment_income_pop_latlong_final)[names(tweets_sentiment_income_pop_latlong_final) == "objectid"] <- "lad17id"

# write.csv(tweets_sentiment_income_pop_latlong_final, "tweets_sentiment_income_pop_latlong_final.csv")
## MAP DISTRICT UK

district@data <- left_join(district@data, district_incpop, by = "lad17nm")

bins <- c(20000, 30000, 40000, 50000, 60000, 70000, 100000, 150000, 200000)
pal <- colorBin("YlOrRd", domain = district@data$income_tot_mean, bins = bins)

labels <- sprintf(
  "<strong>%s</strong><br/>%g £",
  district@data$lad17nm, district@data$income_tot_mean
) %>% lapply(htmltools::HTML)

palTweets <- colorFactor(c("limegreen", "red"), domain = c("positive", "negative"))

leaflet() %>%
  setView(-0.118092, 51.509865, 4) %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addFullscreenControl() %>%
  addPolygons(data = district,
              fillColor = ~pal(district@data$income_tot_mean),
              weight = 2,
              opacity = 1,
              color = "white",
              dashArray = "2",
              fillOpacity = 0.7,
              highlight = highlightOptions(
                weight = 3,
                color = "#666",
                dashArray = "",
                fillOpacity = 0.7,
                bringToFront = FALSE),
              label = labels,
              labelOptions = labelOptions(
                style = list("font-weight" = "normal", padding = "3px 8px"),
                textsize = "15px",
                direction = "auto")) %>% 
  addLegend(pal = pal, 
            values = district@data$income_tot_mean, 
            opacity = 0.7, 
            title = "Average Total Income",
            position = "bottomright") %>%
  addCircleMarkers(data = afinn_word_LatLong_Tot_PN, lng = ~longitude, lat = ~latitude,
    radius = 1,
    color = ~palTweets(sentiment),
    stroke = FALSE, 
    fillOpacity = 1
  )

Citation

For attribution, please cite this work as

Warin (2020, April 15). Thierry Warin, PhD: [R Course] R nanocourse Perspective: Mapping UK Tweets. Retrieved from https://warin.ca/posts/rcourse-rnanocourseperspective-mappinguktweets/

BibTeX citation

@misc{warin2020[r,
  author = {Warin, Thierry},
  title = {Thierry Warin, PhD: [R Course] R nanocourse Perspective: Mapping UK Tweets},
  url = {https://warin.ca/posts/rcourse-rnanocourseperspective-mappinguktweets/},
  year = {2020}
}