Data Wrangling Examples

#' Exercises
library("insuranceData")


library(babynames)
set.seed(7481)

#'
#' ## Exercise - Wide to long
#' 
n <- 3L
dw <- data.frame(
    cid = seq_len(n),
    name = sample(sample(babynames$name, 3), n),
    age = sample.int(100, n, TRUE),
    revenue.2016 = round(rnorm(n, 500, 200)),
    revenue.2017 = round(rnorm(n, 600, 200)),
    revenue.2018 = round(rnorm(n, 700, 200)), stringsAsFactors = FALSE )


#' ## Datasets
#' ### Third party dataset
#' source: http://www.businessandeconomics.mq.edu.au/our_departments/Applied_Finance_and_Actuarial_Studies/acst_docs/glms_for_insurance_data/data/third_party_claims.txt
#' Third party insurance is a compulsory insurance for vehicle owners
#' in Australia. It insures vehicle owners against injury  caused to
#' other drivers, passengers or pedestrians, as a result of an accident.
#' 
#' This data set records the number of third party claims in a
#' twelve-month period between 1984-1986 in each of 176 geographical
#' areas (local government areas) in New South Wales, Australia.
#' 
#' Variables:
#'  lga         local government area
#'  sd          statistical division (1, ..., 13)
#'  claims      number of third party claims
#'  accidents   number of accidents
#'  ki          number killed or injured
#'  population  population size
#'  pop_density population density (measure of the number of people per unit area
#'      e.g., percentage of a population living in an urban area)


#'
#' ## Exercise - `Thirdparty` data set - (base R)
#' 
#' In this first exercise we deal with dirty data.
#' In the `Thirdparty` data set most information is stored in the row names
#' delimited by ";".
data(Thirdparty)
head(Thirdparty)
##                                         lga.sd.claims.accidents.ki.population.pop_density
## ASHFIELD;1;1103;2304;920;124850;0                                                  499001
## AUBURN;1;1939;2660;1465;143500;0                                                   148379
## BANKSTOWN;1;4339;7381;3864;470700;0                                                205407
## BAULKHAMHILLS;1;1491;3217;1554;311300;0                                             25879
## BLACKTOWN;1;3801;6655;4175;584900;0                                                 81222
## BOTANY;1;387;2013;854;106350;0                                                     178143
#' ### (1) Transform the data into a format as commonly stored in a data.frame.

#' ### (2) What do you think went wrong when creating the Thirdparty data set?

#' ### (3) What are the regions with the highest population density (top 10)?

#' ### (4) Which region has the most claims per inhabitant?

#' ### (5) In which statistical division did occur the most claims / accidents.

#' ### (6) @FS Show that it can be useful to use aggregate with simplify = FALSE.


#'
#' ## Exercise - Customer data
#' 
#' In this exercise we generate some data which we will use for
#' the following examples.
#' 
#' ### Data generation
library(babynames)

set.seed(1)
n <- 100
n_seller <- 5L
year <- sample(2000:2019, n, TRUE)
month <- sample(1:12, n, TRUE)
day <- sample(1:28, n, TRUE)
hour <- sample(1:24, n, TRUE)
minute <- sample(1:60, n, TRUE)
sec <- sample(1:60, n, TRUE)
date <- sprintf("%i-%i-%i %i:%i:%i CET", year, month, day, hour, minute, sec)
name <- sample(babynames$name, round(n / 3))
car_id <- sample(seq_len(nrow(mtcars)), n, TRUE)
cid <- sample(seq_along(name))
birth_year <- sample(seq(2019 - 60, 1994 - 18), length(cid), TRUE)
birth_day <- sample(1:28, length(cid), TRUE)
birth_month <- sample(1:12, length(cid), TRUE)
birthday <- sprintf("%i-%i-%i", birth_year, birth_month, birth_day)


## cid ... customer id
## sid ... seller id
cars <- cbind(mtcars, type = rownames(mtcars), car_id = seq_len(nrow(mtcars)),
    catalog_price = round(runif(nrow(mtcars), 20000, 60000), 2))
rownames(cars) <- NULL
purchases <- data.frame(cid = sample(cid, n, replace = TRUE), car_id = car_id, 
    sid = sample.int(n_seller, n, TRUE), date = date, 
    price_paid = round(runif(n, 0.8, 1) * cars$catalog_price[car_id], 2), 
    stringsAsFactors = FALSE)
customer <- data.frame(cid = cid, name = name, birthday = birthday,
    stringsAsFactors = FALSE)
seller <- data.frame(sid = seq_len(n_seller), name = sample(babynames$name, n_seller), 
    site = sample(c("Vienna", "Berlin"), n_seller, TRUE), stringsAsFactors = FALSE)


#' The data.frames `purchases`, `customer` and `cars` contain information about
#' car purchases. In a way, data is often stored in databaes.
head(cars, 2)
##   mpg cyl disp  hp drat    wt  qsec vs am gear carb          type car_id
## 1  21   6  160 110  3.9 2.620 16.46  0  1    4    4     Mazda RX4      1
## 2  21   6  160 110  3.9 2.875 17.02  0  1    4    4 Mazda RX4 Wag      2
##   catalog_price
## 1      20394.32
## 2      47445.24
head(purchases, 2)
##   cid car_id sid                    date price_paid
## 1   6      2   5 2003-12-21 23:41:32 CET   43759.05
## 2   5     15   2    2006-4-5 4:57:47 CET   25040.39
head(customer, 2)
##   cid    name   birthday
## 1   2 Toribio 1966-11-22
## 2   9 Zackeri  1974-9-13
head(seller, 2)
##   sid      name   site
## 1   1   Malyssa Vienna
## 2   2 Khristine Vienna
#' ### (1) Which `site` sold the most cars?

#' ### (2) Is in name a unique identifier for the seller?

#' ### (3) Use table to create a cross-table year times seller.
#' (Hint: The variable year has to be first created)?

#' ### (4) For which `site` the 
#' `average discount rate` = (`catalog_price` - `price_paid`) / `catalog_price`
#' is the highest.