# Use pacman to load packages
pacman::p_load(tidyverse, stringr, pander)
# Set working directory to the external drive containing the IRI dataset
volume_dir <- "/Volumes/ADS_235/Academic Dataset External"
base_dir <- ''
data_dir <- paste0(base_dir, "data/")
viz_dir <- paste0(base_dir, "viz/")
dir.create(data_dir, showWarnings = FALSE)
dir.create(viz_dir, showWarnings = FALSE)
panderOptions('big.mark', ',')
Before we begin loading data, let’s start by assessing the data size of each of the summary files.
We use the shell command find . -name "myfile" | xargs wc - l
to recursively find a file with a particular regular expression construction and then perform the line count.
tissue_category <- "factiss"
if (!file.exists(paste0(data_dir, "tissue_wc.RDS"))) {
tissue_wc <- system(paste0("cd '", volume_dir,"';
find . -name '", tissue_category, "_drug*' -o -name '", tissue_category, "_groc*' | xargs wc -l"), intern = TRUE)
saveRDS(tissue_wc, paste0(data_dir, "tissue_wc.RDS"))
}
The table below lists the file paths within the /Volumes/ADS_235/Academic Dataset External
folder and the number of records.
tissue_wc <- readRDS(paste0(data_dir, "tissue_wc.RDS"))
tissue_weekly_files <-
tissue_wc %>%
stringr::str_trim(side = "left") %>%
stringr::str_split_fixed(" ", n = 2) %>%
tibble::as_tibble() %>%
rename(records = V1, file = V2) %>%
dplyr::mutate(records = as.integer(records)) %>%
dplyr::mutate(records_clean = prettyNum(records, big.mark = ",")) %>%
# Use the fact that the last couplet of 4 digits create the proper ordering
dplyr::arrange(as.integer(str_replace(str_sub(file, -9,-1), "_",""))) %>%
dplyr::select(file, records, records_clean)
tissue_weekly_files %>%
select(file, records = records_clean) %>%
pander(justify = c("left","right"))
file | records |
---|---|
./Year1/External/factiss/factiss_drug_1114_1165 | 276,046 |
./Year1/External/factiss/factiss_groc_1114_1165 | 1,861,943 |
./Year2/External/factiss/factiss_drug_1166_1217 | 262,577 |
./Year2/External/factiss/factiss_groc_1166_1217 | 1,816,241 |
./Year3/External/factiss/factiss_drug_1218_1269 | 260,883 |
./Year3/External/factiss/factiss_groc_1218_1269 | 1,919,860 |
./Year4/External/factiss/factiss_drug_1270_1321 | 267,729 |
./Year4/External/factiss/factiss_groc_1270_1321 | 1,871,891 |
./Year5/External/factiss/factiss_drug_1322_1373 | 313,035 |
./Year5/External/factiss/factiss_groc_1322_1373 | 1,941,955 |
./Year6/External/factiss/factiss_drug_1374_1426 | 326,100 |
./Year6/External/factiss/factiss_groc_1374_1426 | 1,959,222 |
./Year7/External/factiss/factiss_drug_1427_1478 | 353,290 |
./Year7/External/factiss/factiss_groc_1427_1478 | 1,853,608 |
./Year8/factiss/factiss_drug_1479_1530 | 395,891 |
./Year8/factiss/factiss_groc_1479_1530 | 1,888,008 |
./Year9/factiss/factiss_drug_1531_1582 | 371,024 |
./Year9/factiss/factiss_groc_1531_1582 | 1,823,431 |
./Year10/factiss/factiss_drug_1583_1634 | 356,381 |
./Year10/factiss/factiss_groc_1583_1634 | 1,702,116 |
./Year11/factiss/factiss_drug_1635_1686 | 359,005 |
./Year11/factiss/factiss_groc_1635_1686 | 1,666,630 |
total | 23,846,866 |
We then do the same process for the carbonated beverage category.
carbbev_category <- "carbbev"
if (!file.exists(paste0(data_dir, "carbbev_wc.RDS"))) {
carbbev_wc <- system(paste0("cd '", volume_dir,"';
find . -name '", carbbev_category, "_drug*' -o -name '", carbbev_category, "_groc*' | xargs wc -l"), intern = TRUE)
saveRDS(carbbev_wc, paste0(data_dir, "carbbev_wc.RDS"))
}
We note the size difference. However, there are different sub-categories within the carbonated beverage category.
carbbev_wc <- readRDS(paste0(data_dir, "carbbev_wc.RDS"))
carbbev_weekly_files <-
carbbev_wc %>%
stringr::str_trim(side = "left") %>%
stringr::str_split_fixed(" ", n = 2) %>%
tibble::as_tibble() %>%
rename(records = V1, file = V2) %>%
dplyr::mutate(records = as.integer(records)) %>%
dplyr::mutate(records_clean = prettyNum(records, big.mark = ",")) %>%
# Use the fact that the last couplet of 4 digits create the proper ordering
dplyr::arrange(as.integer(str_replace(str_sub(file, -9,-1), "_",""))) %>%
dplyr::select(file, records, records_clean)
carbbev_weekly_files %>%
select(file, records = records_clean) %>%
pander(justify = c("left","right"))
file | records |
---|---|
./Year1/External/carbbev/carbbev_drug_1114_1165 | 1,076,917 |
./Year1/External/carbbev/carbbev_groc_1114_1165 | 16,250,083 |
./Year2/External/carbbev/carbbev_drug_1166_1217 | 1,170,728 |
./Year2/External/carbbev/carbbev_groc_1166_1217 | 17,010,319 |
./Year3/External/carbbev/carbbev_drug_1218_1269 | 1,284,562 |
./Year3/External/carbbev/carbbev_groc_1218_1269 | 18,736,690 |
./Year4/External/carbbev/carbbev_drug_1270_1321 | 1,317,722 |
./Year4/External/carbbev/carbbev_groc_1270_1321 | 18,920,681 |
./Year5/External/carbbev/carbbev_drug_1322_1373 | 1,423,878 |
./Year5/External/carbbev/carbbev_groc_1322_1373 | 19,393,965 |
./Year6/External/carbbev/carbbev_drug_1374_1426 | 1,446,004 |
./Year6/External/carbbev/carbbev_groc_1374_1426 | 19,739,033 |
./Year7/External/carbbev/carbbev_drug_1427_1478 | 1,457,445 |
./Year7/External/carbbev/carbbev_groc_1427_1478 | 18,833,115 |
./Year8/carbbev/carbbev_drug_1479_1530 | 1,609,317 |
./Year8/carbbev/carbbev_groc_1479_1530 | 19,274,904 |
./Year9/carbbev/carbbev_drug_1531_1582 | 1,583,182 |
./Year9/carbbev/carbbev_groc_1531_1582 | 19,811,582 |
./Year10/carbbev/carbbev_drug_1583_1634 | 1,546,509 |
./Year10/carbbev/carbbev_groc_1583_1634 | 19,614,571 |
./Year11/carbbev/carbbev_drug_1635_1686 | 1,542,187 |
./Year11/carbbev/carbbev_groc_1635_1686 | 19,241,642 |
total | 222,285,036 |
Next we load the UPC lookup tables. From the documentation’s explanation, we need three as the UPC descriptions were changed between periods.
Each UPC lookup table has the same structure (defined in the documentation), so we load the three and combine and eliminate unnecessary columns.
tissue_upc_year_1_to_6 <-
readxl::read_excel(paste0(volume_dir, "/parsed stub files/prod_tissue.xls")) %>%
dplyr::mutate(iri_year = '1-6')
tissue_upc_year_7 <-
readxl::read_excel(paste0(volume_dir, "/parsed stub files 2007/prod_factiss.xlsx")) %>%
dplyr::mutate(iri_year = '7')
tissue_upc_year_8_to_11 <-
readxl::read_excel(paste0(volume_dir, "/parsed stub files 2008-2011/prod11_factiss.xlsx")) %>%
dplyr::mutate(iri_year = '8-11')
tissue_upc <-
bind_rows(
tissue_upc_year_1_to_6, tissue_upc_year_7, tissue_upc_year_8_to_11
) %>%
rename(
large_category = L1
, small_category = L2
, parent_company = L3
, vendor = L4
, brand = L5
, upc = UPC
) %>%
# These positions are outlined in the documentation
select(1,2,3,4,5,8, 15:22) %>%
setNames(tolower(make.names(names(.)))) %>%
select(iri_year, everything())
Below is a table of the first 20 records.
tissue_upc %>%
select(parent_company, vendor, brand) %>%
distinct() %>%
arrange(parent_company, vendor, brand) %>%
head(20)
carbbev_upc_year_1_to_6 <-
readxl::read_excel(paste0(volume_dir, "/parsed stub files/prod_carbbev.xls")) %>%
dplyr::mutate(iri_year = '1-6')
carbbev_upc_year_7 <-
readxl::read_excel(paste0(volume_dir, "/parsed stub files 2007/prod_carbbev.xlsx")) %>%
dplyr::mutate(iri_year = '7')
carbbev_upc_year_8_to_11 <-
readxl::read_excel(paste0(volume_dir, "/parsed stub files 2008-2011/prod11_carbbev.xlsx")) %>%
dplyr::mutate(iri_year = '8-11')
carbbev_upc <-
bind_rows(
carbbev_upc_year_1_to_6, carbbev_upc_year_7, carbbev_upc_year_8_to_11
) %>%
rename(
large_category = L1
, small_category = L2
, parent_company = L3
, vendor = L4
, brand = L5
, upc = UPC
) %>%
# These positions are outlined in the documentation
select(1,2,3,4,5,8, 15:22) %>%
setNames(tolower(make.names(names(.)))) %>%
select(iri_year, everything())
There are six sub-categories:
carbbev_upc %>%
group_by(small_category) %>%
summarise(`UPC's` = n()) %>%
arrange(desc(`UPC's`)) %>%
pandoc.table(justify = c("left","right"))
small_category | UPC’s |
---|---|
REGULAR SOFT DRINKS | 25,840 |
LOW CALORIE SOFT DRINKS | 7,949 |
SELTZER/TONIC WATER/CLUB SODA | 2,490 |
PLU SOFT DRINKS | 926 |
PLU - ALL BRANDS SODA | 61 |
SUGAR/CALORIE REDUCED SOFT DRINK | 56 |
Even excluding the categories, there are 1,019 company / vender / brand combinations. Below are the first 20:
carbbev_upc %>%
select(parent_company, vendor, brand) %>%
distinct() %>%
arrange(parent_company, vendor, brand) %>%
head(20)
To load the sales information we can write a function that will load each file using read_fwf
(with the column widths found by reviewing the files). We do not use read_table
as this can potentially be unreliable with large datasets (see Hadly’s comment). We also add the year, store type, and ID (row number for reference) as preceding columns.
fn_load_weekly_file <- function(filename) {
year_num <- as.integer(str_match(filename, "Year([:digit:]+)")[, 2])
drug_or_groc <- str_match(filename, "drug|groc")[, 1]
column_widths <- c(7, 5, 3, 3, 6, 6, 6, 9, 5, 2, 2)
column_names <- c('IRI_KEY','WEEK','SY','GE','VEND','ITEM','UNITS','DOLLARS','F','D','PR')
column_types <- cols(
IRI_KEY = col_character(),
WEEK = col_integer(),
SY = col_character(),
GE = col_character(),
VEND = col_character(),
ITEM = col_character(),
UNITS = col_integer(),
DOLLARS = col_double(),
`F` = col_character(),
D = col_integer(),
PR = col_integer()
)
read_fwf(paste0(volume_dir, str_replace(filename, ".", "")),
fwf_widths(column_widths, column_names), col_types = column_types,
skip = 1, progress = FALSE) %>%
mutate(iri_year = year_num, id = row_number(), store_type = drug_or_groc) %>%
select(iri_year, id, store_type, everything())
}
fn_integrity_check <- function(dt) {
records <- dt %>% nrow()
unique_records <-
dt %>%
select(IRI_KEY, WEEK, SY, GE, VEND, ITEM) %>%
distinct() %>%
nrow()
return(data_frame(records_raw = records, unique_records = unique_records))
}
We load each of the store datasets and perform data integrity checks. We show that the number of records loaded is one less than we found with the wc -l
shell command (due to the header) and that the number of records unique on store / week / upc (SY
, GE
, VEND
, ITEM
) is the total number of records loaded.
tissue_files <- tissue_weekly_files$file[-length(tissue_weekly_files$file)]
tissue_files_list <- lapply(tissue_files, fn_load_weekly_file)
tissue_weekly <- bind_rows(tissue_files_list)
lapply(tissue_files_list, fn_integrity_check) %>%
bind_rows() %>%
bind_rows(data_frame(records_raw = sum(.$records_raw), unique_records = sum(.$unique_records))) %>%
bind_cols(tissue_weekly_files) %>%
mutate(
n = n()
, `Pass?` = if_else(file == "total",
if_else(records_raw == (records - n + 1) & unique_records == records_raw, "Yes","No"),
if_else(records_raw == (records - 1) & unique_records == records_raw, "Yes","No"))
) %>%
select(File = file, Records = records_clean, `Records Loaded` = records_raw,
`Unique on Store/Week/UPC` = unique_records, `Pass?`) %>%
pander(split.table = Inf, justify = c('left', 'right','right','right','center'))
File | Records | Records Loaded | Unique on Store/Week/UPC | Pass? |
---|---|---|---|---|
./Year1/External/factiss/factiss_drug_1114_1165 | 276,046 | 276,045 | 276,045 | Yes |
./Year1/External/factiss/factiss_groc_1114_1165 | 1,861,943 | 1,861,942 | 1,861,942 | Yes |
./Year2/External/factiss/factiss_drug_1166_1217 | 262,577 | 262,576 | 262,576 | Yes |
./Year2/External/factiss/factiss_groc_1166_1217 | 1,816,241 | 1,816,240 | 1,816,240 | Yes |
./Year3/External/factiss/factiss_drug_1218_1269 | 260,883 | 260,882 | 260,882 | Yes |
./Year3/External/factiss/factiss_groc_1218_1269 | 1,919,860 | 1,919,859 | 1,919,859 | Yes |
./Year4/External/factiss/factiss_drug_1270_1321 | 267,729 | 267,728 | 267,728 | Yes |
./Year4/External/factiss/factiss_groc_1270_1321 | 1,871,891 | 1,871,890 | 1,871,890 | Yes |
./Year5/External/factiss/factiss_drug_1322_1373 | 313,035 | 313,034 | 313,034 | Yes |
./Year5/External/factiss/factiss_groc_1322_1373 | 1,941,955 | 1,941,954 | 1,941,954 | Yes |
./Year6/External/factiss/factiss_drug_1374_1426 | 326,100 | 326,099 | 326,099 | Yes |
./Year6/External/factiss/factiss_groc_1374_1426 | 1,959,222 | 1,959,221 | 1,959,221 | Yes |
./Year7/External/factiss/factiss_drug_1427_1478 | 353,290 | 353,289 | 353,289 | Yes |
./Year7/External/factiss/factiss_groc_1427_1478 | 1,853,608 | 1,853,607 | 1,853,607 | Yes |
./Year8/factiss/factiss_drug_1479_1530 | 395,891 | 395,890 | 395,890 | Yes |
./Year8/factiss/factiss_groc_1479_1530 | 1,888,008 | 1,888,007 | 1,888,007 | Yes |
./Year9/factiss/factiss_drug_1531_1582 | 371,024 | 371,023 | 371,023 | Yes |
./Year9/factiss/factiss_groc_1531_1582 | 1,823,431 | 1,823,430 | 1,823,430 | Yes |
./Year10/factiss/factiss_drug_1583_1634 | 356,381 | 356,380 | 356,380 | Yes |
./Year10/factiss/factiss_groc_1583_1634 | 1,702,116 | 1,702,115 | 1,702,115 | Yes |
./Year11/factiss/factiss_drug_1635_1686 | 359,005 | 359,004 | 359,004 | Yes |
./Year11/factiss/factiss_groc_1635_1686 | 1,666,630 | 1,666,629 | 1,666,629 | Yes |
total | 23,846,866 | 23,846,844 | 23,846,844 | Yes |
We are going to construct a UPC column from the columns SY, GE, VEND, and ITEM in the sales information table. We can show that all tissue UPC’s in the UPC Lookup Table are 3765 characters long by
table(sapply(tissue_upc$upc, nchar)) %>%
pander()
17 |
---|
3,765 |
We create a UPC from the components and properly name the columns.
tissue_weekly2 <-
tissue_weekly %>%
mutate(
upc = paste(str_pad(SY, width = 2, "left", "0"), str_pad(GE, width = 2, "left", "0"),
str_pad(VEND, width = 5, "left", "0"), str_pad(ITEM, width = 5, "left", "0"), sep = "-")
, avg_price = DOLLARS / UNITS
) %>%
rename(feature = `F`, display = D, price_reduction = PR) %>%
setNames(tolower(make.names(names(.)))) %>%
select(iri_year, iri_key, week, upc, units, dollars, avg_price, feature, display, price_reduction)
Now, let’s verify again that this dataset is indeed at the store (iri_key) - week - UPC level of detail, with the new UPC column.
data_frame(
`Record Count` = tissue_weekly2 %>% nrow()
, `Store - Week - UPC Combinations` = tissue_weekly2 %>% select(iri_key, week, upc) %>% distinct() %>% nrow()
) %>%
pander()
Record Count | Store - Week - UPC Combinations |
---|---|
23,846,844 | 23,846,844 |
With that assurance, let’s pull in the details about each UPC. We do this for each year window because the documentation indicated that UPC information (potentially including company) changed between the time windows. We then do a check that no records were dropped in the inner join
.
tissue_weekly_1_to_6 <-
tissue_weekly2 %>%
filter(iri_year <= 6) %>%
inner_join(
tissue_upc %>%
filter(iri_year == "1-6") %>%
select(-iri_year)
, by = c("upc" = 'upc')
)
tissue_weekly_7 <-
tissue_weekly2 %>%
filter(iri_year == 7) %>%
inner_join(
tissue_upc %>%
filter(iri_year == "7") %>%
select(-iri_year)
, by = c("upc" = 'upc')
)
tissue_weekly_8_to_11 <-
tissue_weekly2 %>%
filter(iri_year >= 8) %>%
inner_join(
tissue_upc %>%
filter(iri_year == "8-11") %>%
select(-iri_year)
, by = c("upc" = 'upc')
)
data_frame(
Years = c("1-6", "7", "8-11")
, `Record Count in Weekly Data` = sapply(list(filter(tissue_weekly2, iri_year <= 6),
filter(tissue_weekly2, iri_year == 7), filter(tissue_weekly2, iri_year >= 8)), nrow)
, `Record Count in Joined Data` = sapply(list(tissue_weekly_1_to_6, tissue_weekly_7, tissue_weekly_8_to_11), nrow)
) %>%
pander()
Years | Record Count in Weekly Data | Record Count in Joined Data |
---|---|---|
1-6 | 13,077,470 | 13,077,470 |
7 | 2,206,896 | 2,206,896 |
8-11 | 8,562,478 | 8,562,478 |
This passes our data integrity check, so we’ll combine the 3 time windows datasets.
tissue_final <- bind_rows(tissue_weekly_1_to_6, tissue_weekly_7, tissue_weekly_8_to_11)
tissue_final %>%
head(20)