Read Data

data = read_csv('https://raw.githubusercontent.com/ariannalangwang/Analyze-HR-Data/master/HRDataProject/HRDataset_v13.csv') %>% 
   clean_names()
data

## # A tibble: 310 × 35
##    employee_name      emp_id married_id marital_status_… gender_id emp_status_id
##    <chr>               <dbl>      <dbl>            <dbl>     <dbl>         <dbl>
##  1 Brown, Mia         1.10e9          1                1         0             1
##  2 LaRotonda, William 1.11e9          0                2         1             1
##  3 Steans, Tyrone     1.30e9          0                0         1             1
##  4 Howard, Estelle    1.21e9          1                1         0             1
##  5 Singh, Nan         1.31e9          0                0         0             1
##  6 Smith, Leigh Ann   7.11e8          1                1         0             5
##  7 Bunbury, Jessica   1.50e9          1                1         0             5
##  8 Carter, Michelle   1.40e9          0                0         0             1
##  9 Dietrich, Jenna    1.41e9          0                0         0             1
## 10 Digitale, Alfred   1.31e9          1                1         1             1
## # … with 300 more rows, and 29 more variables: dept_id <dbl>,
## #   perf_score_id <dbl>, from_diversity_job_fair_id <dbl>, pay_rate <dbl>,
## #   termd <dbl>, position_id <dbl>, position <chr>, state <chr>, zip <chr>,
## #   dob <chr>, sex <chr>, marital_desc <chr>, citizen_desc <chr>,
## #   hispanic_latino <chr>, race_desc <chr>, dateof_hire <chr>,
## #   dateof_termination <chr>, term_reason <chr>, employment_status <chr>,
## #   department <chr>, manager_name <chr>, manager_id <dbl>, …

Explore Data

skimr::skim(data) #summary

Table 1: Data summary
Name	data
Number of rows	310
Number of columns	35
_______________________
Column type frequency:
character	19
numeric	16
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
employee_name	0	1.00	8	23	310
position	0	1.00	3	28	31
state	0	1.00	2	2	28
zip	0	1.00	5	5	158
dob	0	1.00	8	8	306
sex	0	1.00	1	1	2
marital_desc	0	1.00	6	9	5
citizen_desc	0	1.00	10	19	3
hispanic_latino	0	1.00	2	3	4
race_desc	0	1.00	5	32	6
dateof_hire	0	1.00	8	10	99
dateof_termination	207	0.33	8	8	93
term_reason	1	1.00	5	32	17
employment_status	0	1.00	6	22	5
department	0	1.00	5	20	6
manager_name	0	1.00	8	18	21
recruitment_source	0	1.00	5	38	23
performance_score	0	1.00	3	17	4
last_performance_review_date	103	0.67	8	9	42

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
emp_id	0	1.00	1.199745e+09	1.8296e+08	602000312.00	1.101024e+09	1.203032e+09	1.378814e+09	1988299991	▁▇▇▁▁
married_id	0	1.00	4.000000e-01	4.9000e-01	0.00	0.000000e+00	0.000000e+00	1.000000e+00	1	▇▁▁▁▅
marital_status_id	0	1.00	8.100000e-01	9.4000e-01	0.00	0.000000e+00	1.000000e+00	1.000000e+00	4	▇▇▂▁▁
gender_id	0	1.00	4.300000e-01	5.0000e-01	0.00	0.000000e+00	0.000000e+00	1.000000e+00	1	▇▁▁▁▆
emp_status_id	0	1.00	2.400000e+00	1.8000e+00	1.00	1.000000e+00	1.000000e+00	5.000000e+00	5	▇▁▁▁▃
dept_id	0	1.00	4.610000e+00	1.0800e+00	1.00	5.000000e+00	5.000000e+00	5.000000e+00	6	▁▂▁▇▁
perf_score_id	0	1.00	2.980000e+00	5.8000e-01	1.00	3.000000e+00	3.000000e+00	3.000000e+00	4	▁▁▁▇▁
from_diversity_job_fair_id	0	1.00	9.000000e-02	2.9000e-01	0.00	0.000000e+00	0.000000e+00	0.000000e+00	1	▇▁▁▁▁
pay_rate	0	1.00	3.128000e+01	1.5380e+01	14.00	2.000000e+01	2.400000e+01	4.531000e+01	80	▇▂▂▂▁
termd	0	1.00	3.300000e-01	4.7000e-01	0.00	0.000000e+00	0.000000e+00	1.000000e+00	1	▇▁▁▁▃
position_id	0	1.00	1.684000e+01	6.2300e+00	1.00	1.800000e+01	1.900000e+01	2.000000e+01	30	▂▁▁▇▁
manager_id	8	0.97	1.458000e+01	8.0900e+00	1.00	1.000000e+01	1.500000e+01	1.900000e+01	39	▅▇▇▁▁
engagement_survey	0	1.00	3.330000e+00	1.2900e+00	1.03	2.080000e+00	3.470000e+00	4.520000e+00	5	▅▅▃▅▇
emp_satisfaction	0	1.00	3.890000e+00	9.1000e-01	1.00	3.000000e+00	4.000000e+00	5.000000e+00	5	▁▁▇▇▇
special_projects_count	0	1.00	1.210000e+00	2.3500e+00	0.00	0.000000e+00	0.000000e+00	0.000000e+00	8	▇▁▁▂▁
days_late_last30	103	0.67	0.000000e+00	0.0000e+00	0.00	0.000000e+00	0.000000e+00	0.000000e+00	0	▁▁▇▁▁

DataExplorer::plot_bar(data)

Pivot Table

rpivotTable::rpivotTable(data,
    height = 1500)

↕↔

employee_name ▾

emp_id ▾

married_id ▾

marital_status_id ▾

gender_id ▾

emp_status_id ▾

dept_id ▾

perf_score_id ▾

from_diversity_job_fair_id ▾

pay_rate ▾

termd ▾

position_id ▾

position ▾

state ▾

zip ▾

dob ▾

sex ▾

marital_desc ▾

citizen_desc ▾

hispanic_latino ▾

race_desc ▾

dateof_hire ▾

dateof_termination ▾

term_reason ▾

employment_status ▾

department ▾

manager_name ▾

manager_id ▾

recruitment_source ▾

performance_score ▾

engagement_survey ▾

emp_satisfaction ▾

special_projects_count ▾

last_performance_review_date ▾

days_late_last30 ▾

Totals	310

Check for duplicates

(nrow(get_dupes(data)) == 0) #duplicates?

## [1] TRUE

Check reduntant information

data %>% distinct(hispanic_latino)

## # A tibble: 4 × 1
##   hispanic_latino
##   <chr>          
## 1 No             
## 2 Yes            
## 3 yes            
## 4 no

data <- data %>% mutate(
    hispanic_latino = as_factor(hispanic_latino),
    hispanic_latino = recode(data$hispanic_latino,
    yes = 'Yes', no = 'No'))

Missing data

#missing data?
data %>% miss_var_summary()

## # A tibble: 35 × 3
##    variable                     n_miss pct_miss
##    <chr>                         <int>    <dbl>
##  1 dateof_termination              207   66.8  
##  2 last_performance_review_date    103   33.2  
##  3 days_late_last30                103   33.2  
##  4 manager_id                        8    2.58 
##  5 term_reason                       1    0.323
##  6 employee_name                     0    0    
##  7 emp_id                            0    0    
##  8 married_id                        0    0    
##  9 marital_status_id                 0    0    
## 10 gender_id                         0    0    
## # … with 25 more rows

data %>% gg_miss_upset()

# plot missing data (using raw data)
DataExplorer::plot_missing(
  title   = "% of Missing Data",
  data    = data,
  ggtheme = tidyquant::theme_tq(), 
  missing_only = TRUE)

#Term Reason column
data %>% select(dateof_termination, term_reason) %>% count(term_reason,sort=T)

## # A tibble: 18 × 2
##    term_reason                          n
##    <chr>                            <int>
##  1 N/A - still employed               196
##  2 Another position                    20
##  3 unhappy                             14
##  4 more money                          11
##  5 N/A - Has not started yet           11
##  6 career change                        9
##  7 hours                                9
##  8 attendance                           7
##  9 relocation out of area               5
## 10 return to school                     5
## 11 military                             4
## 12 performance                          4
## 13 retiring                             4
## 14 maternity leave - did not return     3
## 15 medical issues                       3
## 16 no-call, no-show                     3
## 17 gross misconduct                     1
## 18 <NA>                                 1

#it seems there is a missing value here, we drop the row
data <- data %>% drop_na(term_reason)

#Manager ID
#Count how many employees has each manager
data %>% select(manager_name, manager_id) %>% count(manager_name,sort=T)

## # A tibble: 21 × 2
##    manager_name       n
##    <chr>          <int>
##  1 Elijiah Gray      22
##  2 Kelley Spirea     22
##  3 Kissy Sullivan    22
##  4 Michael Albert    22
##  5 Amy Dunn          21
##  6 Brannon Miller    21
##  7 David Stanley     21
##  8 Ketsia Liebig     21
##  9 Webster Butler    21
## 10 Janet King        19
## # … with 11 more rows

#Which manager has not id values?
data %>% select(manager_name, manager_id) %>% filter(is.na(manager_id))

## # A tibble: 8 × 2
##   manager_name   manager_id
##   <chr>               <dbl>
## 1 Webster Butler         NA
## 2 Webster Butler         NA
## 3 Webster Butler         NA
## 4 Webster Butler         NA
## 5 Webster Butler         NA
## 6 Webster Butler         NA
## 7 Webster Butler         NA
## 8 Webster Butler         NA

# Webster Butler has not id, let'see if it applies for other rows.
data %>% select(manager_name, manager_id) %>% filter(manager_name == 'Webster Butler')

## # A tibble: 21 × 2
##    manager_name   manager_id
##    <chr>               <dbl>
##  1 Webster Butler         39
##  2 Webster Butler         NA
##  3 Webster Butler         NA
##  4 Webster Butler         39
##  5 Webster Butler         39
##  6 Webster Butler         39
##  7 Webster Butler         39
##  8 Webster Butler         39
##  9 Webster Butler         39
## 10 Webster Butler         39
## # … with 11 more rows

# As expected just few rows did not reported his id, so we impute with his reported id of 39
#fill with 39 where is.na == TRUE
data$manager_id[is.na(data$manager_id)] <- 39

data %>% select(dateof_termination,
                                last_performance_review_date,
                                days_late_last30)

## # A tibble: 309 × 3
##    dateof_termination last_performance_review_date days_late_last30
##    <chr>              <chr>                                   <dbl>
##  1 <NA>               1/15/2019                                   0
##  2 <NA>               1/17/2019                                   0
##  3 <NA>               1/18/2019                                   0
##  4 <NA>               1/15/2019                                   0
##  5 09/25/13           <NA>                                       NA
##  6 08/02/14           <NA>                                       NA
##  7 <NA>               1/21/2019                                   0
##  8 <NA>               1/29/2019                                   0
##  9 <NA>               1/30/2019                                   0
## 10 <NA>               1/17/2019                                   0
## # … with 299 more rows

unique(data$days_late_last30)

## [1]  0 NA

data %>% select(-days_late_last30)

## # A tibble: 309 × 34
##    employee_name      emp_id married_id marital_status_… gender_id emp_status_id
##    <chr>               <dbl>      <dbl>            <dbl>     <dbl>         <dbl>
##  1 Brown, Mia         1.10e9          1                1         0             1
##  2 LaRotonda, William 1.11e9          0                2         1             1
##  3 Steans, Tyrone     1.30e9          0                0         1             1
##  4 Singh, Nan         1.31e9          0                0         0             1
##  5 Smith, Leigh Ann   7.11e8          1                1         0             5
##  6 Bunbury, Jessica   1.50e9          1                1         0             5
##  7 Carter, Michelle   1.40e9          0                0         0             1
##  8 Dietrich, Jenna    1.41e9          0                0         0             1
##  9 Digitale, Alfred   1.31e9          1                1         1             1
## 10 Friedman, Gerry    1.20e9          0                0         1             1
## # … with 299 more rows, and 28 more variables: dept_id <dbl>,
## #   perf_score_id <dbl>, from_diversity_job_fair_id <dbl>, pay_rate <dbl>,
## #   termd <dbl>, position_id <dbl>, position <chr>, state <chr>, zip <chr>,
## #   dob <chr>, sex <chr>, marital_desc <chr>, citizen_desc <chr>,
## #   hispanic_latino <chr>, race_desc <chr>, dateof_hire <chr>,
## #   dateof_termination <chr>, term_reason <chr>, employment_status <chr>,
## #   department <chr>, manager_name <chr>, manager_id <dbl>, …

It seems that no employees came late in the last 30 days as the column days_late_last30 presents only 0 values and NA for the rows wherein the employee has already left the company. So I will drop the column.

unique(data[,c("dateof_termination","term_reason")])

## # A tibble: 103 × 2
##    dateof_termination term_reason              
##    <chr>              <chr>                    
##  1 <NA>               N/A - still employed     
##  2 09/25/13           career change            
##  3 08/02/14           Another position         
##  4 09/05/15           attendance               
##  5 10/31/14           relocation out of area   
##  6 <NA>               N/A - Has not started yet
##  7 09/12/15           performance              
##  8 03/15/15           no-call, no-show         
##  9 02/22/15           no-call, no-show         
## 10 05/01/16           performance              
## # … with 93 more rows

employees <- data %>% filter(
        is.na(dateof_termination)) %>% 
    select(-dateof_termination)

term_employees <- data %>% filter(
        is.na(dateof_termination) == F)

Data Visualization

Gender Gap

employees %>% select(sex, pay_rate, 
                    dateof_hire, position) %>% mutate(
        position = as.factor(position),
        position = fct_lump(position, 5),
        dateof_hire=anytime::anydate(dateof_hire),
        sex = as_factor(sex),
        year = year(dateof_hire)) %>% 
group_by(year,sex, position) %>% summarise(
    avg = mean(pay_rate)) %>% 
    ggplot(aes(x=year, y=avg, color =sex)) + geom_line() + facet_wrap(~position) + ggthemes::theme_gdocs() + theme(
        axis.text.x = element_text(angle = 35)
    )

Barplot

data %>%
  filter(term_reason=="N/A - still employed") %>%
  group_by(sex,position) %>%
  summarise(Total=n()) %>% 
  hchart(type="column",
        hcaes(x=position,y=Total,group=sex)) %>%
      hc_add_theme(hc_theme_economist())

data %>% group_by(position) %>% summarise(pay_rate = round(mean(pay_rate),2)) %>% 
    hchart("column", hcaes(x = position, y = pay_rate)) %>%
    hc_title(text = "Company Pay Rates") %>%
      hc_subtitle(text = "Average pay rate by job position") %>%
      hc_add_theme(hc_theme_economist())

Boxplot

highchart() %>%
  hc_xAxis(type ="category")%>%
  hc_add_series_list(data_to_boxplot(data,
                                pay_rate, race_desc,
                                group_var = sex,
                                add_outliers = F)) %>% 
  hc_legend(enabled= F) %>%
      hc_add_theme(hc_theme_economist())

Pie Charts

race_pie <- data %>% count(race_desc) %>% 
    hchart('pie', hcaes(race_desc,n)) %>% hc_add_theme(hc_theme_economist()) 

gender_pie <- data %>% count(sex) %>% 
    hchart('pie', hcaes(sex,n)) %>% 
    hc_add_theme(hc_theme_economist())

department_pie <- data %>% count(department) %>% 
    hchart('pie', hcaes(department, n)) %>% 
    hc_add_theme(hc_theme_economist())

managers_pie <- data %>% count(manager_name) %>% 
    hchart('pie', hcaes(manager_name, n)) %>% 
    hc_add_theme(hc_theme_economist())

hw_grid(race_pie, gender_pie, 
                department_pie, managers_pie,
                ncol = 2, rowheight = 350)

Waffle Plot

pacman::p_load(waffle)

View(employees %>% select(marital_desc,performance_score) %>% 
            group_by(marital_desc,performance_score) %>% 
    summarise(n= n()) %>% mutate(freq = round(n/sum(n),2)))
  
waffle(c('Exceeds = 15%' = 15,
                 'Fully Meets = 78%' =78,
                 'Needs Improvement = 4%' = 4,
                 'PIP = 3%' = 3), 
             rows = 8, size = 0.2,
             title = 'Performance Score For Single Employees') + 
    theme(title = element_text(
        family = 'Arial',
        size = 1, face = 'bold'
    ))

Ridgeline Plots

library(ggridges)
ggplot(data, aes(
    y=as_factor(race_desc),
    x=pay_rate,
    fill=sex
)) + geom_density_ridges(alpha=0.7) + theme_minimal()

Treemap

data$manager_name <- as_factor(data$manager_name)

data %>% mutate(
    manager_name = fct_lump(manager_name,15)) %>%
    group_by(manager_name) %>% count() %>% 
treemap::treemap(
    index = 'manager_name',
    vSize = 'n',
    title = "Treemap: Managers names by number of employees")

Chicklet chart

term_employees %>% head() %>% 
    miss_var_summary()

## # A tibble: 35 × 3
##    variable                     n_miss pct_miss
##    <chr>                         <int>    <dbl>
##  1 last_performance_review_date      6      100
##  2 days_late_last30                  6      100
##  3 employee_name                     0        0
##  4 emp_id                            0        0
##  5 married_id                        0        0
##  6 marital_status_id                 0        0
##  7 gender_id                         0        0
##  8 emp_status_id                     0        0
##  9 dept_id                           0        0
## 10 perf_score_id                     0        0
## # … with 25 more rows

term_employees %<>% select(-last_performance_review_date,
                                                     -days_late_last30)

term_employees %>% select(dateof_hire, dateof_termination)

## # A tibble: 102 × 2
##    dateof_hire dateof_termination
##    <chr>       <chr>             
##  1 9/26/2011   09/25/13          
##  2 8/15/2011   08/02/14          
##  3 7/7/2014    09/05/15          
##  4 3/7/2011    10/31/14          
##  5 7/7/2014    09/12/15          
##  6 2/16/2015   03/15/15          
##  7 2/16/2015   02/22/15          
##  8 12/1/2014   05/01/16          
##  9 1/5/2015    10/31/15          
## 10 1/9/2012    11/04/15          
## # … with 92 more rows

library(lubridate)
term_employees %<>% mutate(
    dateof_hire=anytime::anydate(dateof_hire),
    dateof_termination=as.Date(dateof_termination,
                                                         format("%m/%d/%y")),
    worktime = difftime(dateof_termination, dateof_hire)
)


term_employees %>% select(position, worktime) %>% 
    mutate(position=as.factor(term_employees$position),
                 worktime = as.integer(worktime)) %>% 
    group_by(position) %>% summarise(
        avg = round(mean(worktime),0)) %>% 
    arrange(desc(avg)) %>% 
    ggplot(aes(x=reorder(position,avg), y=avg)) +
    ggchicklet::geom_chicklet(aes(fill=avg),
                width = 0.8, radius = grid::unit(5, "pt")) + coord_flip() + scale_fill_gradient(low = '#4B3F72', 
                                             high = '#7C9EB2') +    
    coord_flip() + 
    scale_y_discrete(expand = c(0, 0)) +
    labs(title = "Number of days before quitting job",
             subtitle = "By position",
             caption = "Data: Kaggle",
             fill = "Days") +
    theme(
        aspect.ratio=4/3,
        plot.title = element_text(face = "bold",
                                                            size = 13,
                                                            family = "Times",
                                                            colour = "black"),
        plot.subtitle = element_text(size = 11.7,
                                                                 family = "Times",
                                                                 colour = "grey45"),
        plot.caption = element_text(size = 10,
                                                                family = "Times",
                                                                colour = "grey45"),
        axis.text.y = element_text(vjust = 0.5,
                                                             family  = "Times"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.background = element_blank(),
        panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.justification= c(1,0),
        legend.key.size = unit(15, "pt"),
        legend.text = element_text(size = 10,
                                                             family = "Times"),
        legend.position = c(1,0))

Time series of people quitting

term_employees %>% select(dateof_hire,
    dateof_termination, worktime, 
    employee_name,term_reason) %>% 
    arrange(worktime) %>% filter( 
    worktime < quantile(worktime, 0.05) |
    worktime > quantile(worktime, 0.95)) %>% 
    mutate(term_reason = str_to_title(term_reason))

## # A tibble: 12 × 5
##    dateof_hire dateof_termination worktime  employee_name      term_reason      
##    <date>      <date>             <drtn>    <chr>              <chr>            
##  1 2012-09-24  2012-09-26            2 days MacLennan, Samuel  Hours            
##  2 2011-01-10  2011-01-12            2 days Baczenski, Rachael Another Position 
##  3 2015-02-16  2015-02-22            6 days Hernandez, Daniff  No-Call, No-Show 
##  4 2014-02-17  2014-02-25            8 days Evensen, April     No-Call, No-Show 
##  5 2011-11-07  2011-11-15            8 days Gerke, Melisa      Hours            
##  6 2011-05-16  2011-06-04           19 days Power, Morissa     Another Position 
##  7 2011-01-10  2016-01-26         1842 days Robinson, Alain    Attendance       
##  8 2011-01-10  2016-04-01         1908 days Jung, Judy         Unhappy          
##  9 2011-01-10  2016-05-17         1954 days Robinson, Cherly   Attendance       
## 10 2009-10-26  2015-04-08         1990 days Sloan, Constance   Maternity Leave …
## 11 2010-10-25  2016-05-18         2032 days Peterson, Ebonee   Another Position 
## 12 2008-09-02  2015-09-29         2583 days Ybarra, Catherine  Another Position

Network Visualization

library(igraph)
library(networkD3)
employees %>% select(manager_name, employee_name) %>% head(30) %>% simpleNetwork(
    fontSize = 13, charge = -15,
    linkDistance = 65, zoom = T,
    nodeColour = 'red',
)

Read Data

Explore Data

Pivot Table

employee_name(310)

emp_id(310)

married_id(2)

marital_status_id(5)

gender_id(2)

emp_status_id(5)

dept_id(6)

perf_score_id(4)

from_diversity_job_fair_id(2)

pay_rate(94)

termd(2)

position_id(30)

position(31)

state(28)

zip(158)

dob(306)

sex(2)

marital_desc(5)

citizen_desc(3)

hispanic_latino(4)

race_desc(6)

dateof_hire(99)

dateof_termination(94)

term_reason(18)

employment_status(5)

department(6)

manager_name(21)

manager_id(24)

recruitment_source(23)

performance_score(4)

engagement_survey(188)

emp_satisfaction(5)

special_projects_count(8)

last_performance_review_date(43)

days_late_last30(2)