I'm trying to plot a series of demographics factors. Each plot show the frequency distributions of demographic variables by gender. It runs nicely, but some of the labels are ordered in alphabetic order and not in meaningful order eg. Education, Marital Status and SIC2007.
Data structure
structure(list(DMSex = c("Male", "Female", "Male", "Male"), Income = c(980,
-8, 3000, 120), IncCat = c("-1", "-8", "-1", "-1"), HrWkAc = c(-1,
-1, -1, -1), ShiftWk = c(-1, -1, -1, -1), ShiftPat = c(-1, -1,
-1, -1), SOC2010C = c("-1", "9.2.3.3", "-1", "-1"), XSOC2010 = c(-1,
9233, -1, -1), IndexNo = c(-1, 1398, -1, -1), ES2010 = c(-1,
7, -1, -1), nssec = c(-1, 13.4, -1, -1), SECFlag = c(-1, 0, -1,
-1), LSOC2000 = c("-1", "9.2.3.3", "-1", "-1"), XSOC2000 = c(-1,
9233, -1, -1), seg = c(-1, 11, -1, -1), sc = c(-1, 5, -1, -1),
SIC2007 = c(-1, 87, -1, -1), Educ = c(1, 1, -1, 2), EducCur = c(10,
1, -1, -1), FinFTEd = c(-1, -1, -1, 1), FinFTEdY = c(-1,
-1, -1, 21), HiQual = c(22, 10, -1, 1), sic20070 = c(-1,
87, -1, -1), dhhtype = c(6, 8, 7, 3), dagegrp = c(2, 3, 3,
3), dmarsta = c("Single, never married", "Single, never married",
"Interview not achieved", "Married/cohabitating"), dhiqual = c(" Secondary",
" A level or equivalent", "Item not applicable", "Degree or higher"
), dnssec8 = c(-1, 8, -1, -1), duresmc = c(14, 15, 11, 16
), dgorpaf = c(7, 8, 5, 10), dukcntr = c(1, 1, 1, 1), dnrkid04 = c(0,
0, 0, 0), dilodefr = c(3, 3, -1, 3), deconact = c(8, 8, -1,
11), dtenure = c(2, 3, 2, 3), dtotac = c(-1, -1, -1, -1),
dtotus = c(-1, -1, -1, -1), dsic = c("Item not applicable",
"Public admin, education and health", "Item not applicable",
"Item not applicable"), dsoc = c(-1, 9, -1, -1), DVAge_category = c("15 to 30",
"15 to 30", "15 to 30", "15 to 30"), Income_category = c("Less than 1000",
"Less than 1000", "1001 to 3000", "Less than 1000"), HoursWorked_category = c("Less than 20 hours",
"Less than 20 hours", "Less than 20 hours", "Less than 20 hours"
)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
#Age variable
demographics$dagegrp_category<-ifelse(demographics$dagegrp_01 > 2 & demographics$dagegrp < 6, age<-"15 to 30",
ifelse(demographics$dagegrp> 6 & demographics$dagegrp < 9, age<-"31 to 45",
ifelse(demographics$dagegrp > 9 & demographics$dagegrp < 12 , age<-"46 to 60",
ifelse(demographics$dagegrp > 12 & demographics$dagegrp < 15 , age<-"61 to 75",
ifelse(demographics$dagegrp > 15 & demographics$dagegrp < 18 , age<-"76+",
age<- "zombie")))))
demographics$DVAge_category<-c("15 to 30","31 to 45", "46 to 60","61 to 75", "76+", "zombie")[findInterval(demographics$dagegrp , c(-Inf, 6, 10, 12, 15,18, Inf))]
Age<-as.vector(demographics$DVAge_category)
#Gender variable
demographics$DMSex[demographics$DMSex==1]<-"Male"
demographics$DMSex[demographics$DMSex==2]<-"Female"
Gender<-as.vector(demographics$DMSex)
#Income variable
demographics$Income_category<-ifelse(demographics$Income < 1001, income<-"Less than 1000",
ifelse(demographics$Income > 999 & demographics$Income < 3001, income<-"1001 to 3000",
ifelse(demographics$Income > 3001 & demographics$Income < 6001, income <-"3001 to 6000",
ifelse(demographics$Income > 6001 & demographics$Income < 10001 , income<-"6001 to 10000",
income<- "zombie"))))
demographics$Income_category<-c("Less than 1000","1001 to 3000", "3001 to 6000", "6001 to 10000","zombie")[findInterval(demographics$Income , c(-Inf, 1001, 3001, 6001,10001, Inf) ) ]
Income<-as.vector(demographics$Income_category)
#Marital status variable
demographics$dmarsta[demographics$dmarsta==-1]<-"Interview not achieved"
demographics$dmarsta[demographics$dmarsta==1]<-"Single, never married"
demographics$dmarsta[demographics$dmarsta==2]<-"Married/cohabitating"
demographics$dmarsta[demographics$dmarsta==3]<-"Divorced/widowed"
MaritalStatus<-as.vector(demographics$dmarsta)
#Education
demographics$dhiqual[demographics$dhiqual==-8]<-"Don't know"
demographics$dhiqual[demographics$dhiqual==-1]<-"Item not applicable"
demographics$dhiqual[demographics$dhiqual==1]<-"Degree or higher"
demographics$dhiqual[demographics$dhiqual==2]<-"Higher education"
demographics$dhiqual[demographics$dhiqual==3]<-" A level or equivalent"
demographics$dhiqual[demographics$dhiqual==4]<-" Secondary"
demographics$dhiqual[demographics$dhiqual==5]<-" Other"
Education<-as.vector(demographics$dhiqual)
#Hours worked per week in main job variable
demographics$HoursWorked_category<-ifelse(demographics$dtotac < 21, workhours<-"Less than 20 hours",
ifelse(demographics$dtotac > 20 & demographics$dtotac< 41, workhours <-"Between 21 to 40 hours",
ifelse(demographics$dtotac > 40 & demographics$dtotac < 61, workhours <-"Between 41 to 60 hours",
ifelse(demographics$dtotac > 62, workhours<-"More than 61 hours",
workhours<- "Not Applicable"))))
demographics$HoursWorked_category<-c("Less than 20 hours", "Between 21 to 40 hours", "Between 41 to 60 hours","More than 61 hours","Not Applicable")[findInterval(demographics$dtotac, c(-Inf, 21, 41, 61, 62, Inf) ) ]
WorkHours<-as.vector(demographics$HoursWorked_category)
#DV: SIC 2007 industry divisions (grouped)
demographics$dsic[demographics$dsic==-8]<-"Don't know"
demographics$dsic[demographics$dsic==-1]<-"Item not applicable"
demographics$dsic[demographics$dsic==1]<-"Agriculture, forestry and fishing"
demographics$dsic[demographics$dsic==2]<-"Manufacturing"
demographics$dsic[demographics$dsic==3]<-"Energy and water supply"
demographics$dsic[demographics$dsic==4]<-"Construction"
demographics$dsic[demographics$dsic==5]<-"Distribution, hotels and restaurants"
demographics$dsic[demographics$dsic==6]<-"Transport and communication"
demographics$dsic[demographics$dsic==7]<-"Banking and finances"
demographics$dsic[demographics$dsic==8]<-"Public admin, education and health"
demographics$dsic[demographics$dsic==9]<-"Other services"
demographics$industry_category<-c("Don't know", "Item not applicable", "Agriculture, forestry and fishing","Manufacturing","Energy and water supply",
"Construction", "Distribution, hotels and restaurants", "Transport and communication", "Banking and finances",
"Public admin, education and health", "Other service")
SIC2007<-as.vector(demographics$dsic)
# creating df
df<-data.frame(Gender, Age, Education, MaritalStatus, Income, WorkHours, SIC2007)
df %>%
#tidy, not gender
gather(variable, value, -c(Gender))%>%
#group by value, variable, then gender
group_by(value, variable, Gender) %>%
#summarise to obtain table cell frequencies
summarise(freq=n()) %>%
#Plot
ggplot(aes(x=value, y=freq, group=Gender))+geom_bar(aes(fill=Gender), stat='identity', position='dodge')+ facet_wrap(~variable, scales='free_x') + theme(legend.position="right", axis.text.x = element_text(angle = 60, hjust = 1)) + labs(x="Characteristics", y="Frequencies")