it's the first time I'm using R so sorry for stupid mistakes. I need to do a Sankey Chart to display flows between cities in a Metropolitan Area, but I am not satisfyied with the how the graph looks. I would like the labels to be outside the Sankey and to the cities to be organized from byggest to smallest flow.
This is my code (sorry, it's messy, I am afraid of erasing steps)
library(lodown)
library(magrittr)
library(dplyr)
library(stringr)
library(fs)
library(SAScii)
library(readr)
library(purrr)
library(survey)
library(tidyr)
library(ggplot2)
library(scales)
library(ggtext)
library(questionr)
library(networkD3)
# Search data and download
catalog <- lodown::get_catalog(data_name = "censo", output_dir = "data") %>%
dplyr::filter(year == 2010, stringr::str_detect(state, "al")) %>%
lodown::lodown(data_name = "censo")
# Dowloaded data
fs::dir_tree(path = "data")
# Variables imported
vars_censo <- c("v0001", "v0002", "v1004", "v1006", "v6529", "v6530", "v6531", "v6532", "v0010", "v0606", "v0636", "v6362", "v6364", "v0660", "v6602", "v6604", "v0661", "v0662","v0601")
# Convert SAS to R
sas_input <- SAScii::parse.SAScii(catalog$pes_sas) %>%
dplyr::mutate(varname = stringr::str_to_lower(varname))
# Import TXT
raw_censo <- readr::read_fwf(
file = catalog$pes_file,
col_positions = readr::fwf_widths(
widths = abs(sas_input$width),
col_names = sas_input$varname
),
col_types = paste0(
ifelse(
!(sas_input$varname %in% vars_censo),
"_",
ifelse(sas_input$char, "c", "d")
),
collapse = ""
)
)
# filtrer cities
df.rmm <- data.frame(
raw_censo %>%
filter(v0002 == "00409" | v0002 == "00508" | v0002 == "00607" | v0002 == "02207" | v0002 == "04302" | v0002 == "04708" | v0002 == "05200" | v0002 == "05507" | v0002 == "06448" | v0002 == "06901" | v0002 == "07701" | v0002 == "07909" | v0002 == "08907"))
# rename (column)
df.rmm <- rename(df.rmm, UF=v0001,municipio=v0002,RM=v1004,urbrur=v1006,rend.dom=v6529,rend.dom.sm=v6530,rend.pc=v6531,rend.pc.sm=v6532,peso=v0010,cor=v0606, est.no.mun=v0636, UFesc= v6364, mun.esc = v6364, trab.no.mun= v0660, uf.trab=v6602, mun.trab=v6604, commute=v0661, tempo.desl=v0662, sexo=v0601)
# rename cities (lines)
df.rmm$municipio[df.rmm$municipio=="00409"] <- "Atalaia, 5,6%"
df.rmm$municipio[df.rmm$municipio=="00508"] <- "Barra de Santo Antônio, 2,9%"
df.rmm$municipio[df.rmm$municipio=="00607"] <- "Barra de São Miguel, 0.8%"
df.rmm$municipio[df.rmm$municipio=="02207"] <- "Coqueiro Seco, 3,1%"
df.rmm$municipio[df.rmm$municipio=="04302"] <- "Maceió, 22,3%"
df.rmm$municipio[df.rmm$municipio=="04708"] <- "Marechal Deodoro, 10,5%"
df.rmm$municipio[df.rmm$municipio=="05200"] <- "Messias, 5,4%"
df.rmm$municipio[df.rmm$municipio=="05507"] <- "Murici, 3,2%"
df.rmm$municipio[df.rmm$municipio=="06448"] <- "Paripueira, 3,6%"
df.rmm$municipio[df.rmm$municipio=="06901"] <- "Pilar, 5,3%"
df.rmm$municipio[df.rmm$municipio=="07701"] <- "Rio Largo, 26,3%"
df.rmm$municipio[df.rmm$municipio=="07909"] <- "Santa Luzia do Norte, 3%"
df.rmm$municipio[df.rmm$municipio=="08907"] <- "Satuba, 8,2%"
# criates new dataframe
df.rmm2 <- df.rmm[!is.na(df.rmm$commute),]
# filters municipalities
df.rmm3 <- data.frame(
df.rmm2 %>%
filter(mun.trab == "2700409" | mun.trab == "2700508" | mun.trab == "2700607" | mun.trab == "2702207" | mun.trab == "2704302" | mun.trab == "2704708" | mun.trab == "2705200" | mun.trab == "2705507" | mun.trab == "2706448" | mun.trab == "2706901" | mun.trab == "2707701" | mun.trab == "2707909" | mun.trab == "2708907"))
# rename destiny (columns)
df.rmm3$mun.trab[df.rmm3$mun.trab=="2700409"] <- "Atalaia, 3%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2700508"] <- "Barra de Santo Antônio, 0,6%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2700607"] <- "Barra de São Miguel, 1,5%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2702207"] <- "Coqueiro Seco, 0,2%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2704302"] <- "Maceió, 63,8%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2704708"] <- "Marechal Deodoro, 7%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2705200"] <- "Messias, 1,9%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2705507"] <- "Murici, 1%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2706448"] <- "Paripueira, 1,3%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2706901"] <- "Pilar, 5%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2707701"] <- "Rio Largo, 12,3%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2707909"] <- "Santa Luzia do Norte, 0,7%"
df.rmm3$mun.trab[df.rmm3$mun.trab=="2708907"] <- "Satuba, 1,8%"
# criates OD table
#od_table<-table(df.rmm3$municipio,df.rmm3$mun.trab)
#od_table
# criates variable pesoajust = peso / 10000000000000
df.rmm3["pesoajust"]<-df.rmm3$peso/10000000000000
# waited table
tabelaod <- wtd.table(x = df.rmm3$municipio, y = df.rmm3$mun.trab, weights = df.rmm3$pesoajust)%>%
round()
as.data.frame.matrix(tabelaod)
write.table(tabelaod, file = "tabelaod", sep = " ", na = "", quote = TRUE, row.names = TRUE, eol = "\r\n")
# First Sankey diagram
#dataForSankey <- df.rmm3%>%dplyr::select(mun.trab, municipio)
#hchart(data_to_sankey(dataForSankey), "sankey", name = "OD")
od <- data.frame( wtd.table(x = df.rmm3$municipio, y = df.rmm3$mun.trab, weights = df.rmm3$pesoajust)%>%
round())
od$Var <- NULL
# A connection data frame is a list of flows with intensity for each flow
links <- data.frame(
source= c(od$Var1),
target= c(od$Var2),
value= c(od$Freq))
# Add a 'group' column to each connection:
links$group <- as.factor(c("Atalaia, 5,6%","Barra de Santo Antônio, 2,9%","Barra de São Miguel, 0,8%","Coqueiro Seco, 3,1%","Maceió, 22,3%","Marechal Deodoro, 10,5%", "Messias, 5,4%", "Murici, 3,2%","Paripueira, 3,6%","Pilar, 5,3%", "Rio largo, 26,3%", "Santa Luzia do Norte, 3%", "Satuba, 8,2%"))
# From these flows we need to create a node data frame: it lists every entities involved in the flow
nodes <- data.frame(
name=c(as.character(links$source),
as.character(links$target)) %>% unique()
)
# Add a 'group' column to each connection:
nodes$group <- as.factor(c("Atalaia, 5,6%","Barra de Santo Antônio, 2,9%","Barra de São Miguel, 0,8%","Coqueiro Seco, 3,1%","Maceió, 22,3%","Marechal Deodoro, 10,5%", "Messias, 5,4%", "Murici, 3,2%","Paripueira, 3,6%","Pilar, 5,3%", "Rio largo, 26,3%", "Santa Luzia do Norte, 3%", "Satuba, 8,2%"))
# With networkD3, connection must be provided using id, not using real name like in the links dataframe.. So we need to reformat it.
links$IDsource <- match(links$source, nodes$name)-1
links$IDtarget <- match(links$target, nodes$name)-1
# Make the Network
p <- sankeyNetwork(Links = links, Nodes = nodes,
Source = "IDsource", Target = "IDtarget",
Value = "value", NodeID = "name", LinkGroup = "group",
fontSize = 10, dragY = TRUE, NodeGroup = "group",
showNodeValues = FALSE)
p
this is the order I would like
And I would like to move the labels to the outside, so it's easier to read.