Alright, I hope this is what you were looking for Aquila:
# libraries
library(tidyverse)
# collect data
df <-
structure(
list(
ID = c(
"SS/CR/BIA/ABEYOONG/1/0001/05",
"SS/CR/BIA/ABEYOONG/1/0001/03",
"SS/CR/BIA/ABEYOONG/1/0001/04",
"SS/CR/BIA/ABEYOONG/1/0001/02",
"SS/CR/BIA/ABEYOONG/1/0001/01",
"SS/CR/BIA/ABEYOONG/1/0002/01",
"SS/CR/BIA/ABEYOONG/1/0002/04",
"SS/CR/BIA/ABEYOONG/1/0002/03",
"SS/CR/BIA/ABEYOONG/1/0002/05",
"SS/CR/BIA/ABEYOONG/1/0002/02",
"SS/CR/BIA/ABEYOONG/1/0003/01",
"SS/CR/BIA/ABEYOONG/1/0003/03",
"SS/CR/BIA/ABEYOONG/1/0003/05",
"SS/CR/BIA/ABEYOONG/1/0003/04",
"SS/CR/BIA/ABEYOONG/1/0003/02",
"SS/CR/BIA/ABEYOONG/1/0004/02",
"SS/CR/BIA/ABEYOONG/1/0004/07",
"SS/CR/BIA/ABEYOONG/1/0004/06",
"SS/CR/BIA/ABEYOONG/1/0004/05",
"SS/CR/BIA/ABEYOONG/1/0004/04",
"SS/CR/BIA/ABEYOONG/1/0004/03",
"SS/CR/BIA/ABEYOONG/1/0004/01",
"SS/CR/BIA/ABEYOONG/1/0005/01"
),
relationship = c(3, 3, 3, 2,
1, 1, 10, 3, 11, 2, 1, 3, 3, 3, 3, 3, 11, 3, 3, 3, 3, 1, 1)
),
row.names = c(NA,-23L),
class = c("tbl_df", "tbl", "data.frame")
)
#--------- by sort ID : egen numberhead = total(relationship == 1) ----------
# using data object named df
df %>%
group_by(ID) %>% # bysort
filter(relationship == 1) %>% # to only see these fields
summarise(numberhead = n()) # create a new variable
# # A tibble: 5 × 2
# ID numberhead
# <chr> <int>
# 1 SS/CR/BIA/ABEYOONG/1/0001/01 1
# 2 SS/CR/BIA/ABEYOONG/1/0002/01 1
# 3 SS/CR/BIA/ABEYOONG/1/0003/01 1
# 4 SS/CR/BIA/ABEYOONG/1/0004/01 1
# 5 SS/CR/BIA/ABEYOONG/1/0005/01 1
# of the individual IDs, that have relationship == 1
# there is one observation of each
# I don't think there is an equivalent to assert
# you could validate that there are 5 observations for relationship == 1
# to validate this result, though
df %>%
filter(relationship == 1) %>%
nrow() # number of rows
# [1] 5
#--------- List ID relationship if numberhead >= 2 ----------
# this one is simpler
df %>%
filter(relationship >=2)
# # A tibble: 18 × 2
# ID relationship
# <chr> <dbl>
# 1 SS/CR/BIA/ABEYOONG/1/0001/05 3
# 2 SS/CR/BIA/ABEYOONG/1/0001/03 3
# 3 SS/CR/BIA/ABEYOONG/1/0001/04 3
# 4 SS/CR/BIA/ABEYOONG/1/0001/02 2
# 5 SS/CR/BIA/ABEYOONG/1/0002/04 10
# 6 SS/CR/BIA/ABEYOONG/1/0002/03 3
# 7 SS/CR/BIA/ABEYOONG/1/0002/05 11
# 8 SS/CR/BIA/ABEYOONG/1/0002/02 2
# 9 SS/CR/BIA/ABEYOONG/1/0003/03 3
# 10 SS/CR/BIA/ABEYOONG/1/0003/05 3
# 11 SS/CR/BIA/ABEYOONG/1/0003/04 3
# 12 SS/CR/BIA/ABEYOONG/1/0003/02 3
# 13 SS/CR/BIA/ABEYOONG/1/0004/02 3
# 14 SS/CR/BIA/ABEYOONG/1/0004/07 11
# 15 SS/CR/BIA/ABEYOONG/1/0004/06 3
# 16 SS/CR/BIA/ABEYOONG/1/0004/05 3
# 17 SS/CR/BIA/ABEYOONG/1/0004/04 3
# 18 SS/CR/BIA/ABEYOONG/1/0004/03 3
# If you want to see only the unique IDs
df %>% filter(relationship >= 2) %>%
select(ID) %>%
distinct()
# however every ID is distinct in this data,
# so the results won't look different
#--------- List ID relationship if numberhead < 1 ----------
df %>%
filter(relationship < 1)
# # A tibble: 0 × 2
# # … with 2 variables: ID <chr>, relationship <dbl>
# no results
#--------- see it all at one time? ----------
df %>%
mutate(relates = cut(relationship,
c(0, 1, max(relationship)))) %>%
group_by(relates,ID) %>%
summarise(n()) %>%
print(n = nrow(df)) # when you have a tbl_df,
# you get pretty print in the console,
# this call will let you see it all
# # A tibble: 23 × 3
# # Groups: relates [2]
# relates ID `n()`
# <fct> <chr> <int>
# 1 (0,1] SS/CR/BIA/ABEYOONG/1/0001/01 1
# 2 (0,1] SS/CR/BIA/ABEYOONG/1/0002/01 1
# 3 (0,1] SS/CR/BIA/ABEYOONG/1/0003/01 1
# 4 (0,1] SS/CR/BIA/ABEYOONG/1/0004/01 1
# 5 (0,1] SS/CR/BIA/ABEYOONG/1/0005/01 1
# 6 (1,11] SS/CR/BIA/ABEYOONG/1/0001/02 1
# 7 (1,11] SS/CR/BIA/ABEYOONG/1/0001/03 1
# 8 (1,11] SS/CR/BIA/ABEYOONG/1/0001/04 1
# 9 (1,11] SS/CR/BIA/ABEYOONG/1/0001/05 1
# 10 (1,11] SS/CR/BIA/ABEYOONG/1/0002/02 1
# 11 (1,11] SS/CR/BIA/ABEYOONG/1/0002/03 1
# 12 (1,11] SS/CR/BIA/ABEYOONG/1/0002/04 1
# 13 (1,11] SS/CR/BIA/ABEYOONG/1/0002/05 1
# 14 (1,11] SS/CR/BIA/ABEYOONG/1/0003/02 1
# 15 (1,11] SS/CR/BIA/ABEYOONG/1/0003/03 1
# 16 (1,11] SS/CR/BIA/ABEYOONG/1/0003/04 1
# 17 (1,11] SS/CR/BIA/ABEYOONG/1/0003/05 1
# 18 (1,11] SS/CR/BIA/ABEYOONG/1/0004/02 1
# 19 (1,11] SS/CR/BIA/ABEYOONG/1/0004/03 1
# 20 (1,11] SS/CR/BIA/ABEYOONG/1/0004/04 1
# 21 (1,11] SS/CR/BIA/ABEYOONG/1/0004/05 1
# 22 (1,11] SS/CR/BIA/ABEYOONG/1/0004/06 1
# 23 (1,11] SS/CR/BIA/ABEYOONG/1/0004/07 1
Using the data you have provided and adding how to pull that data directly into R. Note that I assume ID
is what is column 1 and that what you are calling relationship
is the column hhsize
.
For collecting the data, you can pull it directly from your personal computer drive or directly from the web.
library(openxlsx)
# from your computer
df2 <- read.xlsx("/path/in/you/computer/file.xlsx")
# if there was more than one sheet, you would designate which sheet
# from the web
# for dropbox, look in the path for "d1=0"
# you have to change that to "d1=1" for a direct download
df3 <- read.xlsx("https://www.dropbox.com/scl/fi/73dw92bpcjio3m1k0w5vv/Round-11th-19-08-2020.xlsx?dl=1&rlkey=2xxtyge3rppi0aikkl8nlt6oc")
If you really wanted to rename the columns you can do that this way:
names(df2)[1] <- "ID"
Is this what you are looking for?
#----- perhaps looking for this ------
df3[,c(1,17)] %>% # only look at IDs and household size
distinct() %>% # ignore duplicates, when both fields match
mutate(relates = cut(hhsize, # add factor for ranges
c(0, 1, 2,
max(hhsize)),
include.lowest = T)) %>%
group_by(relates) %>% # only group by household size ranges
summarise(count = n()) # show the count per case
# # A tibble: 3 × 2
# relates count
# <fct> <int>
# 1 [0,1] 505
# 2 (1,2] 1736
# 3 (2,25] 15771