I'm cleaning up and normalising imported data. I need to grep for many patterns in many columns and then update a column with the "clean" grepped pattern alongside the initial columns.
Target text is df (this is just a subset of actual columns and rows):
name street_1 street_2 city
217916 Australia 49 Carbine Street Donvale, Melbourne
197423 C/Guatemala 3, Local 11 Balearic Island
205195 In Haiti: #30 Carrefour Digicel Port Salut
Grep patterns are 247 countries. Final result should look like (columns above omitted for the sake of presentation):
country
217916 Australia
197423 Guatemala
205195 Haiti
If a row returns several hits, they should be listed for later processing, e.g. "Haiti, Australia"
or should it be c("Haiti", "Australia")
?
country
217916 c("Haiti", "Australia")
Also: country names are too long when concatenated together for grep (with pipe delimiter), so I've had to resort to processing by block of countries, but there must be a better way.
The code I've come up with for grepping multiple columns is:
hits <- sapply(
colnames(df),
function(x) grep(
paste0(
"(?<![a-z])(?:",
paste(head(countries$name, 120), collapse = "|"),
")(?![a-z])"),
df[, x],
ignore.case = T,
perl = T))
results <- df[unique(unlist(hits)), sapply(hits, function(x) any(length(unlist(x)) > 0))]
This subsets df into rows and columns where hits are found, but is still far off the mark. I don't use the word boundary pattern \\b
because I want to include digits and underscore in the boundaries.
Country names:
structure(list(name = c("Afghanistan", "Aland Islands", "Albania",
"Algeria", "American Samoa", "Andorra", "Angola", "Anguilla",
"Antarctica", "Antigua and Barbuda", "Argentina", "Armenia",
"Aruba", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain",
"Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin",
"Bermuda", "Bhutan", "Bolivia", "Bosnia and Herzegovina", "Botswana",
"Bouvet Island", "Brazil", "British Virgin Islands", "British Indian Ocean Territory",
"Brunei Darussalam", "Bulgaria", "Burkina Faso", "Burundi", "Cambodia",
"Cameroon", "Canada", "Cape Verde", "Cayman Islands", "Central African Republic",
"Chad", "Chile", "China", "Hong Kong, Special Administrative Region of China",
"Macao, Special Administrative Region of China", "Christmas Island",
"Cocos (Keeling) Islands", "Colombia", "Comoros", "Congo?(Brazzaville)",
"Congo, Democratic Republic of the", "Cook Islands", "Costa Rica",
"C?te d'Ivoire", "Croatia", "Cuba", "Cyprus", "Czech Republic",
"Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador",
"Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia",
"Ethiopia", "Falkland Islands (Malvinas)", "Faroe Islands", "Fiji",
"Finland", "France", "French Guiana", "French Polynesia", "French Southern Territories",
"Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Gibraltar",
"Greece", "Greenland", "Grenada", "Guadeloupe", "Guam", "Guatemala",
"Guernsey", "Guinea", "Guinea-Bissau", "Guyana", "Haiti", "Heard Island and Mcdonald Islands",
"Holy See?(Vatican City State)", "Honduras", "Hungary", "Iceland",
"India", "Indonesia", "Iran, Islamic Republic of", "Iraq", "Ireland",
"Isle of Man", "Israel", "Italy", "Jamaica", "Japan", "Jersey",
"Jordan", "Kazakhstan", "Kenya", "Kiribati", "Korea, Democratic People's Republic of",
"Korea, Republic of", "Kuwait", "Kyrgyzstan", "Lao PDR", "Latvia",
"Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania",
"Luxembourg", "Macedonia, Republic of", "Madagascar", "Malawi",
"Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands",
"Martinique", "Mauritania", "Mauritius", "Mayotte", "Mexico",
"Micronesia, Federated States of", "Moldova", "Monaco", "Mongolia",
"Montenegro", "Montserrat", "Morocco", "Mozambique", "Myanmar",
"Namibia", "Nauru", "Nepal", "Netherlands", "Netherlands Antilles",
"New Caledonia", "New Zealand", "Nicaragua", "Niger", "Nigeria",
"Niue", "Norfolk Island", "Northern Mariana Islands", "Norway",
"Oman", "Pakistan", "Palau", "Palestinian Territory, Occupied",
"Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines",
"Pitcairn", "Poland", "Portugal", "Puerto Rico", "Qatar", "R?union",
"Romania", "Russian Federation", "Rwanda", "Saint-Barth?lemy",
"Saint Helena", "Saint Kitts and Nevis", "Saint Lucia", "Saint-Martin (French part)",
"Saint Pierre and Miquelon", "Saint Vincent and Grenadines",
"Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia",
"Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore",
"Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa",
"South Georgia and the South Sandwich Islands", "South Sudan",
"Spain", "Sri Lanka", "Sudan", "Suriname?*", "Svalbard and Jan Mayen Islands",
"Swaziland", "Sweden", "Switzerland", "Syrian Arab Republic?(Syria)",
"Taiwan, Republic of China", "Tajikistan", "Tanzania?*, United Republic of",
"Thailand", "Timor-Leste", "Togo", "Tokelau", "Tonga", "Trinidad and Tobago",
"Tunisia", "Turkey", "Turkmenistan", "Turks and Caicos Islands",
"Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom",
"United States of America", "United States Minor Outlying Islands",
"Uruguay", "Uzbekistan", "Vanuatu", "Venezuela?(Bolivarian Republic of)",
"Viet Nam", "Virgin Islands, US", "Wallis and Futuna Islands",
"Western Sahara", "Yemen", "Zambia", "Zimbabwe")), .Names = "name", class = "data.frame", row.names = c(NA,
-247L))
Target text sample data (50 rows/358626):
structure(list(org_order = c(12444L, 33914L, 44556L, 44592L,
101963L, 109988L, 116027L, 116369L, 119774L, 129643L, 138317L,
144409L, 144409L, 145106L, 162134L, 163578L, 163782L, 165580L,
168297L, 169073L, 174144L, 182441L, 186903L, 187474L, 200151L,
202672L, 203554L, 203696L, 204983L, 205534L, 205669L, 206412L,
206998L, 209374L, 212087L, 214424L, 214549L, 214784L, 215966L,
217497L, 219141L, 219235L, 219611L, 220974L, 221249L, 221704L,
222197L, 226097L, 226987L, 228977L), organizations.api_path = c("organizations/tricked-dk",
"organizations/xdirect", "organizations/lincoln-dental-supply",
"organizations/scil-animal-care", "organizations/sportymob",
"organizations/fanflipped", "organizations/afribaba-holdings",
"organizations/rentokil", "organizations/counter-logic-gaming",
"organizations/predictive-analytics", "organizations/curatern",
"organizations/gotv-entertaining-africa", "organizations/gotv-entertaining-africa",
"organizations/fst-biometrics", "organizations/picons", "organizations/eye",
"organizations/inspiring-fifty", "organizations/fraudcracker",
"organizations/saev", "organizations/take-eat-easy", "organizations/cgiar-consortium",
"organizations/live-tile", "organizations/soko-insight", "organizations/leeds-beckett-university",
"organizations/guialocal", "organizations/wisboo", "organizations/edukwest-europe",
"organizations/demo-africa", "organizations/allafrica-global-media",
"organizations/code-week", "organizations/spotcast-inc", "organizations/synergy-social-ventures",
"organizations/lenoir-rhyne-college", "organizations/ceed-tech",
"organizations/netocratic", "organizations/balkan-venture-forum",
"organizations/adn-contents", "organizations/sarv-crm", "organizations/gooutmap",
"organizations/fi-ware", "organizations/dla-venture-pipeline",
"organizations/angels-on-time", "organizations/tangram-solutions",
"organizations/start-with-seven", "organizations/games-in-asia",
"organizations/openaxel", "organizations/z-safety-systems", "organizations/telkom-sa",
"organizations/cadigo", "organizations/filepreviews"), type = c("Address",
"Address", "Address", "Address", "Address", "Address", "Address",
"Address", "Address", "Address", "Address", "Address", "Address",
"Address", "Address", "Address", "Address", "Address", "Address",
"Address", "Address", "Address", "Address", "Address", "Address",
"Address", "Address", "Address", "Address", "Address", "Address",
"Address", "Address", "Address", "Address", "Address", "Address",
"Address", "Address", "Address", "Address", "Address", "Address",
"Address", "Address", "Address", "Address", "Address", "Address",
"Address"), uuid = c("f5d931f41c976de95e8e19d98c9d1e71", "e4cdc0eab5370addf2cfcc2968b352ed",
"84bd97b962366ba892e9c9e21134c636", "f03a05df323aa842406e8a4652951876",
"52bb2fcc51b56d2a6f124d1084b46c31", "fb3e017f18fdd8e25d350c0acb787d7a",
"775a805b428c129b7e4a4cd724461868", "bee1141933033ac79da8eace50acafdc",
"44e55ce5044b6c1c8bc3d3aef7df87bf", "00e163d78e1809063778e4766a18bdfb",
"2a65aca53c8e39792b113464c681ea52", "41b8b2fd215158d9dcb49ba366b7a036",
"923e0fbdea2e3c1da9f67b721b65d8e4", "d999ff11eae271f9876ec4e3075e4125",
"e51603d3020a327edbfeec14f21cccc3", "9b9e58951d59ed2aec3ca0ab766116ed",
"15e5998b1e8c3d4345c85e76f3dac240", "a22b40a3305de557315758a94080aa76",
"b6332226dec612644868f7491a901f4a", "4a3f4401cc908be0a77185e7d3c9a206",
"73dbe3fda355a357dee29caf5aa4dc99", "e85678b7bf1085432c63600319012a10",
"7f5e79d588492491c012fbdb90122073", "f4e47d7927be2eacddfb960cee55dc6b",
"5c59be51096e8c0a94f87b1de3689d42", "38548540aa739607dec960129cf12ba9",
"952ab75de5ea8d85a44a8dcd432607bd", "e68ee31280619ce2f77b932f8daec571",
"4bb54989ab4497e319d036c89427d8cf", "ae09b50b190db4a183bda9eccbe4cfce",
"862eec31ebb44e73d0ece854ab314ce5", "6380fdab108b1fee616d847d7f392f6a",
"85d6bd776b725d934d0bff98b2c83cad", "f7e3911becbbeb4dfec0418ed5f2b39a",
"b5ef4f3d9660e23808ca0a0ed0e338bb", "da9861eded1a9a0e9bf6ba6a48aedf1e",
"76e1d4c1a1c56c5fe2b1c21c656a3a69", "f3f5becd3a1693efca41bcc1af1703f6",
"18ac74abae87d07ce3b11eb0dbd820a9", "e428d523943b2cc3aec47b72129b5fe6",
"36e696fed157afe1602ca8fda46fa8cc", "329fb82b86a66a9e656f3c552430ceca",
"b7d59665f2a8154e7359b9e290112460", "aa4ef592efedfd60816af1b8f46705dc",
"56cc4caadab46e6badfc8d6dee7d42e1", "17c659034260a6b393f566f0499f4178",
"e05c45df62761524de921aca8c489b44", "26e0ec01fc14f58419a5553bc2e0503a",
"3d3b5834d32f858b65ee07fae02252cf", "49e224e43d352755a51e543c9d036a96"
), name = c("Headquarter", "xDirect", "Headquarters", "Headquarters",
"Headquarters", "Main office", "Headquarters", "Headquarter",
"Main office", "Main office", "Headquaters", "Headquarters",
"Headquarters", "Headquarters", "HQ", "Headquarters", "HQ", "Headquarters",
"Aberdeen (UK)", "Branch Office", "Headquarters", "HQ", "Africa",
"Main Campus", "", "HQ", "HQ", "HQ", "Main office", "HQ", "Headquarters",
"HQ", "Main Campus", "Headquarters", "HQ", "Main office", "Headquarters",
"Headquarters", "Headquarters", "HQ", "", "HQ", "Main office",
"HQ", "HQ", "HQ", "HQ", "Headquarters", "", "HQ"), street_1 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "33 Lazarov St.",
"", "", "", "Building 3, Sandgate Office Park", "Abercrombie Court",
"", "1000, Avenue Agropolis Montpellier,", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "2 passage of Mulhouse City Hall",
"GOLGASHT STREET, NO. 8", "", "", "", "Russia, Europe", "", "Three Seasons Office Park, Block A, 7 Spring Street",
"", "", "Industrial Area Rotem Heulentakstraat 8", "178 Madiba Street Pretoria",
"", ""), street_2 = c("", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "16 Desmond Street, Kramerville Sandton",
"Arnhall Business Park", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "KDPSTY17489", "", "", "",
"", "", "Rivonia, Sandton, South Africa", "", "", "Dilsen-Stokkem",
"", "", ""), postal_code = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "AB32 6FE", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "68100",
"19157", "", "", "", "", "", "", "", "", "", "1", "", ""), city = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "Rishon Lezion",
"", "", "", "", "Aberdeen, UK and Oslo, Norway", "", "France",
"France", "", "", "Mex", "", "", "", "", "", "", "", "", "",
"", "", "France", "", "", "", "Usa", "", "", "", "", "", "",
"", "Usa", ""), city_web_path = c("location/europe/6106f5dc823e5da840d751612c0b2c4e",
"location/asia/b41404d28ecccaf512688fdd3afe73ee", "location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1",
"location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1", "location/europe/6106f5dc823e5da840d751612c0b2c4e",
"location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1", "location/africa/5b194c3b3018b47df611ab142cdfd74c",
"location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1", "location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1",
"location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1", "location/europe/6106f5dc823e5da840d751612c0b2c4e",
"location/africa/5b194c3b3018b47df611ab142cdfd74c", "location/africa/5b194c3b3018b47df611ab142cdfd74c",
"", "location/europe/6106f5dc823e5da840d751612c0b2c4e", "location/europe/6106f5dc823e5da840d751612c0b2c4e",
"location/europe/6106f5dc823e5da840d751612c0b2c4e", "location/africa/5b194c3b3018b47df611ab142cdfd74c",
"", "location/europe/6106f5dc823e5da840d751612c0b2c4e", "", "",
"location/africa/5b194c3b3018b47df611ab142cdfd74c", "", "", "location/europe/6106f5dc823e5da840d751612c0b2c4e",
"location/europe/6106f5dc823e5da840d751612c0b2c4e", "location/africa/5b194c3b3018b47df611ab142cdfd74c",
"location/africa/5b194c3b3018b47df611ab142cdfd74c", "location/europe/6106f5dc823e5da840d751612c0b2c4e",
"location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1", "location/asia/b41404d28ecccaf512688fdd3afe73ee",
"", "location/europe/6106f5dc823e5da840d751612c0b2c4e", "location/europe/6106f5dc823e5da840d751612c0b2c4e",
"location/europe/6106f5dc823e5da840d751612c0b2c4e", "", "location/africa/5b194c3b3018b47df611ab142cdfd74c",
"location/europe/6106f5dc823e5da840d751612c0b2c4e", "location/europe/6106f5dc823e5da840d751612c0b2c4e",
"", "location/europe/6106f5dc823e5da840d751612c0b2c4e", "location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1",
"location/africa/5b194c3b3018b47df611ab142cdfd74c", "location/asia/b41404d28ecccaf512688fdd3afe73ee",
"location/europe/6106f5dc823e5da840d751612c0b2c4e", "location/europe/6106f5dc823e5da840d751612c0b2c4e",
"location/africa/5b194c3b3018b47df611ab142cdfd74c", "", "location/north-america/b25caef9a1b83a5d623293b2dfb6a1d1"
), region = c("", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", ""), region_code2 = c("", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", ""), region_web_path = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
""), country = c("", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", ""), country_code2 = c("", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", ""), country_code3 = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
""), country_web_path = c("", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", ""), latitude = c("", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", ""),
longitude = c("", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", "", "", "", "", "", ""), created_at = c(1472017733L,
1465290491L, 1462452917L, 1462449522L, 1453173099L, 1442981619L,
1441282477L, 1441188785L, 1440063839L, 1436866575L, 1449038192L,
1431511632L, 1431511671L, 1385115787L, 1426216594L, 1451463126L,
1425892972L, 1425395355L, 1366007686L, 1429092875L, 1422609471L,
1419929819L, 1425486709L, 1397988672L, 1397646249L, 1412939086L,
1412685691L, 1412669552L, 1412165155L, 1412059260L, 1427182640L,
1411738828L, 1397988677L, 1426923750L, 1410082123L, 1412660190L,
1422885253L, 1409737973L, 1453226669L, 1408033946L, 1390890580L,
1407478355L, 1407390601L, 1407497667L, 1406899810L, 1408455996L,
1406606186L, 1405405821L, 1396892107L, 1406895938L), updated_at = c(1472017733L,
1465290491L, 1462452917L, 1462449523L, 1453173099L, 1442981619L,
1441282599L, 1441188785L, 1440063840L, 1436866576L, 1449038328L,
1431511633L, 1431511671L, 1431452774L, 1426216595L, 1451463126L,
1425892973L, 1425395356L, 1424818704L, 1429092914L, 1422646398L,
1419967132L, 1425486709L, 1423708863L, 1423516571L, 1412939087L,
1412685692L, 1412669553L, 1412165258L, 1412059261L, 1427182641L,
1411738828L, 1415300534L, 1426923751L, 1410082123L, 1412660191L,
1422902286L, 1409737975L, 1453226669L, 1408033946L, 1416861739L,
1407478355L, 1407390602L, 1407497668L, 1406899810L, 1408455996L,
1406606187L, 1409264446L, 1416858194L, 1406895938L)), .Names = c("org_order",
"organizations.api_path", "type", "uuid", "name", "street_1",
"street_2", "postal_code", "city", "city_web_path", "region",
"region_code2", "region_web_path", "country", "country_code2",
"country_code3", "country_web_path", "latitude", "longitude",
"created_at", "updated_at"), row.names = c(6712L, 23888L, 30537L,
30568L, 66291L, 70998L, 74766L, 74967L, 77009L, 83681L, 89607L,
93685L, 93686L, 94208L, 106079L, 107094L, 107242L, 108555L, 110439L,
110980L, 114553L, 120627L, 123869L, 124297L, 133799L, 135863L,
136578L, 136699L, 137802L, 138269L, 138377L, 138976L, 139449L,
141300L, 143473L, 145303L, 145391L, 145574L, 146477L, 147807L,
149247L, 149327L, 149650L, 150870L, 151106L, 151466L, 151892L,
155304L, 156151L, 157890L), class = "data.frame")