Finding letters mixed with numbers?

Question

So I'm working with a dataset that has values of 16M instead of 16,000,000. How do I find the other currency denotations? This dataset is massive, so it's not feasible to manually check. Thank you!

Below are the first 250 samples. The structure of the dataset is as follows: country, year, GDP($), and classification of the data(GDP).

structure(list(country = c("Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", "Aruba", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", 
"Afghanistan"), year = c("1799", "1800", "1801", "1802", "1803", 
"1804", "1805", "1806", "1807", "1808", "1809", "1810", "1811", 
"1812", "1813", "1814", "1815", "1816", "1817", "1818", "1819", 
"1820", "1821", "1822", "1823", "1824", "1825", "1826", "1827", 
"1828", "1829", "1830", "1831", "1832", "1833", "1834", "1835", 
"1836", "1837", "1838", "1839", "1840", "1841", "1842", "1843", 
"1844", "1845", "1846", "1847", "1848", "1849", "1850", "1851", 
"1852", "1853", "1854", "1855", "1856", "1857", "1858", "1859", 
"1860", "1861", "1862", "1863", "1864", "1865", "1866", "1867", 
"1868", "1869", "1870", "1871", "1872", "1873", "1874", "1875", 
"1876", "1877", "1878", "1879", "1880", "1881", "1882", "1883", 
"1884", "1885", "1886", "1887", "1888", "1889", "1890", "1891", 
"1892", "1893", "1894", "1895", "1896", "1897", "1898", "1899", 
"1900", "1901", "1902", "1903", "1904", "1905", "1906", "1907", 
"1908", "1909", "1910", "1911", "1912", "1913", "1914", "1915", 
"1916", "1917", "1918", "1919", "1920", "1921", "1922", "1923", 
"1924", "1925", "1926", "1927", "1928", "1929", "1930", "1931", 
"1932", "1933", "1934", "1935", "1936", "1937", "1938", "1939", 
"1940", "1941", "1942", "1943", "1944", "1945", "1946", "1947", 
"1948", "1949", "1950", "1951", "1952", "1953", "1954", "1955", 
"1956", "1957", "1958", "1959", "1960", "1961", "1962", "1963", 
"1964", "1965", "1966", "1967", "1968", "1969", "1970", "1971", 
"1972", "1973", "1974", "1975", "1976", "1977", "1978", "1979", 
"1980", "1981", "1982", "1983", "1984", "1985", "1986", "1987", 
"1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995", 
"1996", "1997", "1998", "1999", "2000", "2001", "2002", "2003", 
"2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011", 
"2012", "1799", "1800", "1801", "1802", "1803", "1804", "1805", 
"1806", "1807", "1808", "1809", "1810", "1811", "1812", "1813", 
"1814", "1815", "1816", "1817", "1818", "1819", "1820", "1821", 
"1822", "1823", "1824", "1825", "1826", "1827", "1828", "1829", 
"1830", "1831", "1832", "1833", "1834"), value = c("16.9M", "16.9M", 
"16.9M", "16.9M", "16.9M", "16.9M", "16.9M", "16.9M", "16.9M", 
"16.9M", "16.9M", "16.9M", "16.9M", "16.9M", "16.9M", "16.9M", 
"16.9M", "16.9M", "16.9M", "16.9M", "16.9M", "17.1M", "17.3M", 
"17.5M", "17.7M", "17.9M", "18.1M", "18.4M", "18.6M", "18.8M", 
"19M", "19.3M", "19.5M", "19.7M", "19.9M", "20.2M", "20.4M", 
"20.7M", "20.9M", "21.2M", "21.4M", "21.7M", "21.9M", "22.2M", 
"22.4M", "22.7M", "23M", "23.3M", "23.5M", "23.8M", "24.1M", 
"24.4M", "24.7M", "25M", "25.3M", "25.6M", "25.9M", "26.2M", 
"26.5M", "26.8M", "27.1M", "27.4M", "27.8M", "28.1M", "28.4M", 
"28.8M", "29.1M", "29.5M", "29.8M", "30.2M", "30.5M", "30.9M", 
"31.3M", "31.6M", "32M", "32.4M", "32.8M", "33.1M", "33.5M", 
"34M", "34.4M", "34.8M", "35.2M", "35.6M", "36M", "36.4M", "36.9M", 
"37.3M", "37.8M", "38.2M", "38.7M", "39.1M", "39.6M", "40M", 
"40.5M", "41M", "41.5M", "42M", "42.5M", "43M", "43.5M", "44M", 
"44.5M", "45.1M", "45.6M", "46.2M", "46.7M", "47.3M", "47.8M", 
"48.4M", "49M", "49.5M", "50.1M", "50.7M", "52.3M", "53.8M", 
"55.4M", "57.1M", "58.8M", "60.5M", "62.3M", "64.2M", "66.1M", 
"68.1M", "70.1M", "72.2M", "74.4M", "76.6M", "78.9M", "81.3M", 
"83.7M", "86.2M", "88.8M", "91.4M", "94.2M", "97M", "99.9M", 
"103M", "106M", "109M", "112M", "116M", "119M", "123M", "126M", 
"130M", "134M", "138M", "142M", "146M", "151M", "155M", "162M", 
"171M", "182M", "196M", "210M", "226M", "243M", "259M", "274M", 
"287M", "298M", "308M", "317M", "327M", "337M", "347M", "358M", 
"369M", "380M", "413M", "450M", "490M", "533M", "579M", "628M", 
"679M", "734M", "795M", "861M", "934M", "1.02B", "1.1B", "1.2B", 
"1.31B", "1.43B", "1.66B", "1.98B", "2.21B", "2.3B", "2.48B", 
"2.61B", "2.79B", "3.02B", "3.1B", "3.16B", "3.43B", "3.7B", 
"3.76B", "3.94B", "3.92B", "3.81B", "3.8B", "4.07B", "4.11B", 
"4.13B", "4.23B", "4.07B", "3.79B", "3.58B", "3.9B", NA, NA, 
"2.08B", "2.08B", "2.08B", "2.08B", "2.08B", "2.08B", "2.08B", 
"2.08B", "2.08B", "2.08B", "2.08B", "2.08B", "2.08B", "2.08B", 
"2.08B", "2.08B", "2.08B", "2.08B", "2.08B", "2.08B", "2.08B", 
"2.1B", "2.12B", "2.13B", "2.15B", "2.17B", "2.19B", "2.2B", 
"2.22B", "2.24B", "2.26B", "2.28B", "2.3B", "2.32B", "2.33B", 
"2.35B"), dataset = c("GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", "GDP", 
"GDP")), row.names = c(NA, -250L), class = c("tbl_df", "tbl", 
"data.frame"))

`grep("^[[:digit:]]+[[:alpha:]]+", x, ignore.case = TRUE)` will give all strings that start with one or more digits followed by at least one letter. — Rui Barradas, Jul 01 '21 at 21:22
What other currency denotations? Try to make a [reproducible example](https://stackoverflow.com/q/5963269/5325862) that would better show what you're trying to do — camille, Jul 01 '21 at 21:35
I've added a screen shot of the dataset - I'm still learning how to create a reproducible example for this website. — hopefullyphdsoon, Jul 01 '21 at 21:48
If you already have the dataset imported into R, then `dput(head(my_data, n = 50))` will reproduce the top 50 rows (or whatever is sufficient for `n`). — Greg, Jul 01 '21 at 22:01
To be clear, it is the **console output** _from_ `dput(head(GDPLong_df, n = 50))` that allows others reproduce the data structure, by copying that output into their environment as code. — Greg, Jul 01 '21 at 22:09
My pleasure! This here should give you a vector of all "currency denotation suffixes" that follow a whole or decimal number: `unique(gsub(pattern = "^\\d+(.\\d+)*", replacement = "", x = GDPLong_df$value))`; here `NA` appears in the results if an original `value` was `NA` to begin with. — Greg, Jul 01 '21 at 22:19
Happy to help! Shall I post it as an answer? Mind you, it would need to be adjusted if there are any commas as a thousands separator (`"1,234"`) or as the decimal "point" (`"16,9M"`). — Greg, Jul 01 '21 at 22:22
Also for conversion purposes, some countries do use the ["long scale"](https://en.wikipedia.org/wiki/Billion), in which "billion" refers to 10^12 rather than 10^9. — Greg, Jul 01 '21 at 22:28
There are some options available https://stackoverflow.com/questions/45972571/changing-million-billion-abbreviations-into-actual-numbers-ie-5-12m-5-120-0 and https://stackoverflow.com/questions/38013217/convert-from-billion-to-million-and-vice-versa — Ronak Shah, Jul 02 '21 at 04:11
@Greg, yes you can post it as an answer - it worked beautifully. Thanks again! — hopefullyphdsoon, Jul 03 '21 at 20:59

Greg · Answer 1 · 2021-07-03T21:59:45.717

By the request of OP, I am posting my informal solution in answer to the specific question

How do I find the other currency denotations?

From the value column in GDPLong_df, I simply remove the numeric (possibly decimal) "prefix", to leave the remaining text as the "suffix".

# Extract the denotation suffixes.
currency_denotations <- unique(gsub(pattern = "^\\d+(.\\d+)*", replacement = "",
                                    x = GDPLong_df$value))
# OPTIONALLY: Remove NAs if desired.
currency_denotations <- currency_denotations[!is.na(currency_denotations)]

Applied to OP's sample data, the output for currency_denotations gives us the denotations

"M" "B"

which excludes any NAs resulting from values that were NA to begin with.

Note

Further regex operations could be used to

control for whitespace
ensure the "suffix" is strictly alphabetical
control for commas as thousands separators ("1,234")
control for commas as decimal "points" ("16,9M")

but OP has already confirmed the existing solution is sufficient for their needs.

Warning

Caution should be used when converting from terms like "billion" (and "trillion", etc.), as their magnitude differs according to the conventions used by various countries: namely the long and short scales. For example, the former uses "billion" to describe 10^12, whereas the latter uses it to describe 10^9. However, I imagine that OP's dataset uses a standardized convention across the value column.

Jon Spring · Answer 2 · 2021-07-02T00:41:14.397

Here's an approach to extract the alphanumeric suffixes and multiply their meaning by the number.

library(tidyverse)
df1 %>%
  mutate(mult_txt = value %>% str_extract("[a-z]|[A-Z]"),
         multiplier = case_when(
           mult_txt == "B" ~ 1E9,
           mult_txt == "M" ~ 1E6,
           TRUE ~ 1),
         value_adj = parse_number(value) * multiplier)

Result

# A tibble: 250 x 7
   country year  value dataset mult_txt multiplier value_adj
   <chr>   <chr> <chr> <chr>   <chr>         <dbl>     <dbl>
 1 Aruba   1799  16.9M GDP     M           1000000  16900000
 2 Aruba   1800  16.9M GDP     M           1000000  16900000
 3 Aruba   1801  16.9M GDP     M           1000000  16900000
 4 Aruba   1802  16.9M GDP     M           1000000  16900000
 5 Aruba   1803  16.9M GDP     M           1000000  16900000
 6 Aruba   1804  16.9M GDP     M           1000000  16900000
 7 Aruba   1805  16.9M GDP     M           1000000  16900000
 8 Aruba   1806  16.9M GDP     M           1000000  16900000
 9 Aruba   1807  16.9M GDP     M           1000000  16900000
10 Aruba   1808  16.9M GDP     M           1000000  16900000
# … with 240 more rows

...and a test with more values:

options(scipen = 999) # so the output isn't shown in scientific notation
data.frame(value = c("16.9M", "900000", "192.0Z", "3B")) %>%
  mutate(mult_txt = value %>% str_extract("[a-z]|[A-Z]"),
         multiplier = case_when(
           mult_txt == "B" ~ 1E9,
           mult_txt == "M" ~ 1E6,
           TRUE ~ 1),
         value_adj = parse_number(value) * multiplier)

   value mult_txt multiplier  value_adj
1  16.9M        M    1000000   16900000
2 900000     <NA>          1     900000
3 192.0Z        Z          1        192
4     3B        B 1000000000 3000000000

Finding letters mixed with numbers?

2 Answers2

Note

Warning