Extract complex ranges of numbers from string?

Question

My input is a dictionary with a key "RANGE" where the value can be:

marker_range = "10-20"
marker_range = "10    - 23"
marker_range = "10.433 - 33,99"
marker_range = "50,111, 44.00"

and so on.

Here is what I have tried to extract the ranges to be:

[10, 20]
[10, 23]
[10.433, 33.99]
[50.111, 44.0]

# Create pattern to search: 
# Find commas: [\d] + [.,\d]+
# Find floats: [\d]*[.][\d]+
# Find integers: [\d]+
pattern = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'
# Search the pattern in a given range:
search_res = re.search(pattern, marker_range)

# Set default range:
final_marker_range = []

# Reformat marker range to list:
if search_res is not None:
    for catch in re.finditer(pattern, marker_range):
        final_marker_range.append(float(catch[0]))
        if "<" in marker_range:
            final_marker_range.insert(0, -float("inf"))
        if ">" in marker_range:
            final_marker_range.insert(len(final_marker_range), float("inf"))
# Update the dictionary with the new range format:
marker_dict["RANGE"] = final_marker_range 
else:
    # Update the dictionary with the new default range:
    marker_dict["RANGE"] = [float("-inf"), float("inf")]

Please advise how to solve this, maybe there is a library that "knows" how to parse such complex ranges?

The code you've posted has invalid indentation; in particular, the line before `else:`. — khelwood, Sep 10 '20 at 12:01

score 0 · Answer 1 · answered Sep 10 '20 at 12:04

0

You can try this:

(?P<first>\d+[.,]?\d*)(?:\s*[\-,]\s*)(?P<second>\d+[.,]?\d*)

The first capture group grabs a number with or without decimal, separated either by comma's or dots. The non-capture group is the range indicator. The second group captures a second number with or without decimals (similar to the first).

answered Sep 10 '20 at 12:04

Exelian

5,749
1
30
49

What is the difference of just taking the range field? I get the same value. I need it to be wisely splitted. – SteveS Sep 10 '20 at 12:11

score 0 · Accepted Answer · answered Sep 10 '20 at 12:29

Finally found a great solution, posting for others to benefit, this is the function that solved the parsing for my regex. So the line of code I have used:

final_marker_range.append(float(catch[0]))

Will be:

final_marker_range.append(parseNumber(catch[0]))

https://github.com/hayj/SystemTools/blob/master/systemtools/number.py

# coding: utf-8

import re

def parseNumber(text):
    """
        Return the first number in the given text for any locale.
        TODO we actually don't take into account spaces for only
        3-digited numbers (like "1 000") so, for now, "1 0" is 10.
        TODO parse cases like "125,000.1,0.2" (125000.1).
        :example:
        >>> parseNumber("a 125,00 €")
        125
        >>> parseNumber("100.000,000")
        100000
        >>> parseNumber("100 000,000")
        100000
        >>> parseNumber("100,000,000")
        100000000
        >>> parseNumber("100 000 000")
        100000000
        >>> parseNumber("100.001 001")
        100.001
        >>> parseNumber("$.3")
        0.3
        >>> parseNumber(".003")
        0.003
        >>> parseNumber(".003 55")
        0.003
        >>> parseNumber("3 005")
        3005
        >>> parseNumber("1.190,00 €")
        1190
        >>> parseNumber("1190,00 €")
        1190
        >>> parseNumber("1,190.00 €")
        1190
        >>> parseNumber("$1190.00")
        1190
        >>> parseNumber("$1 190.99")
        1190.99
        >>> parseNumber("$-1 190.99")
        -1190.99
        >>> parseNumber("1 000 000.3")
        1000000.3
        >>> parseNumber('-151.744122')
        -151.744122
        >>> parseNumber('-1')
        -1
        >>> parseNumber("1 0002,1.2")
        10002.1
        >>> parseNumber("")
        >>> parseNumber(None)
        >>> parseNumber(1)
        1
        >>> parseNumber(1.1)
        1.1
        >>> parseNumber("rrr1,.2o")
        1
        >>> parseNumber("rrr1rrr")
        1
        >>> parseNumber("rrr ,.o")
    """
    try:
        # First we return None if we don't have something in the text:
        if text is None:
            return None
        if isinstance(text, int) or isinstance(text, float):
            return text
        text = text.strip()
        if text == "":
            return None
        # Next we get the first "[0-9,. ]+":
        n = re.search("-?[0-9]*([,. ]?[0-9]+)+", text).group(0)
        n = n.strip()
        if not re.match(".*[0-9]+.*", text):
            return None
        # Then we cut to keep only 2 symbols:
        while " " in n and "," in n and "." in n:
            index = max(n.rfind(','), n.rfind(' '), n.rfind('.'))
            n = n[0:index]
        n = n.strip()
        # We count the number of symbols:
        symbolsCount = 0
        for current in [" ", ",", "."]:
            if current in n:
                symbolsCount += 1
        # If we don't have any symbol, we do nothing:
        if symbolsCount == 0:
            pass
        # With one symbol:
        elif symbolsCount == 1:
            # If this is a space, we just remove all:
            if " " in n:
                n = n.replace(" ", "")
            # Else we set it as a "." if one occurence, or remove it:
            else:
                theSymbol = "," if "," in n else "."
                if n.count(theSymbol) > 1:
                    n = n.replace(theSymbol, "")
                else:
                    n = n.replace(theSymbol, ".")
        else:
            # Now replace symbols so the right symbol is "." and all left are "":
            rightSymbolIndex = max(n.rfind(','), n.rfind(' '), n.rfind('.'))
            rightSymbol = n[rightSymbolIndex:rightSymbolIndex+1]
            if rightSymbol == " ":
                return parseNumber(n.replace(" ", "_"))
            n = n.replace(rightSymbol, "R")
            leftSymbolIndex = max(n.rfind(','), n.rfind(' '), n.rfind('.'))
            leftSymbol = n[leftSymbolIndex:leftSymbolIndex+1]
            n = n.replace(leftSymbol, "L")
            n = n.replace("L", "")
            n = n.replace("R", ".")
        # And we cast the text to float or int:
        n = float(n)
        if n.is_integer():
            return int(n)
        else:
            return n
    except: pass
    return None


def truncateFloat(f, n=2):
    '''Truncates/pads a float f to n decimal places without rounding'''
    s = '{}'.format(f)
    if 'e' in s or 'E' in s:
        return float('{0:.{1}f}'.format(f, n))
    i, p, d = s.partition('.')
    return float('.'.join([i, (d+'0'*n)[:n]]))



def removeCommasBetweenDigits(text):
    """
        :example:
        >>> removeCommasBetweenDigits("sfeyv dsf,54dsf ef 6, 6 zdgy 6,919 Photos and 3,3 videos6,")
        'sfeyv dsf,54dsf ef 6, 6 zdgy 6919 Photos and 33 videos6,'
    """
    if text is None:
        return None
    else:
        return re.sub(r"([0-9]),([0-9])", "\g<1>\g<2>", text)

def getAllNumbers(text, removeCommas=False):
    if text is None:
        return None
    if removeCommas:
        text = removeCommasBetweenDigits(text)
    allNumbers = []
    if len(text) > 0:
        # Remove space between digits :
        spaceNumberExists = True
        while spaceNumberExists:
            text = re.sub('(([^.,0-9]|^)[0-9]+) ([0-9])', '\\1\\3', text, flags=re.UNICODE)
            if re.search('([^.,0-9]|^)[0-9]+ [0-9]', text) is None:
                spaceNumberExists = False
        numberRegex = '[-+]?[0-9]+[.,][0-9]+|[0-9]+'
        allMatchIter = re.finditer(numberRegex, text)
        if allMatchIter is not None:
            for current in allMatchIter:
                currentFloat = current.group()
                currentFloat = re.sub("\s", "", currentFloat)
                currentFloat = re.sub(",", ".", currentFloat)
                currentFloat = float(currentFloat)
                if currentFloat.is_integer():
                    allNumbers.append(int(currentFloat))
                else:
                    allNumbers.append(currentFloat)
    return allNumbers

def removeAllNumbers(text):
    if text is None:
        return None
    if len(text) == 0:
        return ""
    # Remove space between digits :
    spaceNumberExists = True
    while spaceNumberExists:
        text = re.sub('([0-9]) ([0-9])', '\\1\\2', text, flags=re.UNICODE)
        if re.search('[0-9] [0-9]', text) is None:
            spaceNumberExists = False
    numberRegex = '[-+]?[0-9]+[.,][0-9]+|[0-9]+'
    numberExists = True
    while numberExists:
        text = re.sub(numberRegex, "", text)
        if re.search(numberRegex, text) is None:
            numberExists = False

    return text.strip()

def getFirstNumber(text, *args, **kwargs):
    result = getAllNumbers(text, *args, **kwargs)
    if result is not None and len(result) > 0:
        return result[0]
    return None

def representsFloat(text):
    """
        This function return True if the given param (string or float) represents a float
        :Example:
        >>> representsFloat("1.0")
        True
        >>> representsFloat("1")
        False
        >>> representsFloat("a")
        False
        >>> representsFloat(".0")
        False
        >>> representsFloat("0.")
        False
        >>> representsFloat("0.000001")
        True
        >>> representsFloat("00000.000001")
        True
        >>> representsFloat("0000a0.000001")
        False
    """
    if isinstance(text, float):
        return True
    elif text is None:
        return False
    elif isinstance(text, str):
        if len(text) < 3:
            return False
        text = text.strip()
        return re.search("^[0-9]{1,}\.[0-9]{1,}$", text) is not None
    else:
        return False
    


def representsInt(s, acceptRoundedFloats=False):
    """
        This function return True if the given param (string or float) represents a int
        :Example:
        >>> representsInt(1)
        True
        >>> representsInt("1")
        True
        >>> representsInt("a")
        False
        >>> representsInt("1.1")
        False
        >>> representsInt(1.1)
        False
        >>> representsInt(42.0, acceptRoundedFloats=True)
        True
        >>> representsInt("42.0", acceptRoundedFloats=True)
        True
    """

    if isinstance(s, float):
        if acceptRoundedFloats:
            return s.is_integer()
    else:
        if acceptRoundedFloats:
            try:
                s = float(s)
                return representsInt(s, acceptRoundedFloats=acceptRoundedFloats)
            except ValueError:
                return False
        else:
            try:
                int(s)
                return True
            except ValueError:
                return False
    return False


def floatAsReadable(f):
    """
        source https://stackoverflow.com/questions/8345795/force-python-to-not-output-a-float-in-standard-form-scientific-notation-expo
    """
    _ftod_r = re.compile(br'^(-?)([0-9]*)(?:\.([0-9]*))?(?:[eE]([+-][0-9]+))?$')
    """Print a floating-point number in the format expected by PDF:
    as short as possible, no exponential notation."""
    s = bytes(str(f), 'ascii')
    m = _ftod_r.match(s)
    if not m:
        raise RuntimeError("unexpected floating point number format: {!a}"
                           .format(s))
    sign = m.group(1)
    intpart = m.group(2)
    fractpart = m.group(3)
    exponent = m.group(4)
    if ((intpart is None or intpart == b'') and
        (fractpart is None or fractpart == b'')):
        raise RuntimeError("unexpected floating point number format: {!a}"
                           .format(s))

    # strip leading and trailing zeros
    if intpart is None: intpart = b''
    else: intpart = intpart.lstrip(b'0')
    if fractpart is None: fractpart = b''
    else: fractpart = fractpart.rstrip(b'0')

    result = None

    if intpart == b'' and fractpart == b'':
        # zero or negative zero; negative zero is not useful in PDF
        # we can ignore the exponent in this case
        result = b'0'

    # convert exponent to a decimal point shift
    elif exponent is not None:
        exponent = int(exponent)
        exponent += len(intpart)
        digits = intpart + fractpart
        if exponent <= 0:
            result = sign + b'.' + b'0'*(-exponent) + digits
        elif exponent >= len(digits):
            result = sign + digits + b'0'*(exponent - len(digits))
        else:
            result = sign + digits[:exponent] + b'.' + digits[exponent:]

    # no exponent, just reassemble the number
    elif fractpart == b'':
        result = sign + intpart # no need for trailing dot
    else:
        result = sign + intpart + b'.' + fractpart

    result = result.decode("utf-8")
    if result.startswith("."):
        result = "0" + result
    return result



def digitalizeIntegers(text, totalDigits=100):
    if text is None or not isinstance(text, str) or text == "":
        return text
    result = str(text)
    toEdit = []
    for current in re.finditer("[0-9]+", text):
        theInt = current.group(0)
        start = current.start(0)
        end = current.end(0)
        remainingDigits = totalDigits - len(theInt)
        digitalizedInt = "0" * remainingDigits + theInt
        toEdit.append((digitalizedInt, start, end))
    for digitalizedInt, start, end in reversed(toEdit):
        # print(digitalizedInt, start, end)
        result = result[:start] + digitalizedInt + result[end:]
    return result

def main():
    allTexts = \
    [
        "ttt1ttt3t",
        "zzz23.32zzz8",
        "3.0z",
        "aaaaa",
        "bb",
        None,
        "1111111111111111111111111111111111111111111",
        "5",
        "0",
    ]
    for current in allTexts:
        print(current)
        print(digitalizeIntegers(current))
        print()

if __name__ == '__main__':
    # print(list(range(1, -1, -1)))
    main()

Extract complex ranges of numbers from string?

2 Answers2