0

for the following code:

import Foundation

extension String {
    var fullRange: NSRange {
        return .init(self.startIndex ..< self.endIndex, in: self)
    }
    public subscript(range: Range<Int>) -> Self.SubSequence {
        let st = self.index(self.startIndex, offsetBy: range.startIndex)
        let ed = self.index(self.startIndex, offsetBy: range.endIndex)
        let sub = self[st ..< ed]
        return sub
    }

    func split(regex pattern: String) throws -> [String] {
        let regex = try NSRegularExpression.init(pattern: pattern, options: [])
        let fRange = self.fullRange
        let match = regex.matches(in: self, options: [], range: fRange)

        var list = [String]()
        var start = 0
        for m in match {
            let r = m.range
            let end = r.location

            list.append(String(self[start ..< end]))
            start = end + r.length
        }
        if start < self.count {
            list.append(String(self[start ..< self.count]))
        }
        return list
    }
}

print(try! "مرتفع جداً\nVery High".split(regex: "\n"))

the output should be :

["مرتفع جداً", "Very High"]

but instead it is:

["مرتفع جداً\n", "ery High"]

that because regex (for this case) matched the \n at the offset 10 instead of 9

is there any thing wrong in my code, or it is a bug in swift with regex !!

Ala'a Al Hallaq
  • 455
  • 5
  • 11

2 Answers2

2

It's not a bug. You are trying to use Int indexes which is error-prone and strongly discouraged in an Unicode environment.

This is the equivalent of your code with the proper String.Index type and the dedicated API to convert NSRange to Range<String.Index> and vice versa. fullRange and subscript are obsolete.

I just left out the print line. startIndex and endIndex are properties of String

extension String {
    func split(regex pattern: String) throws -> [String] {
        let regex = try NSRegularExpression(pattern: pattern)
        let matches = regex.matches(in: self, range: NSRange(startIndex..., in: self))

        var list = [String]()
        var start = startIndex
        for match in matches {

            let range = Range(match.range, in: self)!
            let end = range.lowerBound

            list.append(String(self[start..<end]))
            start = range.upperBound
        }
        if start < endIndex {
            list.append(String(self[start..<endIndex]))
        }

        return list
    }
}

print(try! "مرتفع جداً\nVery High".split(regex: "\n"))

The result is ["مرتفع جداً", "Very High"]

Ala'a Al Hallaq
  • 455
  • 5
  • 11
vadian
  • 274,689
  • 30
  • 353
  • 361
0

I found the issue behind this bug?! Swift Strings are so much weirder than any other language; since every character is 4 bytes length, then a single character (may, would, will, ..) contains 1 or 2 unicode characters (witch what happened in my case), so the solution is to subarray the unicodeScalars of the swift String instead of the string it self !!

Ala'a Al Hallaq
  • 455
  • 5
  • 11