I have a file that contains:
user_name order_id M_Status
jOHN 1000 married to Emma
each "Column" is separated from the following one by 5 spaces, the spaces count can change in another string, and since there is a single space between each word under M_Status column splitting by (" +") didn't work since the M_Status need to be one string, so what I'm trying to do is count the spaces between words in the first line then split all the remaining lines by the correct number of spaces (5 but could change in another file).
UPDATE:
val delimitersList = List(",", ";", ":", "\\|", "\\t", " ")
def findCommonDelimiter(line: String, sep: Option[String], typeToCheck: String): (List[String], String) = {
val delimiterMap = scala.collection.mutable.LinkedHashMap[String, Int]()// this needs to be changed to find how many times a delimiter is repeated between two columns
for (a <- delimitersList)
delimiterMap += a -> (a + "+").r.findAllIn(line).length
try {
val sortedMap = (delimiterMap.toList sortWith ((x, y) => x._2 > y._2)).take(3)
var splitChar = ""
val firstDelimiter = sortedMap.head._1.toString
val firstDelimiterCount = sortedMap.head._2
val secondDelimiter = sortedMap.drop(1).head._1.toString
val secondDelimiterCount = sortedMap.drop(1).head._2
val thirdDelimiter=sortedMap.drop(2).head._1.toString
val lineSplit=line.split("\\r?\\n")
if (!firstDelimiter.equalsIgnoreCase(",") &&
secondDelimiter.equalsIgnoreCase(",") &&
secondDelimiterCount > 0 &&
!typeToCheck.equalsIgnoreCase("map") {//(firstDelimiterCount - commaCount) <= 1 && commaCount > 0) {
splitChar = ","
} else if (firstDelimiter.equalsIgnoreCase(" ") || firstDelimiter.equalsIgnoreCase("\\t")) {
if (lineSplit(0).split(thirdDelimiter, 2).length == 2 &&
typeToCheck.equalsIgnoreCase("map") &&
((secondDelimiter.equalsIgnoreCase(",") &&
secondDelimiterCount > 0) || (secondDelimiter.equalsIgnoreCase(";") && secondDelimiterCount > 0))) {
splitChar = thirdDelimiter
} else if (lineSplit(0).split(secondDelimiter,2).length == 2 && typeToCheck.equalsIgnoreCase("map")) {
splitChar = secondDelimiter
} else if (typeToCheck.equalsIgnoreCase("header") && firstDelimiter.equalsIgnoreCase("\\t")) {
splitChar = "\\t"
} else if (typeToCheck.equalsIgnoreCase("header") &&
firstDelimiter.equalsIgnoreCase(" ") &&
secondDelimiterCount > 0) {
if ((firstDelimiterCount- secondDelimiterCount >= firstDelimiterCount / 2))
splitChar = secondDelimiter
} else {
if (firstDelimiter.equalsIgnoreCase(" ") &&
secondDelimiterCount > 0 &&
(firstDelimiterCount - secondDelimiterCount >= firstDelimiterCount / 2))
splitChar = secondDelimiter
else
splitChar = (sortedMap.maxBy(_._2)._1).toString //.take(1)
}
} else
splitChar = (sortedMap.maxBy(_._2)._1).toString //.take(1)
if (!splitChar.equalsIgnoreCase("""\|""") && !splitChar.equalsIgnoreCase("\\t")) {
// println("===>"+splitChar)
// if(!splitChar.equalsIgnoreCase(""))
(line.split(splitChar, -1).toList, splitChar)
} else {
if (splitChar.equalsIgnoreCase("""\|"""))
(line.split("\\|", -1).toList, splitChar)
else
(line.split("\\t", -1).toList, splitChar)
}
} catch {
case e: Exception => {
e.printStackTrace()
(List(line), "")
}
}
}
Thanks