0

I have a situation wherein i need to parse the data of cell containing multiline text EXAMPLE SHEET with expected desired result. I believe we can create such parsing by using regx but I am so dumb that cant figure out what is wrong with my code.

Sub AddDetails(c As Range)
Dim x       As Variant
    Dim y       As Variant
    Dim a()     As Variant
    Dim r       As Long
    Dim i       As Long
    Dim j       As Long
ActiveSheet.Cells(1, col + 1).Value = "Student Name"
ActiveSheet.Cells(1, col + 2).Value = "ART"
ActiveSheet.Cells(1, col + 3).Value = "Non-Final Result"
ActiveSheet.Cells(1, col + 4).Value = "Final Result"
    For r = 2 To Cells(rowS.Count, 1).End(xlUp).Row
        y = "Student Name=" & SplitMe(Range(col & r).Value, "Student Name")(1)

        x = Split(y, vbLf)
        For i = LBound(x) To UBound(x)
            If InStr(x(i), "=") Then
                ReDim Preserve a(j)
                a(UBound(a)) = Split(x(i), "=")(1)
                j = j + 1
            End If
        Next i
        Range("C" & r).Resize(, UBound(a) + 1).Value = a
        Erase x: Erase a: j = 0
    Next r
End Sub

Function FindColumn(searchFor As String) As Integer
  Dim i As Integer
    'Search row 1 for searchFor
    FindColumn = 0
    For i = 1 To ActiveSheet.Cells.SpecialCells(xlCellTypeLastCell).Column
        If ActiveSheet.Cells(1, i).Value = searchFor Then
            FindColumn = i
            Exit For
        End If
    Next i
End Function

Function SplitMe(s As String, delimiter As String)
    Dim arr, i As Long

    If Len(s) = 0 Or Len(delimiter) = 0 Then
        SplitByLastOccurrence = CVErr(2001)
    Else
        i = InStrRev(s, delimiter)
        If i = 0 Then
            SplitByLastOccurrence = Array(s)
        Else
            ReDim arr(0 To 1)
            arr(0) = Trim(Left$(s, i - 1))
            arr(1) = Trim(Mid$(s, i + Len(delimiter) + 1))
            SplitByLastOccurrence = arr
        End If
    End If
End Function

What I am trying to achieve is parse data from cell followed by key phrase STUDENT NAME and ART

on Top of it I have some dates which may be only one or multiple present after the key line 'Non-Final Result' if event date is present after this phrase that need to be given in respective column and if its multiple then need to stack them in same column cell.

update problem with interpretation of date

Event date=2016-09-02 
Event code=UU/CZXCD 
Event type=Examination events 
Event type=AS 
Event type=ASED 
Non-Final Result
Event date=2017-08-05 
Event code=UU/CZXSA 
Event type=Examination events 
Event type=AS 
Event type=ASED 
Final Result
Event date=2017-09-08 
Event code=UU/CZXCD 
Event type=Examination events 
Event type=AS 
Event type=ASED 
Non-Final Result
Community
  • 1
  • 1
Monika
  • 74
  • 7

2 Answers2

2

Here is one way of doing this using Regular expressions to extract each section of data. It reproduces what you have in your sample sheet

  • All your data is assumed to be in Column A starting at A2
  • The patterns for the different segments to be extracted are assumed to be exactly as you show. If they are not, then the regex as written will not work.
  • I use a Dictionary object to store the results. Makes it a bit easier for me to put things together so as to write the results to the worksheet when done
  • I used early binding (set those references); but you could rewrite to use late binding if necessary

EDIT: Code has been edited to account for the different method of determining Final and non-Final result dates and also RCE.

EDIT2: Certain terms redacted per request of the poster

I left in the same logic for determining Name and ART (except for changing Student Name to xxxxx.

The logic I use for returning the dates is:

  • Look for a line that starts with Event Publication Date=
  • Extract the date at the end of that line
    • if and only if it is followed by the appropriate text string (Final Result, Non-Final Result, xxxxx) with NO dates in-between.

Option Explicit
'set reference to Microsoft Scripting Runtime
'                 Microsoft VBScript Regular Expressions 5.5

Private RE As RegExp
Private MC As MatchCollection

Sub StudentDetail()
    Dim dS As Dictionary
    Dim WS As Worksheet
    Dim vSrc As Variant, vRes As Variant, rRes As Range
    Dim V As Variant, I As Long, J As Long
    Dim S As String

'Read data into vSrc
Set WS = Worksheets("sheet1")
With WS
    vSrc = .Range(.Cells(1, 1), .Cells(.Rows.Count, 1).End(xlUp))
End With

'First cell of results array
Set rRes = WS.Cells(1, 2)

'Initialize Regex
Set RE = New RegExp
With RE
    .Global = True
    .MultiLine = True
    .IgnoreCase = True
End With

'Collect the data into a dictionary
Set dS = New Dictionary
    dS.CompareMode = TextCompare
For I = 2 To UBound(vSrc, 1)
    ReDim V(4)
    S = vSrc(I, 1)

    'Name
    V(0) = reExtract(S, "Primary xxxxx:\s+(.*)")

    'ART
    V(1) = reExtract(S, "ART=(.*)")

    'NonFinal
    V(2) = reExtract(S, "Event publication date=(\d{4}-\d{2}-\d{2})(?:(?!\d{4}-\d{2}-\d{2})[\s\S])*Non-Final")

    'Final
    V(3) = reExtract(S, "Event publication date=(\d{4}-\d{2}-\d{2})(?:(?!\d{4}-\d{2}-\d{2})[\s\S])*^Final Rejection")

    'RCE
    V(4) = reExtract(S, "Event publication date=(\d{4}-\d{2}-\d{2})(?:(?!\d{4}-\d{2}-\d{2})[\s\S])*xxxxxx")

    If Not dS.Exists(V(0)) Then
        dS.Add Key:=V(0), Item:=V
    Else
        MsgBox "duplicate name"
        Stop  'You need to decide what to do
    End If
Next I

'Output the results to array
ReDim vRes(0 To dS.Count, 1 To 5)
    vRes(0, 1) = "xxxxx"
    vRes(0, 2) = "ART"
    vRes(0, 3) = "Non-Final Result"
    vRes(0, 4) = "Final Result"
    vRes(0, 5) = "RCE"

For I = 0 To dS.Count - 1
    V = dS(dS.Keys(I))
    For J = 0 To 4
        If IsArray(V(J)) Then
            vRes(I + 1, J + 1) = Join(V(J), vbLf)
        Else
            vRes(I + 1, J + 1) = V(J)
        End If
    Next J
Next I

'write array to worksheet
Set rRes = rRes.Resize(UBound(vRes, 1) + 1, UBound(vRes, 2))
With rRes
    .EntireColumn.Clear
    .Value = vRes
    With .Rows(1)
        .Font.Bold = True
        .HorizontalAlignment = xlCenter
    End With
    .EntireColumn.AutoFit
End With

End Sub

Function reExtract(S As String, sPat As String)
    Dim V As Variant
    Dim I As Long
With RE
    .Pattern = sPat
    If .Test(S) = True Then
        Set MC = .Execute(S)
        With MC
            If .Count = 1 Then
                reExtract = MC(0).SubMatches(0)
            Else
                ReDim V(0 To .Count - 1)
                I = 0
                For I = 0 To .Count - 1
                    V(I) = MC(I).SubMatches(0)
                Next I
                reExtract = V
            End If
        End With
    End If
End With
End Function
Ron Rosenfeld
  • 53,870
  • 7
  • 28
  • 60
  • Comments are not for extended discussion; this conversation has been [moved to chat](http://chat.stackoverflow.com/rooms/157949/discussion-on-answer-by-ron-rosenfeld-parsing-excel-single-cell-data-based-on-co). – Andy Oct 31 '17 at 20:19
  • Thanks @ron I have exceeded question limit if you could help it will be highly appriciated i came up with V(0) = reExtract(S, "( +(^[A-Za-z]{2}.*\d)") to get the first key but stuck with If .Test(S) = True Then .. let me know if you can help – Monika Nov 01 '17 at 14:44
1

Well, a key part of your RegEx VBA not working could be that you haven't included any RegEx-related code. :) However I see that you tried to put together some code snippets into a working example, so you get points for effort!

Looking at your data sample I have to wonder what the big picture is, as in, where this data is coming from, and if there is a better way to parse it than manually with VBA. For example, if this is linked or imported code from a database or website, possibly in large quantities, perhaps Excel's built in "Get External Data" features would be better suited, or maybe you should be leaning towards Access instead.

Nonetheless I was curious what it would take to parse to your specification, and I put together a solution that works, at least on the single example you provided (without RegEx). An updated copy of your sample is uploaded here (although I'm not sure if it will download correctly, or if the VBA will be blocked).

Option Explicit

'These functions can be used two ways:
' 1. Call sub [populateStudentData] and then programmatically put the extracted data where it needs to go (like sub "sTest" does below)
' 2. Call function [studentData] in a worksheet cell to return the desired field
'      - Syntax:   =studentData ( rawData_In, fieldName_Out )
'      - example:  =studentData ( A2, "Final Result" )  : returns all the "Final Result" dates found in cell A2
'      * NOTE that WRAP TEXT in "Cell Format > Alignment" must be on for multi lines (via vbLf) to display properly with this method
'        WRAP TEXT can also be set programmatically with cell.WrapText (see: https://stackoverflow.com/a/9902628/8112776)
'      - less efficient since it parses all the data for each cell & each field, on every calculation,
'        but shouldn't be a problem unless the function is used in a LOT of cells (in which case it start getting slow to re-calculate)

'Slapped together by ashleedawg@outlook.com for 'SO' Question: https://stackoverflow.com/questions/46996095/parsing-excel-single-cell-data-based-on-condition
'This code contain *zero error checking* and limited documentation.  Google any terms with which you are unfamiliar.
'Created with limited information on the actual application, this is FAR-from the most efficient way to parse data! "Provided for educational purposes only!"
'Perhaps using Collections or Objects would be more efficient.  Please feel free to improve & re-post this code, comments, suggestions, etc.

'declare an array to temporarily store:
'  "sField" = text before the delimiter, in:  "arrStudentData(__,0)"
'  "sValue" = text after the delimiter (if there is one), in:  "arrStudentData(__,1)" (Multiple values will be concatenated, delimited with vbLf's)
'      "sDelimiter" = character that separates the sField from the SValue:  either a Colon or an Equal Sign (: or =)

Public arrStudentData(1 To 99, 0 To 1) As String

Public Function studentData(rawData_In As String, fieldName_Out As String) As String
'parse [rawData_In] and return concatenated string of "sValue" data for "sField" specified in [fieldName_out]
'the concatenated output of the function can be SPLIT (or values Text-To-Column'd) based on [newDelimiter] (vbLf by default]
'call this function on a workdheet or programmatically
    populateStudentData (rawData_In)
    studentData = getField(fieldName_Out)
End Function

Sub sTest()
    'as a test/debugging, let's parse data from cell A2
    populateStudentData (Range("$A$2").Value)

    Stop 'hit F5 or click "play" to print results to the Immediate Window (Hit CTRL+G here to view)
    Call print_Results

    Stop 'hit F5 or click "play" to print data for field "Final Result" to Immediate Window
    Debug.Print getField("Final Result")

    Stop 'hit F5 or click "play" to populate cell A3 with data for field "Final Result"
    Range("$A$3") = getField("Final Result")
End Sub



Sub populateStudentData(str_Input As String)
'populate array [arrStudentData] by parsing [str_Input] (the raw string we need to decode)

    Dim new_Delimiter
    new_Delimiter = Chr(10)
    Dim arr_Input() As String '[str_Input] split into an array & cleaned up
    Dim sFieldCount As Integer 'the number of "sField"'s found in [str_Input]
    Dim sLineNumber As Integer 'the input "line" we are processor (counter)
    Dim sFieldExists As Boolean 'TRUE if the "sField" has already been found at least once
    Dim x As Integer 'counter

    'start with an empty array
    Erase arrStudentData

    'remove "Event Date" to combine date with previous line for {"Final Result" or "Non-Final Result"}
    str_Input = Replace(str_Input, "Result" & vbLf & "Event Date", "Result", , , vbTextCompare) '(vbTextCompare makes the search non-case-sensitive)

    'split [str_Input] into array [arr_Input] with vbLf's separating each value
    arr_Input = Split(str_Input, vbLf)

    'enumerate [arr_Input] to create a list of "sField's" in arrStudentData(x, 0)
    sFieldCount = 0
    For sLineNumber = 0 To UBound(arr_Input)
        If extract_sValue(arr_Input(sLineNumber)) <> "" Then 'ignore lines that don't have an "sValue"
            'does this field already exist?
            sFieldExists = False

            'enumerate [arrStudentData(x, 0)] to see if this field already exists
            For x = 1 To sFieldCount
                    If arrStudentData(x, 0) = extract_sField(arr_Input(sLineNumber)) Then sFieldExists = True 'field already exists in list
            Next x

            If Not sFieldExists Then 'field doesn't exist, add it to list
                sFieldCount = sFieldCount + 1
                arrStudentData(sFieldCount, 0) = extract_sField(arr_Input(sLineNumber))
            End If

        End If
    Next sLineNumber

    'we now have an array of field names : arrStudentData(1 to [sFieldCount],0)
    'next, enumerate [arr_Input] again, this time putting the "sValue's" into arrStudentData(x, 1)
    For sLineNumber = 0 To UBound(arr_Input)
        For x = 1 To sFieldCount
            'add the VALUE to the arrStudentData(x, 1)
            If extract_sField(arr_Input(sLineNumber)) = arrStudentData(x, 0) Then
                'this field is arrStudentData(x, 0) so concatenate the value after the "sDelimiter" to arrStudentData(x, 1)
                If Len(arrStudentData(x, 1)) > 0 Then
                    'this isn't the first value so add [new_Delimiter] before "sValue" (default: vbLf)
                    arrStudentData(x, 1) = arrStudentData(x, 1) & new_Delimiter
                End If
                arrStudentData(x, 1) = arrStudentData(x, 1) & extract_sValue(arr_Input(sLineNumber))
            End If
        Next x
    Next sLineNumber
End Sub

Function getField(sField As String) As String
'return "sValue" for the specified "sField"
    Dim x As Integer 'counter
    'enumerate the array to find a match
    For x = LBound(arrStudentData) To UBound(arrStudentData)
        If LCase(arrStudentData(x, 0)) = LCase(sField) Then 'compare lowercase (so not case sensitive)
            'found a match
            getField = arrStudentData(x, 1)
            Exit Function
        End If
    Next x
End Function

Function extract_sField(str_In As String) As String
    'return text found BEFORE one of the "sDelimiter's"
    If str_In <> "" Then extract_sField = Split(Split(str_In, ":")(0), "=")(0)
End Function

Function extract_sValue(str_In As String) As String
'return text found AFTER one of the "sDelimiter's"
    If InStr(str_In, "=") > 0 Then
        extract_sValue = Trim(Split(str_In, "=")(1)) 'text after "sDelimiter" =
    Else
        If InStr(str_In, ":") > 0 Then
            extract_sValue = Trim(Split(str_In, ":")(1)) 'text after "sDelimiter" :
        Else
            extract_sValue = "" 'no "sDelimiter's" found so return no value
        End If
    End If
End Function

Sub print_Results()
'for testing/debugging purposes: print values of array [arrStudentData] in the Immediate Window (Hit CTRL+G here to view)
    Dim x As Integer 'counter
    Debug.Print "----------"
    For x = LBound(arrStudentData) To UBound(arrStudentData)
        If arrStudentData(x, 0) <> "" Then
            Debug.Print "arrStudentData(" & x & ",0) = """ & arrStudentData(x, 0) & """"
            Debug.Print "arrStudentData(" & x & ",1) = """ & arrStudentData(x, 1) & """"
            Debug.Print "----------"
        End If
    Next x
End Sub

As you probably know, SO isn't supposed to be a "code writing service" but I took it as a practice 'challenge' to see if I could slap something together (so don't give me crap, mods!) There is no error handling and limited commenting but take a look and perhaps you can adapt it as necessary. There are definitely more efficient ways to tackle it instead of this method of repeatedly enumerating the same arrays over and over for each cell, so it won't be suitable on a "huge" scale...

ashleedawg
  • 20,365
  • 9
  • 72
  • 105