You shouldn't use regexes on html. You should use BeautifulSoup or lxml. Here are some examples using BeautifulSoup:
Your td tags actually look like this:
<td>newline
<a>some link</a>newline
<br />newline
some data 1<br />newline
some data 2<br />newline
some data 3</td>
So td.text looks like this:
<newline>some link<newline><newline>some data 1<newline>some data 2<newline>some data 3
You can see that each string is separated by at least one newline, so that enables you to separate out each string.
from bs4 import BeautifulSoup as bs
import re
html = """<td>
<a href="http://www.linktosomewhere.net" title="title here">some link</a>
<br />
some data 1<br />
some data 2<br />
some data 3</td>"""
soup = bs(html)
tds = soup.find_all('td')
csv_data = []
for td in tds:
inner_text = td.text
strings = inner_text.split("\n")
csv_data.extend([string for string in strings if string])
print(",".join(csv_data))
--output:--
some link,some data 1,some data 2,some data 3
Or more concisely:
for td in tds:
print(re.sub("\n+", ",", td.text.lstrip() ) )
--output:--
some link,some data 1,some data 2,some data 3
But that solution is brittle because it won't work if your html looks like this:
<td>
<a href="http://www.linktosomewhere.net" title="title here">some link</a>
<br />some data 1<br />some data 2<br />some data 3</td>
Now td.text looks like this:
<newline>some link<newline>some data 1some data2some data3
And there isn't a way to figure out where some of the strings begin and end. But that just means you can't use td.text--there are still other ways to identify each string:
1)
from bs4 import BeautifulSoup as bs
import re
html = """<td>
<a href="http://www.linktosomewhere.net" title="title here">some link</a>
<br />some data 1<br />some data 2<br />some data 3</td>"""
soup = bs(html)
tds = soup.find_all('td')
csv_data = []
for td in tds:
a_tags = td.find_all('a')
for a_tag in a_tags:
csv_data.append(a_tag.text)
br_tags = a_tag.findNextSiblings('br')
for br in br_tags:
csv_data.append(br.next.strip()) #get the element after the <br> tag
csv_str = ",".join(csv_data)
print(csv_str)
--output:--
some link,some data 1,some data 2,some data 3
2)
for td in tds:
a_tag = td.find('a')
if a_tag: csv_data.append(a_tag.text)
for string in a_tag.findNextSiblings(text=True): #find only text nodes
string = string.strip()
if string: csv_data.append(string)
csv_str = ",".join(csv_data)
print(csv_str)
--output:--
some link,some data 1,some data 2,some data 3
3)
for td in tds:
a_tag = td.find('a')
if a_tag: csv_data.append(a_tag.text)
text_strings = a_tag.findNextSiblings( text=re.compile('\S+') ) #find only non-whitespace text nodes
csv_data.extend(text_strings)
csv_str = ",".join(csv_data)
print(csv_str)
--output:--
some link,some data 1,some data 2,some data 3