Parsing structured data, whether it is YAML, HTML, XML or CSV, with regular expressions alone only work in a tiny subset of possible cases. With YAML multi-line scalars, dealing with flow-style and block-style etc. in a generic way is virtually impossible. If that were not the case, someone would already have written a full YAML parser in awk. (There is nothing wrong with awk
, it is just not the right tool for processing YAML).
That doesn't mean you cannot use regular expressions to find particular elements, you just need a bit of preparation:
import sys
import re
import ruamel.yaml
yaml_str = """\
sss:
- ccc:
brr: 'mmm'
jdk: 'openjdk8'
- bbb:
brr: 'rel/bbb'
jdk: 'openjdk8'
- aaa:
brr: 'rel/aaa'
jdk: 'openjdk7'
"""
class Paths:
def __init__(self, data, sep=':'):
self._sep = sep
self._data = data
def walk(self, data=None, prefix=None):
if data is None:
data = self._data
if prefix is None:
prefix = []
if isinstance(data, dict):
for idx, k in enumerate(data):
path_list = prefix + [k]
yield self._sep.join([str(q) for q in path_list]), path_list, idx, data[k]
for x in self.walk(data[k], path_list):
yield x
elif isinstance(data, list):
for idx, k in enumerate(data):
path_list = prefix + [idx]
yield self._sep.join([str(q) for q in path_list]), path_list, idx, k
for x in self.walk(k, path_list):
yield x
def set(self, pl, val):
pl = pl[:]
d = self._data
while(len(pl) > 1):
d = d[pl.pop(0)]
d[pl[0]] = val
def insert_in_list(self, pl, idx, val):
pl = pl[:]
d = self._data
while(len(pl) > 1):
d = d[pl.pop(0)]
d.insert(idx, val)
data = ruamel.yaml.round_trip_load(yaml_str, preserve_quotes=True)
paths = Paths(data)
pattern = re.compile('sss:.*:c.*:brr$')
# if you are going to insert/delete use list(paths.walk())
for p, pl, idx, val in list(paths.walk()):
print('path', p)
if not pattern.match(p):
continue
paths.set(pl, ruamel.yaml.scalarstring.SingleQuotedScalarString('rel/ccc'))
paths.insert_in_list(pl[:-2], idx, {'new': {
'brr': ruamel.yaml.scalarstring.SingleQuotedScalarString('mmm'),
'jdk': ruamel.yaml.scalarstring.SingleQuotedScalarString('openjdk8')
}})
print('----------')
ruamel.yaml.round_trip_dump(data, sys.stdout)
The output for that is:
path sss
path sss:0
path sss:0:ccc
path sss:0:ccc:brr
path sss:0:ccc:jdk
path sss:1
path sss:1:bbb
path sss:1:bbb:brr
path sss:1:bbb:jdk
path sss:2
path sss:2:aaa
path sss:2:aaa:brr
path sss:2:aaa:jdk
----------
sss:
- new:
brr: 'mmm'
jdk: 'openjdk8'
- ccc:
brr: 'rel/ccc'
jdk: 'openjdk8'
- bbb:
brr: 'rel/bbb'
jdk: 'openjdk8'
- aaa:
brr: 'rel/aaa'
jdk: 'openjdk7'
The printing of the "paths" is not necessary, but here to get a better idea of what is going on.
The SingleQuotedScalarString is necessary to get the superfluous quotes around the string scalars in the YAML output
The dict subclass, into which YAML mappings are loaded by ruamel.yaml
, supports .insert(index, key, val)
for Python 2.7 and Python 3.5 and later, so you can insert in specific positions of a mapping as well.