In the end I came up with this. Regex are not enough since I want to hold context of what's currently happening in json to be a bit more restrictive.
Could be improved by using the whole stack and to knowing, what is current array/object nesting, but this is simple enough to work for my usecase.
And yes since this is only modifying the string, I can leave it out if I get the original source fixed.
import json
import re
import unittest
from json import JSONDecodeError
from typing import Any
expected_characters_by_prestring_value = {
"[": (",", "]"),
"]": ("[", ","),
"{": (":",),
"}": (",", "{", "]"),
":": (",", "}"),
}
def fix_unescaped_quotes(raw: str) -> str:
in_string = False
output = ""
nesting_stack = []
for index, character in enumerate(raw):
if character == '"' and raw[index - 1] != "\\":
if in_string:
first_nonwhite_character_ahead = re.search(
r"\S", raw[index + 1:]
).group()
if first_nonwhite_character_ahead in expected_characters_by_prestring_value[
nesting_stack[-1]]: # (",", "]", "}", ":"):
in_string = False
else:
output += "\\"
else:
in_string = True
else:
if not in_string:
if character.strip() != "" and character not in (",",):
nesting_stack.append(character)
output += character
return output
def parse_and_fix(raw: str) -> Any:
try:
return json.loads(raw)
except JSONDecodeError:
return json.loads(fix_unescaped_quotes(raw=raw))
class JsonFixUnescapedQuotesTest(unittest.TestCase):
def test_completely_invalid(self):
with self.assertRaises(JSONDecodeError):
parse_and_fix("invalid_json")
def test_valid(self):
self.assertEqual({}, parse_and_fix("{}"))
def test_invalid_single_array(self):
self.assertEqual(
['he said "hello world" and left'],
parse_and_fix("""["he said "hello world" and left"]"""),
)
def test_invalid_object(self):
self.assertEqual(
{"key": 'value " with quote in the middle'},
parse_and_fix("""{"key": "value " with quote in the middle"}"""),
)
def test_invalid_2_item_array(self):
self.assertEqual(
['invalid " string', "valid string"],
parse_and_fix("""["invalid " string", "valid string"]"""),
)
def test_wont_get_fooled_by_colon(self):
self.assertEqual(
['invalid ": string', "valid string"],
parse_and_fix("""["invalid ": string", "valid string"]"""),
)
def test_wont_get_fooled_by_colon_after_object(self):
self.assertEqual(
{"key": "value\":"},
parse_and_fix("""{"key": "value":"}"""),
)
def test_wont_get_fooled_by_comma_in_key(self):
self.assertEqual(
{"key\",": "value"},
parse_and_fix("""{"key",": "value"}"""),
)