Pygments in QScintilla

Question

Consider this mcve:

import math
import sys
import textwrap
import time
from pathlib import Path
from collections import defaultdict

from PyQt5.Qsci import QsciLexerCustom, QsciScintilla
from PyQt5.Qt import *

from pygments import lexers, styles, highlight, formatters
from pygments.lexer import Error, RegexLexer, Text, _TokenType
from pygments.style import Style


EXTRA_STYLES = {
    "monokai": {
        "background": "#272822",
        "caret": "#F8F8F0",
        "foreground": "#F8F8F2",
        "invisibles": "#F8F8F259",
        "lineHighlight": "#3E3D32",
        "selection": "#49483E",
        "findHighlight": "#FFE792",
        "findHighlightForeground": "#000000",
        "selectionBorder": "#222218",
        "activeGuide": "#9D550FB0",
        "misspelling": "#F92672",
        "bracketsForeground": "#F8F8F2A5",
        "bracketsOptions": "underline",
        "bracketContentsForeground": "#F8F8F2A5",
        "bracketContentsOptions": "underline",
        "tagsOptions": "stippled_underline",
    }
}


def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return f"{s} {size_name[i]}"


class ViewLexer(QsciLexerCustom):

    def __init__(self, lexer_name, style_name):
        super().__init__()

        # Lexer + Style
        self.pyg_style = styles.get_style_by_name(style_name)
        self.pyg_lexer = lexers.get_lexer_by_name(lexer_name, stripnl=False)
        self.cache = {
            0: ('root',)
        }
        self.extra_style = EXTRA_STYLES[style_name]

        # Generate QScintilla styles
        self.font = QFont("Consolas", 8, weight=QFont.Bold)
        self.token_styles = {}
        index = 0
        for k, v in self.pyg_style:
            self.token_styles[k] = index
            if v.get("color", None):
                self.setColor(QColor(f"#{v['color']}"), index)
            if v.get("bgcolor", None):
                self.setPaper(QColor(f"#{v['bgcolor']}"), index)

            self.setFont(self.font, index)
            index += 1

    def defaultPaper(self, style):
        return QColor(self.extra_style["background"])

    def language(self):
        return self.pyg_lexer.name

    def get_tokens_unprocessed(self, text, stack=('root',)):
        """
        Split ``text`` into (tokentype, text) pairs.

        ``stack`` is the inital stack (default: ``['root']``)
        """
        lexer = self.pyg_lexer
        pos = 0
        tokendefs = lexer._tokens
        statestack = list(stack)
        statetokens = tokendefs[statestack[-1]]
        while 1:
            for rexmatch, action, new_state in statetokens:
                m = rexmatch(text, pos)
                if m:
                    if action is not None:
                        if type(action) is _TokenType:
                            yield pos, action, m.group()
                        else:
                            for item in action(lexer, m):
                                yield item
                    pos = m.end()
                    if new_state is not None:
                        # state transition
                        if isinstance(new_state, tuple):
                            for state in new_state:
                                if state == '#pop':
                                    statestack.pop()
                                elif state == '#push':
                                    statestack.append(statestack[-1])
                                else:
                                    statestack.append(state)
                        elif isinstance(new_state, int):
                            # pop
                            del statestack[new_state:]
                        elif new_state == '#push':
                            statestack.append(statestack[-1])
                        else:
                            assert False, "wrong state def: %r" % new_state
                        statetokens = tokendefs[statestack[-1]]
                    break
            else:
                # We are here only if all state tokens have been considered
                # and there was not a match on any of them.
                try:
                    if text[pos] == '\n':
                        # at EOL, reset state to "root"
                        statestack = ['root']
                        statetokens = tokendefs['root']
                        yield pos, Text, u'\n'
                        pos += 1
                        continue
                    yield pos, Error, text[pos]
                    pos += 1
                except IndexError:
                    break

    def highlight_slow(self, start, end):
        style = self.pyg_style
        view = self.editor()
        code = view.text()[start:]
        tokensource = self.get_tokens_unprocessed(code)

        self.startStyling(start)
        for _, ttype, value in tokensource:
            self.setStyling(len(value), self.token_styles[ttype])

    def styleText(self, start, end):
        view = self.editor()
        t_start = time.time()
        self.highlight_slow(start, end)
        t_elapsed = time.time() - t_start
        len_text = len(view.text())
        text_size = convert_size(len_text)
        view.setWindowTitle(f"Text size: {len_text} - {text_size} Elapsed: {t_elapsed}s")

    def description(self, style_nr):
        return str(style_nr)


class View(QsciScintilla):

    def __init__(self, lexer_name, style_name):
        super().__init__()
        view = self

        # -------- Lexer --------
        self.setEolMode(QsciScintilla.EolUnix)
        self.lexer = ViewLexer(lexer_name, style_name)
        self.setLexer(self.lexer)

        # -------- Shortcuts --------
        self.text_size = 1
        self.s1 = QShortcut(f"ctrl+1", view, self.reduce_text_size)
        self.s2 = QShortcut(f"ctrl+2", view, self.increase_text_size)
        # self.gen_text()

        # # -------- Multiselection --------
        self.SendScintilla(view.SCI_SETMULTIPLESELECTION, True)
        self.SendScintilla(view.SCI_SETMULTIPASTE, 1)
        self.SendScintilla(view.SCI_SETADDITIONALSELECTIONTYPING, True)

        # -------- Extra settings --------
        self.set_extra_settings(EXTRA_STYLES[style_name])

    def get_line_separator(self):
        m = self.eolMode()
        if m == QsciScintilla.EolWindows:
            eol = '\r\n'
        elif m == QsciScintilla.EolUnix:
            eol = '\n'
        elif m == QsciScintilla.EolMac:
            eol = '\r'
        else:
            eol = ''
        return eol

    def set_extra_settings(self, dct):
        self.setIndentationGuidesBackgroundColor(QColor(0, 0, 255, 0))
        self.setIndentationGuidesForegroundColor(QColor(0, 255, 0, 0))

        if "caret" in dct:
            self.setCaretForegroundColor(QColor(dct["caret"]))

        if "line_highlight" in dct:
            self.setCaretLineBackgroundColor(QColor(dct["line_highlight"]))

        if "brackets_background" in dct:
            self.setMatchedBraceBackgroundColor(QColor(dct["brackets_background"]))

        if "brackets_foreground" in dct:
            self.setMatchedBraceForegroundColor(QColor(dct["brackets_foreground"]))

        if "selection" in dct:
            self.setSelectionBackgroundColor(QColor(dct["selection"]))

        if "background" in dct:
            c = QColor(dct["background"])
            self.resetFoldMarginColors()
            self.setFoldMarginColors(c, c)

    def increase_text_size(self):
        self.text_size *= 2
        self.gen_text()

    def reduce_text_size(self):
        if self.text_size == 1:
            return
        self.text_size //= 2
        self.gen_text()

    def gen_text(self):
        content = Path(__file__).read_text()
        while len(content) < self.text_size:
            content *= 2
        self.setText(content[:self.text_size])


if __name__ == '__main__':
    app = QApplication(sys.argv)
    view = View("python", "monokai")
    view.setText(textwrap.dedent("""\
        '''
        Ctrl+1 = You'll decrease the size of existing text
        Ctrl+2 = You'll increase the size of existing text

        Warning: Check the window title to see how long it takes rehighlighting
        '''
    """))
    view.resize(800, 600)
    view.show()
    app.exec_()

To run it you need to install:

QScintilla==2.10.8
Pygments==2.3.1
PyQt5==5.12

I'm trying to figure out how to use pygments on a QScintilla widget and right now the main problem I need to solve is the performance when dealing with non-tiny documents.

I'd like the editor to become responsive & usable when dealing with large documents (>=100kb) but I don't know very well what's the approach I should take here. In order to test performance you can use Ctrl+1 or Ctrl+2 and the widget text will be decreased/increased respectively.

When I say "responsive" I mean that the highlighting computation of the visible screen should take no longer of [1-2]frame/highglight <=> [17-34]ms/highlight (assuming 60fps) so when typing you won't feel any slowdown.

Note: As you can see in the above mcve, I've included the pygments tokenizer so you can play around with it... it feels like in order to achieve "real-time highlighting" I'd need to use memoization/caching in some smart way but I'm struggling to figure out what's the data I need to cache and what's the best way to cache it... :/

Demo:

In the above demo you can see using this naive highlighting the editor will become unusable very soon, in my laptop rehighlighting text chunks of 32kb is still giving interactive framerate but with something higher than that the editor becomes completely unusable.

CONSIDERATIONS:

The most typical case will happen when you're typing/coding on the visible screen with no selections
It may happen you're editing multiple selections spread over the whole document, which means you won't know if these selections are near the visible screen or not. For instance, in Sublime when you press Alt+F3 you select all ocurrences under cursor
In the above snippet I've used a python lexer but the algorithm shouldn't focus too much on that one. Pygments support ~300 lexers afterall
The worst case scenario would happen if the visible screen is at the end of the file and one of the selections happens to live at the beginning of the screen... In case you need to rehighlight the whole document you'd need to find an alternative way even if that means the "highlighting" is not correct on the first pass
The most important is performance but also correctness... that is, if you give enough time the whole document should become highlighted correctly

REFERENCES:

The following documents are not specific to this particular problem but they talk about possible strategies of caching and syntax highlighting:

Related: https://code.visualstudio.com/blogs/2017/02/08/syntax-highlighting-optimizations tells how efficient syntax highlighting works. — ivan_pozdeev, Apr 27 '19 at 02:07
Just an FYI - [this question is being discussed on Meta](https://meta.stackoverflow.com/q/384321/1079354) so it may get more "attention" than normal. — Makoto, Apr 27 '19 at 19:32
The worst case is always going to suck. Either you have to live with that and work on the typical cases, (and/)or find a way to do your parsing asynchronously so you don't block the UI. For the latter, I'm not sure how much support Scintilla has, but I'd look into it if you haven't. For the former, @Nathan already addressed it: you need to stop ignoring the `end` parameter; it's provided precisely for reasons like this. — user541686, Apr 29 '19 at 08:53
@Mehrdad Reason why I've never considered to use the `end` parameter is because the nature of Pushdown automatons like pygments, textmate syntax-based engines. Scintilla can't suggest properly the `end` parameter because it doesn't know what's the end, as simply as that, @Nathan answer is bad/invalid and it'll provide wrong results, think about it... no matter if you're dealing with single or multiple selections, the whole document could potentially be rehighlighted and the end parameter won't inform about it. You say the worst case is always going to suck but SublimeText works wonderfully :/ — BPL, Apr 29 '19 at 09:04
@BPL: I didn't realize `end` isn't specified correctly -- but shouldn't `end` just be the rightmost character on the last visible row on the screen? You don't need to consider anything beyond that to highlight what's on the screen; you might as well consider nothing beyond that exits. The worst case I was imagining was when the beginning of the document is selected and then modified when you've scrolled all the way to the end, and delete something then. I don't know what Sublime does but there's simply no way to get around parsing the entire document at that point. But that should be rare. — user541686, Apr 29 '19 at 09:27
@BPL I've explained now. I think my solution solves the problems inherent in Nathan's, and the slightly fewer problems inherent in yours. However, it requires a bit more development effort, and I haven't got the tools available to do that, sadly. — wizzwizz4, Apr 29 '19 at 17:35
Since this clearly isn't a minimal example, I think it would be a better fit for codereview.stackexchange.com. — Greg Schmit, Apr 29 '19 at 18:06
@GregSchmit: That is not so clear to me. Also this isn't asking for a code review. — user541686, Apr 29 '19 at 21:13
@BPL I read it as optimizing this particular code (which is absolutely asking for code review), not a general algorithm question. If that is what the real question is, then the code should be significantly minimized. The fact that it's not is why it looks to some people that you're just asking them to write teh codez for you. The best answer you called "hacky" but that's only because you want too much from a single Q/A on SO. That's why it should be minimized and the question constrained in scope, or it should be on codereview. — Greg Schmit, Apr 29 '19 at 22:08
What is the actual question here? I don't find a single question mark in the question text. Perhaps simply add paragraph like "Question: ...what are you asking?" — hyde, Apr 30 '19 at 05:49

score 20 · Answer 1 · answered Apr 26 '19 at 18:35

20

In highlight_slow, you're receiving start and end values, but you're ignoring the end value. As a result, any time you type a single character, the code is rehighlighting the entire rest of the buffer. This is why, if you type at the end of a long buffer, the timing is very fast - around .1 - .2 ms - but if you type at the beginning, it's very slow.

Thinking just in terms of correct highlighting, in most cases (with Python, at least) when you introduce a new character only the current line needs to be re-styled. Sometimes, like if you start a function definition or open a bracket, multiple lines might need to be styled. Only when you open or close a multiline """ or ''' string - will the rest of the buffer need to be restyled.

If you include start and end in your logging, you'll see that most of the time when you type they span a very small range. If you change one line of your highlight_code method from

code = view.text()[start:]

to

code = view.text()[start:end]

you'll see that the method almost always take sub-millisecond time now, and it almost always gets the highlighting correct.

From what I've been able to tell, this only gets the styling wrong when multiline quotes are involved. However, your current code has the same problem: try opening a multiline string, typing enter, and continuing the string on the next line. The second line will be highlighted as code. Qscintilla is leading you astray a bit here, by giving a start that does not include the beginning of the multiline quote. It's not trying to be perfect, though - the docs say

In fact, QScintilla says: “Hey, I think you should restyle the text between the character at position start up to the character at position end“. You are completely free to ignore this suggestion.

Handling mutliline quoting correctly will be a bit tricky! If it were me, and I wanted to get something working quickly, I'd probably impement a keystroke to refresh the highlighting for the entire buffer and use that when things look wrong.

answered Apr 26 '19 at 18:35

Nathan Vērzemnieks

5,495
1
11
23

22

You said the main problem you need to deal with is performance. The change I suggested makes your code usably fast without making it behave more incorrectly. The multiline problem isn't mentioned in your question, it's just something I noticed. If you want help figuring out how to do better-quality highlighting with of multiple languages, with features your editor doesn't have yet (like multiple selection), I suggest adding those factors to your question. – Nathan Vērzemnieks Apr 26 '19 at 20:10
1

It is an interesting problem! In retrospect it should have been obvious you weren't missing something so obvious but - in my defense, we often miss the obvious ;) I might look more into the resources you point at this weekend if I have time. – Nathan Vērzemnieks Apr 26 '19 at 22:27
2

I've been thinking about that very problem! I agree that my answer doesn't address what you really wanted, although I still think it was not a _bad_ answer to the original question. I have been tinkering more over the weekend, and I have some ideas, but I won't have time to put them into useful form before the bounty expires. I do plan to put some more time into it during the week, and I will update my answer with what I come up with, but I can't promise the end result will satisfy you :) – Nathan Vērzemnieks Apr 29 '19 at 15:18
So it seems I wasn't wrong :) . At the end, it seems this invalid answer was just intended to earn some repo as well as earning the bounties... well, I don't blame you, this proves SO is somehow broken in some cases. That said, if you ever got interested again on this topic and come up with a nice valid answer I'd gladly rewarded it with 500 bounties... that said, I'd first need to confirm such an answer would satisfy me. Anyway, this has been a nice experience to me, I shouldn't have given so much bounties on this hard topic in the first place, my bad ;D – BPL May 13 '19 at 10:48
I was a little taken aback when you deleted all your comments, and frankly it still bothers me - taking away the context from other people's comments makes them look weird, and the motivation was unclear to me. So I felt a bit less inclined to come back to it. I did in fact spend quite a bit of time on this in the ensuing week, but it is indeed a hard problem :) I can post another answer outlining some things I found, if you like. In particular, relying on the `styleText` method just isn't going to work. – Nathan Vērzemnieks May 13 '19 at 20:43
Oh, sorry about that... I didn't want to bother you by deleting all my comments... usually my policy is when I create a thread I try to clean it up as much as possible from offtopic comments and I just try to leave ontopic comments. Usually I warn users I chat with (through comments) to do the same... in this case I forgot to do the same here. Just for the record, I'm honest with the intention of giving away another 500 bounties but let me be clear, **I'd be quite strict** judging the answer and testing it before doing so... but thing is, I'm extremely interested on this hard topic-thread ;) – BPL May 13 '19 at 21:04

wizzwizz4 · Answer 2 · 2019-04-29T18:48:33.503

If you're happy to write your own syntax highlighter, here's a possible way of speeding it up dramatically. You can do this with Pygments with a little effort; see the bottom of the answer for one possible way of doing this.

The syntax highlighter is simple. It has a small internal data structure, representing the current context, which it updates as it goes along. So, for the following Python code:

import time

def sleep_ms(ms):
    """sleeps for a length of time
    given in milliseconds"""

    time.sleep(
        ms / 1000
    )

sleep_ms(1000)
syntax error

its context might change like this, as it goes through the tokens¹:

>>> [nothing]
>>> IMPORT
    IMPORT modulename
>>> [nothing]
>>> DEF
    DEF functionname
    DEF functionname, OPENPAREN
    DEF functionname, OPENPAREN
    DEF functionname ARGLIST
    DEF functionname ARGLIST COLON
>>> FUNCBODY 4s
    FUNCBODY 4s, DOUBLE_MLSTR
>>> FUNCBODY 4s, DOUBLE_MLSTR
    FUNCBODY 4s
>>> FUNCBODY 4s
>>> FUNCBODY 4s, varname
    FUNCBODY 4s, varname ATTR
    FUNCBODY 4s, varname ATTR attrname
    FUNCBODY 4s, varname ATTR attrname, OPENPAREN
>>> FUNCBODY 4s, varname ATTR attrname, OPENPAREN
>>> FUNCBODY 4s, varname ATTR attrname, OPENPAREN, varname
    FUNCBODY 4s, varname ATTR attrname, OPENPAREN, TRUEDIV varname
    FUNCBODY 4s, varname ATTR attrname, OPENPAREN, TRUEDIV varname intliteral
>>> FUNCBODY 4s, FUNCCALL
>>> FUNCBODY 4s
>>> [nothing]
    varname
    varname, OPENPAREN
    varname, OPENPAREN, intliteral
    FUNCCALL
>>> [nothing]
    varname
    ERROR

If you cache the final contexts of each line, then you can start the syntax highlighting at the line that changed and keep going until you get to a line where the context is the same as is cached; you don't have to recompute the whole file, but if you add something like """ then it'll recompute until the end. If you get to an ERROR then you can just stop there; there's no point recalculating the syntax highlighting past a syntax error, because you don't know what the context's meant to be. (For the initial version when you open the file, you could assume that there's no context after a syntax error; this heuristic seems to work well enough.)

This syntax highlighter has the potential to be ridiculously accurate, or just "good enough", with virtually no perceivable difference in speed between the two. Language-specific highlighters could even be dynamically linked plugins, and it'd still be reasonably fast! Additionally, if you add debouncing for highlighting of subsequent lines, typing """""" quickly enough will be just as fast as typing "" or 42, no matter how big the file is.

Note that this highlighter is single-pass – it doesn't highlight known variable names differently to unknown ones, for example. If you wish to do this, the problem becomes considerably harder.

¹: This example Python highlighter is a "ridiculously accurate" one; I probably wouldn't go with something like this if I had a time limit. Nevertheless, I've got it planned out in my head and – for now, at least – could explain it in detail if required.

Your code requires surprisingly few changes to work with this technique.

Change the beginning of your get_tokens_unprocessed to:

    def get_tokens_unprocessed(self, text, stack=('root',), mutate_stack=False):
        """
        Split ``text`` into (tokentype, text) pairs.

        ``stack`` is the inital stack (default: ``['root']``)
        """
        lexer = self.pyg_lexer
        pos = 0
        tokendefs = lexer._tokens
        if not mutate_stack:
            statestack = list(stack)
        statetokens = tokendefs[statestack[-1]]

Find some way of detecting the line number.

In highlight_slow's loop, do something like this (except better):

        stack = list(self.cache[line_no_of(start)])
        tokensource = self.get_tokens_unprocessed(code, stack, True)

        self.startStyling(start)
        pos = start;
        for _, ttype, value in tokensource:
            self.setStyling(len(value), self.token_styles[ttype])
            pos += len(value)
            if is_line_end(pos):
                if pos >= end and stack == self.cache[line_no_of(start)]:
                    break
                self.cache[line_no_of(start)] = tuple(stack)

Obviously, the code would have to be better than this, and you'd have to find some efficient way of implementing is_line_end and line_no_of; there's probably some Pygments way of doing this.

This solution has at least one benefit over yours already: it supports multi-line comments.

@BPL The former – replacing Pygments. Well, I suppose _technically_ you could take any syntax highlighter you have the source code for and use that, dumping the state of variables and loading it as necessary. — wizzwizz4, Apr 28 '19 at 18:21
I'm not sure how to clarify. What parts _do_ you understand? — wizzwizz4, Apr 28 '19 at 18:22
@BPL You _could implement debouncing regardless (hence, bolded). I'll try to figure that out and explain better. — wizzwizz4, Apr 28 '19 at 18:28
@BPL `stack`. The `stack` kwarg is the "context" I'm talking about, and you can **pass it into the function**. You're more familiar with the library than I am, and my code almost certainly wouldn't work. — wizzwizz4, Apr 28 '19 at 18:38
Highlight line by line, and tweak `get_tokens_unprocessed` to output the `state` somehow at the end so you can store it and then feed it back in for the next line. Then, after a change, you only need to recalculate the highlighting for the changed line and all subsequent lines until `state` stops changing. — wizzwizz4, Apr 28 '19 at 18:49

Pygments in QScintilla

2 Answers2