CoCalc -- markdown2Mathjax.py

Path: cocalc/src / smc_pyutil / smc_pyutil / markdown2Mathjax.py
Views: ³⁹⁵⁵⁸
1
__version_info__ = (0,3,9)
2
__version__ = '.'.join(map(str,__version_info__))
3
__author__ = "Matthew Young"
4

5
import re
6
from markdown2 import markdown
7

8
def break_tie(inline,equation):
9
    """If one of the delimiters is a substring of the other (e.g., $ and $$) it is possible that the two will begin at the same location.  In this case we need some criteria to break the tie and decide which operation takes precedence.  I've gone with the longer of the two delimiters takes priority (for example, $$ over $).  This function should return a 2 for the equation block taking precedence, a 1 for the inline block.  The magic looking return statement is to map 0->2 and 1->1."""
10
    tmp=(inline.end()-inline.start() > equation.end()-equation.start())
11
    return (tmp*3+2)%4
12

13
def markdown_safe(placeholder):
14
    """Is the placeholder changed by markdown?  If it is, this isn't a valid placeholder."""
15
    mdstrip=re.compile("<p>(.*)</p>\n")
16
    md=markdown(placeholder)
17
    mdp=mdstrip.match(md)
18
    if mdp and mdp.group(1)==placeholder:
19
        return True
20
    return False
21

22
def mathdown(text):
23
    """Convenience function which runs the basic markdown and mathjax processing sequentially."""
24
    tmp=sanitizeInput(text)
25
    return reconstructMath(markdown(tmp[0]),tmp[1])
26

27
def sanitizeInput(string,inline_delims=["$","$"],equation_delims=["$$","$$"],placeholder="$0$"):
28
    """Given a string that will be passed to markdown, the content of the different math blocks is stripped out and replaced by a placeholder which MUST be ignored by markdown.  A list is returned containing the text with placeholders and a list of the stripped out equations.  Note that any pre-existing instances of the placeholder are "replaced" with themselves and a corresponding dummy entry is placed in the returned codeblock.  The sanitized string can then be passed safetly through markdown and then reconstructed with reconstructMath.
29

30
    There are potential four delimiters that can be specified.  The left and right delimiters for inline and equation mode math.  These can potentially be anything that isn't already used by markdown and is compatible with mathjax (see documentation for both).
31
    """
32
    #Check placeholder is valid.
33
    if not markdown_safe(placeholder):
34
        raise ValueError("Placeholder %s altered by markdown processing." % placeholder)
35
    #really what we want is a reverse markdown function, but as that's too much work, this will do
36
    inline_left=re.compile("(?<!\\\\)"+re.escape(inline_delims[0]))
37
    inline_right=re.compile("(?<!\\\\)"+re.escape(inline_delims[1]))
38
    equation_left=re.compile("(?<!\\\\)"+re.escape(equation_delims[0]))
39
    equation_right=re.compile("(?<!\\\\)"+re.escape(equation_delims[1]))
40
    placeholder_re = re.compile("(?<!\\\\)"+re.escape(placeholder))
41
    placeholder_scan = placeholder_re.scanner(string)
42
    ilscanner=[inline_left.scanner(string),inline_right.scanner(string)]
43
    eqscanner=[equation_left.scanner(string),equation_right.scanner(string)]
44
    scanners=[placeholder_scan,ilscanner,eqscanner]
45
    #There are 3 types of blocks, inline math, equation math and occurances of the placeholder in the text
46
    #inBlack is 0 for a placeholder, 1 for inline block, 2 for equation
47
    inBlock=0
48
    post=-1
49
    stlen=len(string)
50
    startmatches=[placeholder_scan.search(),ilscanner[0].search(),eqscanner[0].search()]
51
    startpoints=[stlen,stlen,stlen]
52
    startpoints[0]= startmatches[0].start() if startmatches[0] else stlen
53
    startpoints[1]= startmatches[1].start() if startmatches[1] else stlen
54
    startpoints[2]= startmatches[2].start() if startmatches[2] else stlen
55
    terminator=-1
56
    sanitizedString=''
57
    codeblocks=[]
58
    while 1:
59
        #find the next point of interest.
60
	while startmatches[0] and startmatches[0].start()<post:
61
	    startmatches[0]=placeholder_scan.search()
62
            startpoints[0]= startmatches[0].start() if startmatches[0] else stlen
63
        while startmatches[1] and startmatches[1].start()<post:
64
            startmatches[1]=ilscanner[0].search()
65
            startpoints[1]= startmatches[1].start() if startmatches[1] else stlen
66
        while startmatches[2] and startmatches[2].start()<post:
67
            startmatches[2]=eqscanner[0].search()
68
            startpoints[2]= startmatches[2].start() if startmatches[2] else stlen
69
        #Found start of next block of each type
70
	#Placeholder type always takes precedence if it exists and is next...
71
	if startmatches[0] and min(startpoints)==startpoints[0]:
72
	    #We can do it all in one!
73
	    #First add the "stripped" code to the blocks
74
	    codeblocks.append('0'+placeholder)
75
	    #Work out where the placeholder ends
76
	    tmp=startpoints[0]+len(placeholder)
77
	    #Add the "sanitized" text up to and including the placeholder
78
	    sanitizedString = sanitizedString + string[post*(post>=0):tmp]
79
	    #Set the new post
80
	    post=tmp
81
	    #Back to start!
82
	    continue
83
        elif startmatches[1] is None and startmatches[2] is None:
84
    	    #No more blocks, add in the rest of string and be done with it...
85
    	    sanitizedString = sanitizedString + string[post*(post>=0):]
86
    	    return (sanitizedString, codeblocks)
87
        elif startmatches[1] is None:
88
            inBlock=2
89
        elif startmatches[2] is None:
90
            inBlock=1
91
        else:
92
	    inBlock = (startpoints[1] < startpoints[2]) + (startpoints[1] > startpoints[2])*2
93
            if not inBlock:
94
                inBlock = break_tie(startmatches[1],startmatches[2])
95
        #Magic to ensure minimum index is 0
96
        sanitizedString = sanitizedString+string[(post*(post>=0)):startpoints[inBlock]]
97
        post = startmatches[inBlock].end()
98
        #Now find the matching end...
99
        while terminator<post:
100
            endpoint=scanners[inBlock][1].search()
101
            #If we run out of terminators before ending this loop, we're done
102
    	    if endpoint is None:
103
    	        #Add the unterminated codeblock to the sanitized string
104
    	        sanitizedString = sanitizedString + string[startpoints[inBlock]:]
105
    	        return (sanitizedString, codeblocks)
106
    	    terminator=endpoint.start()
107
        #We fonud a matching endpoint, add the bit to the appropriate codeblock...
108
        codeblocks.append(str(inBlock)+string[post:endpoint.start()])
109
        #Now add in the appropriate placeholder
110
        sanitizedString = sanitizedString+placeholder
111
        #Fabulous.  Now we can start again once we update post...
112
        post = endpoint.end()
113

114
def reconstructMath(processedString,codeblocks,inline_delims=["$","$"],equation_delims=["$$","$$"],placeholder="$0$",htmlSafe=False):
115
    """This is usually the output of sanitizeInput, after having passed the output string through markdown.  The delimiters given to this function should match those used to construct the string to begin with.
116

117
     This will output a string containing html suitable to use with mathjax.
118

119
     "<" and ">" "&" symbols in math can confuse the html interpreter because they mark the begining and end of definition blocks.  To avoid issues, if htmlSafe is set to True these symbols will be replaced by ascii codes in the math blocks. The downside to this is that if anyone is already doing this, there already niced text might be mangled (I think I've taken steps to make sure it won't but not extensively tested...)"""
120
    delims=[['',''],inline_delims,equation_delims]
121
    placeholder_re = re.compile("(?<!\\\\)"+re.escape(placeholder))
122
    #If we've defined some "new" special characters we'll have to process any escapes of them here
123
    #Make html substitutions.
124
    if htmlSafe:
125
        safeAmp=re.compile("&(?!(?:amp;|lt;|gt;))")
126
        for i in xrange(len(codeblocks)):
127
	    codeblocks[i]=safeAmp.sub("&amp;",codeblocks[i])
128
	    codeblocks[i]=codeblocks[i].replace("<","&lt;")
129
	    codeblocks[i]=codeblocks[i].replace(">","&gt;")
130
    #Step through the codeblocks one at a time and replace the next occurance of the placeholder.  Extra placeholders are invalid math blocks and ignored...
131
    outString=''
132
    scan = placeholder_re.scanner(processedString)
133
    post=0
134
    for i in xrange(len(codeblocks)):
135
        inBlock=int(codeblocks[i][0])
136
        match=scan.search()
137
	if not match:
138
	    #raise ValueError("More codeblocks given than valid placeholders in text.")
139
            print("WARNING: More codeblocks given than valid placeholders in text.") 
140
            continue  # we make this error non-fatal: see https://github.com/sagemathinc/cocalc/issues/506
141
	outString=outString+processedString[post:match.start()]+delims[inBlock][0]+codeblocks[i][1:]+delims[inBlock][1]
142
	post = match.end()
143
    #Add the rest of the string (if we need to)
144
    if post<len(processedString):
145
        outString = outString+processedString[post:]
146
    return outString
147

148
def findBoundaries(string):
149
    """A depricated function.  Finds the location of string boundaries in a stupid way."""
150
    last=''
151
    twod=[]
152
    oned=[]
153
    boundary=False
154
    inoned=False
155
    intwod=False
156
    for count,char in enumerate(string):
157
        if char=="$" and last!='\\':
158
	    #We just hit a valid $ character!
159
            if inoned:
160
    	        oned.append(count)
161
    	        inoned=False
162
    	    elif intwod:
163
    	        if boundary:
164
    	            twod.append(count)
165
    	    	    intwod=False
166
    		    boundary=False
167
    	        else:
168
    	            boundary=True
169
    	    elif boundary:
170
	        #This means the last character was also a valid $
171
		twod.append(count)
172
		intwod=True
173
		boundary=False
174
    	    else:
175
	        #This means the last character was NOT a useable $
176
    	        boundary=True
177
        elif boundary:
178
	    #The last character was a valid $, but this one isn't...
179
	    #This means the last character was a valid $, but this isn't
180
	    if inoned:
181
	        print "THIS SHOULD NEVER HAPPEN!"
182
	    elif intwod:
183
	        #ignore it...
184
		pass
185
	    else:
186
	        oned.append(count-1)
187
		inoned=True
188
	    boundary=False
189
        last=char
190
    #What if we finished on a boundary character?  Actually doesn't matter, but let's include it for completeness
191
    if boundary:
192
        if not (inoned or intwod):
193
	    oned.append(count)
194
	    inoned=True
195
    return (oned,twod)
196

197
Product

Resources

Company