The following code shows that FMc 's code doesn't work.
The line
from name_of_file import olding_the_new,finding
refers to my code in my personal answer in this thread to this question.
* Give the name name_of_file
to the file containing the script of my code (lying in my other answer in this thread) and it will run.
* Or if you are annoyed to copy-paste my code, just comment this line of import, and the following code will run because I put a try-except instruction that will react correctly to the absence of olding_the_new
and finding
I used two ways to verify the results of FMc 's code:
-1/ comparing the span returned by his code with index of 'f' and index of 'r', as we search for phrase 'foobar' and I managed that there are no f and r other than the ones in foobar
-2/ comparing with the first span returned by my code, hence the need of the above import from name_of_file
Nota Bene
If disp = None
is changed to disp == True
, the execution displays intermediary results that help to understand the algorithm.
.
import re
from name_of_file import olding_the_new,finding
def main():
# Two versions of the text: the original,
# and one without any of the "#" markers.
for text_orig in ('BLAH ##BLAH fo####o##ba###r## BL##AH',
'jkh##jh#f',
'#f#oo##ba###r##',
'a##xf#oo##ba###r##',
'ax##f#oo##ba###r##',
'ab###xyf#oo##ba###r##',
'abx###yf#oo##ba###r##',
'abxy###f#oo##ba###r##',
'iji#hkh#f#oo##ba###r##',
'mn##pps#f#oo##ba###r##',
'mn##pab###xyf#oo##ba###r##',
'lmn#pab###xyf#oo##ba###r##',
'fo##o##ba###r## aaaaaBLfoob##arAH',
'fo#o##ba####r## aaaaaBLfoob##ar#AH',
'f##oo##ba###r## aaaaaBLfoob##ar',
'f#oo##ba####r## aaaaBL#foob##arAH',
'f#oo##ba####r## aaaaBL#foob##ar#AH',
'foo##ba#####r## aaaaBL#foob##ar',
'#f#oo##ba###r## aaaBL##foob##arAH',
'#foo##ba####r## aaaBL##foob##ar#AH',
'#af#oo##ba##r## aaaBL##foob##ar',
'##afoo##ba###r## aaaaaBLfoob##arAH',
'BLAHHfo##o##ba###r aaBLfoob##ar#AH',
'BLAH#fo##o##ba###r aaBLfoob##ar',
'BLA#Hfo##o##ba###r###BLfoob##ar',
'BLA#Hfo##o##ba###r#BL##foob##ar',
):
text_clean = text_orig.replace('#', '')
# Collect data on the positions and widths
# of the markers in the original text.
rgx = re.compile(r'#+')
markers = [(m.start(), len(m.group()))
for m in rgx.finditer(text_orig)]
# Find the location of the search phrase in the cleaned text.
# At that point you'll have all the data you need to compute
# the span of the phrase in the original text.
search = 'foobar'
try:
i = text_clean.index(search)
print ('text_clean == %s\n'
"text_clean.index('%s')==%d len('%s') == %d\n"
'text_orig == %s\n'
'markers == %s'
% (text_clean,
search,i,search,len(search),
text_orig,
markers))
S,E = compute_span(i, len(search), markers)
print "span = (%d,%d) %s %s %s"\
% (S,E,
text_orig.index('f')==S,
text_orig.index('r')+1==E,
list(finding(search,text_orig,'#+')))
except ValueError:
print ('text_clean == %s\n'
"text_clean.index('%s') ***Not found***\n"
'text_orig == %s\n'
'markers == %s'
% (text_clean,
search,
text_orig,
markers))
print '--------------------------------'
.
def compute_span(start, width, markers):
# start and width are in expurgated text
# markers are in original text
disp = None # if disp==True => displaying of intermediary results
span_start = start
if disp:
print ('\nAt beginning in compute_span():\n'
' span_start==start==%d width==%d'
% (start,width))
for s, w in markers: # s and w are in original text
if disp:
print ('\ns,w==%d,%d'
' s+w-1(%d)<start(%d) %s'
' s(%d)==start(%d) %s'
% (s,w,s+w-1,start,s+w-1<start,s,start,s==start))
if s + w - 1 < start:
#mwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmmwmwmwmwmwm
# the following if-else section is justified to be used
# only after correction of the above line to this one:
# if s+w-1 <= start or s==start:
#mwmwmwmwmwmwmwmwmwmwmwmwmwmwmwm
if s + w - 1 <= start and disp:
print ' 1a) s + w - 1 (%d) <= start (%d) marker at left'\
% (s+w-1, start)
elif disp:
print ' 1b) s(%d) == start(%d)' % (s,start)
#mwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmwmmwmwmwmwmwm
# Situation: marker fully to left of our text.
# Adjust our start points rightward.
start += w
span_start += w
if disp:
print ' span_start == %d start, width == %d, %d' % (span_start, start, width)
elif start + width - 1 < s:
if disp:
print (' 2) start + width - 1 (%d) < s (%d) marker at right\n'
' break' % (start+width-1, s))
# Situation: marker fully to the right of our text.
break
else:
# Situation: marker interrupts our text.
# Advance the start point for the remaining text
# rightward, and reduce the remaining width.
if disp:
print " 3) In 'else': s - start == %d marker interrupts" % (s - start)
start += w
width = width - (s - start)
if disp:
print ' span_start == %d start, width == %d, %d' % (span_start, start, width)
return (span_start, start + width)
.
main()
Result
>>>
text_clean == BLAH BLAH foobar BLAH
text_clean.index('foobar')==10 len('foobar') == 6
text_orig == BLAH ##BLAH fo####o##ba###r## BL##AH
markers == [(5, 2), (14, 4), (19, 2), (23, 3), (27, 2), (32, 2)]
span = (12,26) True False [(12, 27)]
--------------------------------
text_clean == jkhjhf
text_clean.index('foobar') ***Not found***
text_orig == jkh##jh#f
markers == [(3, 2), (7, 1)]
--------------------------------
text_clean == foobar
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == #f#oo##ba###r##
markers == [(0, 1), (2, 1), (5, 2), (9, 3), (13, 2)]
span = (0,11) False False [(1, 13)]
--------------------------------
text_clean == axfoobar
text_clean.index('foobar')==2 len('foobar') == 6
text_orig == a##xf#oo##ba###r##
markers == [(1, 2), (5, 1), (8, 2), (12, 3), (16, 2)]
span = (2,16) False True [(4, 16)]
--------------------------------
text_clean == axfoobar
text_clean.index('foobar')==2 len('foobar') == 6
text_orig == ax##f#oo##ba###r##
markers == [(2, 2), (5, 1), (8, 2), (12, 3), (16, 2)]
span = (2,15) False False [(4, 16)]
--------------------------------
text_clean == abxyfoobar
text_clean.index('foobar')==4 len('foobar') == 6
text_orig == ab###xyf#oo##ba###r##
markers == [(2, 3), (8, 1), (11, 2), (15, 3), (19, 2)]
span = (4,19) False True [(7, 19)]
--------------------------------
text_clean == abxyfoobar
text_clean.index('foobar')==4 len('foobar') == 6
text_orig == abx###yf#oo##ba###r##
markers == [(3, 3), (8, 1), (11, 2), (15, 3), (19, 2)]
span = (4,18) False False [(7, 19)]
--------------------------------
text_clean == abxyfoobar
text_clean.index('foobar')==4 len('foobar') == 6
text_orig == abxy###f#oo##ba###r##
markers == [(4, 3), (8, 1), (11, 2), (15, 3), (19, 2)]
span = (4,19) False True [(7, 19)]
--------------------------------
text_clean == ijihkhfoobar
text_clean.index('foobar')==6 len('foobar') == 6
text_orig == iji#hkh#f#oo##ba###r##
markers == [(3, 1), (7, 1), (9, 1), (12, 2), (16, 3), (20, 2)]
span = (7,18) False False [(8, 20)]
--------------------------------
text_clean == mnppsfoobar
text_clean.index('foobar')==5 len('foobar') == 6
text_orig == mn##pps#f#oo##ba###r##
markers == [(2, 2), (7, 1), (9, 1), (12, 2), (16, 3), (20, 2)]
span = (7,18) False False [(8, 20)]
--------------------------------
text_clean == mnpabxyfoobar
text_clean.index('foobar')==7 len('foobar') == 6
text_orig == mn##pab###xyf#oo##ba###r##
markers == [(2, 2), (7, 3), (13, 1), (16, 2), (20, 3), (24, 2)]
span = (9,24) False True [(12, 24)]
--------------------------------
text_clean == lmnpabxyfoobar
text_clean.index('foobar')==8 len('foobar') == 6
text_orig == lmn#pab###xyf#oo##ba###r##
markers == [(3, 1), (7, 3), (13, 1), (16, 2), (20, 3), (24, 2)]
span = (9,24) False True [(12, 24)]
--------------------------------
text_clean == foobar aaaaaBLfoobarAH
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == fo##o##ba###r## aaaaaBLfoob##arAH
markers == [(2, 2), (5, 2), (9, 3), (13, 2), (27, 2)]
span = (0,9) True False [(0, 13), (23, 31)]
--------------------------------
text_clean == foobar aaaaaBLfoobarAH
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == fo#o##ba####r## aaaaaBLfoob##ar#AH
markers == [(2, 1), (4, 2), (8, 4), (13, 2), (27, 2), (31, 1)]
span = (0,7) True False [(0, 13), (23, 31)]
--------------------------------
text_clean == foobar aaaaaBLfoobar
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == f##oo##ba###r## aaaaaBLfoob##ar
markers == [(1, 2), (5, 2), (9, 3), (13, 2), (27, 2)]
span = (0,11) True False [(0, 13), (23, 31)]
--------------------------------
text_clean == foobar aaaaBLfoobarAH
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == f#oo##ba####r## aaaaBL#foob##arAH
markers == [(1, 1), (4, 2), (8, 4), (13, 2), (22, 1), (27, 2)]
span = (0,8) True False [(0, 13), (23, 31)]
--------------------------------
text_clean == foobar aaaaBLfoobarAH
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == f#oo##ba####r## aaaaBL#foob##ar#AH
markers == [(1, 1), (4, 2), (8, 4), (13, 2), (22, 1), (27, 2), (31, 1)]
span = (0,8) True False [(0, 13), (23, 31)]
--------------------------------
text_clean == foobar aaaaBLfoobar
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == foo##ba#####r## aaaaBL#foob##ar
markers == [(3, 2), (7, 5), (13, 2), (22, 1), (27, 2)]
span = (0,7) True False [(0, 13), (23, 31)]
--------------------------------
text_clean == foobar aaaBLfoobarAH
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == #f#oo##ba###r## aaaBL##foob##arAH
markers == [(0, 1), (2, 1), (5, 2), (9, 3), (13, 2), (21, 2), (27, 2)]
span = (0,11) False False [(1, 13), (23, 31)]
--------------------------------
text_clean == foobar aaaBLfoobarAH
text_clean.index('foobar')==0 len('foobar') == 6
text_orig == #foo##ba####r## aaaBL##foob##ar#AH
markers == [(0, 1), (4, 2), (8, 4), (13, 2), (21, 2), (27, 2), (31, 1)]
span = (0,12) False False [(1, 13), (23, 31)]
--------------------------------
text_clean == afoobar aaaBLfoobar
text_clean.index('foobar')==1 len('foobar') == 6
text_orig == #af#oo##ba##r## aaaBL##foob##ar
markers == [(0, 1), (3, 1), (6, 2), (10, 2), (13, 2), (21, 2), (27, 2)]
span = (2,10) True False [(2, 13), (23, 31)]
--------------------------------
text_clean == afoobar aaaaaBLfoobarAH
text_clean.index('foobar')==1 len('foobar') == 6
text_orig == ##afoo##ba###r## aaaaaBLfoob##arAH
markers == [(0, 2), (6, 2), (10, 3), (14, 2), (28, 2)]
span = (1,14) False True [(3, 14), (24, 32)]
--------------------------------
text_clean == BLAHHfoobar aaBLfoobarAH
text_clean.index('foobar')==5 len('foobar') == 6
text_orig == BLAHHfo##o##ba###r aaBLfoob##ar#AH
markers == [(7, 2), (10, 2), (14, 3), (27, 2), (31, 1)]
span = (5,14) True False [(5, 18), (23, 31)]
--------------------------------
text_clean == BLAHfoobar aaBLfoobar
text_clean.index('foobar')==4 len('foobar') == 6
text_orig == BLAH#fo##o##ba###r aaBLfoob##ar
markers == [(4, 1), (7, 2), (10, 2), (14, 3), (27, 2)]
span = (4,16) False False [(5, 18), (23, 31)]
--------------------------------
text_clean == BLAHfoobarBLfoobar
text_clean.index('foobar')==4 len('foobar') == 6
text_orig == BLA#Hfo##o##ba###r###BLfoob##ar
markers == [(3, 1), (7, 2), (10, 2), (14, 3), (18, 3), (27, 2)]
span = (5,14) True False [(5, 18), (23, 31)]
--------------------------------
text_clean == BLAHfoobarBLfoobar
text_clean.index('foobar')==4 len('foobar') == 6
text_orig == BLA#Hfo##o##ba###r#BL##foob##ar
markers == [(3, 1), (7, 2), (10, 2), (14, 3), (18, 1), (21, 2), (27, 2)]
span = (5,14) True False [(5, 18), (23, 31)]
--------------------------------
>>>
.
---------------------------------------------
The code of FMc is very subtle, I had a long hard time to understand its principle and then to be able to correct it.
I will let anybody the task to understand the algorithm. I only say the corrections required to make the code of FMc to work correctly:
.
First correction:
if s + w - 1 < start:
# must be changed to
if s + w - 1 <= start or (s==start):
EDIT
In my initial present answer,
I had written ... or (s<=start)
.
That was an error of me, in fact I had the intention to write
.. or (s==start)
NOTA BENE about this EDIT:
This error had no consequence in the code corrected with the two corrections I describe here to correct the initial code of FMc (the very first one, because at present he has changed it two times).
Indeed, if you correct the code with the two corrections, you'll obtain correct results with all the 25 examples taken for text_orig
, as well with ... or (s <= start)
as with ... or (s==start)
.
So I thought that the case in which s < start
is True could never happen when the first condition s+w-1 <= start
is False, presumely based on the fact that w
is always greater than 0 and some other reason due to the configuration of the markers and non-marker sequences.....
So I tried to find the demonstration of this impression... and I failed.
Moreover, I reached a state of mind in which I even no more understand the algorithm of FMc (the very first one before any edit he did) !!
Despite this, I let this answer as it is, and I post, at the end of this answer, the explanations trying to explain why these corrections are needed.
But I warn: the very first algorithm of FMc is very outlandish and difficult to comprehend because it does comparison of indices belonging to two different strings, one which is text_orig with markers #### and another one cleaned of all these markers.... and now I am no more convinced that may have a sense....
.
Second correction::
start += w
width = width - (s - start)
# must be changed to
width -= (s-start) # this line MUST BE before the following one
start = s + w # because start += (s-start) + w
-------------------
I am stunned that 2 people upvoted the answer of FMc though it gives a wrong code. It means that they upvoted an answer without having tested the given code.
----------------------------------------
.
EDIT
Why must the condition if s + w - 1 < start:
must be changed to this one:
if s + w - 1 <= start or (s==start):
?
Because it may happen that s + w - 1 < start
should be False and s
equals start
together.
In this case, the execution goes to the else
section and executes (in corrected code):
width -= (s - start)
start = s + w
Consequently, width
doesn't change while it evidently should change when we see the sequence concerned.
This case may occur at the moment the first marker is examined, as with the following sequences:
'#f#oo##ba###r##' : s,w==0,1 , 0==s==start==0
'ax##f#oo##ba###r##' : s,w==2,2 , 2==s==start==2
'abxy###f#oo##ba###r##' : s,w==4,3 , 4==s==start==4
'#f#oo##ba###r## aaaBL##foob##arAH' : s,w==0,1 , 0==s==start==0
'BLAH#fo##o##ba###r aaBLfoob##ar' : s,w==4,1 4==s==start==4
With the following ones, it occurs for the examination of the second marker:
'iji#hkh#f#oo##ba###r##' : s,w==7,1 , 7==s==start==7
'mn##pps#f#oo##ba###r##' : s,w==7,1 , 7==s==start==7
It can be understood better by executing my code with disp = True
setted.
When s + w - 1 <= start
is verified, the fact that s
may equal start
isn't troublesome because the execution doesn't go to the else
section, it goes to the first section in which there's only the addition of w
to s
and to start
.
But when s + w - 1 <= start
is False while s
equals start
, the execution goes to the else
section where the execution of instruction width -= (s-start)
doesn't change anything to the value of width and that's troublesome.
So the condition or (s==start)
must be added to prevent this else
destination, and it is needed to put it after an or
to prevent this destination even when s+w-1 <= start
is False, which can happen as some examples show it.
.
Concerning the fact that the instruction s+w-1 < start
must be changed to s+w-1 <= start
(with =),
it's because of the case w==1
corresponding to 1 character # only ,
as for the cases
mn##pps#f#oo##ba###r##
(second marker)
and BLAH#fo##o##ba###r
(first marker).