Skip to content

Commit f0ae1bc

Browse files
committed
nuke changeState in tokenizer.py to save us over 200.000 method invocations in web-apps.htm; makes us at least 0.3 CPU seconds faster; yeah, I know I introduced the damn method :-)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%40332
1 parent 2660421 commit f0ae1bc

1 file changed

Lines changed: 48 additions & 52 deletions

File tree

src/tokenizer.py

Lines changed: 48 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ def __init__(self, stream):
139139
}
140140

141141
# Setup the initial tokenizer state
142-
self.contentModelFlag = contentModelFlags['PCDATA']
143-
self.state = self.states['data']
142+
self.contentModelFlag = contentModelFlags["PCDATA"]
143+
self.state = self.states["data"]
144144

145145
# The current token being created
146146
self.currentToken = None
@@ -163,10 +163,6 @@ def __iter__(self):
163163
while self.tokenQueue:
164164
yield self.tokenQueue.pop(0)
165165

166-
def changeState(self, state):
167-
self.state = self.states[state]
168-
169-
170166
# Below are various helper functions the tokenizer states use worked out.
171167
def processSolidusInTag(self):
172168
"""When a solidus (/) is encountered within a tag name what happens
@@ -344,7 +340,7 @@ def emitCurrentToken(self):
344340
# Add token to the queue to be yielded
345341
self.tokenQueue.append(token)
346342

347-
self.changeState("data")
343+
self.state = self.states["data"]
348344

349345
def emitCurrentTokenWithParseError(self, data=None):
350346
"""This method is equivalent to emitCurrentToken (well, it invokes it)
@@ -358,7 +354,7 @@ def emitCurrentTokenWithParseError(self, data=None):
358354
def attributeValueQuotedStateHandler(self, quoteType):
359355
data = self.stream.char()
360356
if data == quoteType:
361-
self.changeState("beforeAttributeName")
357+
self.state = self.states["beforeAttributeName"]
362358
elif data == u"&":
363359
self.processEntityInAttribute()
364360
elif data == EOF:
@@ -378,10 +374,10 @@ def dataState(self):
378374
if (data == u"&" and
379375
(self.contentModelFlag in
380376
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]))):
381-
self.changeState("entityData")
377+
self.state = self.states["entityData"]
382378
elif (data == u"<" and
383379
self.contentModelFlag != contentModelFlags["PLAINTEXT"]):
384-
self.changeState("tagOpen")
380+
self.state = self.states["tagOpen"]
385381
elif data == EOF:
386382
# Tokenization ends.
387383
return False
@@ -401,40 +397,40 @@ def entityDataState(self):
401397
self.tokenQueue.append(Characters(entity))
402398
else:
403399
self.tokenQueue.append(Characters(u"&"))
404-
self.changeState("data")
400+
self.state = self.states["data"]
405401
return True
406402

407403
def tagOpenState(self):
408404
data = self.stream.char()
409405
if (self.contentModelFlag in
410406
(contentModelFlags["RCDATA"], contentModelFlags["CDATA"])):
411407
if data == u"/":
412-
self.changeState("closeTagOpen")
408+
self.state = self.states["closeTagOpen"]
413409
else:
414410
self.tokenQueue.append(Characters(u"<"))
415411
self.stream.queue.append(data)
416-
self.changeState("data")
412+
self.state = self.states["data"]
417413
elif self.contentModelFlag == contentModelFlags['PCDATA']:
418414
if data == u"!":
419-
self.changeState("markupDeclarationOpen")
415+
self.state = self.states["markupDeclarationOpen"]
420416
elif data == u"/":
421-
self.changeState("closeTagOpen")
417+
self.state = self.states["closeTagOpen"]
422418
elif data in asciiLetters:
423419
self.currentToken = StartTag(data.lower())
424-
self.changeState("tagName")
420+
self.state = self.states["tagName"]
425421
elif data == u">":
426422
self.tokenQueue.append(ParseError())
427423
self.tokenQueue.append(Characters(u"<>"))
428-
self.changeState("data")
424+
self.state = self.states["data"]
429425
elif data == u"?":
430426
self.tokenQueue.append(ParseError())
431427
self.stream.queue.append(data)
432-
self.changeState("bogusComment")
428+
self.state = self.states["bogusComment"]
433429
else:
434430
self.tokenQueue.append(ParseError())
435431
self.tokenQueue.append(Characters(u"<"))
436432
self.stream.queue.append(data)
437-
self.changeState("data")
433+
self.state = self.states["data"]
438434
else:
439435
assert False
440436
return True
@@ -469,7 +465,7 @@ def closeTagOpenState(self):
469465
else:
470466
self.tokenQueue.append(ParseError())
471467
self.tokenQueue.append(Characters(u"</"))
472-
self.changeState("data")
468+
self.state = self.states["data"]
473469

474470
# Need to return here since we don't want the rest of the
475471
# method to be walked through.
@@ -479,25 +475,25 @@ def closeTagOpenState(self):
479475
data = self.stream.char()
480476
if data in asciiLetters:
481477
self.currentToken = EndTag(data.lower())
482-
self.changeState("tagName")
478+
self.state = self.states["tagName"]
483479
elif data == u">":
484480
self.tokenQueue.append(ParseError())
485-
self.changeState("data")
481+
self.state = self.states["data"]
486482
elif data == EOF:
487483
self.tokenQueue.append(ParseError())
488484
self.tokenQueue.append(Characters(u"</"))
489485
self.stream.queue.append(data)
490-
self.changeState("data")
486+
self.state = self.states["data"]
491487
else:
492488
self.tokenQueue.append(ParseError())
493489
self.stream.queue.append(data)
494-
self.changeState("bogusComment")
490+
self.state = self.states["bogusComment"]
495491
return True
496492

497493
def tagNameState(self):
498494
data = self.stream.char()
499495
if data in spaceCharacters:
500-
self.changeState("beforeAttributeName")
496+
self.state = self.states["beforeAttributeName"]
501497
elif data in asciiLowercase:
502498
data += self.stream.charsUntil(asciiLowercase, True)
503499
self.currentToken.name += data
@@ -510,7 +506,7 @@ def tagNameState(self):
510506
self.emitCurrentTokenWithParseError(data)
511507
elif data == u"/":
512508
self.processSolidusInTag()
513-
self.changeState("beforeAttributeName")
509+
self.state = self.states["beforeAttributeName"]
514510
else:
515511
self.currentToken.name += data
516512
return True
@@ -521,7 +517,7 @@ def beforeAttributeNameState(self):
521517
pass
522518
elif data in asciiUppercase:
523519
self.currentToken.data.append([data.lower(), ""])
524-
self.changeState("attributeName")
520+
self.state = self.states["attributeName"]
525521
elif data == u">":
526522
self.emitCurrentToken()
527523
elif data == u"/":
@@ -530,14 +526,14 @@ def beforeAttributeNameState(self):
530526
self.emitCurrentTokenWithParseError(data)
531527
else:
532528
self.currentToken.data.append([data, ""])
533-
self.changeState("attributeName")
529+
self.state = self.states["attributeName"]
534530
return True
535531

536532
def attributeNameState(self):
537533
data = self.stream.char()
538534
leavingThisState = True
539535
if data == u"=":
540-
self.changeState("beforeAttributeValue")
536+
self.state = self.states["beforeAttributeValue"]
541537
elif data in asciiLowercase:
542538
self.currentToken.data[-1][0] += data + self.stream.charsUntil(
543539
asciiLowercase, True)
@@ -552,10 +548,10 @@ def attributeNameState(self):
552548
# because data is a dict not a list
553549
pass
554550
elif data in spaceCharacters:
555-
self.changeState("afterAttributeName")
551+
self.state = self.states["afterAttributeName"]
556552
elif data == u"/":
557553
self.processSolidusInTag()
558-
self.changeState("beforeAttributeName")
554+
self.state = self.states["beforeAttributeName"]
559555
elif data == u"<" or data == EOF:
560556
self.emitCurrentTokenWithParseError(data)
561557
leavingThisState = False
@@ -580,40 +576,40 @@ def afterAttributeNameState(self):
580576
if data in spaceCharacters:
581577
pass
582578
elif data == u"=":
583-
self.changeState("beforeAttributeValue")
579+
self.state = self.states["beforeAttributeValue"]
584580
elif data == u">":
585581
self.emitCurrentToken()
586582
elif data in asciiUppercase:
587583
self.currentToken.data.append([data.lower(), ""])
588-
self.changeState("attributeName")
584+
self.state = self.states["attributeName"]
589585
elif data == u"/":
590586
self.processSolidusInTag()
591-
self.changeState("beforeAttributeName")
587+
self.state = self.states["beforeAttributeName"]
592588
elif data == u"<" or data == EOF:
593589
self.emitCurrentTokenWithParseError(data)
594590
else:
595591
self.currentToken.data.append([data, ""])
596-
self.changeState("attributeName")
592+
self.state = self.states["attributeName"]
597593
return True
598594

599595
def beforeAttributeValueState(self):
600596
data = self.stream.char()
601597
if data in spaceCharacters:
602598
pass
603599
elif data == u"\"":
604-
self.changeState("attributeValueDoubleQuoted")
600+
self.state = self.states["attributeValueDoubleQuoted"]
605601
elif data == u"&":
606-
self.changeState("attributeValueUnQuoted")
602+
self.state = self.states["attributeValueUnQuoted"]
607603
self.stream.queue.append(data);
608604
elif data == u"'":
609-
self.changeState("attributeValueSingleQuoted")
605+
self.state = self.states["attributeValueSingleQuoted"]
610606
elif data == u">":
611607
self.emitCurrentToken()
612608
elif data == u"<" or data == EOF:
613609
self.emitCurrentTokenWithParseError(data)
614610
else:
615611
self.currentToken.data[-1][1] += data
616-
self.changeState("attributeValueUnQuoted")
612+
self.state = self.states["attributeValueUnQuoted"]
617613
return True
618614

619615
def attributeValueDoubleQuotedState(self):
@@ -630,7 +626,7 @@ def attributeValueSingleQuotedState(self):
630626
def attributeValueUnQuotedState(self):
631627
data = self.stream.char()
632628
if data in spaceCharacters:
633-
self.changeState("beforeAttributeName")
629+
self.state = self.states["beforeAttributeName"]
634630
elif data == u"&":
635631
self.processEntityInAttribute()
636632
elif data == u">":
@@ -661,24 +657,24 @@ def markupDeclarationOpenState(self):
661657
charStack = [self.stream.char(), self.stream.char()]
662658
if charStack == [u"-", u"-"]:
663659
self.currentToken = Comment()
664-
self.changeState("comment")
660+
self.state = self.states["comment"]
665661
else:
666662
for x in xrange(5):
667663
charStack.append(self.stream.char())
668664
# Put in explicit EOF check
669665
if (not EOF in charStack and
670666
"".join(charStack).upper() == u"DOCTYPE"):
671-
self.changeState("doctype")
667+
self.state = self.states["doctype"]
672668
else:
673669
self.tokenQueue.append(ParseError())
674670
self.stream.queue.extend(charStack)
675-
self.changeState("bogusComment")
671+
self.state = self.states["bogusComment"]
676672
return True
677673

678674
def commentState(self):
679675
data = self.stream.char()
680676
if data == u"-":
681-
self.changeState("commentDash")
677+
self.state = self.states["commentDash"]
682678
elif data == EOF:
683679
self.emitCurrentTokenWithParseError(data)
684680
else:
@@ -688,7 +684,7 @@ def commentState(self):
688684
def commentDashState(self):
689685
data = self.stream.char()
690686
if data == u"-":
691-
self.changeState("commentEnd")
687+
self.state = self.states["commentEnd"]
692688
elif data == EOF:
693689
self.emitCurrentTokenWithParseError(data)
694690
else:
@@ -712,17 +708,17 @@ def commentEndState(self):
712708
else:
713709
self.tokenQueue.append(ParseError())
714710
self.currentToken.data += u"--" + data
715-
self.changeState("comment")
711+
self.state = self.states["comment"]
716712
return True
717713

718714
def doctypeState(self):
719715
data = self.stream.char()
720716
if data in spaceCharacters:
721-
self.changeState("beforeDoctypeName")
717+
self.state = self.states["beforeDoctypeName"]
722718
else:
723719
self.tokenQueue.append(ParseError())
724720
self.stream.queue.append(data)
725-
self.changeState("beforeDoctypeName")
721+
self.state = self.states["beforeDoctypeName"]
726722
return True
727723

728724
def beforeDoctypeNameState(self):
@@ -731,7 +727,7 @@ def beforeDoctypeNameState(self):
731727
pass
732728
elif data in asciiLowercase:
733729
self.currentToken = Doctype(data.upper())
734-
self.changeState("doctypeName")
730+
self.state = self.states["doctypeName"]
735731
elif data == u">":
736732
# Character needs to be consumed per the specification so don't
737733
# invoke emitCurrentTokenWithParseError with "data" as argument.
@@ -740,14 +736,14 @@ def beforeDoctypeNameState(self):
740736
self.emitCurrentTokenWithParseError(data)
741737
else:
742738
self.currentToken = Doctype(data)
743-
self.changeState("doctypeName")
739+
self.state = self.states["doctypeName"]
744740
return True
745741

746742
def doctypeNameState(self):
747743
data = self.stream.char()
748744
needsDoctypeCheck = False
749745
if data in spaceCharacters:
750-
self.changeState("afterDoctypeName")
746+
self.state = self.states["afterDoctypeName"]
751747
needsDoctypeCheck = True
752748
elif data == u">":
753749
self.emitCurrentToken()
@@ -779,7 +775,7 @@ def afterDoctypeNameState(self):
779775
else:
780776
self.tokenQueue.append(ParseError())
781777
self.currentToken.data = True
782-
self.changeState("bogusDoctype")
778+
self.state = self.states["bogusDoctype"]
783779
return True
784780

785781
def bogusDoctypeState(self):

0 commit comments

Comments
 (0)