Skip to content

Commit b09af03

Browse files
committed
Port error handlers from Py_UNICODE indexing to code point indexing.
1 parent 495dcbd commit b09af03

File tree

2 files changed

+51
-77
lines changed

2 files changed

+51
-77
lines changed

Objects/exceptions.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,6 +1513,11 @@ UnicodeEncodeError_init(PyObject *self, PyObject *args, PyObject *kwds)
15131513
return -1;
15141514
}
15151515

1516+
if (PyUnicode_READY(err->object) < -1) {
1517+
err->encoding = NULL;
1518+
return -1;
1519+
}
1520+
15161521
Py_INCREF(err->encoding);
15171522
Py_INCREF(err->object);
15181523
Py_INCREF(err->reason);

Python/codecs.c

Lines changed: 46 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -573,93 +573,82 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
573573
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
574574
PyObject *restuple;
575575
PyObject *object;
576+
Py_ssize_t i, o;
576577
Py_ssize_t start;
577578
Py_ssize_t end;
578579
PyObject *res;
579-
Py_UNICODE *p;
580-
Py_UNICODE *startp;
581-
Py_UNICODE *outp;
580+
unsigned char *outp;
582581
int ressize;
582+
Py_UCS4 ch;
583583
if (PyUnicodeEncodeError_GetStart(exc, &start))
584584
return NULL;
585585
if (PyUnicodeEncodeError_GetEnd(exc, &end))
586586
return NULL;
587587
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
588588
return NULL;
589-
startp = PyUnicode_AS_UNICODE(object);
590-
for (p = startp+start, ressize = 0; p < startp+end; ++p) {
591-
if (*p<10)
589+
for (i = start, ressize = 0; i < end; ++i) {
590+
/* object is guaranteed to be "ready" */
591+
ch = PyUnicode_READ_CHAR(object, i);
592+
if (ch<10)
592593
ressize += 2+1+1;
593-
else if (*p<100)
594+
else if (ch<100)
594595
ressize += 2+2+1;
595-
else if (*p<1000)
596+
else if (ch<1000)
596597
ressize += 2+3+1;
597-
else if (*p<10000)
598+
else if (ch<10000)
598599
ressize += 2+4+1;
599-
#ifndef Py_UNICODE_WIDE
600-
else
601-
ressize += 2+5+1;
602-
#else
603-
else if (*p<100000)
600+
else if (ch<100000)
604601
ressize += 2+5+1;
605-
else if (*p<1000000)
602+
else if (ch<1000000)
606603
ressize += 2+6+1;
607604
else
608605
ressize += 2+7+1;
609-
#endif
610606
}
611607
/* allocate replacement */
612-
res = PyUnicode_FromUnicode(NULL, ressize);
608+
res = PyUnicode_New(ressize, 127);
613609
if (res == NULL) {
614610
Py_DECREF(object);
615611
return NULL;
616612
}
613+
outp = PyUnicode_1BYTE_DATA(res);
617614
/* generate replacement */
618-
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
619-
p < startp+end; ++p) {
620-
Py_UNICODE c = *p;
615+
for (i = start, o = 0; i < end; ++i) {
616+
ch = PyUnicode_READ_CHAR(object, i);
621617
int digits;
622618
int base;
623619
*outp++ = '&';
624620
*outp++ = '#';
625-
if (*p<10) {
621+
if (ch<10) {
626622
digits = 1;
627623
base = 1;
628624
}
629-
else if (*p<100) {
625+
else if (ch<100) {
630626
digits = 2;
631627
base = 10;
632628
}
633-
else if (*p<1000) {
629+
else if (ch<1000) {
634630
digits = 3;
635631
base = 100;
636632
}
637-
else if (*p<10000) {
633+
else if (ch<10000) {
638634
digits = 4;
639635
base = 1000;
640636
}
641-
#ifndef Py_UNICODE_WIDE
642-
else {
643-
digits = 5;
644-
base = 10000;
645-
}
646-
#else
647-
else if (*p<100000) {
637+
else if (ch<100000) {
648638
digits = 5;
649639
base = 10000;
650640
}
651-
else if (*p<1000000) {
641+
else if (ch<1000000) {
652642
digits = 6;
653643
base = 100000;
654644
}
655645
else {
656646
digits = 7;
657647
base = 1000000;
658648
}
659-
#endif
660649
while (digits-->0) {
661-
*outp++ = '0' + c/base;
662-
c %= base;
650+
*outp++ = '0' + ch/base;
651+
ch %= base;
663652
base /= 10;
664653
}
665654
*outp++ = ';';
@@ -677,58 +666,41 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
677666

678667
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
679668
{
680-
#ifndef Py_UNICODE_WIDE
681-
#define IS_SURROGATE_PAIR(p, end) \
682-
(*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
683-
*(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
684-
#else
685-
#define IS_SURROGATE_PAIR(p, end) 0
686-
#endif
687669
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
688670
PyObject *restuple;
689671
PyObject *object;
672+
Py_ssize_t i;
690673
Py_ssize_t start;
691674
Py_ssize_t end;
692675
PyObject *res;
693-
Py_UNICODE *p;
694-
Py_UNICODE *startp;
695-
Py_UNICODE *outp;
676+
unsigned char *outp;
696677
int ressize;
678+
Py_UCS4 c;
697679
if (PyUnicodeEncodeError_GetStart(exc, &start))
698680
return NULL;
699681
if (PyUnicodeEncodeError_GetEnd(exc, &end))
700682
return NULL;
701683
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
702684
return NULL;
703-
startp = PyUnicode_AS_UNICODE(object);
704-
for (p = startp+start, ressize = 0; p < startp+end; ++p) {
705-
#ifdef Py_UNICODE_WIDE
706-
if (*p >= 0x00010000)
685+
for (i = start, ressize = 0; i < end; ++i) {
686+
/* object is guaranteed to be "ready" */
687+
c = PyUnicode_READ_CHAR(object, i);
688+
if (c >= 0x10000) {
707689
ressize += 1+1+8;
708-
else
709-
#endif
710-
if (*p >= 0x100) {
711-
if (IS_SURROGATE_PAIR(p, startp+end)) {
712-
ressize += 1+1+8;
713-
++p;
714-
}
715-
else
716-
ressize += 1+1+4;
690+
}
691+
else if (c >= 0x100) {
692+
ressize += 1+1+4;
717693
}
718694
else
719695
ressize += 1+1+2;
720696
}
721-
res = PyUnicode_FromUnicode(NULL, ressize);
697+
res = PyUnicode_New(ressize, 127);
722698
if (res==NULL)
723699
return NULL;
724-
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
725-
p < startp+end; ++p) {
726-
Py_UCS4 c = (Py_UCS4) *p;
700+
for (i = start, outp = PyUnicode_1BYTE_DATA(res);
701+
i < end; ++i) {
702+
c = PyUnicode_READ_CHAR(object, i);
727703
*outp++ = '\\';
728-
if (IS_SURROGATE_PAIR(p, startp+end)) {
729-
c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
730-
++p;
731-
}
732704
if (c >= 0x00010000) {
733705
*outp++ = 'U';
734706
*outp++ = Py_hexdigits[(c>>28)&0xf];
@@ -758,7 +730,6 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
758730
wrong_exception_type(exc);
759731
return NULL;
760732
}
761-
#undef IS_SURROGATE_PAIR
762733
}
763734

764735
/* This handler is declared static until someone demonstrates
@@ -768,28 +739,27 @@ PyCodec_SurrogatePassErrors(PyObject *exc)
768739
{
769740
PyObject *restuple;
770741
PyObject *object;
742+
Py_ssize_t i;
771743
Py_ssize_t start;
772744
Py_ssize_t end;
773745
PyObject *res;
774746
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
775-
Py_UNICODE *p;
776-
Py_UNICODE *startp;
777747
char *outp;
778748
if (PyUnicodeEncodeError_GetStart(exc, &start))
779749
return NULL;
780750
if (PyUnicodeEncodeError_GetEnd(exc, &end))
781751
return NULL;
782752
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
783753
return NULL;
784-
startp = PyUnicode_AS_UNICODE(object);
785754
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
786755
if (!res) {
787756
Py_DECREF(object);
788757
return NULL;
789758
}
790759
outp = PyBytes_AsString(res);
791-
for (p = startp+start; p < startp+end; p++) {
792-
Py_UNICODE ch = *p;
760+
for (i = start; i < end; i++) {
761+
/* object is guaranteed to be "ready" */
762+
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
793763
if (ch < 0xd800 || ch > 0xdfff) {
794764
/* Not a surrogate, fail with original exception */
795765
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
@@ -847,28 +817,27 @@ PyCodec_SurrogateEscapeErrors(PyObject *exc)
847817
{
848818
PyObject *restuple;
849819
PyObject *object;
820+
Py_ssize_t i;
850821
Py_ssize_t start;
851822
Py_ssize_t end;
852823
PyObject *res;
853824
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
854-
Py_UNICODE *p;
855-
Py_UNICODE *startp;
856825
char *outp;
857826
if (PyUnicodeEncodeError_GetStart(exc, &start))
858827
return NULL;
859828
if (PyUnicodeEncodeError_GetEnd(exc, &end))
860829
return NULL;
861830
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
862831
return NULL;
863-
startp = PyUnicode_AS_UNICODE(object);
864832
res = PyBytes_FromStringAndSize(NULL, end-start);
865833
if (!res) {
866834
Py_DECREF(object);
867835
return NULL;
868836
}
869837
outp = PyBytes_AsString(res);
870-
for (p = startp+start; p < startp+end; p++) {
871-
Py_UNICODE ch = *p;
838+
for (i = start; i < end; i++) {
839+
/* object is guaranteed to be "ready" */
840+
Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
872841
if (ch < 0xdc80 || ch > 0xdcff) {
873842
/* Not a UTF-8b surrogate, fail with original exception */
874843
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);

0 commit comments

Comments
 (0)