@@ -392,397 +392,3 @@ stringlib_splitlines(PyObject* str_obj,
392392}
393393
394394#endif
395- /* stringlib: split implementation */
396-
397- #ifndef STRINGLIB_SPLIT_H
398- #define STRINGLIB_SPLIT_H
399-
400- #ifndef STRINGLIB_FASTSEARCH_H
401- #error must include "stringlib/fastsearch.h" before including this module
402- #endif
403-
404- /* Overallocate the initial list to reduce the number of reallocs for small
405- split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
406- resizes, to sizes 4, 8, then 16. Most observed string splits are for human
407- text (roughly 11 words per line) and field delimited data (usually 1-10
408- fields). For large strings the split algorithms are bandwidth limited
409- so increasing the preallocation likely will not improve things.*/
410-
411- #define MAX_PREALLOC 12
412-
413- /* 5 splits gives 6 elements */
414- #define PREALLOC_SIZE (maxsplit ) \
415- (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
416-
417- #define SPLIT_APPEND (data , left , right ) \
418- sub = STRINGLIB_NEW((data) + (left), \
419- (right) - (left)); \
420- if (sub == NULL) \
421- goto onError; \
422- if (PyList_Append(list, sub)) { \
423- Py_DECREF(sub); \
424- goto onError; \
425- } \
426- else \
427- Py_DECREF(sub);
428-
429- #define SPLIT_ADD (data , left , right ) { \
430- sub = STRINGLIB_NEW((data) + (left), \
431- (right) - (left)); \
432- if (sub == NULL) \
433- goto onError; \
434- if (count < MAX_PREALLOC) { \
435- PyList_SET_ITEM(list, count, sub); \
436- } else { \
437- if (PyList_Append(list, sub)) { \
438- Py_DECREF(sub); \
439- goto onError; \
440- } \
441- else \
442- Py_DECREF(sub); \
443- } \
444- count++; }
445-
446-
447- /* Always force the list to the expected size. */
448- #define FIX_PREALLOC_SIZE (list ) Py_SIZE(list) = count
449-
450- Py_LOCAL_INLINE (PyObject * )
451- stringlib_split_whitespace (PyObject * str_obj ,
452- const STRINGLIB_CHAR * str , Py_ssize_t str_len ,
453- Py_ssize_t maxcount )
454- {
455- Py_ssize_t i , j , count = 0 ;
456- PyObject * list = PyList_New (PREALLOC_SIZE (maxcount ));
457- PyObject * sub ;
458-
459- if (list == NULL )
460- return NULL ;
461-
462- i = j = 0 ;
463- while (maxcount -- > 0 ) {
464- while (i < str_len && STRINGLIB_ISSPACE (str [i ]))
465- i ++ ;
466- if (i == str_len ) break ;
467- j = i ; i ++ ;
468- while (i < str_len && !STRINGLIB_ISSPACE (str [i ]))
469- i ++ ;
470- #ifndef STRINGLIB_MUTABLE
471- if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT (str_obj )) {
472- /* No whitespace in str_obj, so just use it as list[0] */
473- Py_INCREF (str_obj );
474- PyList_SET_ITEM (list , 0 , (PyObject * )str_obj );
475- count ++ ;
476- break ;
477- }
478- #endif
479- SPLIT_ADD (str , j , i );
480- }
481-
482- if (i < str_len ) {
483- /* Only occurs when maxcount was reached */
484- /* Skip any remaining whitespace and copy to end of string */
485- while (i < str_len && STRINGLIB_ISSPACE (str [i ]))
486- i ++ ;
487- if (i != str_len )
488- SPLIT_ADD (str , i , str_len );
489- }
490- FIX_PREALLOC_SIZE (list );
491- return list ;
492-
493- onError :
494- Py_DECREF (list );
495- return NULL ;
496- }
497-
498- Py_LOCAL_INLINE (PyObject * )
499- stringlib_split_char (PyObject * str_obj ,
500- const STRINGLIB_CHAR * str , Py_ssize_t str_len ,
501- const STRINGLIB_CHAR ch ,
502- Py_ssize_t maxcount )
503- {
504- Py_ssize_t i , j , count = 0 ;
505- PyObject * list = PyList_New (PREALLOC_SIZE (maxcount ));
506- PyObject * sub ;
507-
508- if (list == NULL )
509- return NULL ;
510-
511- i = j = 0 ;
512- while ((j < str_len ) && (maxcount -- > 0 )) {
513- for (; j < str_len ; j ++ ) {
514- /* I found that using memchr makes no difference */
515- if (str [j ] == ch ) {
516- SPLIT_ADD (str , i , j );
517- i = j = j + 1 ;
518- break ;
519- }
520- }
521- }
522- #ifndef STRINGLIB_MUTABLE
523- if (count == 0 && STRINGLIB_CHECK_EXACT (str_obj )) {
524- /* ch not in str_obj, so just use str_obj as list[0] */
525- Py_INCREF (str_obj );
526- PyList_SET_ITEM (list , 0 , (PyObject * )str_obj );
527- count ++ ;
528- } else
529- #endif
530- if (i <= str_len ) {
531- SPLIT_ADD (str , i , str_len );
532- }
533- FIX_PREALLOC_SIZE (list );
534- return list ;
535-
536- onError :
537- Py_DECREF (list );
538- return NULL ;
539- }
540-
541- Py_LOCAL_INLINE (PyObject * )
542- stringlib_split (PyObject * str_obj ,
543- const STRINGLIB_CHAR * str , Py_ssize_t str_len ,
544- const STRINGLIB_CHAR * sep , Py_ssize_t sep_len ,
545- Py_ssize_t maxcount )
546- {
547- Py_ssize_t i , j , pos , count = 0 ;
548- PyObject * list , * sub ;
549-
550- if (sep_len == 0 ) {
551- PyErr_SetString (PyExc_ValueError , "empty separator" );
552- return NULL ;
553- }
554- else if (sep_len == 1 )
555- return stringlib_split_char (str_obj , str , str_len , sep [0 ], maxcount );
556-
557- list = PyList_New (PREALLOC_SIZE (maxcount ));
558- if (list == NULL )
559- return NULL ;
560-
561- i = j = 0 ;
562- while (maxcount -- > 0 ) {
563- pos = fastsearch (str + i , str_len - i , sep , sep_len , -1 , FAST_SEARCH );
564- if (pos < 0 )
565- break ;
566- j = i + pos ;
567- SPLIT_ADD (str , i , j );
568- i = j + sep_len ;
569- }
570- #ifndef STRINGLIB_MUTABLE
571- if (count == 0 && STRINGLIB_CHECK_EXACT (str_obj )) {
572- /* No match in str_obj, so just use it as list[0] */
573- Py_INCREF (str_obj );
574- PyList_SET_ITEM (list , 0 , (PyObject * )str_obj );
575- count ++ ;
576- } else
577- #endif
578- {
579- SPLIT_ADD (str , i , str_len );
580- }
581- FIX_PREALLOC_SIZE (list );
582- return list ;
583-
584- onError :
585- Py_DECREF (list );
586- return NULL ;
587- }
588-
589- Py_LOCAL_INLINE (PyObject * )
590- stringlib_rsplit_whitespace (PyObject * str_obj ,
591- const STRINGLIB_CHAR * str , Py_ssize_t str_len ,
592- Py_ssize_t maxcount )
593- {
594- Py_ssize_t i , j , count = 0 ;
595- PyObject * list = PyList_New (PREALLOC_SIZE (maxcount ));
596- PyObject * sub ;
597-
598- if (list == NULL )
599- return NULL ;
600-
601- i = j = str_len - 1 ;
602- while (maxcount -- > 0 ) {
603- while (i >= 0 && STRINGLIB_ISSPACE (str [i ]))
604- i -- ;
605- if (i < 0 ) break ;
606- j = i ; i -- ;
607- while (i >= 0 && !STRINGLIB_ISSPACE (str [i ]))
608- i -- ;
609- #ifndef STRINGLIB_MUTABLE
610- if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT (str_obj )) {
611- /* No whitespace in str_obj, so just use it as list[0] */
612- Py_INCREF (str_obj );
613- PyList_SET_ITEM (list , 0 , (PyObject * )str_obj );
614- count ++ ;
615- break ;
616- }
617- #endif
618- SPLIT_ADD (str , i + 1 , j + 1 );
619- }
620-
621- if (i >= 0 ) {
622- /* Only occurs when maxcount was reached */
623- /* Skip any remaining whitespace and copy to beginning of string */
624- while (i >= 0 && STRINGLIB_ISSPACE (str [i ]))
625- i -- ;
626- if (i >= 0 )
627- SPLIT_ADD (str , 0 , i + 1 );
628- }
629- FIX_PREALLOC_SIZE (list );
630- if (PyList_Reverse (list ) < 0 )
631- goto onError ;
632- return list ;
633-
634- onError :
635- Py_DECREF (list );
636- return NULL ;
637- }
638-
639- Py_LOCAL_INLINE (PyObject * )
640- stringlib_rsplit_char (PyObject * str_obj ,
641- const STRINGLIB_CHAR * str , Py_ssize_t str_len ,
642- const STRINGLIB_CHAR ch ,
643- Py_ssize_t maxcount )
644- {
645- Py_ssize_t i , j , count = 0 ;
646- PyObject * list = PyList_New (PREALLOC_SIZE (maxcount ));
647- PyObject * sub ;
648-
649- if (list == NULL )
650- return NULL ;
651-
652- i = j = str_len - 1 ;
653- while ((i >= 0 ) && (maxcount -- > 0 )) {
654- for (; i >= 0 ; i -- ) {
655- if (str [i ] == ch ) {
656- SPLIT_ADD (str , i + 1 , j + 1 );
657- j = i = i - 1 ;
658- break ;
659- }
660- }
661- }
662- #ifndef STRINGLIB_MUTABLE
663- if (count == 0 && STRINGLIB_CHECK_EXACT (str_obj )) {
664- /* ch not in str_obj, so just use str_obj as list[0] */
665- Py_INCREF (str_obj );
666- PyList_SET_ITEM (list , 0 , (PyObject * )str_obj );
667- count ++ ;
668- } else
669- #endif
670- if (j >= -1 ) {
671- SPLIT_ADD (str , 0 , j + 1 );
672- }
673- FIX_PREALLOC_SIZE (list );
674- if (PyList_Reverse (list ) < 0 )
675- goto onError ;
676- return list ;
677-
678- onError :
679- Py_DECREF (list );
680- return NULL ;
681- }
682-
683- Py_LOCAL_INLINE (PyObject * )
684- stringlib_rsplit (PyObject * str_obj ,
685- const STRINGLIB_CHAR * str , Py_ssize_t str_len ,
686- const STRINGLIB_CHAR * sep , Py_ssize_t sep_len ,
687- Py_ssize_t maxcount )
688- {
689- Py_ssize_t j , pos , count = 0 ;
690- PyObject * list , * sub ;
691-
692- if (sep_len == 0 ) {
693- PyErr_SetString (PyExc_ValueError , "empty separator" );
694- return NULL ;
695- }
696- else if (sep_len == 1 )
697- return stringlib_rsplit_char (str_obj , str , str_len , sep [0 ], maxcount );
698-
699- list = PyList_New (PREALLOC_SIZE (maxcount ));
700- if (list == NULL )
701- return NULL ;
702-
703- j = str_len ;
704- while (maxcount -- > 0 ) {
705- pos = fastsearch (str , j , sep , sep_len , -1 , FAST_RSEARCH );
706- if (pos < 0 )
707- break ;
708- SPLIT_ADD (str , pos + sep_len , j );
709- j = pos ;
710- }
711- #ifndef STRINGLIB_MUTABLE
712- if (count == 0 && STRINGLIB_CHECK_EXACT (str_obj )) {
713- /* No match in str_obj, so just use it as list[0] */
714- Py_INCREF (str_obj );
715- PyList_SET_ITEM (list , 0 , (PyObject * )str_obj );
716- count ++ ;
717- } else
718- #endif
719- {
720- SPLIT_ADD (str , 0 , j );
721- }
722- FIX_PREALLOC_SIZE (list );
723- if (PyList_Reverse (list ) < 0 )
724- goto onError ;
725- return list ;
726-
727- onError :
728- Py_DECREF (list );
729- return NULL ;
730- }
731-
732- Py_LOCAL_INLINE (PyObject * )
733- stringlib_splitlines (PyObject * str_obj ,
734- const STRINGLIB_CHAR * str , Py_ssize_t str_len ,
735- int keepends )
736- {
737- /* This does not use the preallocated list because splitlines is
738- usually run with hundreds of newlines. The overhead of
739- switching between PyList_SET_ITEM and append causes about a
740- 2-3% slowdown for that common case. A smarter implementation
741- could move the if check out, so the SET_ITEMs are done first
742- and the appends only done when the prealloc buffer is full.
743- That's too much work for little gain.*/
744-
745- register Py_ssize_t i ;
746- register Py_ssize_t j ;
747- PyObject * list = PyList_New (0 );
748- PyObject * sub ;
749-
750- if (list == NULL )
751- return NULL ;
752-
753- for (i = j = 0 ; i < str_len ; ) {
754- Py_ssize_t eol ;
755-
756- /* Find a line and append it */
757- while (i < str_len && !STRINGLIB_ISLINEBREAK (str [i ]))
758- i ++ ;
759-
760- /* Skip the line break reading CRLF as one line break */
761- eol = i ;
762- if (i < str_len ) {
763- if (str [i ] == '\r' && i + 1 < str_len && str [i + 1 ] == '\n' )
764- i += 2 ;
765- else
766- i ++ ;
767- if (keepends )
768- eol = i ;
769- }
770- #ifndef STRINGLIB_MUTABLE
771- if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT (str_obj )) {
772- /* No linebreak in str_obj, so just use it as list[0] */
773- if (PyList_Append (list , str_obj ))
774- goto onError ;
775- break ;
776- }
777- #endif
778- SPLIT_APPEND (str , j , eol );
779- j = i ;
780- }
781- return list ;
782-
783- onError :
784- Py_DECREF (list );
785- return NULL ;
786- }
787-
788- #endif
0 commit comments