Added string manipulations

dhondta · dhondta · commit 202d95f856fc · 2022-01-11T00:00:53.000+01:00
diff --git a/README.md b/README.md
@@ -235,7 +235,7 @@ o
 
 - [X] `a1z26`: keeps words whitespace-separated and uses a custom character separator
 - [X] `cases`: set of case-related encodings (including camel-, kebab-, lower-, pascal-, upper-, snake- and swap-case, slugify, capitalize, title)
-- [X] `dummy`: set of simple encodings (including reverse and word-reverse)
+- [X] `dummy`: set of simple encodings (including replace, reverse, word-reverse, substite and strip-spaces)
 - [X] `octal`: dummy octal conversion (converts to 3-digits groups)
 - [X] `octal-spaced`: variant of `octal` ; dummy octal conversion, handling whitespace separators
 - [X] `ordinal`: dummy character ordinals conversion (converts to 3-digits groups)
diff --git a/codext/__common__.py b/codext/__common__.py
@@ -968,7 +968,9 @@ def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False)
         if code in ["assert_not", "at"]:
             continue
         elif code == "any":
-            tokens.append(printable.replace("\n", ""))  # should be ord(x) with x belongs to [0, 256[
+            charset = list(printable.replace("\n", ""))
+            random.shuffle(charset)
+            tokens.append(charset)  # should be ord(x) with x belongs to [0, 256[
         elif code == "assert":
             tokens.append(list(__gen_str_from_re(value[1], star_plus_max, repeat_max, yield_max, True)))
         elif code == "branch":
@@ -977,10 +979,11 @@ def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False)
                 result += list(__gen_str_from_re(r, star_plus_max, repeat_max, yield_max, True)) or [""]
             tokens.append(result)
         elif code == "category":
-            charset = CATEGORIES[value[9:]]
+            charset = list(CATEGORIES[value[9:]])
             if negate:
                 negate = False
                 charset = list(set(printable).difference(charset))
+            random.shuffle(charset)
             tokens.append(charset)
         elif code == "groupref":
             tokens.extend(__groups[value])
@@ -1015,7 +1018,9 @@ def __gen_str_from_re(regex, star_plus_max, repeat_max, yield_max, parsed=False)
         elif code == "negate":
             negate = True
         elif code == "not_literal":
-            tokens.append(printable.replace(chr(value), ""))
+            charset = list(printable.replace(chr(value), ""))
+            random.shuffle(charset)
+            tokens.append(charset)
         elif code == "range":
             tokens.append("".join(chr(i) for i in range(value[0], value[1] + 1)))
         elif code == "subpattern":
diff --git a/codext/common/dummy.py b/codext/common/dummy.py
@@ -14,9 +14,30 @@
 from ..__common__ import add
 
 
+def replace(pair, *args):
+    def code(input, error="strict"):
+        return input.replace(pair[0], pair[1]), len(input)
+    return code
+add("replace", replace, replace, r"^replace[-_]?((?!.*(.).*\2)..)$", guess=None)
+# important note:                                              ^
+#                                           using "{2}" here instead will break the codec
+#  this is due to the fact the codext.__common__.generate_string_from_regex DOES NOT handle ASSERT_NOT (?!) and will
+#   faill to generate a valid instance in lookup(...) when an encoding name is to be generated to get the CodecInfo
+
+
+def substitute(token, replacement):
+    def code(input, error="strict"):
+        return input.replace(token, replacement), len(input)
+    return code
+add("substitute", substitute, substitute, r"^substitute[-_]?(.*?)/(.*?)$", guess=None)
+
+
 reverse = lambda i, e="strict": (i[::-1], len(i))
 add("reverse", reverse, reverse)
 
 word_reverse = lambda i, e="strict": (" ".join(w[::-1] for w in i.split()), len(i))
 add("reverse-words", word_reverse, word_reverse, r"^reverse[-_]words$")
 
+strip_spaces = lambda i, e="strict": (i.replace(" ", ""), len(i))
+add("strip-spaces", strip_spaces, strip_spaces, guess=None)
+
diff --git a/docs/manipulations.md b/docs/manipulations.md
@@ -20,7 +20,7 @@ These transformation functions are simple string transformations, including `str
 `title` | text <-> titled text |  | decoding "untitles" the text
 `uppercase` | text <-> uppercase text | `upper` | decoding is `lowercase`
 
-Of course, these "encodings" have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)).
+Of course, these transformations have no interest while using them in Python as the `str` methods can be called. It can be useful while using `codext` from the terminal (see [*CLI tool*](cli.html)).
 
 Some simple examples:
 
@@ -43,22 +43,25 @@ These transformation functions are simple string transformations.
 
 **Codec** | **Conversions** | **Aliases** | **Comment**
 :---: | :---: | --- | ---
+`replace` | text <-> text with single-char replaced |  | 
 `reverse` | text <-> reversed text |  | 
 `reverse-words` | text <-> reversed words |  | same as `reverse` but not on the whole text, only on the words (text split by whitespace)
+`strip-spaces` | text <-> all whitespaces stripped |  | 
+`substitute` | text <-> text with token substituted |  | 
 
-As in the previous section, these "encodings" have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)).
+As in the previous section, these transformations have no interest while using them in Python but well while using `codext` from the terminal (see [*CLI tool*](cli.html)).
 
 A simple example:
 
 ```sh
-$ echo -en "test string" | codext encode reverse-words | codext encode reverse
-string test
+$ echo -en "test string" | codext encode reverse-words | codext encode reverse replace-\ _
+string_test
 ```
 
 Or using encodings chaining:
 
 ```sh
-$ echo -en "test string" | codext encode reverse-words reverse
-string test
+$ echo -en "test string" | codext encode reverse-words reverse substitute-string/phrase
+phrase test
 ```
 
diff --git a/tests/test_manual.py b/tests/test_manual.py
@@ -96,6 +96,10 @@ def test_codec_dummy_str_manips(self):
         self.assertEqual(codecs.decode(STR, "reverse"), "tset a si siht")
         self.assertEqual(codecs.decode(STR, "reverse_words"), "siht si a tset")
         self.assertEqual(codecs.decode(STR.split()[0], "reverse"), codecs.decode(STR.split()[0], "reverse-words"))
+        self.assertEqual(codecs.encode(STR, "replace-i1"), STR.replace("i", "1"))
+        self.assertEqual(codecs.decode(STR.replace("i", "1"), "replace-1i"), STR)
+        self.assertEqual(codecs.encode(STR, "substitute-this/that"), STR.replace("this", "that"))
+        self.assertEqual(codecs.decode(STR.replace("this", "that"), "substitute-that/this"), STR)
     
     def test_codec_hash_functions(self):
         STR = b"This is a test string!"