@@ -34,5 +34,150 @@ class Heap {
3434 return ( < usize > current_memory ( ) << 16 ) - HEAP_START ;
3535 }
3636
37+ static copy ( dest : usize , src : usize , n : usize ) : usize {
38+ assert ( dest >= HEAP_START ) ;
39+
40+ // the following is based on musl's implementation of memcpy
41+ let dst : usize = dest ;
42+ let w : u32 , x : u32 ;
43+
44+ // copy 1 byte each until src is aligned to 4 bytes
45+ while ( n != 0 && src % 4 != 0 ) {
46+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
47+ n -- ;
48+ }
49+
50+ // if dst is aligned to 4 bytes as well, copy 4 bytes each
51+ if ( dst % 4 == 0 ) {
52+ while ( n >= 16 ) {
53+ store < u32 > ( dst , load < u32 > ( src ) ) ;
54+ store < u32 > ( dst + 4 , load < u32 > ( src + 4 ) ) ;
55+ store < u32 > ( dst + 8 , load < u32 > ( src + 8 ) ) ;
56+ store < u32 > ( dst + 12 , load < u32 > ( src + 12 ) ) ;
57+ src += 16 ; dst += 16 ; n -= 16 ;
58+ }
59+ if ( n & 8 ) {
60+ store < u32 > ( dst , load < u32 > ( src ) ) ;
61+ store < u32 > ( dst + 4 , load < u32 > ( src + 4 ) ) ;
62+ dst += 8 ; src += 8 ;
63+ }
64+ if ( n & 4 ) {
65+ store < u32 > ( dst , load < u32 > ( src ) ) ;
66+ dst += 4 ; src += 4 ;
67+ }
68+ if ( n & 2 ) { // drop to 2 bytes each
69+ store < u16 > ( dst , load < u16 > ( src ) ) ;
70+ dst += 2 ; src += 2 ;
71+ }
72+ if ( n & 1 ) { // drop to 1 byte
73+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
74+ }
75+ return dest ;
76+ }
77+
78+ // if dst is not aligned to 4 bytes, use alternating shifts to copy 4 bytes each
79+ // doing shifts if faster when copying enough bytes (here: 32 or more)
80+ if ( n >= 32 ) {
81+ switch ( dst % 4 ) {
82+ // known to be != 0
83+ case 1 :
84+ w = load < u32 > ( src ) ;
85+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
86+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
87+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
88+ n -= 3 ;
89+ while ( n >= 17 ) {
90+ x = load < u32 > ( src + 1 ) ;
91+ store < u32 > ( dst , w >> 24 | x << 8 ) ;
92+ w = load < u32 > ( src + 5 ) ;
93+ store < u32 > ( dst + 4 , x >> 24 | w << 8 ) ;
94+ x = load < u32 > ( src + 9 ) ;
95+ store < u32 > ( dst + 8 , w >> 24 | x << 8 ) ;
96+ w = load < u32 > ( src + 13 ) ;
97+ store < u32 > ( dst + 12 , x >> 24 | w << 8 ) ;
98+ src += 16 ; dst += 16 ; n -= 16 ;
99+ }
100+ break ;
101+ case 2 :
102+ w = load < u32 > ( src ) ;
103+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
104+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
105+ n -= 2 ;
106+ while ( n >= 18 ) {
107+ x = load < u32 > ( src + 2 ) ;
108+ store < u32 > ( dst , w >> 16 | x << 16 ) ;
109+ w = load < u32 > ( src + 6 ) ;
110+ store < u32 > ( dst + 4 , x >> 16 | w << 16 ) ;
111+ x = load < u32 > ( src + 10 ) ;
112+ store < u32 > ( dst + 8 , w >> 16 | x << 16 ) ;
113+ w = load < u32 > ( src + 14 ) ;
114+ store < u32 > ( dst + 12 , x >> 16 | w << 16 ) ;
115+ src += 16 ; dst += 16 ; n -= 16 ;
116+ }
117+ break ;
118+ case 3 :
119+ w = load < u32 > ( src ) ;
120+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
121+ n -= 1 ;
122+ while ( n >= 19 ) {
123+ x = load < u32 > ( src + 3 ) ;
124+ store < u32 > ( dst , w >> 8 | x << 24 ) ;
125+ w = load < u32 > ( src + 7 ) ;
126+ store < u32 > ( dst + 4 , x >> 8 | w << 24 ) ;
127+ x = load < u32 > ( src + 11 ) ;
128+ store < u32 > ( dst + 8 , w >> 8 | x << 24 ) ;
129+ w = load < u32 > ( src + 15 ) ;
130+ store < u32 > ( dst + 12 , x >> 8 | w << 24 ) ;
131+ src += 16 ; dst += 16 ; n -= 16 ;
132+ }
133+ break ;
134+ }
135+ }
136+
137+ // copy remaining bytes one by one
138+ if ( n & 16 ) {
139+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
140+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
141+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
142+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
143+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
144+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
145+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
146+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
147+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
148+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
149+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
150+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
151+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
152+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
153+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
154+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
155+ }
156+ if ( n & 8 ) {
157+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
158+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
159+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
160+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
161+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
162+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
163+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
164+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
165+ }
166+ if ( n & 4 ) {
167+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
168+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
169+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
170+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
171+ }
172+ if ( n & 2 ) {
173+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
174+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
175+ }
176+ if ( n & 1 ) {
177+ store < u8 > ( dst ++ , load < u8 > ( src ++ ) ) ;
178+ }
179+ return dest ;
180+ }
181+
37182 private constructor ( ) { }
38183}
0 commit comments