XS.xs
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "ppport.h"
U8 _encoding_fix_latin_ms_map[] = {
0xE2, 0x82, 0xAC, 0x00,
0x25, 0x38, 0x31, 0x00,
0xE2, 0x80, 0x9A, 0x00,
0xC6, 0x92, 0x00, 0x00,
0xE2, 0x80, 0x9E, 0x00,
0xE2, 0x80, 0xA6, 0x00,
0xE2, 0x80, 0xA0, 0x00,
0xE2, 0x80, 0xA1, 0x00,
0xCB, 0x86, 0x00, 0x00,
0xE2, 0x80, 0xB0, 0x00,
0xC5, 0xA0, 0x00, 0x00,
0xE2, 0x80, 0xB9, 0x00,
0xC5, 0x92, 0x00, 0x00,
0x25, 0x38, 0x44, 0x00,
0xC5, 0xBD, 0x00, 0x00,
0x25, 0x38, 0x46, 0x00,
0x25, 0x39, 0x30, 0x00,
0xE2, 0x80, 0x98, 0x00,
0xE2, 0x80, 0x99, 0x00,
0xE2, 0x80, 0x9C, 0x00,
0xE2, 0x80, 0x9D, 0x00,
0xE2, 0x80, 0xA2, 0x00,
0xE2, 0x80, 0x93, 0x00,
0xE2, 0x80, 0x94, 0x00,
0xCB, 0x9C, 0x00, 0x00,
0xE2, 0x84, 0xA2, 0x00,
0xC5, 0xA1, 0x00, 0x00,
0xE2, 0x80, 0xBA, 0x00,
0xC5, 0x93, 0x00, 0x00,
0x25, 0x39, 0x44, 0x00,
0xC5, 0xBE, 0x00, 0x00,
0xC5, 0xB8, 0x00, 0x00,
0x00
};
static SV* _encoding_fix_latin_xs(SV*);
static int consume_utf8_bytes(U8*, U8*);
static int consume_latin_byte(U8*, U8*);
static SV* _encoding_fix_latin_xs(SV* source) {
SV* out = NULL;
U8 *ph, *pt;
U8 ubuf[8];
UV i, bytes, bytes_consumed;
STRLEN l;
ph = pt = SvPV(source, l);
bytes = SvCUR(source);
for(i = 0; i < bytes; i++, ph++) {
if((*ph & 0x80) == 0)
continue;
if(out == NULL) {
out = newSV(bytes * 12 / 10);
SvPOK_on(out);
}
if(ph > pt) {
sv_catpvn(out, pt, (STRLEN)(ph - pt));
}
bytes_consumed = consume_utf8_bytes(ph, ubuf);
if(!bytes_consumed) {
bytes_consumed = consume_latin_byte(ph, ubuf);
}
sv_catpvn(out, ubuf, strnlen(ubuf, 8));
i += bytes_consumed - 1;
ph += bytes_consumed - 1;
pt = ph + 1;
}
if(out == NULL) {
return(source);
}
if(ph > pt) {
sv_catpvn(out, pt, (STRLEN)(ph - pt));
}
SvUTF8_on(out);
return(sv_2mortal(out));
}
static int consume_utf8_bytes(U8* in, U8* out) {
UV cp, bytes, i;
U8 *d;
if((in[0] & 0b11100000) == 0b11000000) {
cp = in[0] & 0b00011111;
bytes = 2;
}
else if((in[0] & 0b11110000) == 0b11100000) {
cp = in[0] & 0b00001111;
bytes = 3;
}
else if((in[0] & 0b11111000) == 0b11110000) {
cp = in[0] & 0b00000111;
bytes = 4;
}
else if((in[0] & 0b11111100) == 0b11111000) {
cp = in[0] & 0b00000011;
bytes = 5;
}
else {
return(0);
}
for(i = 1; i < bytes; i++) {
if((in[i] & 0b11000000) != 0b10000000) {
return(0);
}
cp <<= 6;
cp += in[i] & 0b00111111;
}
d = uvchr_to_utf8(out, cp);
*d = '\0';
return(bytes);
}
static int consume_latin_byte(U8* in, U8* out) {
U8 *d;
if(in[0] > 0x9F) {
d = uvchr_to_utf8(out, (UV)in[0]);
*d = '\0';
}
else {
strncpy(out, _encoding_fix_latin_ms_map + (in[0] & 0x7F) * 4, 4);
}
return(1);
}
MODULE = Encoding::FixLatin::XS PACKAGE = Encoding::FixLatin::XS
SV *
encoding_fixlatin_xs(source)
SV * source
PPCODE:
ST(0) = _encoding_fix_latin_xs(source);
XSRETURN(1);