samedi 28 mars 2015

Reading lines and counting words as fast as possible in C++


I'm trying to read a file, push each line (char*) into a vector, and then return the number of lines in the file.


My current implementation is based on the answer here.


My current attempt is



static uintmax_t wc(char const *fname, vector<char*> &v)
{
static const size_t BUFFER_SIZE = 16*1024;
int fd = open(fname, O_RDONLY);
if(fd == -1)
exit(1);

char buf[BUFFER_SIZE + 1];
uintmax_t lines = 0;

while(size_t bytes_read = read(fd, buf, BUFFER_SIZE))
{
if(bytes_read == (size_t)-1)
exit(1);

for(char *p = buf; (p = (char*) memchr(p, '\n', (buf + bytes_read) - p)); ++p) {
cout << p+107 << endl; // this will be a v.push_back eventually
++lines;
}

}

return lines;
}


What's happening is I'm getting all of buf (all of the contents of the file) except for the first line for the first iteration, then all of buf except for the first two lines, etc.


EXAMPLE


I have these 10 strings in my file



aatYHGCry9587izvZhRxlCfLezW9lcPEPA9mAfNya9vyuPgS8I1nUsKicn2HDCymkCQeAzvn8h9gcTpAsNtfE7xLrAuuZfMm7I5E9DSNJv6
ef5fu1v0zfKyBoCuj4JCtRICYEo3s5LQnQBAv49YoAAAHekpNiANefmqjkRhv9vnka0X0jWgU6S8Ap3WDwlCAHz6n9xXaIT9MfJ31NETK6X
OWXHJqNMFQnNgfFcLV6B6LxkVlGip30Qz1sDAUAmvVDHoyhMWf0CbhBkuHlVDJqcuheB02eVXwckPgP4yXBmoTh0GNLKs69JQaK0YxEFPFx
8CbyfXtcN4MUYgBiNf4JHavxtqpuPJHYmhaZ6g8N75Ht6tZAVK6bf3HgYGOgMwAEhkKXNk3H2N7w99bSsx9ei846yWslNIz8rVQyiSoN2W4
iR3BE1RmfhXcQ1aI1QmCqV5AEf1x4g9GQ9fsTYT2XYRsQCcm2XkVFawMHzJKJqzi8zmSF57qB4zgu1KS8S2TZesL7zEKBcqckafYnf4vxEa
PEQhLyLR2aS1LtwGoT0oeSS6JWDii3T0REDVPRCJTpAwymoaMur0Kmg9fIPOh3perHLnwSoEQ8MG45C3M1EXrisCqsgrnt3kZeKBv2kfFc1
ExyQNSAaaTgOAUjMNtVpZ9uTZUNIAmnzp7FWIBV8mZvGAMpU9Qe3nch8bRR8j5UDxTA2yobGx0qSgUuGrTTaQwRITh4vQPI36f4FkhLbQqy
UgjWxwYlEGQQafbPEFHLUMYLxc819WB8avKiAoVwKqZKaroOVcxE4OeGbgDvLshmqNQfibMRfIvXsoIJ2C0LZb9eDqAmLjWYB1rkbZbGIZu
neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86


My output is:



p is X
OWXHJqNMFQnNgfFcLV6B6LxkVlGip30Qz1sDAUAmvVDHoyhMWf0CbhBkuHlVDJqcuheB02eVXwckPgP4yXBmoTh0GNLKs69JQaK0YxEFPFx
8CbyfXtcN4MUYgBiNf4JHavxtqpuPJHYmhaZ6g8N75Ht6tZAVK6bf3HgYGOgMwAEhkKXNk3H2N7w99bSsx9ei846yWslNIz8rVQyiSoN2W4
iR3BE1RmfhXcQ1aI1QmCqV5AEf1x4g9GQ9fsTYT2XYRsQCcm2XkVFawMHzJKJqzi8zmSF57qB4zgu1KS8S2TZesL7zEKBcqckafYnf4vxEa
PEQhLyLR2aS1LtwGoT0oeSS6JWDii3T0REDVPRCJTpAwymoaMur0Kmg9fIPOh3perHLnwSoEQ8MG45C3M1EXrisCqsgrnt3kZeKBv2kfFc1
ExyQNSAaaTgOAUjMNtVpZ9uTZUNIAmnzp7FWIBV8mZvGAMpU9Qe3nch8bRR8j5UDxTA2yobGx0qSgUuGrTTaQwRITh4vQPI36f4FkhLbQqy
UgjWxwYlEGQQafbPEFHLUMYLxc819WB8avKiAoVwKqZKaroOVcxE4OeGbgDvLshmqNQfibMRfIvXsoIJ2C0LZb9eDqAmLjWYB1rkbZbGIZu
neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86

p is x
8CbyfXtcN4MUYgBiNf4JHavxtqpuPJHYmhaZ6g8N75Ht6tZAVK6bf3HgYGOgMwAEhkKXNk3H2N7w99bSsx9ei846yWslNIz8rVQyiSoN2W4
iR3BE1RmfhXcQ1aI1QmCqV5AEf1x4g9GQ9fsTYT2XYRsQCcm2XkVFawMHzJKJqzi8zmSF57qB4zgu1KS8S2TZesL7zEKBcqckafYnf4vxEa
PEQhLyLR2aS1LtwGoT0oeSS6JWDii3T0REDVPRCJTpAwymoaMur0Kmg9fIPOh3perHLnwSoEQ8MG45C3M1EXrisCqsgrnt3kZeKBv2kfFc1
ExyQNSAaaTgOAUjMNtVpZ9uTZUNIAmnzp7FWIBV8mZvGAMpU9Qe3nch8bRR8j5UDxTA2yobGx0qSgUuGrTTaQwRITh4vQPI36f4FkhLbQqy
UgjWxwYlEGQQafbPEFHLUMYLxc819WB8avKiAoVwKqZKaroOVcxE4OeGbgDvLshmqNQfibMRfIvXsoIJ2C0LZb9eDqAmLjWYB1rkbZbGIZu
neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86

p is 4
iR3BE1RmfhXcQ1aI1QmCqV5AEf1x4g9GQ9fsTYT2XYRsQCcm2XkVFawMHzJKJqzi8zmSF57qB4zgu1KS8S2TZesL7zEKBcqckafYnf4vxEa
PEQhLyLR2aS1LtwGoT0oeSS6JWDii3T0REDVPRCJTpAwymoaMur0Kmg9fIPOh3perHLnwSoEQ8MG45C3M1EXrisCqsgrnt3kZeKBv2kfFc1
ExyQNSAaaTgOAUjMNtVpZ9uTZUNIAmnzp7FWIBV8mZvGAMpU9Qe3nch8bRR8j5UDxTA2yobGx0qSgUuGrTTaQwRITh4vQPI36f4FkhLbQqy
UgjWxwYlEGQQafbPEFHLUMYLxc819WB8avKiAoVwKqZKaroOVcxE4OeGbgDvLshmqNQfibMRfIvXsoIJ2C0LZb9eDqAmLjWYB1rkbZbGIZu
neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86

p is a
PEQhLyLR2aS1LtwGoT0oeSS6JWDii3T0REDVPRCJTpAwymoaMur0Kmg9fIPOh3perHLnwSoEQ8MG45C3M1EXrisCqsgrnt3kZeKBv2kfFc1
ExyQNSAaaTgOAUjMNtVpZ9uTZUNIAmnzp7FWIBV8mZvGAMpU9Qe3nch8bRR8j5UDxTA2yobGx0qSgUuGrTTaQwRITh4vQPI36f4FkhLbQqy
UgjWxwYlEGQQafbPEFHLUMYLxc819WB8avKiAoVwKqZKaroOVcxE4OeGbgDvLshmqNQfibMRfIvXsoIJ2C0LZb9eDqAmLjWYB1rkbZbGIZu
neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86

p is 1
ExyQNSAaaTgOAUjMNtVpZ9uTZUNIAmnzp7FWIBV8mZvGAMpU9Qe3nch8bRR8j5UDxTA2yobGx0qSgUuGrTTaQwRITh4vQPI36f4FkhLbQqy
UgjWxwYlEGQQafbPEFHLUMYLxc819WB8avKiAoVwKqZKaroOVcxE4OeGbgDvLshmqNQfibMRfIvXsoIJ2C0LZb9eDqAmLjWYB1rkbZbGIZu
neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86

p is y
UgjWxwYlEGQQafbPEFHLUMYLxc819WB8avKiAoVwKqZKaroOVcxE4OeGbgDvLshmqNQfibMRfIvXsoIJ2C0LZb9eDqAmLjWYB1rkbZbGIZu
neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86

p is u
neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86

p is t
sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86

p is 6

p is


and my expected output is:



p is aatYHGCry9587izvZhRxlCfLezW9lcPEPA9mAfNya9vyuPgS8I1nUsKicn2HDCymkCQeAzvn8h9gcTpAsNtfE7xLrAuuZfMm7I5E9DSNJv6

p is ef5fu1v0zfKyBoCuj4JCtRICYEo3s5LQnQBAv49YoAAAHekpNiANefmqjkRhv9vnka0X0jWgU6S8Ap3WDwlCAHz6n9xXaIT9MfJ31NETK6X

p is OWXHJqNMFQnNgfFcLV6B6LxkVlGip30Qz1sDAUAmvVDHoyhMWf0CbhBkuHlVDJqcuheB02eVXwckPgP4yXBmoTh0GNLKs69JQaK0YxEFPFx

p is 8CbyfXtcN4MUYgBiNf4JHavxtqpuPJHYmhaZ6g8N75Ht6tZAVK6bf3HgYGOgMwAEhkKXNk3H2N7w99bSsx9ei846yWslNIz8rVQyiSoN2W4

p is iR3BE1RmfhXcQ1aI1QmCqV5AEf1x4g9GQ9fsTYT2XYRsQCcm2XkVFawMHzJKJqzi8zmSF57qB4zgu1KS8S2TZesL7zEKBcqckafYnf4vxEa

p is PEQhLyLR2aS1LtwGoT0oeSS6JWDii3T0REDVPRCJTpAwymoaMur0Kmg9fIPOh3perHLnwSoEQ8MG45C3M1EXrisCqsgrnt3kZeKBv2kfFc1

p is ExyQNSAaaTgOAUjMNtVpZ9uTZUNIAmnzp7FWIBV8mZvGAMpU9Qe3nch8bRR8j5UDxTA2yobGx0qSgUuGrTTaQwRITh4vQPI36f4FkhLbQqy

p is UgjWxwYlEGQQafbPEFHLUMYLxc819WB8avKiAoVwKqZKaroOVcxE4OeGbgDvLshmqNQfibMRfIvXsoIJ2C0LZb9eDqAmLjWYB1rkbZbGIZu

p is neCtcb4o90XIqRloktifWEuR99lj08kh75RVPvoh1Er1rhfwBZBQJ9I1nG7aJe0We6gy5GimMxEUSQ2JQDJNhSe55momPQzF9AEVpPKGMZt

p is sUUjmpnOyAyE1rSYBrztnjnkVaSSviJDs7WesfShDJUeYR35p72X6hePaMKC4764qsgA4bM7O0QqvuhCyuYQ9lRgUAe4GJXo4zx4D14Si86


Also, breaking at newline rather than the hardcoded 107 characters would be nice, but not necessary.


If you need any more information, I'd be happy to provide it. Hope this isn't too much to ask.




Aucun commentaire:

Enregistrer un commentaire