It processes 100MB of text content in 2.33 seconds, but I feel like it should be able to go faster. - It's too much of a hassle to use
Memory<char>
Memory<char>
elsewhere in the code, instead of returning strings - I've tried string pooling instead of constantly allocating new strings, and it actually ends up being slower somehow
Code:
static string MemoryToLower(ReadOnlySpan<char> span){ Span<char> chars = stackalloc char[span.Length]; for (int i = 0; i < span.Length; i++) { char c = span[i]; chars[i] = (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c; } return new string(chars);}static SearchValues<char> searchValues = SearchValues.Create(new[] { ' ', '\r', '\n', ',', '.', '?', '"', '\'', ';', '!', '\t', '(', ')', '[', ']', '<', '>', '+', '-', '*' });static List<string> Tokenizer(string input){ var startIndex = 0; var list = new List<string>(); for (var i = 0; i < input.Length; i++) { var c = input[i]; if (searchValues.Contains(c)) { if (i - startIndex > 0) list.Add(MemoryToLower(input.AsSpan(startIndex, i - startIndex))); startIndex = i + 1; } } if (input.Length - startIndex > 0) list.Add(MemoryToLower(input.AsSpan(startIndex, input.Length - startIndex))); return list;}
static string MemoryToLower(ReadOnlySpan<char> span){ Span<char> chars = stackalloc char[span.Length]; for (int i = 0; i < span.Length; i++) { char c = span[i]; chars[i] = (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c; } return new string(chars);}static SearchValues<char> searchValues = SearchValues.Create(new[] { ' ', '\r', '\n', ',', '.', '?', '"', '\'', ';', '!', '\t', '(', ')', '[', ']', '<', '>', '+', '-', '*' });static List<string> Tokenizer(string input){ var startIndex = 0; var list = new List<string>(); for (var i = 0; i < input.Length; i++) { var c = input[i]; if (searchValues.Contains(c)) { if (i - startIndex > 0) list.Add(MemoryToLower(input.AsSpan(startIndex, i - startIndex))); startIndex = i + 1; } } if (input.Length - startIndex > 0) list.Add(MemoryToLower(input.AsSpan(startIndex, input.Length - startIndex))); return list;}