C#C
C#4y ago
ero

Optimizing some string manipulation

I want to both substring an input string at the last occurrence of '/' and normalize it into only alphanumeric (a-z, A-Z, 0-9) characters, turning any characters unable to be normalized (meaning characters with diacritics turning into their non-diacritic versions (ä -> a)) into _.

Here's what I've got so far;
if (input.Length == 0)
{
  return "";
}

Span<char> outBuf = stackalloc char[128];
char* pNorm = stackalloc char[128];

fixed (char* pIn = input, pOut = outBuf)
{
  int dLength = NormalizeString(2, (ushort*)pIn, input.Length, (ushort*)pNorm, 128);

  int start = 127, length = 0;
  char first = default;

  for (int i = dLength - 1; i >= 0; i--)
  {
    char c = pNorm[i];

    if (CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.NonSpacingMark)
    {
      continue;
    }

    if (c is '/')
    {
      break;
    }

    pOut[start] = first = c switch
    {
      (>= '0' and <= '9') or (>= 'A' and <= 'Z') or (>= 'a' and <= 'z') => c,
      _ => '_'
    };

    start--;
    length++;
  }

  if (first is >= '0' and <= '9')
  {
    pOut[start] = '_';
    length++;
  }

  return outBuf.Slice(start, length).ToString();
}

[DllImport("normaliz")]
static extern int NormalizeString(
  int normForm,
  ushort* source,
  int sourceLength,
  ushort* destination,
  int destinationLength);

However, this is hardly faster than using Substring and Normalize (with some custom code involving CharUnicodeInfo.GetUnicodeCategory).

Any ideas?
Was this page helpful?