C#•4y ago•

17 replies

Optimizing some string manipulation

I want to both substring an input string at the last occurrence of

'/'

'/'

and normalize it into only alphanumeric (

a-z

a-z

A-Z

A-Z

0-9

0-9

) characters, turning any characters unable to be normalized (meaning characters with diacritics turning into their non-diacritic versions (

ä

ä

)) into

.

Here's what I've got so far;

if (input.Length == 0)
{
  return "";
}

Span<char> outBuf = stackalloc char[128];
char* pNorm = stackalloc char[128];

fixed (char* pIn = input, pOut = outBuf)
{
  int dLength = NormalizeString(2, (ushort*)pIn, input.Length, (ushort*)pNorm, 128);

  int start = 127, length = 0;
  char first = default;

  for (int i = dLength - 1; i >= 0; i--)
  {
    char c = pNorm[i];

    if (CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.NonSpacingMark)
    {
      continue;
    }

    if (c is '/')
    {
      break;
    }

    pOut[start] = first = c switch
    {
      (>= '0' and <= '9') or (>= 'A' and <= 'Z') or (>= 'a' and <= 'z') => c,
      _ => '_'
    };

    start--;
    length++;
  }

  if (first is >= '0' and <= '9')
  {
    pOut[start] = '_';
    length++;
  }

  return outBuf.Slice(start, length).ToString();
}

[DllImport("normaliz")]
static extern int NormalizeString(
  int normForm,
  ushort* source,
  int sourceLength,
  ushort* destination,
  int destinationLength);

if (input.Length == 0)
{
  return "";
}

Span<char> outBuf = stackalloc char[128];
char* pNorm = stackalloc char[128];

fixed (char* pIn = input, pOut = outBuf)
{
  int dLength = NormalizeString(2, (ushort*)pIn, input.Length, (ushort*)pNorm, 128);

  int start = 127, length = 0;
  char first = default;

  for (int i = dLength - 1; i >= 0; i--)
  {
    char c = pNorm[i];

    if (CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.NonSpacingMark)
    {
      continue;
    }

    if (c is '/')
    {
      break;
    }

    pOut[start] = first = c switch
    {
      (>= '0' and <= '9') or (>= 'A' and <= 'Z') or (>= 'a' and <= 'z') => c,
      _ => '_'
    };

    start--;
    length++;
  }

  if (first is >= '0' and <= '9')
  {
    pOut[start] = '_';
    length++;
  }

  return outBuf.Slice(start, length).ToString();
}

[DllImport("normaliz")]
static extern int NormalizeString(
  int normForm,
  ushort* source,
  int sourceLength,
  ushort* destination,
  int destinationLength);

However, this is hardly faster than using

Substring

Substring

and

Normalize

Normalize

(with some custom code involving

CharUnicodeInfo.GetUnicodeCategory

CharUnicodeInfo.GetUnicodeCategory

).

Any ideas?

Optimizing some string manipulation

I want to both substring an input string at the last occurrence of

'/'

'/'

and normalize it into only alphanumeric (

a-z

a-z

A-Z

A-Z

0-9

0-9

) characters, turning any characters unable to be normalized (meaning characters with diacritics turning into their non-diacritic versions (

ä

ä

)) into

.

Here's what I've got so far;

if (input.Length == 0)
{
  return "";
}

Span<char> outBuf = stackalloc char[128];
char* pNorm = stackalloc char[128];

fixed (char* pIn = input, pOut = outBuf)
{
  int dLength = NormalizeString(2, (ushort*)pIn, input.Length, (ushort*)pNorm, 128);

  int start = 127, length = 0;
  char first = default;

  for (int i = dLength - 1; i >= 0; i--)
  {
    char c = pNorm[i];

    if (CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.NonSpacingMark)
    {
      continue;
    }

    if (c is '/')
    {
      break;
    }

    pOut[start] = first = c switch
    {
      (>= '0' and <= '9') or (>= 'A' and <= 'Z') or (>= 'a' and <= 'z') => c,
      _ => '_'
    };

    start--;
    length++;
  }

  if (first is >= '0' and <= '9')
  {
    pOut[start] = '_';
    length++;
  }

  return outBuf.Slice(start, length).ToString();
}

[DllImport("normaliz")]
static extern int NormalizeString(
  int normForm,
  ushort* source,
  int sourceLength,
  ushort* destination,
  int destinationLength);

if (input.Length == 0)
{
  return "";
}

Span<char> outBuf = stackalloc char[128];
char* pNorm = stackalloc char[128];

fixed (char* pIn = input, pOut = outBuf)
{
  int dLength = NormalizeString(2, (ushort*)pIn, input.Length, (ushort*)pNorm, 128);

  int start = 127, length = 0;
  char first = default;

  for (int i = dLength - 1; i >= 0; i--)
  {
    char c = pNorm[i];

    if (CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.NonSpacingMark)
    {
      continue;
    }

    if (c is '/')
    {
      break;
    }

    pOut[start] = first = c switch
    {
      (>= '0' and <= '9') or (>= 'A' and <= 'Z') or (>= 'a' and <= 'z') => c,
      _ => '_'
    };

    start--;
    length++;
  }

  if (first is >= '0' and <= '9')
  {
    pOut[start] = '_';
    length++;
  }

  return outBuf.Slice(start, length).ToString();
}

[DllImport("normaliz")]
static extern int NormalizeString(
  int normForm,
  ushort* source,
  int sourceLength,
  ushort* destination,
  int destinationLength);

However, this is hardly faster than using

Substring

Substring

and

Normalize

Normalize

(with some custom code involving

CharUnicodeInfo.GetUnicodeCategory

CharUnicodeInfo.GetUnicodeCategory

).

Any ideas?

Optimizing some string manipulation

Similar Threads

Optimizing some string manipulation

Similar Threads

Similar Threads

Similar Threads