C# Read Groups of string separated (streaming) from STDIN

Date: 2021-04-28

Update 2023-12-04: See fastest ReadOnlySpan<T> version at the bottom of the post

Console Input

using System;
using System.Collections.Generic;
using System.Threading.Tasks;
using System.Text;
using System.IO;
using System.Diagnostics;

namespace Domainizr.ConsoleApp
{
    public class Program
    {
        public static async Task Main(string[] args)
        {
            Console.WriteLine($"Starting");
            var sw = new Stopwatch();
            sw.Start();
            var bufferSize = 1024 * 1024;
            var buffer = new char[bufferSize];
            var groupCount = 0;
            var stream = Console.OpenStandardInput(bufferSize);
            //using (var stream = File.OpenRead("MOCK_DATA.json"))
            //{
                var reader = new StreamReader(stream, Encoding.UTF8);
                Console.SetIn(reader); // This will allow input >256 chars
                Console.WriteLine($"Reading groups");
                
                await foreach(var group in ReadGroups(reader, buffer))
                {
                    if (groupCount % 100 == 0) Console.Write($".");
                    groupCount++;
                }
            //}
            Console.WriteLine($"\r\nCount: {groupCount}");
            Console.WriteLine($"Elapsed: {sw.ElapsedMilliseconds}ms");
        }

        public static string HexString(string plainText)
        {
            var plainTextBytes = Encoding.UTF8.GetBytes(plainText);
            return BitConverter.ToString(plainTextBytes);
        }

        public static async IAsyncEnumerable<string> ReadGroups(StreamReader stream, char[] buffer)
        {
            var separator = "\n";
            Console.WriteLine($"Separator: {HexString(separator)}");
            var lastPart = "";
            while (true)
            {
                var size = await stream.ReadAsync(buffer, 0, buffer.Length);
                if (size <= 0) break;
                var s = lastPart + new string(buffer[0..size]);
                var parts = SplitString(s, separator);
                foreach(var part in parts)
                {
                    if (part.complete)
                    {
                        lastPart = "";
                        yield return part.value;
                    }
                    else
                    {
                        lastPart += part.value;
                    }
                }
            }
            if (!string.IsNullOrEmpty(lastPart))
                yield return lastPart;
        }

        public static IEnumerable<(bool complete, string value)> SplitString(string source, string separator)
        {
            if (source.Length <= 0) yield break;
            var start = 0;
            var sepLen = separator.Length;
            while(true)
            {
                var end = source.IndexOf(separator, start);
                if (end < 0) break; 
                yield return (true, new string(source.AsSpan()[start..end]));
                start = end + sepLen;
            }
            var last = new string(source.AsSpan()[start..source.Length]);
            if (!string.IsNullOrEmpty(last)) {
                yield return (false, last);
            }
        }
    }
}

Usage: Run application and type: 123#45#67<ENTER>89#<ENTER>

Update: 2023-28-11: Refactored

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Threading.Tasks;

namespace ConsoleApp
{
    class Program
    {
        public static async Task Main(string[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Please provide an input file path as a command-line argument.");
                return;
            }

            string filePath = args[0];

            Console.WriteLine($"Starting");
            var sw = new Stopwatch();
            sw.Start();
            
            var bufferSize = 1024 * 32;
            var buffer = new char[bufferSize];
            var groupCount = 0;
            
            try
            {
                using (var stream = File.OpenRead(filePath))
                {
                    var reader = new StreamReader(stream, Encoding.UTF8);
                    Console.SetIn(reader); // This will allow input >256 chars
                    Console.WriteLine($"Reading groups");
                    double groupSizes = 0;
                    await foreach(var group in ReadGroups(reader, buffer, "\n"))
                    {
                        if (groupCount < 10) {
                            Console.WriteLine($"group: {group}");
                        }
                        if (groupCount % 1000 == 0) Console.Write($".");
                        groupCount++;
                        groupSizes += group.Length;
                    }

                    Console.WriteLine($"\nAverage size: {groupSizes / groupCount}");
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"\r\nError: {ex}");
            }

            Console.WriteLine($"\r\nCount: {groupCount}");
            Console.WriteLine($"Elapsed: {FormatElapsedTime(sw.Elapsed)}");
        }

        // A helper method to format elapsed time in a human-readable form
        public static string FormatElapsedTime(TimeSpan elapsed)
        {
            return $"{(int)elapsed.TotalHours}h {elapsed.Minutes}m {elapsed.Seconds}s {elapsed.Milliseconds}ms";
        }

        public static async IAsyncEnumerable<string> ReadGroups(StreamReader stream, char[] buffer, string separator)
        {
            var openParts = new List<string>();
            while (true)
            {
                var size = await stream.ReadAsync(buffer, 0, buffer.Length);
                if (size <= 0) break;
                var s = new string(buffer, 0, size);
                var position = 0;
                while (true)
                {
                    var index = s.IndexOf(separator, position);
                    if (index < 0) break;
                    openParts.Add(s.Substring(position, index - position));
                    yield return string.Concat(openParts);
                    openParts.Clear();
                    position = index + separator.Length;
                }
                openParts.Add(s.Substring(position));
            }
            yield return string.Concat(openParts);
        }
        
        public static async IAsyncEnumerable<string> ReadGroups2(StreamReader stream, char[] buffer, string separator)
        {
            var stringBuilder = new StringBuilder();
            
            while (true)
            {
                var size = await stream.ReadAsync(buffer, 0, buffer.Length);
                if (size <= 0) break;
                
                var s = new string(buffer, 0, size);
                var position = 0;

                while (true)
                {
                    var index = s.IndexOf(separator, position);
                    if (index < 0) break;

                    stringBuilder.Append(s, position, index - position);
                    yield return stringBuilder.ToString();

                    stringBuilder.Clear();
                    position = index + separator.Length;
                }

                stringBuilder.Append(s, position, s.Length - position);
            }

            if (stringBuilder.Length > 0)
                yield return stringBuilder.ToString();
        }
    }
}

Update 2023-12-04: Refactored with ReadOnlySpan<T>

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Threading.Tasks;

namespace ConsoleApp
{
    class Program
    {
        public static async Task Main(string[] args)
        {
            if (args.Length == 0)
            {
                Console.WriteLine("Please provide an input file path as a command-line argument.");
                return;
            }

            string filePath = args[0];

            Console.WriteLine($"Starting");
            var sw = new Stopwatch();
            sw.Start();

            var bufferSize = 1024;
            var buffer = new char[bufferSize];
            var groupCount = 0;

            try
            {
                using (var stream = File.OpenRead(filePath))
                {
                    var reader = new StreamReader(stream, Encoding.UTF8);
                    Console.SetIn(reader); // This will allow input >256 chars
                    Console.WriteLine($"Reading groups");
                    double groupSizes = 0;
                    await foreach (var group in ReadGroups(reader, buffer, "\n"))
                    {
                        if (groupCount < 10)
                        {
                            Console.WriteLine($"group: {group}");
                        }
                        if (groupCount % 1000 == 0) Console.Write($".");
                        groupCount++;
                        groupSizes += group.Length;
                    }

                    Console.WriteLine($"\nAverage size: {groupSizes / groupCount}");
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine($"\r\nError: {ex}");
            }

            Console.WriteLine($"\r\nCount: {groupCount}");
            Console.WriteLine($"Elapsed: {FormatElapsedTime(sw.Elapsed)}");
        }

        // A helper method to format elapsed time in a human-readable form
        public static string FormatElapsedTime(TimeSpan elapsed)
        {
            return $"{(int)elapsed.TotalHours}h {elapsed.Minutes}m {elapsed.Seconds}s {elapsed.Milliseconds}ms";
        }

        public static async IAsyncEnumerable<string> ReadGroups(StreamReader stream, char[] buffer, string separator)
        {
            var stringBuilder = new StringBuilder();
            while (true)
            {
                var size = await stream.ReadAsync(buffer, 0, buffer.Length);
                if (size <= 0) break;

                foreach (var item in ReadGroupsSync(stringBuilder, buffer, separator, size))
                    yield return item;
            }

            if (stringBuilder.Length > 0)
                yield return stringBuilder.ToString();
        }

        private static List<string> ReadGroupsSync(StringBuilder stringBuilder, char[] buffer, string separator, int size)
        {
            var list = new List<string>();
            var s = buffer.AsSpan(0, size);
            var position = 0;
            while (true)
            {
                var index = s.IndexOf(separator.AsSpan());
                if (index < 0) break;

                stringBuilder.Append(s.Slice(0, index));
                list.Add(stringBuilder.ToString());

                stringBuilder.Clear();
                position = position + index + separator.Length;
                s = buffer.AsSpan(position, size - position);
            }
            stringBuilder.Append(s.Slice(0, s.Length));
            return list;
        }
    }
}
50200cookie-checkC# Read Groups of string separated (streaming) from STDIN