Update 2023-12-04: See fastest ReadOnlySpan<T> version at the bottom of the post
Console Input
using System; using System.Collections.Generic; using System.Threading.Tasks; using System.Text; using System.IO; using System.Diagnostics; namespace Domainizr.ConsoleApp { public class Program { public static async Task Main(string[] args) { Console.WriteLine($"Starting"); var sw = new Stopwatch(); sw.Start(); var bufferSize = 1024 * 1024; var buffer = new char[bufferSize]; var groupCount = 0; var stream = Console.OpenStandardInput(bufferSize); //using (var stream = File.OpenRead("MOCK_DATA.json")) //{ var reader = new StreamReader(stream, Encoding.UTF8); Console.SetIn(reader); // This will allow input >256 chars Console.WriteLine($"Reading groups"); await foreach(var group in ReadGroups(reader, buffer)) { if (groupCount % 100 == 0) Console.Write($"."); groupCount++; } //} Console.WriteLine($"\r\nCount: {groupCount}"); Console.WriteLine($"Elapsed: {sw.ElapsedMilliseconds}ms"); } public static string HexString(string plainText) { var plainTextBytes = Encoding.UTF8.GetBytes(plainText); return BitConverter.ToString(plainTextBytes); } public static async IAsyncEnumerable<string> ReadGroups(StreamReader stream, char[] buffer) { var separator = "\n"; Console.WriteLine($"Separator: {HexString(separator)}"); var lastPart = ""; while (true) { var size = await stream.ReadAsync(buffer, 0, buffer.Length); if (size <= 0) break; var s = lastPart + new string(buffer[0..size]); var parts = SplitString(s, separator); foreach(var part in parts) { if (part.complete) { lastPart = ""; yield return part.value; } else { lastPart += part.value; } } } if (!string.IsNullOrEmpty(lastPart)) yield return lastPart; } public static IEnumerable<(bool complete, string value)> SplitString(string source, string separator) { if (source.Length <= 0) yield break; var start = 0; var sepLen = separator.Length; while(true) { var end = source.IndexOf(separator, start); if (end < 0) break; yield return (true, new string(source.AsSpan()[start..end])); start = end + sepLen; } var last = new string(source.AsSpan()[start..source.Length]); if (!string.IsNullOrEmpty(last)) { yield return (false, last); } } } }
Usage: Run application and type: 123#45#67<ENTER>89#<ENTER>
Update: 2023-28-11: Refactored
using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Text; using System.Threading.Tasks; namespace ConsoleApp { class Program { public static async Task Main(string[] args) { if (args.Length == 0) { Console.WriteLine("Please provide an input file path as a command-line argument."); return; } string filePath = args[0]; Console.WriteLine($"Starting"); var sw = new Stopwatch(); sw.Start(); var bufferSize = 1024 * 32; var buffer = new char[bufferSize]; var groupCount = 0; try { using (var stream = File.OpenRead(filePath)) { var reader = new StreamReader(stream, Encoding.UTF8); Console.SetIn(reader); // This will allow input >256 chars Console.WriteLine($"Reading groups"); double groupSizes = 0; await foreach(var group in ReadGroups(reader, buffer, "\n")) { if (groupCount < 10) { Console.WriteLine($"group: {group}"); } if (groupCount % 1000 == 0) Console.Write($"."); groupCount++; groupSizes += group.Length; } Console.WriteLine($"\nAverage size: {groupSizes / groupCount}"); } } catch (Exception ex) { Console.WriteLine($"\r\nError: {ex}"); } Console.WriteLine($"\r\nCount: {groupCount}"); Console.WriteLine($"Elapsed: {FormatElapsedTime(sw.Elapsed)}"); } // A helper method to format elapsed time in a human-readable form public static string FormatElapsedTime(TimeSpan elapsed) { return $"{(int)elapsed.TotalHours}h {elapsed.Minutes}m {elapsed.Seconds}s {elapsed.Milliseconds}ms"; } public static async IAsyncEnumerable<string> ReadGroups(StreamReader stream, char[] buffer, string separator) { var openParts = new List<string>(); while (true) { var size = await stream.ReadAsync(buffer, 0, buffer.Length); if (size <= 0) break; var s = new string(buffer, 0, size); var position = 0; while (true) { var index = s.IndexOf(separator, position); if (index < 0) break; openParts.Add(s.Substring(position, index - position)); yield return string.Concat(openParts); openParts.Clear(); position = index + separator.Length; } openParts.Add(s.Substring(position)); } yield return string.Concat(openParts); } public static async IAsyncEnumerable<string> ReadGroups2(StreamReader stream, char[] buffer, string separator) { var stringBuilder = new StringBuilder(); while (true) { var size = await stream.ReadAsync(buffer, 0, buffer.Length); if (size <= 0) break; var s = new string(buffer, 0, size); var position = 0; while (true) { var index = s.IndexOf(separator, position); if (index < 0) break; stringBuilder.Append(s, position, index - position); yield return stringBuilder.ToString(); stringBuilder.Clear(); position = index + separator.Length; } stringBuilder.Append(s, position, s.Length - position); } if (stringBuilder.Length > 0) yield return stringBuilder.ToString(); } } }
Update 2023-12-04: Refactored with ReadOnlySpan<T>
using System; using System.Collections.Generic; using System.Diagnostics; using System.IO; using System.Text; using System.Threading.Tasks; namespace ConsoleApp { class Program { public static async Task Main(string[] args) { if (args.Length == 0) { Console.WriteLine("Please provide an input file path as a command-line argument."); return; } string filePath = args[0]; Console.WriteLine($"Starting"); var sw = new Stopwatch(); sw.Start(); var bufferSize = 1024; var buffer = new char[bufferSize]; var groupCount = 0; try { using (var stream = File.OpenRead(filePath)) { var reader = new StreamReader(stream, Encoding.UTF8); Console.SetIn(reader); // This will allow input >256 chars Console.WriteLine($"Reading groups"); double groupSizes = 0; await foreach (var group in ReadGroups(reader, buffer, "\n")) { if (groupCount < 10) { Console.WriteLine($"group: {group}"); } if (groupCount % 1000 == 0) Console.Write($"."); groupCount++; groupSizes += group.Length; } Console.WriteLine($"\nAverage size: {groupSizes / groupCount}"); } } catch (Exception ex) { Console.WriteLine($"\r\nError: {ex}"); } Console.WriteLine($"\r\nCount: {groupCount}"); Console.WriteLine($"Elapsed: {FormatElapsedTime(sw.Elapsed)}"); } // A helper method to format elapsed time in a human-readable form public static string FormatElapsedTime(TimeSpan elapsed) { return $"{(int)elapsed.TotalHours}h {elapsed.Minutes}m {elapsed.Seconds}s {elapsed.Milliseconds}ms"; } public static async IAsyncEnumerable<string> ReadGroups(StreamReader stream, char[] buffer, string separator) { var stringBuilder = new StringBuilder(); while (true) { var size = await stream.ReadAsync(buffer, 0, buffer.Length); if (size <= 0) break; foreach (var item in ReadGroupsSync(stringBuilder, buffer, separator, size)) yield return item; } if (stringBuilder.Length > 0) yield return stringBuilder.ToString(); } private static List<string> ReadGroupsSync(StringBuilder stringBuilder, char[] buffer, string separator, int size) { var list = new List<string>(); var s = buffer.AsSpan(0, size); var position = 0; while (true) { var index = s.IndexOf(separator.AsSpan()); if (index < 0) break; stringBuilder.Append(s.Slice(0, index)); list.Add(stringBuilder.ToString()); stringBuilder.Clear(); position = position + index + separator.Length; s = buffer.AsSpan(position, size - position); } stringBuilder.Append(s.Slice(0, s.Length)); return list; } } }
502000cookie-checkC# Read Groups of string separated (streaming) from STDIN