Atomx/Atomx.Utils/Files/FileTypes.cs

using System.Text;
using System.Text.RegularExpressions;

namespace Atomx.Utils.Files
{
    /// <summary>
    /// 常用文件头识别工具（改进版：修复编译器警告、改善流读取安全性与类型判断）
    /// </summary>
    public static class FileTypes
    {
        // 保持原有 public 字段签名以兼容现有代码
        public static readonly Dictionary<string, byte[]> ImageHeader = new();
        public static readonly Dictionary<string, object> FilesHeader = new();
        public static readonly Dictionary<string, object> VideoHeader = new();

        static FileTypes()
        {
            ImageHeader.Add("gif", new byte[] { 71, 73, 70, 56, 57, 97 });
            ImageHeader.Add("bmp", new byte[] { 66, 77 });
            ImageHeader.Add("jpg", new byte[] { 255, 216, 255 });
            ImageHeader.Add("png", new byte[] { 137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82 });

            FilesHeader.Add("pdf", new byte[] { 37, 80, 68, 70, 45, 49, 46, 53 });
            FilesHeader.Add("docx", new object[] { new byte[] { 80, 75, 3, 4, 20, 0, 6, 0, 8, 0, 0, 0, 33 }, new Regex(@"word/_rels/document\.xml\.rels", RegexOptions.IgnoreCase) });
            FilesHeader.Add("xlsx", new object[] { new byte[] { 80, 75, 3, 4, 20, 0, 6, 0, 8, 0, 0, 0, 33 }, new Regex(@"xl/_rels/workbook\.xml\.rels", RegexOptions.IgnoreCase) });
            FilesHeader.Add("pptx", new object[] { new byte[] { 80, 75, 3, 4, 20, 0, 6, 0, 8, 0, 0, 0, 33 }, new Regex(@"ppt/_rels/presentation\.xml\.rels", RegexOptions.IgnoreCase) });
            FilesHeader.Add("doc", new object[] { new byte[] { 208, 207, 17, 224, 161, 177, 26, 225 }, new Regex(@"microsoft( office)? word(?![\s\S]*?microsoft)", RegexOptions.IgnoreCase) });
            FilesHeader.Add("xls", new object[] { new byte[] { 208, 207, 17, 224, 161, 177, 26, 225 }, new Regex(@"microsoft( office)? excel(?![\s\S]*?microsoft)", RegexOptions.IgnoreCase) });
            FilesHeader.Add("ppt", new object[] { new byte[] { 208, 207, 17, 224, 161, 177, 26, 225 }, new Regex(@"c.u.r.r.e.n.t. .u.s.e.r(?![\s\S]*?[a-z])", RegexOptions.IgnoreCase) });

            FilesHeader.Add("avi", new byte[] { 65, 86, 73, 32 });
            FilesHeader.Add("mpg", new byte[] { 0, 0, 1, 0xBA });
            FilesHeader.Add("mpeg", new byte[] { 0, 0, 1, 0xB3 });
            FilesHeader.Add("rar", new byte[] { 82, 97, 114, 33, 26, 7 });
            FilesHeader.Add("zip", new byte[] { 80, 75, 3, 4 });

            VideoHeader.Add("avi", new byte[] { 65, 86, 73, 32 });
            VideoHeader.Add("mpg", new byte[] { 0, 0, 1, 0xBA });
            VideoHeader.Add("mpeg", new byte[] { 0, 0, 1, 0xB3 });
        }

        /// <summary>
        /// 检测视频类型扩展方法
        /// </summary>
        public static string VideoType(this Stream stream)
        {
            if (stream == null) throw new ArgumentNullException(nameof(stream));
            foreach (var kv in VideoHeader)
            {
                if (TryMatchHeader(stream, kv.Value, out var matched))
                {
                    if (matched)
                        return kv.Key;
                }
            }
            return string.Empty;
        }

        /// <summary>
        /// 检测常规文件类型扩展方法
        /// </summary>
        public static string FileType(this Stream stream)
        {
            if (stream == null) throw new ArgumentNullException(nameof(stream));
            foreach (var kv in FilesHeader)
            {
                if (TryMatchHeader(stream, kv.Value, out var matched))
                {
                    if (matched)
                        return kv.Key;
                }
            }
            return string.Empty;
        }

        /// <summary>
        /// 检测图片类型扩展方法
        /// </summary>
        public static string ImageType(this Stream stream)
        {
            if (stream == null) throw new ArgumentNullException(nameof(stream));
            foreach (var kv in ImageHeader)
            {
                var header = kv.Value;
                if (header == null || header.Length == 0) continue;

                if (ReadAndCompare(stream, header))
                    return kv.Key;
            }

            // 额外尝试判断是否是纯文本（防止误判），若为纯文本则返回空
            // 只读取流内容进行检查，谨慎处理大流（可能会分配较大内存）
            try
            {
                var content = stream.ReadAllBytesAndRestorePosition();
                if (content.Length > 0)
                {
                    var encodings = new[] { Encoding.ASCII, Encoding.UTF8 };
                    foreach (var enc in encodings)
                    {
                        var text = enc.GetString(content);
                        if (Regex.IsMatch(text, @"^[^\u0000-\u0008\u000B-\u000C\u000E-\u001F]*$"))
                        {
                            // 认为是文本，非图片
                            return string.Empty;
                        }
                    }

                    // Windows-936 编码仅在支持 CodePages 的情况下尝试使用
                    try
                    {
                        var cp936 = Encoding.GetEncoding(936);
                        var text936 = cp936.GetString(content);
                        if (Regex.IsMatch(text936, @"^[^\u0000-\u0008\u000B-\u000C\u000E-\u001F]*$"))
                            return string.Empty;
                    }
                    catch
                    {
                        // 在不支持 CodePages 的平台上忽略
                    }
                }
            }
            catch
            {
                // 忽略检查错误，回退为无法识别
            }

            return string.Empty;
        }

        /// <summary>
        /// 将流完整读取为字节数组（更高效且保持原位置）
        /// </summary>
        public static byte[] StreamToBytes(this Stream stream)
        {
            if (stream == null) throw new ArgumentNullException(nameof(stream));
            return stream.ReadAllBytesAndRestorePosition();
        }

        // ---------- 辅助方法 ----------

        /// <summary>
        /// 根据字节头或复杂描述（object[]）进行匹配判断。
        /// object 类型支持：
        /// - byte[] : 仅比较头部字节
        /// - object[] : 第一个元素为 byte[] 头部；第二个元素可以为 Regex 或 int(表示尾部偏移)，后面可跟要比较的尾部 byte[] 列表
        /// </summary>
        private static bool TryMatchHeader(Stream stream, object value, out bool matched)
        {
            matched = false;
            if (value == null) return false;

            if (value is byte[] headerOnly)
            {
                matched = ReadAndCompare(stream, headerOnly);
                return true;
            }

            if (value is object[] arr && arr.Length > 0 && arr[0] is byte[] header)
            {
                // 先比较头部
                if (!ReadAndCompare(stream, header))
                    return true; // header no match -> not this type

                // 若仅头部匹配，后续根据第二元素进一步校验
                if (arr.Length >= 2)
                {
                    var second = arr[1];
                    if (second is Regex regex)
                    {
                        var content = stream.ReadAllBytesAndRestorePosition();
                        var text = Encoding.ASCII.GetString(content);
                        matched = regex.IsMatch(text);
                        return true;
                    }
                    else if (second is int tailOffset)
                    {
                        // arr[2..] 为若干尾部字节数组，比对任一相等则通过
                        for (int i = 2; i < arr.Length; i++)
                        {
                            if (arr[i] is byte[] tailBytes)
                            {
                                if (ReadAndCompareTail(stream, tailOffset, tailBytes))
                                {
                                    matched = true;
                                    return true;
                                }
                            }
                        }
                        matched = false;
                        return true;
                    }
                    else
                    {
                        // 未知第二元素类型，视为仅头匹配
                        matched = true;
                        return true;
                    }
                }

                matched = true;
                return true;
            }

            return false;
        }

        /// <summary>
        /// 从流头部读取与目标字节数组比较，比较完成后恢复流位置
        /// </summary>
        private static bool ReadAndCompare(Stream stream, byte[] target)
        {
            if (target == null || target.Length == 0) return false;
            var originalPos = stream.CanSeek ? stream.Position : (long?)null;
            try
            {
                var buffer = new byte[target.Length];
                if (!ReadFull(stream, buffer, 0, buffer.Length))
                    return false;
                return buffer.SequenceEqual(target);
            }
            finally
            {
                if (originalPos.HasValue)
                    stream.Position = originalPos.Value;
            }
        }

        /// <summary>
        /// 从流尾部根据偏移读取并比较
        /// </summary>
        private static bool ReadAndCompareTail(Stream stream, int offsetFromEnd, byte[] target)
        {
            if (target == null || target.Length == 0) return false;
            if (!stream.CanSeek) return false;
            var originalPos = stream.Position;
            try
            {
                if (stream.Length < offsetFromEnd + target.Length) return false;
                stream.Position = stream.Length - offsetFromEnd;
                var buffer = new byte[target.Length];
                if (!ReadFull(stream, buffer, 0, buffer.Length)) return false;
                return buffer.SequenceEqual(target);
            }
            finally
            {
                stream.Position = originalPos;
            }
        }

        /// <summary>
        /// 从流中读取指定长度的数据，直到读取到要求长度或 EOF（返回是否读取到完整长度）
        /// </summary>
        private static bool ReadFull(Stream stream, byte[] buffer, int offset, int count)
        {
            if (count <= 0) return true;
            int read;
            int total = 0;
            while (total < count && (read = stream.Read(buffer, offset + total, count - total)) > 0)
            {
                total += read;
            }
            return total == count;
        }

        /// <summary>
        /// 将流全部读取为字节数组并尝试恢复原始 Position（若支持）
        /// </summary>
        private static byte[] ReadAllBytesAndRestorePosition(this Stream stream)
        {
            var originalPos = stream.CanSeek ? stream.Position : (long?)null;
            try
            {
                using var ms = new MemoryStream();
                stream.CopyTo(ms);
                return ms.ToArray();
            }
            finally
            {
                if (originalPos.HasValue)
                    stream.Position = originalPos.Value;
            }
        }
    }
}