Files
Atomx/Atomx.Utils/Files/FileTypes.cs
2025-12-02 13:10:10 +08:00

285 lines
11 KiB
C#
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text;
using System.Text.RegularExpressions;
namespace Atomx.Utils.Files
{
/// <summary>
/// 常用文件头识别工具(改进版:修复编译器警告、改善流读取安全性与类型判断)
/// </summary>
public static class FileTypes
{
// 保持原有 public 字段签名以兼容现有代码
public static readonly Dictionary<string, byte[]> ImageHeader = new();
public static readonly Dictionary<string, object> FilesHeader = new();
public static readonly Dictionary<string, object> VideoHeader = new();
static FileTypes()
{
ImageHeader.Add("gif", new byte[] { 71, 73, 70, 56, 57, 97 });
ImageHeader.Add("bmp", new byte[] { 66, 77 });
ImageHeader.Add("jpg", new byte[] { 255, 216, 255 });
ImageHeader.Add("png", new byte[] { 137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82 });
FilesHeader.Add("pdf", new byte[] { 37, 80, 68, 70, 45, 49, 46, 53 });
FilesHeader.Add("docx", new object[] { new byte[] { 80, 75, 3, 4, 20, 0, 6, 0, 8, 0, 0, 0, 33 }, new Regex(@"word/_rels/document\.xml\.rels", RegexOptions.IgnoreCase) });
FilesHeader.Add("xlsx", new object[] { new byte[] { 80, 75, 3, 4, 20, 0, 6, 0, 8, 0, 0, 0, 33 }, new Regex(@"xl/_rels/workbook\.xml\.rels", RegexOptions.IgnoreCase) });
FilesHeader.Add("pptx", new object[] { new byte[] { 80, 75, 3, 4, 20, 0, 6, 0, 8, 0, 0, 0, 33 }, new Regex(@"ppt/_rels/presentation\.xml\.rels", RegexOptions.IgnoreCase) });
FilesHeader.Add("doc", new object[] { new byte[] { 208, 207, 17, 224, 161, 177, 26, 225 }, new Regex(@"microsoft( office)? word(?![\s\S]*?microsoft)", RegexOptions.IgnoreCase) });
FilesHeader.Add("xls", new object[] { new byte[] { 208, 207, 17, 224, 161, 177, 26, 225 }, new Regex(@"microsoft( office)? excel(?![\s\S]*?microsoft)", RegexOptions.IgnoreCase) });
FilesHeader.Add("ppt", new object[] { new byte[] { 208, 207, 17, 224, 161, 177, 26, 225 }, new Regex(@"c.u.r.r.e.n.t. .u.s.e.r(?![\s\S]*?[a-z])", RegexOptions.IgnoreCase) });
FilesHeader.Add("avi", new byte[] { 65, 86, 73, 32 });
FilesHeader.Add("mpg", new byte[] { 0, 0, 1, 0xBA });
FilesHeader.Add("mpeg", new byte[] { 0, 0, 1, 0xB3 });
FilesHeader.Add("rar", new byte[] { 82, 97, 114, 33, 26, 7 });
FilesHeader.Add("zip", new byte[] { 80, 75, 3, 4 });
VideoHeader.Add("avi", new byte[] { 65, 86, 73, 32 });
VideoHeader.Add("mpg", new byte[] { 0, 0, 1, 0xBA });
VideoHeader.Add("mpeg", new byte[] { 0, 0, 1, 0xB3 });
}
/// <summary>
/// 检测视频类型扩展方法
/// </summary>
public static string VideoType(this Stream stream)
{
if (stream == null) throw new ArgumentNullException(nameof(stream));
foreach (var kv in VideoHeader)
{
if (TryMatchHeader(stream, kv.Value, out var matched))
{
if (matched)
return kv.Key;
}
}
return string.Empty;
}
/// <summary>
/// 检测常规文件类型扩展方法
/// </summary>
public static string FileType(this Stream stream)
{
if (stream == null) throw new ArgumentNullException(nameof(stream));
foreach (var kv in FilesHeader)
{
if (TryMatchHeader(stream, kv.Value, out var matched))
{
if (matched)
return kv.Key;
}
}
return string.Empty;
}
/// <summary>
/// 检测图片类型扩展方法
/// </summary>
public static string ImageType(this Stream stream)
{
if (stream == null) throw new ArgumentNullException(nameof(stream));
foreach (var kv in ImageHeader)
{
var header = kv.Value;
if (header == null || header.Length == 0) continue;
if (ReadAndCompare(stream, header))
return kv.Key;
}
// 额外尝试判断是否是纯文本(防止误判),若为纯文本则返回空
// 只读取流内容进行检查,谨慎处理大流(可能会分配较大内存)
try
{
var content = stream.ReadAllBytesAndRestorePosition();
if (content.Length > 0)
{
var encodings = new[] { Encoding.ASCII, Encoding.UTF8 };
foreach (var enc in encodings)
{
var text = enc.GetString(content);
if (Regex.IsMatch(text, @"^[^\u0000-\u0008\u000B-\u000C\u000E-\u001F]*$"))
{
// 认为是文本,非图片
return string.Empty;
}
}
// Windows-936 编码仅在支持 CodePages 的情况下尝试使用
try
{
var cp936 = Encoding.GetEncoding(936);
var text936 = cp936.GetString(content);
if (Regex.IsMatch(text936, @"^[^\u0000-\u0008\u000B-\u000C\u000E-\u001F]*$"))
return string.Empty;
}
catch
{
// 在不支持 CodePages 的平台上忽略
}
}
}
catch
{
// 忽略检查错误,回退为无法识别
}
return string.Empty;
}
/// <summary>
/// 将流完整读取为字节数组(更高效且保持原位置)
/// </summary>
public static byte[] StreamToBytes(this Stream stream)
{
if (stream == null) throw new ArgumentNullException(nameof(stream));
return stream.ReadAllBytesAndRestorePosition();
}
// ---------- 辅助方法 ----------
/// <summary>
/// 根据字节头或复杂描述object[])进行匹配判断。
/// object 类型支持:
/// - byte[] : 仅比较头部字节
/// - object[] : 第一个元素为 byte[] 头部;第二个元素可以为 Regex 或 int(表示尾部偏移),后面可跟要比较的尾部 byte[] 列表
/// </summary>
private static bool TryMatchHeader(Stream stream, object value, out bool matched)
{
matched = false;
if (value == null) return false;
if (value is byte[] headerOnly)
{
matched = ReadAndCompare(stream, headerOnly);
return true;
}
if (value is object[] arr && arr.Length > 0 && arr[0] is byte[] header)
{
// 先比较头部
if (!ReadAndCompare(stream, header))
return true; // header no match -> not this type
// 若仅头部匹配,后续根据第二元素进一步校验
if (arr.Length >= 2)
{
var second = arr[1];
if (second is Regex regex)
{
var content = stream.ReadAllBytesAndRestorePosition();
var text = Encoding.ASCII.GetString(content);
matched = regex.IsMatch(text);
return true;
}
else if (second is int tailOffset)
{
// arr[2..] 为若干尾部字节数组,比对任一相等则通过
for (int i = 2; i < arr.Length; i++)
{
if (arr[i] is byte[] tailBytes)
{
if (ReadAndCompareTail(stream, tailOffset, tailBytes))
{
matched = true;
return true;
}
}
}
matched = false;
return true;
}
else
{
// 未知第二元素类型,视为仅头匹配
matched = true;
return true;
}
}
matched = true;
return true;
}
return false;
}
/// <summary>
/// 从流头部读取与目标字节数组比较,比较完成后恢复流位置
/// </summary>
private static bool ReadAndCompare(Stream stream, byte[] target)
{
if (target == null || target.Length == 0) return false;
var originalPos = stream.CanSeek ? stream.Position : (long?)null;
try
{
var buffer = new byte[target.Length];
if (!ReadFull(stream, buffer, 0, buffer.Length))
return false;
return buffer.SequenceEqual(target);
}
finally
{
if (originalPos.HasValue)
stream.Position = originalPos.Value;
}
}
/// <summary>
/// 从流尾部根据偏移读取并比较
/// </summary>
private static bool ReadAndCompareTail(Stream stream, int offsetFromEnd, byte[] target)
{
if (target == null || target.Length == 0) return false;
if (!stream.CanSeek) return false;
var originalPos = stream.Position;
try
{
if (stream.Length < offsetFromEnd + target.Length) return false;
stream.Position = stream.Length - offsetFromEnd;
var buffer = new byte[target.Length];
if (!ReadFull(stream, buffer, 0, buffer.Length)) return false;
return buffer.SequenceEqual(target);
}
finally
{
stream.Position = originalPos;
}
}
/// <summary>
/// 从流中读取指定长度的数据,直到读取到要求长度或 EOF返回是否读取到完整长度
/// </summary>
private static bool ReadFull(Stream stream, byte[] buffer, int offset, int count)
{
if (count <= 0) return true;
int read;
int total = 0;
while (total < count && (read = stream.Read(buffer, offset + total, count - total)) > 0)
{
total += read;
}
return total == count;
}
/// <summary>
/// 将流全部读取为字节数组并尝试恢复原始 Position若支持
/// </summary>
private static byte[] ReadAllBytesAndRestorePosition(this Stream stream)
{
var originalPos = stream.CanSeek ? stream.Position : (long?)null;
try
{
using var ms = new MemoryStream();
stream.CopyTo(ms);
return ms.ToArray();
}
finally
{
if (originalPos.HasValue)
stream.Position = originalPos.Value;
}
}
}
}