#if !UNITY_WSA_10_0 using OpenCVForUnity.CoreModule; using OpenCVForUnity.DnnModule; using OpenCVForUnity.ImgprocModule; using OpenCVForUnity.ObjdetectModule; using System; using System.Collections.Generic; using System.Linq; using System.Runtime.InteropServices; using System.Text; using UnityEngine; namespace OpenCVForUnityExample.DnnModel { /// /// Referring to https://github.com/opencv/opencv_zoo/tree/main/models/facial_expression_recognition /// public class FacialExpressionRecognizer { int backend; int target; string inputName = "data"; string outputName = "label"; Size input_size = new Size(112, 112); Scalar mean = new Scalar(0.5, 0.5, 0.5); Scalar std = new Scalar(0.5, 0.5, 0.5); Net facial_expression_recognition_net; List classNames; List palette; Mat input_sizeMat; Mat getDataMat; FaceRecognizerSF faceRecognizer; public FacialExpressionRecognizer(string modelFilepath, string SF_modelFilepath, string SF_configFilepath, int backend = Dnn.DNN_BACKEND_OPENCV, int target = Dnn.DNN_TARGET_CPU) { // initialize if (!string.IsNullOrEmpty(modelFilepath)) { facial_expression_recognition_net = Dnn.readNet(modelFilepath); } if (!string.IsNullOrEmpty(SF_modelFilepath)) { faceRecognizer = FaceRecognizerSF.create(SF_modelFilepath, SF_configFilepath, backend, target); } this.backend = backend; this.target = target; facial_expression_recognition_net.setPreferableBackend(this.backend); facial_expression_recognition_net.setPreferableTarget(this.target); classNames = new List(); classNames.Add("angry"); classNames.Add("disgust"); classNames.Add("fearful"); classNames.Add("happy"); classNames.Add("neutral"); classNames.Add("sad"); classNames.Add("surprised"); palette = new List(); palette.Add(new Scalar(255, 56, 56, 255)); palette.Add(new Scalar(82, 0, 133, 255)); palette.Add(new Scalar(52, 69, 147, 255)); palette.Add(new Scalar(255, 178, 29, 255)); palette.Add(new Scalar(55, 55, 55, 255)); palette.Add(new Scalar(100, 115, 255, 255)); palette.Add(new Scalar(255, 112, 31, 255)); } protected virtual Mat preprocess(Mat image, Mat bbox = null) { if (input_sizeMat == null) input_sizeMat = new Mat(input_size, CvType.CV_8UC3); if (bbox != null && faceRecognizer != null) { alignCrop(image, bbox, input_sizeMat); } else { Imgproc.resize(image, input_sizeMat, input_size); } // Create a 4D blob from a frame. Mat blob; blob = Dnn.blobFromImage(input_sizeMat, 1.0 / 255.0, input_sizeMat.size(), Scalar.all(0), true, false, CvType.CV_32F); // HWC to NCHW, BGR to RGB int c = input_sizeMat.channels(); int h = input_sizeMat.height(); int w = input_sizeMat.width(); Mat blob_cxhxw = blob.reshape(1, new int[] { c, h, w });// [c, h, w] for (int i = 0; i < c; ++i) { Mat blob_1xhw = blob_cxhxw.row(i).reshape(1, 1);// [1, h, w] => [1, h * w] // Subtract blob by mean. Core.subtract(blob_1xhw, new Scalar(mean.val[i]), blob_1xhw); // Divide blob by std. Core.divide(blob_1xhw, new Scalar(std.val[i]), blob_1xhw); } return blob;// [1, 112, 112, 3] } public virtual Mat infer(Mat image, Mat bbox = null) { // cheack if (image.channels() != 3) { Debug.Log("The input image must be in BGR format."); return new Mat(); } // Preprocess Mat input_blob = preprocess(image, bbox); // Forward facial_expression_recognition_net.setInput(input_blob, inputName); Mat output_blob = facial_expression_recognition_net.forward(outputName); // Postprocess Mat results = postprocess(output_blob); input_blob.Dispose(); return results; } protected virtual Mat postprocess(Mat output_blob) { Mat results = softmax(output_blob); return results;// [1, 7] } protected virtual Mat softmax(Mat src) { Mat dst = src.clone(); Core.MinMaxLocResult result = Core.minMaxLoc(src); Scalar max = new Scalar(result.maxVal); Core.subtract(src, max, dst); Core.exp(dst, dst); Scalar sum = Core.sumElems(dst); Core.divide(dst, sum, dst); return dst; } public virtual void visualize(Mat image, List results, Mat faces, bool print_results = false, bool isRGB = false) { if (image.IsDisposed) return; if (results.Count != faces.rows()) return; StringBuilder sb = null; if (print_results) sb = new StringBuilder(); for (int i = 0; i < results.Count; ++i) { float[] face_box = new float[4]; faces.get(i, 0, face_box); float left = face_box[0] + 2; float top = face_box[1] + 2; float right = face_box[0] + face_box[2] - 2; float bottom = face_box[1] + face_box[3] - 2; ClassificationData bmData = getBestMatchData(results[i]); int classId = (int)bmData.cls; string label = getClassLabel(bmData.cls) + ", " + String.Format("{0:0.0000}", bmData.conf); Scalar c = palette[classId % palette.Count]; Scalar color = isRGB ? c : new Scalar(c.val[2], c.val[1], c.val[0], c.val[3]); // draw box Imgproc.rectangle(image, new Point(left, top), new Point(right, bottom), color, 2); // draw label int[] baseLine = new int[1]; Size labelSize = Imgproc.getTextSize(label, Imgproc.FONT_HERSHEY_SIMPLEX, 0.5, 1, baseLine); top = Mathf.Max((float)top, (float)labelSize.height); Imgproc.rectangle(image, new Point(left, top + 2), new Point(left + labelSize.width, top + labelSize.height + baseLine[0] + 2), color, Core.FILLED); Imgproc.putText(image, label, new Point(left, top + labelSize.height + 2), Imgproc.FONT_HERSHEY_SIMPLEX, 0.5, Scalar.all(255), 1, Imgproc.LINE_AA); // Print results if (print_results) { sb.AppendLine(String.Format("-----------expression {0}-----------", i + 1)); sb.AppendLine(String.Format("Best match: " + getClassLabel(bmData.cls) + ", " + bmData)); } } if (print_results) Debug.Log(sb); } public virtual void dispose() { if (facial_expression_recognition_net != null) facial_expression_recognition_net.Dispose(); if (input_sizeMat != null) input_sizeMat.Dispose(); input_sizeMat = null; if (getDataMat != null) getDataMat.Dispose(); getDataMat = null; if (faceRecognizer != null) faceRecognizer.Dispose(); } private void alignCrop(Mat src_img, Mat face_box, Mat aligned_img) { // The alignCrop method of FaceRecognizerSF is used here, because the implementation of the alignment and crop process is cumbersome. // This method returns an image of 112x112 pixels, the same as the Facial Expression Recognition model input. faceRecognizer.alignCrop(src_img, face_box, aligned_img); } [StructLayout(LayoutKind.Sequential)] public readonly struct ClassificationData { public readonly float cls; public readonly float conf; // sizeof(ClassificationData) public const int Size = 2 * sizeof(float); public ClassificationData(int cls, float conf) { this.cls = cls; this.conf = conf; } public override string ToString() { return "cls:" + cls + " conf:" + conf; } }; public virtual ClassificationData[] getData(Mat results) { if (results.empty()) return new ClassificationData[0]; int num = results.cols(); if (getDataMat == null) { getDataMat = new Mat(num, 2, CvType.CV_32FC1); float[] arange = Enumerable.Range(0, num).Select(i => (float)i).ToArray(); getDataMat.col(0).put(0, 0, arange); } Mat results_numx1 = results.reshape(1, num); results_numx1.copyTo(getDataMat.col(1)); var dst = new ClassificationData[num]; OpenCVForUnity.UtilsModule.MatUtils.copyFromMat(getDataMat, dst); return dst; } public virtual ClassificationData[] getSortedData(Mat results, int topK = 5) { if (results.empty()) return new ClassificationData[0]; int num = results.cols(); if (topK < 1 || topK > num) topK = num; var sortedData = getData(results).OrderByDescending(x => x.conf).Take(topK).ToArray(); return sortedData; } public virtual ClassificationData getBestMatchData(Mat results) { if (results.empty()) return new ClassificationData(); Core.MinMaxLocResult minmax = Core.minMaxLoc(results); return new ClassificationData((int)minmax.maxLoc.x, (float)minmax.maxVal); } public virtual string getClassLabel(float id) { int classId = (int)id; string className = string.Empty; if (classNames != null && classNames.Count != 0) { if (classId >= 0 && classId < (int)classNames.Count) { className = classNames[classId]; } } if (string.IsNullOrEmpty(className)) className = classId.ToString(); return className; } } } #endif