Health/Assets/OpenCVForUnity/Examples/MainModules/dnn/TextRecognitionCRNNExample/TextRecognitionCRNNExample.cs

324 lines
12 KiB
C#

#if !UNITY_WSA_10_0
using UnityEngine;
using UnityEngine.SceneManagement;
using System;
using System.Collections;
using System.IO;
using System.Collections.Generic;
using OpenCVForUnity.CoreModule;
using OpenCVForUnity.ImgcodecsModule;
using OpenCVForUnity.DnnModule;
using OpenCVForUnity.ImgprocModule;
using OpenCVForUnity.UnityUtils;
namespace OpenCVForUnityExample
{
/// <summary>
/// Text Recognition CRNN Example
/// This example demonstrates text detection and recognition model using the TextDetectionMode and TextRecognitionModel class.
/// https://github.com/opencv/opencv_zoo/tree/master/models/text_detection_db
/// https://github.com/opencv/opencv_zoo/tree/master/models/text_recognition_crnn
/// https://docs.opencv.org/4.x/d4/d43/tutorial_dnn_text_spotting.html
/// </summary>
public class TextRecognitionCRNNExample : MonoBehaviour
{
// Preprocess input image by resizing to a specific width. It should be multiple by 32.
const float detection_inputSize_w = 736f;
// Preprocess input image by resizing to a specific height. It should be multiple by 32.
const float detection_inputSize_h = 736f;
const double detection_inputScale = 1.0 / 255.0;
Scalar detection_inputMean = new Scalar(122.67891434, 116.66876762, 104.00698793);
// Threshold of the binary map.
const float detection_binary_threshold = 0.3f;
// Threshold of polygons.
const float detection_polygon_threshold = 0.5f;
// Max candidates of polygons.
const int detection_max_candidates = 200;
// The unclip ratio of the detected text region, which determines the output size.
const double detection_unclip_ratio = 2.0;
// Preprocess input image by resizing to a specific width.
const float recogniton_inputSize_w = 100f;
// Preprocess input image by resizing to a specific height.
const float recogniton_inputSize_h = 32f;
const double recogniton_inputScale = 1.0 / 127.5;
Scalar recogniton_inputMean = new Scalar(127.5);
/// <summary>
/// Path to a binary .onnx file contains trained detection network.
/// </summary>
string DETECTIONMODEL_FILENAME = "OpenCVForUnity/dnn/text_detection_DB_IC15_resnet18_2021sep.onnx";
/// <summary>
/// The detection model filepath.
/// </summary>
string detectionmodel_filepath;
/// <summary>
/// Path to a binary .onnx file contains trained recognition network.
/// </summary>
string RECOGNTIONMODEL_FILENAME = "OpenCVForUnity/dnn/text_recognition_CRNN_EN_2021sep.onnx";
/// <summary>
/// The recognition model filepath.
/// </summary>
string recognitionmodel_filepath;
/// <summary>
/// Path to a .txt file contains charset.
/// </summary>
string CHARSETTXT_FILENAME = "OpenCVForUnity/dnn/charset_36_EN.txt";
/// <summary>
/// The charset txt filepath.
/// </summary>
string charsettxt_filepath;
/// <summary>
/// IMAGE_FILENAME
/// </summary>
string IMAGE_FILENAME = "OpenCVForUnity/text/test_text.jpg";
/// <summary>
/// The image filepath.
/// </summary>
string image_filepath;
#if UNITY_WEBGL
IEnumerator getFilePath_Coroutine;
#endif
// Use this for initialization
void Start()
{
#if UNITY_WEBGL
getFilePath_Coroutine = GetFilePath();
StartCoroutine(getFilePath_Coroutine);
#else
detectionmodel_filepath = Utils.getFilePath(DETECTIONMODEL_FILENAME);
recognitionmodel_filepath = Utils.getFilePath(RECOGNTIONMODEL_FILENAME);
charsettxt_filepath = Utils.getFilePath(CHARSETTXT_FILENAME);
image_filepath = Utils.getFilePath(IMAGE_FILENAME);
Run();
#endif
}
#if UNITY_WEBGL
private IEnumerator GetFilePath()
{
var getFilePathAsync_0_Coroutine = Utils.getFilePathAsync(DETECTIONMODEL_FILENAME, (result) =>
{
detectionmodel_filepath = result;
});
yield return getFilePathAsync_0_Coroutine;
var getFilePathAsync_1_Coroutine = Utils.getFilePathAsync(RECOGNTIONMODEL_FILENAME, (result) =>
{
recognitionmodel_filepath = result;
});
yield return getFilePathAsync_1_Coroutine;
var getFilePathAsync_2_Coroutine = Utils.getFilePathAsync(CHARSETTXT_FILENAME, (result) =>
{
charsettxt_filepath = result;
});
yield return getFilePathAsync_2_Coroutine;
var getFilePathAsync_3_Coroutine = Utils.getFilePathAsync(IMAGE_FILENAME, (result) =>
{
image_filepath = result;
});
yield return getFilePathAsync_3_Coroutine;
getFilePath_Coroutine = null;
Run();
}
#endif
// Use this for initialization
void Run()
{
//if true, The error log of the Native side OpenCV will be displayed on the Unity Editor Console.
Utils.setDebugMode(true);
Mat img = Imgcodecs.imread(image_filepath, Imgcodecs.IMREAD_COLOR);
if (img.empty())
{
Debug.LogError(IMAGE_FILENAME + " is not loaded. Please read “StreamingAssets/OpenCVForUnity/dnn/setup_dnn_module.pdf” to make the necessary setup.");
img = new Mat(368, 368, CvType.CV_8UC3, new Scalar(0, 0, 0));
}
//Adust Quad.transform.localScale.
gameObject.transform.localScale = new Vector3(img.width(), img.height(), 1);
Debug.Log("Screen.width " + Screen.width + " Screen.height " + Screen.height + " Screen.orientation " + Screen.orientation);
float imageWidth = img.width();
float imageHeight = img.height();
float widthScale = (float)Screen.width / imageWidth;
float heightScale = (float)Screen.height / imageHeight;
if (widthScale < heightScale)
{
Camera.main.orthographicSize = (imageWidth * (float)Screen.height / (float)Screen.width) / 2;
}
else
{
Camera.main.orthographicSize = imageHeight / 2;
}
TextDetectionModel_DB detectonModel = null;
TextRecognitionModel recognitonModel = null;
Mat croppedMat = null;
Mat croppedGrayMat = null;
if (string.IsNullOrEmpty(detectionmodel_filepath) || string.IsNullOrEmpty(recognitionmodel_filepath) || string.IsNullOrEmpty(charsettxt_filepath))
{
Debug.LogError(DETECTIONMODEL_FILENAME + " or " + RECOGNTIONMODEL_FILENAME + " or " + CHARSETTXT_FILENAME + " is not loaded. Please read “StreamingAssets/OpenCVForUnity/dnn/setup_dnn_module.pdf” to make the necessary setup.");
}
else
{
// Create TextDetectionModel.
detectonModel = new TextDetectionModel_DB(detectionmodel_filepath);
detectonModel.setBinaryThreshold(detection_binary_threshold);
detectonModel.setPolygonThreshold(detection_polygon_threshold);
detectonModel.setUnclipRatio(detection_unclip_ratio);
detectonModel.setMaxCandidates(detection_max_candidates);
detectonModel.setInputParams(detection_inputScale, new Size(detection_inputSize_w, detection_inputSize_h), detection_inputMean);
// Create TextRecognitonModel.
recognitonModel = new TextRecognitionModel(recognitionmodel_filepath);
recognitonModel.setDecodeType("CTC-greedy");
recognitonModel.setVocabulary(loadCharset(charsettxt_filepath));
recognitonModel.setInputParams(recogniton_inputScale, new Size(recogniton_inputSize_w, recogniton_inputSize_h), recogniton_inputMean);
croppedMat = new Mat(new Size(recogniton_inputSize_w, recogniton_inputSize_h), CvType.CV_8SC3);
croppedGrayMat = new Mat(croppedMat.size(), CvType.CV_8SC1);
}
if (detectonModel == null || recognitonModel == null)
{
Imgproc.putText(img, "model file is not loaded.", new Point(5, img.rows() - 30), Imgproc.FONT_HERSHEY_SIMPLEX, 0.7, new Scalar(255, 255, 255), 2, Imgproc.LINE_AA, false);
Imgproc.putText(img, "Please read console message.", new Point(5, img.rows() - 10), Imgproc.FONT_HERSHEY_SIMPLEX, 0.7, new Scalar(255, 255, 255), 2, Imgproc.LINE_AA, false);
}
else
{
TickMeter tickMeter = new TickMeter();
MatOfRotatedRect detectons = new MatOfRotatedRect();
MatOfFloat confidences = new MatOfFloat();
tickMeter.start();
detectonModel.detectTextRectangles(img, detectons, confidences);
tickMeter.stop();
RotatedRect[] detectons_arr = detectons.toArray();
foreach (var rb in detectons_arr)
{
Point[] vertices = new Point[4];
rb.points(vertices);
for (int j = 0; j < 4; ++j)
Imgproc.line(img, vertices[j], vertices[(j + 1) % 4], new Scalar(0, 255, 0), 1);
// Create transformed and cropped image.
fourPointsTransform(img, croppedMat, vertices);
Imgproc.cvtColor(croppedMat, croppedGrayMat, Imgproc.COLOR_BGR2GRAY);
tickMeter.start();
string recognitionResult = recognitonModel.recognize(croppedGrayMat);
tickMeter.stop();
Debug.Log(recognitionResult);
Imgproc.putText(img, recognitionResult, vertices[1], Imgproc.FONT_HERSHEY_SIMPLEX, 0.8, new Scalar(0, 0, 255), 2, Imgproc.LINE_AA, false);
}
Debug.Log("Inference time, ms: " + tickMeter.getTimeMilli());
detectonModel.Dispose();
recognitonModel.Dispose();
croppedMat.Dispose();
croppedGrayMat.Dispose();
}
Imgproc.cvtColor(img, img, Imgproc.COLOR_BGR2RGB);
Texture2D texture = new Texture2D(img.cols(), img.rows(), TextureFormat.RGB24, false);
Utils.matToTexture2D(img, texture);
gameObject.GetComponent<Renderer>().material.mainTexture = texture;
Utils.setDebugMode(false);
}
// Update is called once per frame
void Update()
{
}
/// <summary>
/// Raises the disable event.
/// </summary>
void OnDisable()
{
#if UNITY_WEBGL
if (getFilePath_Coroutine != null)
{
StopCoroutine(getFilePath_Coroutine);
((IDisposable)getFilePath_Coroutine).Dispose();
}
#endif
}
/// <summary>
/// Raises the back button click event.
/// </summary>
public void OnBackButtonClick()
{
SceneManager.LoadScene("OpenCVForUnityExample");
}
protected void fourPointsTransform(Mat src, Mat dst, Point[] vertices)
{
Size outputSize = dst.size();
Point[] targetVertices = new Point[] { new Point(0, outputSize.height - 1),
new Point(0, 0), new Point(outputSize.width - 1, 0),
new Point(outputSize.width - 1, outputSize.height - 1),
};
MatOfPoint2f verticesMat = new MatOfPoint2f(vertices);
MatOfPoint2f targetVerticesMat = new MatOfPoint2f(targetVertices);
Mat rotationMatrix = Imgproc.getPerspectiveTransform(verticesMat, targetVerticesMat);
Imgproc.warpPerspective(src, dst, rotationMatrix, outputSize);
}
protected List<string> loadCharset(string charsetPath)
{
return new List<string>(File.ReadAllLines(charsetPath));
}
}
}
#endif