由爱的恩典机器照管一切

前几天一直在忙着写 iOS 课程的期末项目 Octor（一个笔记软件），还添加了 OCR 功能：

自定义拍照视图

CameraViewController 中实现了自定义的拍照视图。

class CameraViewController: UIViewController {

  private var textDetectionRequest: VNDetectTextRectanglesRequest!

  private var captureSession: AVCaptureSession!
  private var stillImageOutput: AVCapturePhotoOutput!
  private var videoPreviewLayer: AVCaptureVideoPreviewLayer!

  private var takePhotoButton: UIButton!

  public weak var delegate: CameraPhotoDelegate?

  override func viewDidLoad() {
    super.viewDidLoad()
    self.navigationItem.title = "文字识别"
    setupBackButton()
    setupTextDetection()
    setupCamera()
    addTakePhotoButton()
  }

  override func viewWillDisappear(_ animated: Bool) {
    super.viewWillDisappear(animated)
    self.captureSession.stopRunning()
  }

  // ......

  func setupCamera() {
    captureSession = AVCaptureSession()
    captureSession.sessionPreset = .high

    guard let captureDevice = AVCaptureDevice.default(for: .video) else { return }
    guard let input = try? AVCaptureDeviceInput(device: captureDevice) else { return }

    if captureSession.canAddInput(input) {
      videoPreviewLayer = AVCaptureVideoPreviewLayer(session: captureSession)
      videoPreviewLayer.frame = view.frame
      videoPreviewLayer.videoGravity = .resize
      view.layer.addSublayer(videoPreviewLayer)
      // add input
      captureSession.addInput(input)
      // add image output
      stillImageOutput = AVCapturePhotoOutput()
      if captureSession.canAddOutput(stillImageOutput) {
        captureSession.addOutput(stillImageOutput)
      }
      // add video data output
      let videoDataOutput = AVCaptureVideoDataOutput()
      videoDataOutput.setSampleBufferDelegate(self, queue: DispatchQueue(label: "Buffer Queue", qos: .userInteractive, attributes: .concurrent, autoreleaseFrequency: .inherit, target: nil))
      if captureSession.canAddOutput(videoDataOutput) {
        captureSession.addOutput(videoDataOutput)
      }
      self.captureSession.startRunning()
    }
  }

  // ......
}

通过 Vision 框架检测摄像头拍摄到的文字，并根据文字大小和位置绘制出文字的边界。

func setupTextDetection() {
  textDetectionRequest = VNDetectTextRectanglesRequest(completionHandler: handleDetection)
  textDetectionRequest!.reportCharacterBoxes = true
}

private func handleDetection(request: VNRequest, error: Error?) {
  guard let detectionResults = request.results else {
    return
  }
  let textResults = detectionResults.map() {
    return $0 as? VNTextObservation
  }
  if textResults.isEmpty {
    return
  }
  DispatchQueue.main.async {
    // remove old rects
    self.view.layer.sublayers?.removeSubrange(2...)
    let viewWidth = self.view.frame.size.width
    let viewHeight = self.view.frame.size.height
    for region in textResults {
      guard let boxes = region?.characterBoxes else {
        return
      }
      // iter all boxes in current region
      var xMin = CGFloat.greatestFiniteMagnitude
      var xMax: CGFloat = 0
      var yMin = CGFloat.greatestFiniteMagnitude
      var yMax: CGFloat = 0
      for box in boxes {
        xMin = min(xMin, box.bottomLeft.x)
        xMax = max(xMax, box.bottomRight.x)
        yMin = min(yMin, box.bottomRight.y)
        yMax = max(yMax, box.topRight.y)
      }
      // position and size of the rect for current region
      let x = xMin * viewWidth
      let y = (1 - yMax) * viewHeight
      let width = (xMax - xMin) * viewWidth
      let height = (yMax - yMin) * viewHeight
      // draw a new rect for current region
      let layer = CALayer()
      layer.frame = CGRect(x: x, y: y, width: width, height: height)
      layer.borderWidth = 1
      layer.borderColor = UIColor.systemTeal.cgColor
      self.view.layer.addSublayer(layer)
    }
    // set button to the front
    self.takePhotoButton.layer.zPosition = 1
  }
}

视图中有一个白色按钮，点击该按钮将会跳出选择框，显示图片以及选项。

func addTakePhotoButton() {
  let buttonDiameter = CGFloat(50)
  takePhotoButton = UIButton()
  takePhotoButton.translatesAutoresizingMaskIntoConstraints = false
  takePhotoButton.backgroundColor = .white
  takePhotoButton.layer.cornerRadius = buttonDiameter / 2
  takePhotoButton.clipsToBounds = true
  takePhotoButton.addTarget(self, action: #selector(didTapTakePhoto), for: .touchUpInside)
  self.view.addSubview(takePhotoButton)
  // layout
  takePhotoButton.centerXAnchor.constraint(equalTo: self.view.centerXAnchor).isActive = true
  takePhotoButton.bottomAnchor.constraint(equalTo: self.view.bottomAnchor, constant: -1.5 * buttonDiameter).isActive = true
  takePhotoButton.widthAnchor.constraint(equalToConstant: buttonDiameter).isActive = true
  takePhotoButton.heightAnchor.constraint(equalToConstant: buttonDiameter).isActive = true
}

@objc func didTapTakePhoto(sender: UIButton!) {
  let settings = AVCapturePhotoSettings(format: [AVVideoCodecKey: AVVideoCodecType.jpeg])
  self.stillImageOutput.capturePhoto(with: settings, delegate: self)
}

extension CameraViewController: AVCapturePhotoCaptureDelegate {

  func photoOutput(_ output: AVCapturePhotoOutput, didFinishProcessingPhoto photo: AVCapturePhoto, error: Error?) {
    guard let imageData = photo.fileDataRepresentation() else { return }
    let image: UIImage! = UIImage(data: imageData)

    let photoAlertController = UIAlertController(title: "当前图片", message: nil, preferredStyle: .alert)
    photoAlertController.addImage(image: image)
    photoAlertController.addAction(UIAlertAction(title: "识别文字", style: .default) { (alert) -> Void in
      self.delegate?.onCameraPhotoReady(image: image)
      self.navigationController?.popViewController(animated: true)
    })
    photoAlertController.addAction(UIAlertAction(title: "重新选择", style: .default, handler: nil))
    photoAlertController.addAction(UIAlertAction(title: "丢弃", style: .cancel) { (alert) -> Void in
      self.navigationController?.popViewController(animated: true)
    })

    present(photoAlertController, animated: true)
  }

}

图片在 addImage 函数中需要进行等比例缩小。

// MARK: - UIAlertController Extension

extension UIAlertController {

  func addImage(image: UIImage) {
    let imageAction = UIAlertAction(title: "", style: .default)
    imageAction.isEnabled = false

    let maxSize = CGSize(width: 245, height: 300)
    var scaledImage: UIImage! = image.scale(maxSize: maxSize)

    if image.size.height > image.size.width {
      // center the image
      let left = (maxSize.width - scaledImage.size.width) / 2
      scaledImage = scaledImage?.withAlignmentRectInsets(UIEdgeInsets(top: 0, left: -left, bottom: 0, right: 0))
    }

    imageAction.setValue(scaledImage.withRenderingMode(.alwaysOriginal), forKey: "image")
    self.addAction(imageAction)
  }

}

// MARK: - UIImage Extension

extension UIImage {

  func scale(maxSize: CGSize) -> UIImage? {
    var ratio: CGFloat!
    if size.width > size.height {
      ratio = maxSize.width / size.width
    }
    else {
      ratio = maxSize.height / size.height
    }
    let targetSize = CGSize(width: size.width * ratio, height: size.height * ratio)
    // draw a new image
    UIGraphicsBeginImageContext(targetSize)
    draw(in: CGRect(origin: .zero, size: targetSize))
    let scaledImage = UIGraphicsGetImageFromCurrentImageContext()
    UIGraphicsEndImageContext()
    return scaledImage
  }

}

文字识别

TextRecognizer 用于实现文字识别，使用了 Tesseract OCR iOS 框架。

class TextRecognizer {

  private var tesseract: G8Tesseract!

  init() {
    tesseract = G8Tesseract(language: "eng+fra")
    tesseract?.engineMode = .tesseractCubeCombined
    tesseract?.pageSegmentationMode = .auto
  }

  func recognize(_ image: UIImage) -> String? {
    let scaledImage = image.scale(maxDimension: 1000) ?? image
    let preprocessedImage = scaledImage.preprocess() ?? scaledImage
    tesseract?.image = preprocessedImage
    tesseract?.recognize()
    return tesseract?.recognizedText
  }

}

由于 Tesseract OCR 框架的限制，这里同样需要将图像等比例缩小。preprocess 函数通过 GPUImage 框架的 GPUImageAdaptiveThresholdFilter 组件提高图像的质量。

GPUImage 文档的解释：

GPUImageAdaptiveThresholdFilter: Determines the local luminance around a pixel, then turns the pixel black if it is below that local luminance and white if above. This can be useful for picking out text under varying lighting conditions.

// MARK: - UIImage Extension

extension UIImage {

  func scale(maxDimension: CGFloat) -> UIImage? {
    // keep the width-to-height ratio constant
    var targetSize = CGSize(width: maxDimension, height: maxDimension)
    if size.width > size.height {
      // keep width to maxDimension and update height
      targetSize.height = size.height / size.width * targetSize.width
    }
    else {
      // keep height to maxDimension and update width
      targetSize.width = size.width / size.height * targetSize.height
    }
    // draw a new image
    UIGraphicsBeginImageContext(targetSize)
    draw(in: CGRect(origin: .zero, size: targetSize))
    let scaledImage = UIGraphicsGetImageFromCurrentImageContext()
    UIGraphicsEndImageContext()
    return scaledImage
  }

  func preprocess() -> UIImage? {
    let stillImageFilter = GPUImageAdaptiveThresholdFilter()
    stillImageFilter.blurRadiusInPixels = 15.0
    let filteredImage = stillImageFilter.image(byFilteringImage: self)
    return filteredImage
  }

}

由爱的恩典机器照管一切

文章开头的演示 GIF 里，扫描出来的文本是布劳提根的诗歌 All Watched Over by Machines of Loving Grace，它比上面的代码更有意思。

I like to think (and
the sooner the better!)
of a cybernetic meadow
where mammals and computers
live together in mutually
programming harmony
like pure water
touching clear sky.

I like to think
  (right now, please!)
of a cybernetic forest
filled with pines and electronics
where deer stroll peacefully
past computers
as if they were flowers
with spinning blossoms.

I like to think
  (it has to be!)
of a cybernetic ecology
where we are free of our labors
and joined back to nature,
returned to our mammal
brothers and sisters,
and all watched over
by machines of loving grace.

@whichxjy

自定义拍照视图

文字识别

由爱的恩典机器照管一切