Lua Crawl Host

Date: 2016-07-04
module(... or 'crawl', package.seeall)

local httputils = require('httputils')

function scanHost(host, startURL, useProxy)
	
	local curl = 'curl -s -H "Accept: text/html" --max-filesize 100000 "%s"'
	
	if (useProxy) then
		curl = 'curl -s -H "Accept: text/html" -x http://fw.net.local:8081 --proxy-user proxyuser --max-filesize 100000 "%s"'
	end
	
	local urls = {}
	local urls_queued = {}
	
	local abshost = "http://"..host
	
	function addUrl(url)	
		if (urls_queued[url] == nil and #url < 255) then
			table.insert(urls, url)
			urls_queued[url] = true
		end
	end
	
	addUrl(startURL)

	while #urls > 0 do
	
		local url = urls[1];
		local cmd = string.format(curl, url)			
		table.remove(urls, 1)
		
		print(url)					
		local cd = io.popen (cmd , 'r')	
		content = cd:read("*all")		
		io.close(cd)
		
		--print('content:')
		--print(content)
		--os.exit(1)		
		local absURL = "[=(]['\"]([^'\"]-://[^'\"]-)['\"]"
		local relURL = "[=(]['\"]([%s]-/[^'\"]-)['\"]"
		
		for word in string.gmatch(content, absURL) do 		
			local u = httputils.HtmlDecode(word)
			if string.find(u, host) then
				addUrl(u)
			end
		end
		
		for word in string.gmatch(content, relURL) 
		do 
			local u = httputils.HtmlDecode(word)
			if (string.sub(u, 1, 4) ~= "/wEP") and (string.find(u, '//') == nil) then
				u = abshost..u		
				addUrl(u)
			end
		end
	end
end

--scanHost("localhost:1467", "http://localhost:1467/", false);
scanHost("localhost:1470", "http://localhost:1470/nl-NL/Home.aspx", false);

module(... or 'httputils', package.seeall)

local htmlEncodeEntities = { 
	['&'] = 'amp',  
	[' '] = 'nbsp',  
	['"'] = 'quot',  
	['<'] = 'lt',  
	['>'] = 'gt'
		}

local htmlDecodeEntities = { }

local function fillHtmlDecodeEntities()
	for k,v in pairs(htmlEncodeEntities) do
		htmlDecodeEntities[v] = k
	end
end
fillHtmlDecodeEntities();

local function EncodeHtmlEntity(char)
	if (htmlEncodeEntities[char]) then
		return '&'..htmlEncodeEntities[char]..';'
	else
		return char
	end
end

local function DecodeHtmlEntity(entity)
	if (htmlDecodeEntities[entity]) then
		return htmlDecodeEntities[entity]
	else
		return entity
	end
end

function HtmlEncode(text)
	return string.gsub(text, "(.)", EncodeHtmlEntity)
end

function HtmlDecode(html)
	return string.gsub(html, "&(.-);", DecodeHtmlEntity)
end

function UrlEncode()

end

function UrlDecode()

end

2430cookie-checkLua Crawl Host