module(... or 'crawl', package.seeall)
local httputils = require('httputils')
function scanHost(host, startURL, useProxy)
local curl = 'curl -s -H "Accept: text/html" --max-filesize 100000 "%s"'
if (useProxy) then
curl = 'curl -s -H "Accept: text/html" -x http://fw.net.local:8081 --proxy-user proxyuser --max-filesize 100000 "%s"'
end
local urls = {}
local urls_queued = {}
local abshost = "http://"..host
function addUrl(url)
if (urls_queued[url] == nil and #url < 255) then
table.insert(urls, url)
urls_queued[url] = true
end
end
addUrl(startURL)
while #urls > 0 do
local url = urls[1];
local cmd = string.format(curl, url)
table.remove(urls, 1)
print(url)
local cd = io.popen (cmd , 'r')
content = cd:read("*all")
io.close(cd)
--print('content:')
--print(content)
--os.exit(1)
local absURL = "[=(]['\"]([^'\"]-://[^'\"]-)['\"]"
local relURL = "[=(]['\"]([%s]-/[^'\"]-)['\"]"
for word in string.gmatch(content, absURL) do
local u = httputils.HtmlDecode(word)
if string.find(u, host) then
addUrl(u)
end
end
for word in string.gmatch(content, relURL)
do
local u = httputils.HtmlDecode(word)
if (string.sub(u, 1, 4) ~= "/wEP") and (string.find(u, '//') == nil) then
u = abshost..u
addUrl(u)
end
end
end
end
--scanHost("localhost:1467", "http://localhost:1467/", false);
scanHost("localhost:1470", "http://localhost:1470/nl-NL/Home.aspx", false);
module(... or 'httputils', package.seeall)
local htmlEncodeEntities = {
['&'] = 'amp',
[' '] = 'nbsp',
['"'] = 'quot',
['<'] = 'lt',
['>'] = 'gt'
}
local htmlDecodeEntities = { }
local function fillHtmlDecodeEntities()
for k,v in pairs(htmlEncodeEntities) do
htmlDecodeEntities[v] = k
end
end
fillHtmlDecodeEntities();
local function EncodeHtmlEntity(char)
if (htmlEncodeEntities[char]) then
return '&'..htmlEncodeEntities[char]..';'
else
return char
end
end
local function DecodeHtmlEntity(entity)
if (htmlDecodeEntities[entity]) then
return htmlDecodeEntities[entity]
else
return entity
end
end
function HtmlEncode(text)
return string.gsub(text, "(.)", EncodeHtmlEntity)
end
function HtmlDecode(html)
return string.gsub(html, "&(.-);", DecodeHtmlEntity)
end
function UrlEncode()
end
function UrlDecode()
end24300cookie-checkLua Crawl Host