Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ Skip url (by extension) (skip pdf AND xml url):

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --skipext pdf --skipext xml

Exclude url :

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --exclude "action=edit"

Read the robots.txt to ignore some url:

>>> python main.py --domain http://blog.lesite.us --output sitemap.xml --parserobots
17 changes: 14 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,24 @@ def can_fetch(parserobots, rp, link):
print ("Error during parsing robots.txt")
return True


def exclude_url(exclude, link):
if exclude:
for ex in exclude:
if ex in link:
return False
return True
else:
return True

# Gestion des parametres
parser = argparse.ArgumentParser(version="0.1",description='Crawler pour la creation de site map')
parser.add_argument('--domain', action="store", default="",required=True, help="Target domain (ex: http://blog.lesite.us)")
parser.add_argument('--skipext', action="append", default=[], required=False, help="File extension to skip")
parser.add_argument('--parserobots', action="store_true", default=False, required=False, help="Ignore file defined in robots.txt")
parser.add_argument('--debug', action="store_true", default=False, help="Enable debug mode")
parser.add_argument('--output', action="store", default=None, help="Output file")
parser.add_argument('--exclude', action="append", default=[], required=False, help="Regular expression for exclude URL")

arg = parser.parse_args()

Expand Down Expand Up @@ -113,11 +124,11 @@ def can_fetch(parserobots, rp, link):
parsed_link = urlparse(link)
domain_link = parsed_link.netloc
target_extension = os.path.splitext(parsed_link.path)[1][1:]
if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link) and ("javascript:" not in link) and (target_extension not in arg.skipext):

if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)):
print ("<url><loc>"+link+"</loc></url>", file=outputFile)
tocrawl.add(link)
print (footer, file=outputFile)

if arg.debug:
print ("Number of link crawled : {0}".format(len(crawled)))
print ("Number of link crawled : {0}".format(len(crawled)))