提交 e2a7bbde 编写于 作者: 骆昊的技术专栏's avatar 骆昊的技术专栏

'更新了爬虫部分的代码'

上级 02918d50
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>web1804</name>
<comment>Create By HBuilder</comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>com.aptana.ide.core.unifiedBuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>com.aptana.projects.webnature</nature>
</natures>
<filteredResources>
<filter>
<id>1531706375771</id>
<name></name>
<type>10</type>
<matcher>
<id>org.eclipse.ui.ide.orFilterMatcher</id>
<arguments>
<matcher>
<id>org.eclipse.ui.ide.multiFilter</id>
<arguments>1.0-projectRelativePath-matches-false-false-bin</arguments>
</matcher>
<matcher>
<id>org.eclipse.ui.ide.multiFilter</id>
<arguments>1.0-projectRelativePath-matches-false-false-setting</arguments>
</matcher>
</arguments>
</matcher>
</filter>
</filteredResources>
</projectDescription>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<script>
/*
var sum = 0;
for (var i = 1; i <= 100; i += 1) {
sum += i;
}
window.alert(sum);
*/
/*
var sum = 0;
var i = 1;
while (i <= 100) {
sum += i;
i += 1;
}
window.alert(sum);
*/
var sum = 0;
var i = 1;
do {
sum += i;
i += 1;
} while (i <= 100);
window.alert(sum);
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
.right {
float: right;
width: 250px;
height: 30px;
font-size: 16px;
line-height: 30px;
background-color: blue;
color: yellow;
text-align: center;
}
</style>
</head>
<body>
<div id="time" class="right"></div>
<script>
function showDateTime() {
var array = ["", "", "", "", "", "", ""];
var date = new Date();
var str = "";
str += date.getFullYear() + ""; // 年
str += (date.getMonth() + 1) + ""; // 月(0-11)
str += date.getDate() + "日&nbsp;&nbsp;"; // 日
str += "星期" + array[date.getDay()] + "&nbsp;&nbsp;"; // 星期(0-6)
var hour = date.getHours();
str += hour < 10 ? "0" + hour : hour; // 时
str += ":";
var min = date.getMinutes();
str += min < 10 ? "0" + min : min; // 分
str += ":";
var sec = date.getSeconds();
str += sec < 10 ? "0" + sec : sec; // 秒
// JavaScript = ECMAScript + BOM(window) + DOM(document)
// document对象(DOM)代表整个HTML页面
// 通过该对象的getElementById方法可以用ID来获取指定的元素(标签)
// 通过获得的元素的textContent属性就可以修改标签体的文本内容
var div = document.getElementById("time");
// 如果放入元素中的内容又包含了标签或实体替换符(字符实体)
// 那么就要将textContent属性换成innerHTML才能渲染标签和字符实体
div.innerHTML = str;
}
showDateTime();
// window对象(BOM)代表浏览器窗口
// 通过该对象的setInterval方法可以设置计时器控制函数周期性执行
setInterval(showDateTime, 1000);
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<script>
do {
var answer = parseInt(Math.random() * 100 + 1);
var total = 0;
do {
total += 1;
var thyAnswer = parseInt(prompt("请输入:"));
if (thyAnswer > answer) {
alert("小一点");
} else if (thyAnswer < answer) {
alert("大一点");
} else if (thyAnswer == answer) {
alert("恭喜你猜对了");
} else {
alert("你是猴子派来的逗比吗?");
}
} while (thyAnswer != answer);
if (total > 7) {
alert("智商捉急!!!");
}
} while (confirm('再来一局?'));
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<h3><span id="counter">5</span>秒钟以后跳转到百度</h3>
<script>
+function() {
var counter = 5;
var span = document.getElementById("counter");
setTimeout(function() {
counter -= 1;
if (counter > 0) {
span.textContent = counter;
setTimeout(arguments.callee, 1000);
} else {
location.href = "http://www.baidu.com";
}
}, 1000);
}();
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
* {
margin: 0;
padding: 0;
}
#adv {
width: 940px;
margin: 0 auto;
}
#adv ul {
width: 120px;
height: 30px;
margin: 0 auto;
position: relative;
top: -30px;
}
#adv li {
width: 30px;
height: 30px;
list-style: none;
float: left;
color: #ccc;
cursor: pointer;
}
</style>
</head>
<body>
<div id="adv">
<img id="image" src="img/slide-1.jpg" alt="">
<ul>
<li class="dot"></li>
<li class="dot"></li>
<li class="dot"></li>
<li class="dot"></li>
</ul>
</div>
<script src="js/common.js"></script>
<script>
(function() {
var index = 1;
var img = document.getElementById("image");
var timerId = 0;
function startTimer() {
if (timerId == 0) {
timerId = setInterval(function() {
index += 1;
if (index > 4) {
index = 1;
}
img.src = "img/slide-" + index + ".jpg";
}, 2000);
}
}
startTimer();
// 通过document对象获取页面上的元素(标签)有以下方法:
// 1. document.getElementById("...")
// 2. document.getElementsByTagName("...")
// 3. document.getElementsByClassName("...")
// 4. document.getElementsByName("...")
// 5. document.querySelector("...")
// 6. document.querySelectorAll("...")
var liList = document.querySelectorAll("#adv .dot");
for (var i = 0; i < liList.length; i += 1) {
liList[i].index = i + 1;
bind(liList[i], "click", function(evt) {
evt = evt || event;
var target = evt.target || evt.srcElement;
index = target.index;
img.src = "img/slide-" + index + ".jpg";
clearInterval(timerId);
timerId = 0;
});
bind(liList[i], "mouseout", function(evt) {
startTimer();
});
}
})();
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
</head>
<body>
<button id="ok">确定</button>
<!--如果希望点击按钮时会执行对应的操作-->
<!--那么需要通过JavaScript为按钮绑定事件回调函数-->
<!--绑定事件回调函数大致有3种方式: -->
<!--1. 通过标签的onXXX属性来指定需要执行的事件回调函数-->
<!--2. 通过元素的onXXX属性来绑定需要执行的事件回调函数-->
<!--3. 通过元素的addEventListener方法来绑定事件回调函数-->
<script>
var btn = document.getElementById("ok");
function sayHello() {
alert("大家好!");
}
function sayGoodbye() {
alert("再见!");
}
// Netscape Navigator --> Firefox
// Internet Explorer
// Chrome
// Safari
// Opera
if (btn.addEventListener) {
btn.addEventListener("click", sayHello);
btn.addEventListener("click", sayGoodbye);
btn.addEventListener("click", function() {
btn.removeEventListener("click", sayGoodbye);
});
} else {
btn.attachEvent("onclick", sayHello);
btn.attachEvent("onclick", sayGoodbye);
btn.attachEvent("onclick", function() {
btn.detachEvent("onclick", sayGoodbye);
});
}
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
* {
margin: 0;
padding: 0;
}
#container {
margin: 10px 20px;
}
#container li {
float: left;
list-style: none;
width: 60px;
height: 60px;
}
</style>
</head>
<body>
<div id="container">
<img src="img/hello.jpg" alt="">
<ul>
<li><img src="img/thumb-1.jpg" alt=""></li>
<li><img src="img/thumb-2.jpg" alt=""></li>
<li><img src="img/thumb-3.jpg" alt=""></li>
</ul>
</div>
<script src="js/common.js"></script>
<script>
+function() {
// 通过querySelector用父子选择器获取img标签
var img = document.querySelector('#container>img');
function showPhoto(evt) {
evt = evt || window.event;
// 获取事件源(谁引发了事件)
var target = evt.target || evt.srcElement;
img.src = "img/" + target.parentNode.photoName;
}
var imgNames = ["hello.jpg", "goodbye.jpg", "oneshit.jpg"];
// 通过querySelectorAll用后代选择器获取指定的li标签
// var ul = document.querySelector("#container>ul");
// 通过元素获取相关节点的属性:
// parentNode - 获取父节点
// children - 获取所有子节点
// nextSibling - 获取相邻下一个兄弟节点
// previousSibling - 获取相邻上一个兄弟节点
var ul = img.nextSibling.nextSibling;
console.log(ul);
for (var i = 0; i < ul.children.length; i += 1) {
ul.children[i].photoName = imgNames[i];
bind(ul.children[i], "mouseover", showPhoto);
}
}();
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
* {
margin: 0;
padding: 0;
}
#container {
margin: 20px 50px;
}
#fruits li {
list-style: none;
width: 200px;
height: 50px;
font-size: 20px;
line-height: 50px;
background-color: cadetblue;
color: white;
text-align: center;
margin: 2px 0;
}
#fruits>li>a {
float: right;
text-decoration: none;
color: white;
position: relative;
right: 5px;
}
#fruits~input {
border: none;
outline: none;
font-size: 18px;
}
#fruits~input[type=text] {
border-bottom: 1px solid darkgray;
width: 200px;
height: 50px;
text-align: center;
}
#fruits~input[type=button] {
width: 80px;
height: 30px;
background-color: coral;
color: white;
vertical-align: bottom;
cursor: pointer;
}
</style>
</head>
<body>
<div id="container">
<ul id="fruits">
<li>苹果<a href="">×</a></li>
<li>香蕉<a href="">×</a></li>
<li>火龙果<a href="">×</a></li>
<li>西瓜<a href="">×</a></li>
</ul>
<input type="text" name="fruit">
<input id="ok" type="button" value="确定">
</div>
<script src="js/common.js"></script>
<script>
function removeItem(evt) {
evt = evt || window.event;
prevent(evt); // 用自定义函数阻止事件的默认行为
var target = evt.target || evt.srcElement;
var li = target.parentNode;
li.parentNode.removeChild(li);
}
(function() {
function addItem(evt) {
var fruitName = textInput.value.trim();
if (fruitName.length > 0) {
var li = document.createElement("li");
li.textContent = fruitName;
li.style.backgroundColor = "rgba(20, 150, 180, 0.5)";
var a = document.createElement("a");
a.href = "";
a.textContent = "×";
bind(a, "click", removeItem);
li.appendChild(a);
var ul = document.getElementById("fruits");
ul.insertBefore(li, ul.children[0]);
}
textInput.value = "";
textInput.focus();
}
var anchors = document.querySelectorAll("#fruits>li>a");
for (var i = 0; i < anchors.length; i += 1) {
bind(anchors[i], "click", removeItem);
}
var btn = document.getElementById("ok");
var textInput = document.getElementsByName("fruit")[0];
bind(textInput, "keyup", function(evt) {
evt = evt || window.event;
var code = evt.keyCode || evt.which;
// console.log(code);
if (code == 13) {
addItem();
}
});
bind(btn, "click", addItem);
})();
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
#container {
width: 800px;
height: 400px;
margin: 10px auto;
border: 1px solid black;
overflow: hidden;
}
#buttons {
width: 800px;
margin: 10px auto;
text-align: center;
}
#add, #fla {
border: none;
outline: none;
width: 80px;
height: 30px;
background-color: red;
color: white;
}
.small {
width: 80px;
height: 80px;
float: left;
}
</style>
</head>
<body>
<div id="container">
</div>
<div id="buttons">
<button id="add">添加</button>
<button id="fla">闪烁</button>
</div>
<script src="js/common.js"></script>
<script>
(function() {
var container = document.getElementById("container");
var addButton = document.getElementById("add");
var flaButton = document.getElementById("fla");
bind(addButton, "click", function() {
var div = document.createElement("div");
div.className = "small";
div.style.backgroundColor = randomColor();
container.insertBefore(div, container.children[0]);
});
var timerId = 0;
bind(flaButton, "click", function(evt) {
evt = prepare(evt);
if (timerId == 0) {
evt.target.textContent = "停止";
timerId = setInterval(function() {
var divs = document.querySelectorAll("#container>div");
for (var i = 0; i < divs.length; i += 1) {
divs[i].style.backgroundColor = randomColor();
}
}, 200);
} else {
evt.target.textContent = "闪烁";
clearInterval(timerId);
timerId = 0;
}
});
})();
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
#one {
width: 400px;
height: 400px;
background-color: red;
}
#two {
width: 300px;
height: 300px;
background-color: green;
}
#three {
width: 200px;
height: 200px;
background-color: blue;
}
#two, #three {
position: relative;
left: 50px;
top: 50px;
}
</style>
</head>
<body>
<div id="container">
<div id="one">
<div id="two">
<div id="three"></div>
</div>
</div>
</div>
<script src="js/common.js"></script>
<script>
(function() {
var one = document.getElementById("one");
var two = document.getElementById("two");
var three = document.getElementById("three");
bind(one, "click", function() {
alert("one");
});
bind(two, "click", function() {
alert("two");
});
bind(three, "click", function(evt) {
if (evt.stopPropagation) {
evt.stopPropagation();
} else {
evt.cancelBubble = true;
}
alert("three");
});
})();
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
#adv {
position: fixed;
right: 10px;
top: 10px;
width: 200px;
height: 200px;
background-color: blue;
color: yellow;
}
#close {
float: right;
}
</style>
</head>
<body>
<div id="adv">
<span>此广告位招租</span>
<button id="close">关闭</button>
</div>
<script src="js/common.js"></script>
<script>
(function() {
var div = document.getElementById("adv");
var closeButton = document.getElementById("close");
bind(closeButton, "click", function() {
var divStyle = div.currentStyle ||
document.defaultView.getComputedStyle(div);
var top = parseInt(divStyle.top);
if (top < 300) {
div.style.top = (top + 30) + "px";
} else {
div.style.display = "none";
// div.style.visibility = "hidden";
}
});
})();
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
* {
margin: 0;
padding: 0;
}
#container {
margin: 20px 50px;
}
#fruits li {
list-style: none;
width: 200px;
height: 50px;
font-size: 20px;
line-height: 50px;
background-color: cadetblue;
color: white;
text-align: center;
margin: 2px 0;
}
#fruits>li>a {
float: right;
text-decoration: none;
color: white;
position: relative;
right: 5px;
}
#fruits~input {
border: none;
outline: none;
font-size: 18px;
}
#fruits~input[type=text] {
border-bottom: 1px solid darkgray;
width: 200px;
height: 50px;
text-align: center;
}
#fruits~input[type=button] {
width: 80px;
height: 30px;
background-color: coral;
color: white;
vertical-align: bottom;
cursor: pointer;
}
</style>
</head>
<body>
<div id="container">
<ul id="fruits">
<li>苹果<a href="">×</a></li>
<li>香蕉<a href="">×</a></li>
<li>火龙果<a href="">×</a></li>
<li>西瓜<a href="">×</a></li>
</ul>
<input type="text" name="fruit">
<input id="ok" type="button" value="确定">
</div>
<script src="js/jquery.min.js"></script>
<script>
// 写JavaScript代码时为什么推荐使用jQuery而不写原生JavaScript
// 因为jQuery对象有更多的属性和方法, 能够用更少的代码做更多的事情
// 而且jQuery对象的方法使用灵活且没有浏览器兼容性问题
// 当加载jQuery成功时会在window对象上绑定名为jQuery的属性
// 该属性还有一个名字叫$, $既是一个对象也是一个函数
// 当$作为函数时有以下四种最常用的用法:
// 1. 如果$函数的参数是一个函数, 传入的函数是页面加载完成时要执行的回调函数
// 2. 如果$函数的参数是选择器字符串, 那么$函数会返回代表元素的jQuery对象(其本质是一个数组)
// 3. 如果$函数的参数是标签字符串, 那么$函数会创建该标签并返回对应的jQuery对象
// 4. 如果$函数的参数是原生JavaScript对象(DOM), 那么$函数将该对象处理成jQuery对象
// 用法1
$(function() {
function removeItem(evt) {
evt.preventDefault();
// 用法4
$(evt.target).parent().remove();
}
function addItem(evt) {
// 用法2
var fruitName = $("#fruits+input").val().trim();
if (fruitName.length > 0) {
// 用法3
var $li = $("<li>").text(fruitName);
// 用法3
var $a = $("<a href=''>").text("×").on("click", removeItem);
$("#fruits").append($li.append($a));
}
$("#fruits+input").val("");
$("#fruits+input").focus();
}
// 用法2
$("#fruits a").on("click", removeItem);
$("#ok").on("click", addItem);
});
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
#data {
border-collapse: collapse;
}
#data td, #data th {
width: 120px;
height: 40px;
text-align: center;
border: 1px solid black;
}
#buttons {
margin: 10px 0;
}
</style>
</head>
<body>
<table id="data">
<caption>数据统计表</caption>
<tr>
<th>姓名</th>
<th>年龄</th>
<th>性别</th>
<th>身高</th>
<th>体重</th>
</tr>
<tr>
<td>Item1</td>
<td>Item2</td>
<td>Item3</td>
<td>Item4</td>
<td>Item5</td>
</tr>
<tr>
<td>Item1</td>
<td>Item2</td>
<td>Item3</td>
<td>Item4</td>
<td>Item5</td>
</tr>
<tr>
<td>Item1</td>
<td>Item2</td>
<td>Item3</td>
<td>Item4</td>
<td>Item5</td>
</tr>
<tr>
<td>Item1</td>
<td>Item2</td>
<td>Item3</td>
<td>Item4</td>
<td>Item5</td>
</tr>
<tr>
<td>Item1</td>
<td>Item2</td>
<td><a>Item3</a></td>
<td>Item4</td>
<td>Item5</td>
</tr>
<tr>
<td>Item1</td>
<td>Item2</td>
<td>Item3</td>
<td>Item4</td>
<td><a>Item5</a></td>
</tr>
</table>
<div id="buttons">
<button id="pretty">美化表格</button>
<button id="clear">清除数据</button>
<button id="remove">删单元格</button>
<button id="hide">隐藏表格</button>
</div>
<script src="js/jquery.min.js"></script>
<script>
$(function() {
$("#pretty").on("click", function() {
$("#data tr:gt(0)").css("color", "white");
$("#data tr:odd").css("background-color", "darkgreen");
$("#data tr:even").css("background-color", "darkmagenta");
$("#data tr:eq(0)").css("background-color", "white");
});
$("#clear").on("click", function() {
$("#data tr:gt(0) td").html("");
});
$("#remove").on("click", function() {
$("#data tr:gt(0):last").remove();
});
$("#hide").on("click", function() {
$("#data").fadeOut(2000, function() {
$("#data").css({
"display": "block",
"visibility": "hidden"
});
});
});
});
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title></title>
<style>
* {
margin: 0;
padding: 0;
}
#container {
width: 960px;
margin: 0 auto;
}
</style>
</head>
<body>
<button id="load">加载</button>
<div id="container"></div>
<script src="https://cdn.bootcss.com/jquery/3.3.1/jquery.min.js"></script>
<script>
$(function() {
$("#load").on("click", function() {
console.log(encodeURIComponent("手机", "utf-8"));
console.log(decodeURIComponent("%E7%8B%97%E5%B1%8E", "utf-8"));
// 通过Ajax请求获得数据并对页面进行局部刷新
// jQuery封装了多个Ajax请求方法:
// - $.ajax(): 灵活强大(强烈推荐使用)
// - $.getJSON(): 简单好用
// 统一资源定位符
// 协议://IP地址或域名:端口号/路径/资源?查询字符串
// HTTP(s)协议的请求有多种请求命令
// 浏览器在正常情况下只能发出get或post请求
// 将来我们在项目中可能用到的HTTP请求命令包括以下5个:
// - GET: 从服务器获取资源
// - POST: 向服务器提交资源
// - DELETE: 从服务器删除资源
// - PUT / PATCH: 更新服务器上的资源
var url = "http://api.tianapi.com/meinv/";
$.ajax({
"url": url,
"type": "get",
"data": {
"key": "772a81a51ae5c780251b1f98ea431b84",
"num": 15
},
"dataType": "json",
"success": function(json) {
for (var i = 0; i < json.newslist.length; i += 1) {
var mm = json.newslist[i];
$img = $("<img>").attr('src', mm.picUrl);
$("#container").append($img.width(300));
}
}
});
});
});
</script>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title></title>
<style>
.t99 {
border-collapse: collapse;
}
.t99 td {
padding: 0 10px;
border: 1px solid black;
}
</style>
</head>
<body>
<script src="js/common.js"></script>
<script>
createTable();
</script>
</body>
</html>
/**
* 绑定事件
* @param {HTMLElement} elem 待绑定事件的元素
* @param {String} en 事件的名称
* @param {Function} fn 回调函数
* @param {Boolean} capture 是否使用事件捕获
*/
function bind(elem, en, fn, capture) {
if (elem.addEventListener) {
elem.addEventListener(en, fn, capture);
} else {
elem.attachEvent('on' + en, fn);
}
}
/**
* 反绑定事件
* @param {HTMLElement} elem 待反绑定事件的元素
* @param {String} en 事件的名称
* @param {Function} fn 回调函数
*/
function unbind(elem, en, fn) {
if (elem.removeEventListener) {
elem.removeEventListener(en, fn);
} else {
elem.detachEvent('on' + en, fn);
}
}
/**
* 事件对象预处理
* @param {Event} evt 事件对象
*/
function prepare(evt) {
evt = evt || window.event;
evt.target = evt.target || evt.srcElement;
evt.preventDefault = evt.preventDefault || function() {
this.returnValue = false;
};
return evt;
}
/**
* 阻止事件的默认行为
* @param {Event} evt 事件对象
*/
function prevent(evt) {
if (evt.preventDefault) {
evt.preventDefault();
} else {
evt.returnValue = false;
}
}
/**
* 获得[min, max)范围的随机整数
* @param {Number} min
* @param {Number} max
*/
function randomInt(min, max) {
return parseInt(Math.random() * (max - min) + min);
}
/**
* 获得随机颜色
*/
function randomColor() {
var red = randomInt(0, 256);
var green = randomInt(0, 256);
var blue = randomInt(0, 256);
return "rgb(" + red + "," + green + "," + blue + ")";
}
function createTable() {
document.write("<table class='t99'>");
for (var i = 1; i <= 9; i += 1) {
document.write("<tr>");
for (var j = 1; j <= i; j += 1) {
document.write("<td>");
document.write(i + "*" + j + "=" + i * j);
document.write("</td>");
}
document.write("</tr>");
}
document.write("</table>");
}
此差异已折叠。
......@@ -8,11 +8,8 @@
import scrapy
class DoubanItem(scrapy.Item):
class MovieItem(scrapy.Item):
name = scrapy.Field()
year = scrapy.Field()
title = scrapy.Field()
score = scrapy.Field()
director = scrapy.Field()
classification = scrapy.Field()
actor = scrapy.Field()
motto = scrapy.Field()
......@@ -78,7 +78,7 @@ class DoubanDownloaderMiddleware(object):
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
request.meta['proxy'] = 'http://144.52.232.155:80'
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
......
......@@ -4,40 +4,17 @@
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.exceptions import DropItem
from scrapy.conf import settings
from scrapy import log
class DoubanPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(settings['MONGODB_SERVER'], settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
# def __init__(self, server, port):
# pass
# @classmethod
# def from_crawler(cls, crawler):
# return cls(crawler.settings['MONGO_SERVER'],
# crawler.settings['MONGO_PORT'])
def process_item(self, item, spider):
#Remove invalid data
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing %s of blogpost from %s" %(data, item['url']))
if valid:
#Insert data into database
new_moive=[{
"name":item['name'][0],
"year":item['year'][0],
"score":item['score'],
"director":item['director'],
"classification":item['classification'],
"actor":item['actor']
}]
self.collection.insert(new_moive)
log.msg("Item wrote to MongoDB database %s/%s" %
(settings['MONGODB_DB'], settings['MONGODB_COLLECTION']),
level=log.DEBUG, spider=spider)
return item
......@@ -11,35 +11,33 @@
BOT_NAME = 'douban'
MONGO_SERVER = '120.77.222.217'
MONGO_PORT = 27017
SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/65.0.3325.181 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS = 2
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
RANDOMIZE_DOWNLOAD_DELAY = True
DOWNLOAD_DELAY = 5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
MONGODB_SERVER = '120.77.222.217'
MONGODB_PORT = 27017
MONGODB_DB = 'douban'
MONGODB_COLLECTION = 'movie'
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
......@@ -58,9 +56,9 @@ MONGODB_COLLECTION = 'movie'
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'douban.middlewares.DoubanDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
'douban.middlewares.DoubanDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
......@@ -71,11 +69,9 @@ MONGODB_COLLECTION = 'movie'
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'douban.pipelines.DoubanPipeline': 400,
'douban.pipelines.DoubanPipeline': 300,
}
LOG_LEVEL = 'DEBUG'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
......
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from douban.items import DoubanItem
from douban.items import MovieItem
class MovieSpider(CrawlSpider):
class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/top250']
rules = (
Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*'))),
Rule(LinkExtractor(allow=(r'https://movie.douban.com/subject/\d+')), callback='parse_item'),
)
def parse_item(self, response):
sel = Selector(response)
item = DoubanItem()
item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
item['director']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract()
item['classification']= sel.xpath('//span[@property="v:genre"]/text()').extract()
item['actor']= sel.xpath('//*[@id="info"]/span[3]/a[1]/text()').extract()
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
return item
def parse(self, response):
li_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li')
for li in li_list:
item = MovieItem()
item['title'] = li.xpath('div/div[2]/div[1]/a/span[1]/text()').extract_first()
item['score'] = li.xpath('div/div[2]/div[2]/div/span[2]/text()').extract_first()
item['motto'] = li.xpath('div/div[2]/div[2]/p[2]/span/text()').extract_first()
yield item
href_list = response.css('a[href]::attr("href")').re('\?start=.*')
for href in href_list:
url = response.urljoin(href)
yield scrapy.Request(url=url, callback=self.parse)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册